diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu index efc6dbbd6a..47504f621e 100644 --- a/lib/gpu/cmm_cut_gpu_kernel.cu +++ b/lib/gpu/cmm_cut_gpu_kernel.cu @@ -75,6 +75,10 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #endif +#define SBBITS 30 +#define NEIGHMASK 0x3FFFFFFF +__inline int sbmask(int j) { return j >> SBBITS & 3; } + __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, @@ -114,12 +118,9 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, for ( ; nbor> SBBITS & 3; } + __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, @@ -139,14 +143,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, int j=*nbor; numtyp factor_lj, factor_coul; - if (j < nall) { - factor_lj = (numtyp)1.0; - factor_coul = (numtyp)0.0; - } else { - factor_lj = sp_lj[j/nall]; - factor_coul = (numtyp)1.0-sp_lj[j/nall+4]; - j %= nall; - } + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + numtyp4 jx=fetch_pos(j,x_); //x_[j]; int jtype=jx.w; @@ -282,14 +282,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, int j=*nbor; numtyp factor_lj, factor_coul; - if (j < nall) { - factor_lj = (numtyp)1.0; - factor_coul = (numtyp)0.0; - } else { - factor_lj = sp_lj[j/nall]; - factor_coul = (numtyp)1.0-sp_lj[j/nall+4]; - j %= nall; - } + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + numtyp4 jx=fetch_pos(j,x_); //x_[j]; int mtype=itype+jx.w; diff --git a/lib/gpu/crml_gpu_kernel.cu b/lib/gpu/crml_gpu_kernel.cu index 47dad50753..6ba6eaedca 100644 --- a/lib/gpu/crml_gpu_kernel.cu +++ b/lib/gpu/crml_gpu_kernel.cu @@ -93,6 +93,10 @@ __inline float fetch_q(const int& i, const float *q) #endif +#define SBBITS 30 +#define NEIGHMASK 0x3FFFFFFF +__inline int sbmask(int j) { return j >> SBBITS & 3; } + __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, @@ -142,14 +146,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, int j=*nbor; numtyp factor_lj, factor_coul; - if (j < nall) { - factor_lj = (numtyp)1.0; - factor_coul = (numtyp)0.0; - } else { - factor_lj = sp_lj[j/nall]; - factor_coul = (numtyp)1.0-sp_lj[j/nall+4]; - j %= nall; - } + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + numtyp4 jx=fetch_pos(j,x_); //x_[j]; int jtype=jx.w; @@ -287,14 +287,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, int j=*nbor; numtyp factor_lj, factor_coul; - if (j < nall) { - factor_lj = (numtyp)1.0; - factor_coul = (numtyp)0.0; - } else { - factor_lj = sp_lj[j/nall]; - factor_coul = (numtyp)1.0-sp_lj[j/nall+4]; - j %= nall; - } + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + numtyp4 jx=fetch_pos(j,x_); //x_[j]; int jtype=jx.w; @@ -390,4 +386,3 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, } #endif - diff --git a/lib/gpu/gb_gpu_kernel.cu b/lib/gpu/gb_gpu_kernel.cu index 347e6ede35..b8d06ec6da 100644 --- a/lib/gpu/gb_gpu_kernel.cu +++ b/lib/gpu/gb_gpu_kernel.cu @@ -22,6 +22,10 @@ #include "gb_gpu_extra.h" #endif +#define SBBITS 30 +#define NEIGHMASK 0x3FFFFFFF +__inline int sbmask(int j) { return j >> SBBITS & 3; } + __inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, numtyp ans[9]) { @@ -142,12 +146,9 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q, for ( ; nbor> SBBITS & 3; } + __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q, __global numtyp4* shape,__global numtyp4* well, __global numtyp *gum, __global numtyp2* sig_eps, @@ -68,12 +72,9 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q, for ( ; nbor> SBBITS & 3; } + __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, @@ -114,12 +118,9 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, for ( ; nbor> SBBITS & 3; } + __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, @@ -114,12 +118,9 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, for ( ; nbor> SBBITS & 3; } + __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, @@ -131,14 +135,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, int j=*nbor; numtyp factor_lj, factor_coul; - if (j < nall) { - factor_lj = (numtyp)1.0; - factor_coul = (numtyp)1.0; - } else { - factor_lj = sp_lj[j/nall]; - factor_coul = sp_lj[j/nall+4]; - j %= nall; - } + factor_lj = sp_lj[sbmask(j)]; + factor_coul = sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + numtyp4 jx=fetch_pos(j,x_); //x_[j]; int jtype=jx.w; @@ -260,14 +260,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, int j=*nbor; numtyp factor_lj, factor_coul; - if (j < nall) { - factor_lj = (numtyp)1.0; - factor_coul = (numtyp)1.0; - } else { - factor_lj = sp_lj[j/nall]; - factor_coul = sp_lj[j/nall+4]; - j %= nall; - } + factor_lj = sp_lj[sbmask(j)]; + factor_coul = sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + numtyp4 jx=fetch_pos(j,x_); //x_[j]; int mtype=itype+jx.w; diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu index be2ae069e4..a0b27f0259 100644 --- a/lib/gpu/ljcl_cut_gpu_kernel.cu +++ b/lib/gpu/ljcl_cut_gpu_kernel.cu @@ -93,6 +93,10 @@ __inline float fetch_q(const int& i, const float *q) #endif +#define SBBITS 30 +#define NEIGHMASK 0x3FFFFFFF +__inline int sbmask(int j) { return j >> SBBITS & 3; } + __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, @@ -139,14 +143,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, int j=*nbor; numtyp factor_lj, factor_coul; - if (j < nall) { - factor_lj = (numtyp)1.0; - factor_coul = (numtyp)0.0; - } else { - factor_lj = sp_lj[j/nall]; - factor_coul = (numtyp)1.0-sp_lj[j/nall+4]; - j %= nall; - } + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + numtyp4 jx=fetch_pos(j,x_); //x_[j]; int jtype=jx.w; @@ -274,14 +274,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, int j=*nbor; numtyp factor_lj, factor_coul; - if (j < nall) { - factor_lj = (numtyp)1.0; - factor_coul = (numtyp)0.0; - } else { - factor_lj = sp_lj[j/nall]; - factor_coul = (numtyp)1.0-sp_lj[j/nall+4]; - j %= nall; - } + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + numtyp4 jx=fetch_pos(j,x_); //x_[j]; int mtype=itype+jx.w; diff --git a/lib/gpu/pair_gpu_build_kernel.cu b/lib/gpu/pair_gpu_build_kernel.cu index a08a26800a..bcf41c0050 100644 --- a/lib/gpu/pair_gpu_build_kernel.cu +++ b/lib/gpu/pair_gpu_build_kernel.cu @@ -57,6 +57,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define CELL_BLOCK_SIZE 64 #define BLOCK_2D 8 +#define SBBITS 30 + __kernel void transpose(int *out, int *in, int columns_in, int rows_in) { __local float block[BLOCK_2D][BLOCK_2D+1]; @@ -279,16 +281,16 @@ __kernel void kernel_special(__global int *dev_nbor, int offset=ii; for (int i=0; i=n1) - nbor+=nall; + which++; if (i>=n2) - nbor+=nall; + which++; + nbor=nbor ^ (which << SBBITS); + *list=nbor; } offset+=nt; } - if (nbor>=nall) - *list=nbor; } } // if ii }