Replacing mul24 with fast_mul in device code.
This commit is contained in:
@ -147,7 +147,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
@ -184,7 +184,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
@ -216,7 +216,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
@ -25,7 +25,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||
#define atom_info(t_per_atom, ii, tid, offset) \
|
||||
tid=THREAD_ID_X; \
|
||||
offset=tid & (t_per_atom-1); \
|
||||
ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
|
||||
ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
|
||||
|
||||
#define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset, \
|
||||
i, numj, stride, list_end, nbor) \
|
||||
@ -34,9 +34,9 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||
nbor+=nbor_stride; \
|
||||
numj=*nbor; \
|
||||
nbor+=nbor_stride; \
|
||||
list_end=nbor+mul24(nbor_stride,numj); \
|
||||
nbor+=mul24(offset,nbor_stride); \
|
||||
stride=mul24(t_per_atom,nbor_stride);
|
||||
list_end=nbor+fast_mul(nbor_stride,numj); \
|
||||
nbor+=fast_mul(offset,nbor_stride); \
|
||||
stride=fast_mul(t_per_atom,nbor_stride);
|
||||
|
||||
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
|
||||
eflag, vflag, ans, engv) \
|
||||
|
||||
@ -37,12 +37,12 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
__global int *list_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24(iw,ntypes);
|
||||
int itype=fast_mul(iw,ntypes);
|
||||
int newj=0;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
int j=*nbor;
|
||||
@ -90,7 +90,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
|
||||
cutsq[ii]=cut_form[ii].x;
|
||||
form[ii]=cut_form[ii].y;
|
||||
}
|
||||
ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
|
||||
ii+=fast_mul((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
@ -99,12 +99,12 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
__global int *list_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
int newj=0;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
|
||||
@ -196,7 +196,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
|
||||
kappa[1]*=r;
|
||||
kappa[2]*=r;
|
||||
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
int mtype=fast_mul(ntypes,itype)+jtype;
|
||||
numtyp sigma = sig_eps[mtype].x;
|
||||
numtyp epsilon = sig_eps[mtype].y;
|
||||
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
||||
|
||||
@ -131,7 +131,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
kappa[1]*=r;
|
||||
kappa[2]*=r;
|
||||
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
int mtype=fast_mul(ntypes,itype)+jtype;
|
||||
numtyp sigma = sig_eps[mtype].x;
|
||||
numtyp epsilon = sig_eps[mtype].y;
|
||||
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
||||
@ -357,7 +357,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
@ -139,7 +139,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
@ -140,7 +140,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
@ -178,7 +178,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
@ -171,7 +171,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
@ -176,7 +176,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
@ -142,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
@ -140,7 +140,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
@ -21,7 +21,7 @@ __kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
|
||||
const int inum, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int offset=tid & (t_per_atom-1);
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
|
||||
int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii+inum;
|
||||
@ -30,8 +30,8 @@ __kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
|
||||
__global int *list=dev_ij+*nbor;
|
||||
__global int *list_end=list+numj;
|
||||
list+=offset;
|
||||
nbor+=mul24(ii,t_per_atom-1)+offset;
|
||||
int stride=mul24(t_per_atom,inum);
|
||||
nbor+=fast_mul(ii,t_per_atom-1)+offset;
|
||||
int stride=fast_mul(t_per_atom,inum);
|
||||
|
||||
for ( ; list<list_end; list++) {
|
||||
*nbor=*list;
|
||||
|
||||
@ -217,7 +217,7 @@ __kernel void kernel_special(__global int *dev_nbor,
|
||||
__global int *nspecial, __global int *special,
|
||||
int inum, int nt, int max_nbors, int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid & (t_per_atom-1);
|
||||
|
||||
@ -234,15 +234,15 @@ __kernel void kernel_special(__global int *dev_nbor,
|
||||
stride=inum;
|
||||
list=dev_nbor+stride+ii;
|
||||
numj=*list;
|
||||
list+=stride+mul24(ii,t_per_atom-1);
|
||||
stride=mul24(inum,t_per_atom);
|
||||
list_end=list+mul24(int(numj/t_per_atom),stride)+(numj & (t_per_atom-1));
|
||||
list+=stride+fast_mul(ii,t_per_atom-1);
|
||||
stride=fast_mul(inum,t_per_atom);
|
||||
list_end=list+fast_mul(int(numj/t_per_atom),stride)+(numj & (t_per_atom-1));
|
||||
list+=offset;
|
||||
} else {
|
||||
stride=1;
|
||||
list=host_nbor_list+(ii-inum)*max_nbors;
|
||||
numj=host_numj[ii-inum];
|
||||
list_end=list+mul24(numj,stride);
|
||||
list_end=list+fast_mul(numj,stride);
|
||||
}
|
||||
|
||||
for ( ; list<list_end; list+=stride) {
|
||||
|
||||
@ -53,7 +53,7 @@ __kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_,
|
||||
|
||||
// Resequence the atom indices to avoid collisions during atomic ops
|
||||
int nthreads=GLOBAL_SIZE_X;
|
||||
ii=mul24(ii,PPPM_BLOCK_1D);
|
||||
ii=fast_mul(ii,PPPM_BLOCK_1D);
|
||||
ii-=(ii/nthreads)*(nthreads-1);
|
||||
|
||||
int nx,ny,nz;
|
||||
@ -130,21 +130,21 @@ __kernel void make_rho(__global int *counts, __global grdtyp4 *atoms,
|
||||
y_stop-=ny-nlocal_y+1;
|
||||
if (nz>=nlocal_z)
|
||||
z_stop-=nz-nlocal_z+1;
|
||||
int z_stride=mul24(nlocal_x,nlocal_y);
|
||||
int z_stride=fast_mul(nlocal_x,nlocal_y);
|
||||
|
||||
int loop_count=npts_x/PENCIL_SIZE+1;
|
||||
int nx=fid;
|
||||
int pt=mul24(nz,mul24(npts_y,npts_x))+mul24(ny,npts_x)+nx;
|
||||
int pt=fast_mul(nz,fast_mul(npts_y,npts_x))+fast_mul(ny,npts_x)+nx;
|
||||
for (int i=0 ; i<loop_count; i++) {
|
||||
for (int n=0; n<order; n++)
|
||||
ans[n][tid]=(grdtyp)0.0;
|
||||
if (nx<nlocal_x && nz<npts_z) {
|
||||
int z_pos=mul24(nz+z_start-order_m_1,z_stride);
|
||||
int z_pos=fast_mul(nz+z_start-order_m_1,z_stride);
|
||||
for (int m=z_start; m<z_stop; m++) {
|
||||
int y_pos=mul24(ny+y_start-order_m_1,nlocal_x);
|
||||
int y_pos=fast_mul(ny+y_start-order_m_1,nlocal_x);
|
||||
for (int l=y_start; l<y_stop; l++) {
|
||||
int pos=z_pos+y_pos+nx;
|
||||
int natoms=mul24(counts[pos],atom_stride);
|
||||
int natoms=fast_mul(counts[pos],atom_stride);
|
||||
for (int row=pos; row<natoms; row+=atom_stride) {
|
||||
grdtyp4 delta=atoms[row];
|
||||
|
||||
@ -240,13 +240,13 @@ __kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
|
||||
}
|
||||
}
|
||||
|
||||
int mz=mul24(nz,npts_yx)+nx;
|
||||
int mz=fast_mul(nz,npts_yx)+nx;
|
||||
for (int n=0; n<order; n++) {
|
||||
grdtyp rho1d_2=(grdtyp)0.0;
|
||||
for (int k=order2+n; k>=n; k-=order)
|
||||
rho1d_2=rho_coeff[k]+rho1d_2*dz;
|
||||
grdtyp z0=qs*rho1d_2;
|
||||
int my=mz+mul24(ny,npts_x);
|
||||
int my=mz+fast_mul(ny,npts_x);
|
||||
for (int m=0; m<order; m++) {
|
||||
grdtyp y0=z0*rho1d_1[m][tid];
|
||||
for (int l=0; l<order; l++) {
|
||||
|
||||
@ -105,10 +105,10 @@ ucl_inline double fetch_q(const int& i, const double *q) { return q[i]; }
|
||||
#endif
|
||||
|
||||
#if (__CUDA_ARCH__ < 200)
|
||||
#define mul24 __mul24
|
||||
#define fast_mul __mul24
|
||||
#define MEM_THREADS 16
|
||||
#else
|
||||
#define mul24(X,Y) (X)*(Y)
|
||||
#define fast_mul(X,Y) (X)*(Y)
|
||||
#define MEM_THREADS 32
|
||||
#endif
|
||||
|
||||
@ -144,6 +144,7 @@ typedef struct _double4 double4;
|
||||
|
||||
#ifdef GENERIC_OCL
|
||||
|
||||
#define fast_mul mul24
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
|
||||
@ -197,7 +197,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
|
||||
gpu_times3(aTe1,a1,temp);
|
||||
|
||||
numtyp sigma, epsilon;
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
int mtype=fast_mul(ntypes,itype)+jtype;
|
||||
sigma = sig_eps[mtype].x;
|
||||
epsilon = sig_eps[mtype].y*factor_lj;
|
||||
|
||||
|
||||
@ -100,7 +100,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
|
||||
rhat[2] = r[2]*rnorm;
|
||||
|
||||
numtyp sigma, epsilon;
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
int mtype=fast_mul(ntypes,itype)+jtype;
|
||||
sigma = sig_eps[mtype].x;
|
||||
epsilon = sig_eps[mtype].y*factor_lj;
|
||||
|
||||
@ -335,7 +335,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
rhat[2] = r[2]*rnorm;
|
||||
|
||||
numtyp sigma, epsilon;
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
int mtype=fast_mul(ntypes,itype)+jtype;
|
||||
sigma = sig_eps[mtype].x;
|
||||
epsilon = sig_eps[mtype].y*factor_lj;
|
||||
|
||||
@ -577,7 +577,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
@ -8,6 +8,6 @@ mkdir /tmp/cpp5678/lgpu
|
||||
foreach file ( $files )
|
||||
# /bin/cp $file /tmp/cpp5678/$file:t:t
|
||||
# ------ Sed Replace
|
||||
sed -i 's/atom->dev_engv/ans->dev_engv/g' $file
|
||||
sed -i.bak 's/atom->dev_engv/ans->dev_engv/g' $file
|
||||
end
|
||||
|
||||
|
||||
Reference in New Issue
Block a user