Replacing mul24 with fast_mul in device code.

2011-10-23 18:04:42 -04:00
parent b0ee11cadd
commit cab77720de
21 changed files with 46 additions and 45 deletions
--- a/lib/gpu/cg_cmm.cu
+++ b/lib/gpu/cg_cmm.cu
@ -147,7 +147,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
--- a/lib/gpu/cg_cmm_long.cu
+++ b/lib/gpu/cg_cmm_long.cu
@ -184,7 +184,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
--- a/lib/gpu/cg_cmm_msm.cu
+++ b/lib/gpu/cg_cmm_msm.cu
@ -216,7 +216,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
--- a/lib/gpu/ellipsoid_extra.h
+++ b/lib/gpu/ellipsoid_extra.h
@ -25,7 +25,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 #define atom_info(t_per_atom, ii, tid, offset)                               \
  tid=THREAD_ID_X;                                                           \
  offset=tid & (t_per_atom-1);                                               \
-  ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
+  ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;

 #define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
                    i, numj, stride, list_end, nbor)                         \
@ -34,9 +34,9 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
    nbor+=nbor_stride;                                                       \
    numj=*nbor;                                                              \
    nbor+=nbor_stride;                                                       \
-    list_end=nbor+mul24(nbor_stride,numj);                                   \
-    nbor+=mul24(offset,nbor_stride);                                         \
-    stride=mul24(t_per_atom,nbor_stride);
+    list_end=nbor+fast_mul(nbor_stride,numj);                                   \
+    nbor+=fast_mul(offset,nbor_stride);                                         \
+    stride=fast_mul(t_per_atom,nbor_stride);

 #define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
                      eflag, vflag, ans, engv)                              \
--- a/lib/gpu/ellipsoid_nbor.cu
+++ b/lib/gpu/ellipsoid_nbor.cu
@ -37,12 +37,12 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+    __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
  
    numtyp4 ix=x_[i];
    int iw=ix.w;
-    int itype=mul24(iw,ntypes);
+    int itype=fast_mul(iw,ntypes);
    int newj=0;  
    for ( ; nbor<list_end; nbor+=nbor_pitch) {
      int j=*nbor;
@ -90,7 +90,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
    cutsq[ii]=cut_form[ii].x;
    form[ii]=cut_form[ii].y;
  }
-  ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
+  ii+=fast_mul((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
  __syncthreads();

  if (ii<inum) {
@ -99,12 +99,12 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+    __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
  
    numtyp4 ix=x_[i];
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    int newj=0;  
    for ( ; nbor<list_end; nbor+=nbor_pitch) {
--- a/lib/gpu/gayberne.cu
+++ b/lib/gpu/gayberne.cu
@ -196,7 +196,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
              kappa[1]*=r;
              kappa[2]*=r;
          
-              int mtype=mul24(ntypes,itype)+jtype;
+              int mtype=fast_mul(ntypes,itype)+jtype;
              numtyp sigma = sig_eps[mtype].x;
              numtyp epsilon = sig_eps[mtype].y;
              numtyp varrho = sigma/(h12+gum[0]*sigma);
--- a/lib/gpu/gayberne_lj.cu
+++ b/lib/gpu/gayberne_lj.cu
@ -131,7 +131,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
              kappa[1]*=r;
              kappa[2]*=r;
          
-              int mtype=mul24(ntypes,itype)+jtype;
+              int mtype=fast_mul(ntypes,itype)+jtype;
              numtyp sigma = sig_eps[mtype].x;
              numtyp epsilon = sig_eps[mtype].y;
              numtyp varrho = sigma/(h12+gum[0]*sigma);
@ -357,7 +357,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,

    numtyp4 ix=x_[i];
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
--- a/lib/gpu/lj.cu
+++ b/lib/gpu/lj.cu
@ -139,7 +139,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,

    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
--- a/lib/gpu/lj96.cu
+++ b/lib/gpu/lj96.cu
@ -140,7 +140,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
--- a/lib/gpu/lj_class2_long.cu
+++ b/lib/gpu/lj_class2_long.cu
@ -178,7 +178,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
--- a/lib/gpu/lj_coul.cu
+++ b/lib/gpu/lj_coul.cu
@ -171,7 +171,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
--- a/lib/gpu/lj_coul_long.cu
+++ b/lib/gpu/lj_coul_long.cu
@ -176,7 +176,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
--- a/lib/gpu/lj_expand.cu
+++ b/lib/gpu/lj_expand.cu
@ -142,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
--- a/lib/gpu/morse.cu
+++ b/lib/gpu/morse.cu
@ -140,7 +140,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
--- a/lib/gpu/neighbor_cpu.cu
+++ b/lib/gpu/neighbor_cpu.cu
@ -21,7 +21,7 @@ __kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
                            const int inum, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int offset=tid & (t_per_atom-1);
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
+  int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;

  if (ii<inum) {
    __global int *nbor=dev_nbor+ii+inum;
@ -30,8 +30,8 @@ __kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
    __global int *list=dev_ij+*nbor;
    __global int *list_end=list+numj;
    list+=offset;
-    nbor+=mul24(ii,t_per_atom-1)+offset;
-    int stride=mul24(t_per_atom,inum);
+    nbor+=fast_mul(ii,t_per_atom-1)+offset;
+    int stride=fast_mul(t_per_atom,inum);
      
    for ( ; list<list_end; list++) {
      *nbor=*list;
--- a/lib/gpu/neighbor_gpu.cu
+++ b/lib/gpu/neighbor_gpu.cu
@ -217,7 +217,7 @@ __kernel void kernel_special(__global int *dev_nbor,
                             __global int *nspecial, __global int *special,
                             int inum, int nt, int max_nbors, int t_per_atom) {
  int tid=THREAD_ID_X;
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid & (t_per_atom-1);

@ -234,15 +234,15 @@ __kernel void kernel_special(__global int *dev_nbor,
      stride=inum;
      list=dev_nbor+stride+ii;
      numj=*list;
-      list+=stride+mul24(ii,t_per_atom-1);
-      stride=mul24(inum,t_per_atom);
-      list_end=list+mul24(int(numj/t_per_atom),stride)+(numj & (t_per_atom-1));
+      list+=stride+fast_mul(ii,t_per_atom-1);
+      stride=fast_mul(inum,t_per_atom);
+      list_end=list+fast_mul(int(numj/t_per_atom),stride)+(numj & (t_per_atom-1));
      list+=offset;
    } else {
      stride=1;
      list=host_nbor_list+(ii-inum)*max_nbors;
      numj=host_numj[ii-inum];
-      list_end=list+mul24(numj,stride);
+      list_end=list+fast_mul(numj,stride);
    }
  
    for ( ; list<list_end; list+=stride) {
--- a/lib/gpu/pppm.cu
+++ b/lib/gpu/pppm.cu
@ -53,7 +53,7 @@ __kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,

  // Resequence the atom indices to avoid collisions during atomic ops
  int nthreads=GLOBAL_SIZE_X;
-  ii=mul24(ii,PPPM_BLOCK_1D);
+  ii=fast_mul(ii,PPPM_BLOCK_1D);
  ii-=(ii/nthreads)*(nthreads-1);

  int nx,ny,nz;
@ -130,21 +130,21 @@ __kernel void make_rho(__global int *counts, __global grdtyp4 *atoms,
    y_stop-=ny-nlocal_y+1;
  if (nz>=nlocal_z)
    z_stop-=nz-nlocal_z+1;
-  int z_stride=mul24(nlocal_x,nlocal_y);
+  int z_stride=fast_mul(nlocal_x,nlocal_y);

  int loop_count=npts_x/PENCIL_SIZE+1;
  int nx=fid;
-  int pt=mul24(nz,mul24(npts_y,npts_x))+mul24(ny,npts_x)+nx;
+  int pt=fast_mul(nz,fast_mul(npts_y,npts_x))+fast_mul(ny,npts_x)+nx;
  for (int i=0 ; i<loop_count; i++) {
    for (int n=0; n<order; n++)
      ans[n][tid]=(grdtyp)0.0;
    if (nx<nlocal_x && nz<npts_z) {
-      int z_pos=mul24(nz+z_start-order_m_1,z_stride);
+      int z_pos=fast_mul(nz+z_start-order_m_1,z_stride);
      for (int m=z_start; m<z_stop; m++) {
-        int y_pos=mul24(ny+y_start-order_m_1,nlocal_x);
+        int y_pos=fast_mul(ny+y_start-order_m_1,nlocal_x);
        for (int l=y_start; l<y_stop; l++) {
          int pos=z_pos+y_pos+nx;
-          int natoms=mul24(counts[pos],atom_stride);
+          int natoms=fast_mul(counts[pos],atom_stride);
          for (int row=pos; row<natoms; row+=atom_stride) {
            grdtyp4 delta=atoms[row];
      
@ -240,13 +240,13 @@ __kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
        }
      }
        
-      int mz=mul24(nz,npts_yx)+nx;
+      int mz=fast_mul(nz,npts_yx)+nx;
      for (int n=0; n<order; n++) {
        grdtyp rho1d_2=(grdtyp)0.0;
        for (int k=order2+n; k>=n; k-=order)
          rho1d_2=rho_coeff[k]+rho1d_2*dz;
        grdtyp z0=qs*rho1d_2;
-        int my=mz+mul24(ny,npts_x);
+        int my=mz+fast_mul(ny,npts_x);
        for (int m=0; m<order; m++) {
          grdtyp y0=z0*rho1d_1[m][tid];
  	      for (int l=0; l<order; l++) {
--- a/lib/gpu/preprocessor.h
+++ b/lib/gpu/preprocessor.h
@ -105,10 +105,10 @@ ucl_inline double fetch_q(const int& i, const double *q) { return q[i]; }
 #endif

 #if (__CUDA_ARCH__ < 200)
-#define mul24 __mul24
+#define fast_mul __mul24
 #define MEM_THREADS 16
 #else
-#define mul24(X,Y) (X)*(Y)
+#define fast_mul(X,Y) (X)*(Y)
 #define MEM_THREADS 32
 #endif

@ -144,6 +144,7 @@ typedef struct _double4 double4;

 #ifdef GENERIC_OCL

+#define fast_mul mul24
 #define GLOBAL_ID_X get_global_id(0)
 #define THREAD_ID_X get_local_id(0)
 #define BLOCK_ID_X get_group_id(0)
--- a/lib/gpu/re_squared.cu
+++ b/lib/gpu/re_squared.cu
@ -197,7 +197,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
      gpu_times3(aTe1,a1,temp);

      numtyp sigma, epsilon;
-      int mtype=mul24(ntypes,itype)+jtype;
+      int mtype=fast_mul(ntypes,itype)+jtype;
      sigma = sig_eps[mtype].x;
      epsilon = sig_eps[mtype].y*factor_lj;

--- a/lib/gpu/re_squared_lj.cu
+++ b/lib/gpu/re_squared_lj.cu
@ -100,7 +100,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
      rhat[2] = r[2]*rnorm;

      numtyp sigma, epsilon;
-      int mtype=mul24(ntypes,itype)+jtype;
+      int mtype=fast_mul(ntypes,itype)+jtype;
      sigma = sig_eps[mtype].x;
      epsilon = sig_eps[mtype].y*factor_lj;

@ -335,7 +335,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
      rhat[2] = r[2]*rnorm;

      numtyp sigma, epsilon;
-      int mtype=mul24(ntypes,itype)+jtype;
+      int mtype=fast_mul(ntypes,itype)+jtype;
      sigma = sig_eps[mtype].x;
      epsilon = sig_eps[mtype].y*factor_lj;

@ -577,7 +577,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,

    numtyp4 ix=x_[i];
    int iw=ix.w;
-    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
--- a/lib/gpu/replace_code.sh
+++ b/lib/gpu/replace_code.sh
@ -8,6 +8,6 @@ mkdir /tmp/cpp5678/lgpu
 foreach file ( $files )
 #	/bin/cp $file /tmp/cpp5678/$file:t:t
 	# ------ Sed Replace
-	sed -i 's/atom->dev_engv/ans->dev_engv/g' $file
+	sed -i.bak 's/atom->dev_engv/ans->dev_engv/g' $file
 end