From e7fc403e3cdc929779fc565c4a2f97d59b1acaf5 Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <brownw@ornl.gov>
Date: Tue, 12 Apr 2011 20:04:38 -0400
Subject: [PATCH] Updating CHARMM test kernel.

---
 lib/gpu/charge_gpu_memory2.cpp |   8 ++-
 lib/gpu/charge_gpu_memory2.h   |   3 +-
 lib/gpu/crml_gpu_kernel2.cu    | 120 +++++++++++++++++++++++++--------
 lib/gpu/crml_gpu_memory2.cpp   |  14 ++--
 lib/gpu/pair_gpu_device.h      |   6 ++
 lib/gpu/pair_gpu_precision.h   |   2 -
 6 files changed, 113 insertions(+), 40 deletions(-)
diff --git a/lib/gpu/charge_gpu_memory2.cpp b/lib/gpu/charge_gpu_memory2.cpp
index 7ba035006a..7683d1b14f 100644
--- a/lib/gpu/charge_gpu_memory2.cpp
+++ b/lib/gpu/charge_gpu_memory2.cpp
@@ -25,7 +25,7 @@ ChargeGPUMemory2T::ChargeGPUMemory2() : _compiled(false), _max_bytes(0) {
   device=&pair_gpu_device;
   ans=new PairGPUAns<numtyp,acctyp>();
   nbor=new PairGPUNbor();
-  nbor->packing(true);
+  _threads_per_atom=1;
 }
 
 template <class numtyp, class acctyp>
@@ -58,6 +58,12 @@ int ChargeGPUMemory2T::init_atomic(const int nlocal, const int nall,
   if (host_nlocal>0)
     _gpu_host=1;
 
+  if (_threads_per_atom>1 && gpu_nbor==false) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+    
   int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,false);
   if (success!=0)
diff --git a/lib/gpu/charge_gpu_memory2.h b/lib/gpu/charge_gpu_memory2.h
index e2accf4a2d..070008faaa 100644
--- a/lib/gpu/charge_gpu_memory2.h
+++ b/lib/gpu/charge_gpu_memory2.h
@@ -187,9 +187,10 @@ class ChargeGPUMemory2 {
 
  protected:
   bool _compiled;
-  int _block_size;
+  int _block_size, _threads_per_atom;
   double  _max_bytes, _max_an_bytes;
   double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;
 
   void compile_kernels(UCL_Device &dev, const char *pair_string);
 
diff --git a/lib/gpu/crml_gpu_kernel2.cu b/lib/gpu/crml_gpu_kernel2.cu
index a01eb64c7b..eda49f7411 100644
--- a/lib/gpu/crml_gpu_kernel2.cu
+++ b/lib/gpu/crml_gpu_kernel2.cu
@@ -19,6 +19,7 @@
 #define CRML_GPU_KERNEL
 
 #define MAX_BIO_SHARED_TYPES 128
+#define BLOCK_PAIR 64
 
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
@@ -98,18 +99,16 @@ __inline float fetch_q(const int& i, const float *q)
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
 
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          const int lj_types, __global numtyp *sp_lj_in,
+                          __global int *dev_nbor, __global int *dev_packed,
                           __global acctyp4 *ans, __global acctyp *engv, 
                           const int eflag, const int vflag, const int inum, 
                           const int nall, const int nbor_pitch,
                           __global numtyp *q_, const numtyp cut_coulsq,
                           const numtyp qqrd2e, const numtyp g_ewald,
                           const numtyp denom_lj, const numtyp cut_bothsq, 
-                          const numtyp cut_ljsq, const numtyp cut_lj_innersq) {
-
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          const numtyp cut_ljsq, const numtyp cut_lj_innersq,
+                          const int t_per_atom) {
   __local numtyp sp_lj[8];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
@@ -120,29 +119,50 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  if (ii<inum) {
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
+  int tid=THREAD_ID_X;
 
+  acctyp energy;
+  acctyp e_coul;
+  acctyp4 f;
+  acctyp virial[6];
+
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+  
+  energy=(acctyp)0;
+  e_coul=(acctyp)0;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  for (int o=0; o<6; o++)
+    virial[o]=(acctyp)0;
+
+  if (ii<inum) {
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
@@ -219,9 +239,51 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp s_energy[BLOCK_PAIR];
+    __local acctyp s_e_coul[BLOCK_PAIR];
+    __local acctyp4 s_f[BLOCK_PAIR];
+    __local acctyp s_virial[6][BLOCK_PAIR];
+    
+    s_f[tid].x=f.x;
+    s_f[tid].y=f.y;
+    s_f[tid].z=f.z;
+    s_energy[tid]=energy;
+    s_e_coul[tid]=e_coul;
+    for (int v=0; v<6; v++)
+      s_virial[v][tid]=virial[v];
 
-    // Store answers
-    __global acctyp *ap1=engv+ii;
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        s_f[tid].x += s_f[tid+s].x;
+        s_f[tid].y += s_f[tid+s].y;
+        s_f[tid].z += s_f[tid+s].z;
+        s_energy[tid] += s_energy[tid+s];
+        s_e_coul[tid] += s_e_coul[tid+s];
+        s_virial[0][tid] += s_virial[0][tid+s];
+        s_virial[1][tid] += s_virial[1][tid+s];
+        s_virial[2][tid] += s_virial[2][tid+s];
+        s_virial[3][tid] += s_virial[3][tid+s];
+        s_virial[4][tid] += s_virial[4][tid+s];
+        s_virial[5][tid] += s_virial[5][tid+s];
+      }
+    }
+    
+    f.x=s_f[tid].x;
+    f.y=s_f[tid].y;
+    f.z=s_f[tid].z;
+    energy=s_energy[tid];
+    e_coul=s_e_coul[tid];
+    for (int v=0; v<6; v++)
+      virial[v]=s_virial[v][tid];
+  }
+
+  // Store answers
+  __global acctyp *ap1=engv+ii;
+  if (ii<inum && offset==0) {
     if (eflag>0) {
       *ap1=energy;
       ap1+=inum;
@@ -235,12 +297,12 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       }
     }
     ans[ii]=f;
-  } // if ii
+  }
 }
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
                                __global numtyp* sp_lj_in, __global int *dev_nbor, 
-                               __global int *dev_list, __global acctyp4 *ans,
+                               __global int *dev_packed, __global acctyp4 *ans,
                                __global acctyp *engv, const int eflag,
                                const int vflag, const int inum, const int nall,
                                const int nbor_pitch, __global numtyp *q_,
@@ -261,7 +323,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   ljd[tid]=ljd_in[tid];
-  ljd[tid+64]=ljd_in[tid+64];
+  ljd[tid+BLOCK_PAIR]=ljd_in[tid+BLOCK_PAIR];
 
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
@@ -285,12 +347,12 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
 
     int n_stride;
     __global int *list_end;
-    if (dev_nbor==dev_list) {
+    if (dev_nbor==dev_packed) {
       list_end=nbor+mul24(numj,nbor_pitch);
       nbor+=mul24(offset,nbor_pitch);
       n_stride=mul24(t_per_atom,nbor_pitch);
     } else {
-      nbor=dev_list+*nbor;
+      nbor=dev_packed+*nbor;
       list_end=nbor+numj;
       n_stride=t_per_atom;
       nbor+=offset;
@@ -387,10 +449,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
 
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp s_energy[64];
-    __local acctyp s_e_coul[64];
-    __local acctyp4 s_f[64];
-    __local acctyp s_virial[6][64];
+    __local acctyp s_energy[BLOCK_PAIR];
+    __local acctyp s_e_coul[BLOCK_PAIR];
+    __local acctyp4 s_f[BLOCK_PAIR];
+    __local acctyp s_virial[6][BLOCK_PAIR];
     
     s_f[tid].x=f.x;
     s_f[tid].y=f.y;
diff --git a/lib/gpu/crml_gpu_memory2.cpp b/lib/gpu/crml_gpu_memory2.cpp
index 764fa5f48c..dd817dfc46 100644
--- a/lib/gpu/crml_gpu_memory2.cpp
+++ b/lib/gpu/crml_gpu_memory2.cpp
@@ -127,8 +127,6 @@ double CRML_GPU_Memory2T::host_memory_usage() const {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void CRML_GPU_Memory2T::loop(const bool _eflag, const bool _vflag) {
-  const int threads_per_atom=16;
-
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int eflag, vflag;
@@ -143,7 +141,7 @@ void CRML_GPU_Memory2T::loop(const bool _eflag, const bool _vflag) {
     vflag=0;
   
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/threads_per_atom)));
+                               (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int anall=this->atom->nall();
@@ -153,22 +151,24 @@ void CRML_GPU_Memory2T::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
                           &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                          &this->nbor->dev_packed.begin(),
+                          &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch,
                           &this->atom->dev_q.begin(), &_cut_coulsq,
                           &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
-                          &_cut_ljsq, &_cut_lj_innersq,&threads_per_atom);
+                          &_cut_ljsq, &_cut_lj_innersq,
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->ans->dev_ans.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                      &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
-                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq);
+                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
+                     &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index b11bfbc356..7c37afeab9 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -42,6 +42,12 @@
 #define BLOCK_CELL_ID 128
 // Default block size for neighbor list builds
 #define BLOCK_NBOR_BUILD 64
+// Maximum number of atom types that can be stored in shared memory
+// - Must be sqrt of BLOCK_PAIR
+#define MAX_SHARED_TYPES 8
+// Maximum number of atom types that can be stored in shared memory for bio
+// - Must be BLOCK_PAIR*2
+#define MAX_BIO_SHARED_TYPES 128
 
 template <class numtyp, class acctyp, 
           class grdtyp, class grdtyp4> class PPPMGPUMemory;
diff --git a/lib/gpu/pair_gpu_precision.h b/lib/gpu/pair_gpu_precision.h
index a5f57c1f95..902975be0b 100644
--- a/lib/gpu/pair_gpu_precision.h
+++ b/lib/gpu/pair_gpu_precision.h
@@ -84,8 +84,6 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define acctyp4 _lgpu_float4
 #endif
 
-#define MAX_SHARED_TYPES 8
-#define MAX_BIO_SHARED_TYPES 128
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 
 #endif