From e8da16ff23acaa7625643a26bbc90302eb4fe565 Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <brownw@ornl.gov>
Date: Thu, 17 Feb 2011 13:37:02 -0500
Subject: [PATCH] Finishing laptop optimization of PPPM charge spread kernels.

---
 lib/gpu/geryon/ucl_nv_kernel.h |  2 +
 lib/gpu/pair_gpu_device.cpp    |  1 +
 lib/gpu/pppm_gpu_kernel.cu     | 69 ++++++++++++++++++++++------------
 lib/gpu/pppm_gpu_memory.cpp    | 51 +++++++++++++++++--------
 lib/gpu/pppm_gpu_memory.h      |  1 +
 src/GPU/pppm_gpu.cpp           | 11 +++---
 6 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h
index 47f6f235b6..0040447549 100644
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@@ -27,6 +27,8 @@
 
 #define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x)
 #define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y)
+#define GLOBAL_SIZE_X __mul24(gridDim.x,blockDim.x);
+#define GLOBAL_SIZE_Y __mul24(gridDim.y,blockDim.y);
 #define THREAD_ID_X threadIdx.x
 #define THREAD_ID_Y threadIdx.y
 #define BLOCK_ID_X blockIdx.x
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index b94068629c..550c1207b5 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -293,6 +293,7 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in,
         fprintf(screen,"Kernel (map):    %.4f s.\n",times[2]/_replica_size);
         fprintf(screen,"Kernel (rho):    %.4f s.\n",times[3]/_replica_size);
 //        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
+        fprintf(screen,"Total:           %.4f s.\n",(times[0]+times[1]+times[2]+times[3])/_replica_size);
       }
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
 
diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu
index 51c7fdf525..37ffb9f8d8 100644
--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@@ -106,6 +106,7 @@ __device__ inline void atomicFloatAdd(float *address, float val)
 #define THREAD_ID_X get_local_id(0)
 #define BLOCK_ID_X get_group_id(0)
 #define BLOCK_SIZE_X get_local_size(0)
+#define GLOBAL_SIZE_X get_global_size(0)
 #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
 #define __inline inline
 
@@ -121,9 +122,16 @@ __kernel void particle_map(__global numtyp4 *x_, const int nlocal,
                            const numtyp delyinv, const numtyp delzinv,
                            const int nlocal_x, const int nlocal_y,
                            const int nlocal_z, const int atom_stride,
-                           const int max_atoms, __global int *error) {
+                           const int max_atoms, __global int *error,
+                           const int skip) {
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X;
+
+  // Resequence the atom indices to avoid collisions during atomic ops
+  int nthreads=GLOBAL_SIZE_X;
+  ii=__mul24(ii,skip);
+  ii-=int(ii/nthreads)*(nthreads-1);
+
   int nx,ny,nz;
   numtyp tx,ty,tz;
 
@@ -156,32 +164,40 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_,
                        __global numtyp *brick, __global numtyp *_rho_coeff,
                        const int atom_stride, const int npts_x,
                        const int npts_yx, const int nlocal_x,
-                       const int nlocal_y,
-                       const int nlocal_z, const numtyp b_lo_x,
+                       const int nlocal_y, const int nlocal_z,
+                       const int x_threads, const numtyp b_lo_x,
                        const numtyp b_lo_y, const numtyp b_lo_z,
                        const numtyp delxinv, const numtyp delyinv,
                        const numtyp delzinv, const int order, const int order2,
                        const numtyp delvolinv) {
   __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL];
+
   int nx=THREAD_ID_X;
   int ny=THREAD_ID_Y;
   if (nx<order && ny<order) {
-    int ri=nx*order+ny;
+    int ri=__mul24(nx,order)+ny;
     rho_coeff[ri]=_rho_coeff[ri];
   }
   __syncthreads();
   
-  nx+=BLOCK_ID_X*BLOCK_SIZE_X;
-  ny+=BLOCK_ID_Y*BLOCK_SIZE_Y;
-  int nz=0;
+  nx+=__mul24(BLOCK_ID_X,BLOCK_SIZE_X);
+  ny+=__mul24(BLOCK_ID_Y,BLOCK_SIZE_Y);
+
+  // Get the z-block we are working on
+  int z_block=nx/x_threads;
+  nx=nx%x_threads;
+  int nz=__mul24(z_block,8);
+  int z_stop=nz+8;
+  if (z_stop>nlocal_z)
+    z_stop=nlocal_z;
   
   if (nx<nlocal_x && ny<nlocal_y) {
-    int z_stride=nlocal_x*nlocal_y;
-    int z_pos=nz*z_stride+ny*nlocal_x+nx;
-    for ( ; nz<nlocal_z; nz++) {
+    int z_stride=__mul24(nlocal_x,nlocal_y);
+    int z_pos=__mul24(nz,z_stride)+__mul24(ny,nlocal_x)+nx;
+    for ( ; nz<z_stop; nz++) {
       int natoms=counts[z_pos];
       for (int row=0; row<natoms; row++) {
-        int atom=atoms[atom_stride*row+z_pos];
+        int atom=atoms[__mul24(atom_stride,row)+z_pos];
         numtyp4 p=fetch_pos(atom,x_);
         numtyp z0=delvolinv*fetch_q(atom,q_);
         
@@ -199,13 +215,13 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_,
           }
         }
         
-        int mz=nz*npts_yx+nx;
+        int mz=__mul24(nz,npts_yx)+nx;
         for (int n=0; n<order; n++) {
           numtyp rho1d_2=(numtyp)0.0;
           for (int k=order2+n; k>=n; k-=order)
             rho1d_2=rho_coeff[k]+rho1d_2*dz;
           numtyp y0=z0*rho1d_2;
-          int my=mz+ny*npts_x;
+          int my=mz+__mul24(ny,npts_x);
           for (int m=0; m<order; m++) {
 	          numtyp x0=y0*rho1d[1][m];
 	          for (int l=0; l<order; l++) {
@@ -259,8 +275,8 @@ __kernel void make_rho2(__global numtyp4 *x_, __global numtyp *q_,
       x_stop-=nx-nlocal_x+1;
     if (ny>=nlocal_y)
       y_stop-=ny-nlocal_y+1;
-    z_stride=npts_yx*BLOCK_1D;
-    z_local_stride=nlocal_x*nlocal_y*BLOCK_1D;
+    z_stride=__mul24(npts_yx,BLOCK_1D);
+    z_local_stride=__mul24(__mul24(nlocal_x,nlocal_y),BLOCK_1D);
   }
   
   if (tx<order) 
@@ -271,19 +287,19 @@ __kernel void make_rho2(__global numtyp4 *x_, __global numtyp *q_,
   numtyp ans[MAX_STENCIL];
   int loop_count=npts_z/BLOCK_1D+1;
   int nz=tx;
-  int pt=nz*npts_yx+ny*npts_x+nx;
-  int z_local=nz*nlocal_x*nlocal_y;
+  int pt=__mul24(nz,npts_yx)+__mul24(ny,npts_x)+nx;
+  int z_local=__mul24(__mul24(nz,nlocal_x),nlocal_y);
   for (int i=0 ; i<loop_count; i++) {
     for (int n=0; n<order; n++)
       ans[n]=(numtyp)0.0;
     if (nz<nlocal_z) {
       for (int m=y_start; m<y_stop; m++) {
         int y_pos=ny+m-order_m_1;
-        int y_local=y_pos*nlocal_x;
+        int y_local=__mul24(y_pos,nlocal_x);
         for (int l=x_start; l<x_stop; l++) {
           int x_pos=nx+l-order_m_1;
           int pos=z_local+y_local+x_pos;
-          int natoms=counts[pos]*atom_stride;
+          int natoms=__mul24(counts[pos],atom_stride);
           for (int row=pos; row<natoms; row+=atom_stride) {
             int atom=atoms[row];
             numtyp4 p=fetch_pos(atom,x_);
@@ -343,7 +359,8 @@ __kernel void make_rho3(__global numtyp4 *x_, __global numtyp *q_,
                         const numtyp b_lo_z, const numtyp delxinv,
                         const numtyp delyinv, const numtyp delzinv,
                         const int order, const int order2,
-                        const numtyp delvolinv, __global int *error) {
+                        const numtyp delvolinv, __global int *error,
+                        const int skip) {
   __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL];
   int ii=THREAD_ID_X;
   if (ii<order2+order)
@@ -351,9 +368,11 @@ __kernel void make_rho3(__global numtyp4 *x_, __global numtyp *q_,
   __syncthreads();
   
   ii+=BLOCK_ID_X*BLOCK_SIZE_X;
-
-//  ii=8*ii - ii/4000*31999;
-
+  
+  // Resequence the atom indices to avoid collisions during atomic ops
+  int nthreads=GLOBAL_SIZE_X;
+  ii=__mul24(ii,skip);
+  ii-=int(ii/nthreads)*(nthreads-1);
 
   int nx,ny,nz;
   numtyp tx,ty,tz;
@@ -387,13 +406,13 @@ __kernel void make_rho3(__global numtyp4 *x_, __global numtyp *q_,
         }
       }
         
-      int mz=nz*npts_yx+nx;
+      int mz=__mul24(nz,npts_yx)+nx;
       for (int n=0; n<order; n++) {
         numtyp rho1d_2=(numtyp)0.0;
         for (int k=order2+n; k>=n; k-=order)
           rho1d_2=rho_coeff[k]+rho1d_2*dz;
         numtyp y0=z0*rho1d_2;
-        int my=mz+ny*npts_x;
+        int my=mz+__mul24(ny,npts_x);
         for (int m=0; m<order; m++) {
           numtyp x0=y0*rho1d[1][m];
 	        for (int l=0; l<order; l++) {
diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp
index 8980b5ac29..785e3735cb 100644
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@@ -25,6 +25,7 @@
 #define BLOCK_1D 64
 #define BLOCK_X 8
 #define BLOCK_Y 8
+#define MAX_STENCIL 8
 #define PPPMGPUMemoryT PPPMGPUMemory<numtyp, acctyp>
 
 extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
@@ -146,11 +147,19 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
   d_error_flag.zero();
   _max_bytes+=1;
   
+  // Used to resequence atom indices to reduce probability of collisions
+  //   during atomic ops
+  _resequence_skip=ucl_device->cores();
+  if (_resequence_skip%_block_size!=0)
+    _resequence_skip=int(_resequence_skip/_block_size+1)*_block_size;
+  assert(_block_size%_block_x_size==0);
+  
 std::cout << "LO: " << _nxlo_out << " " << _nylo_out << " " << _nzlo_out << " " << _nlower << std::endl;
 std::cout << "HI: " << _nxhi_out << " " << _nyhi_out << " " << _nzhi_out << " " << _nupper << std::endl;
 std::cout << "pts: " << _npts_x << " " << _npts_y << " " << _npts_z << std::endl;
 std::cout << "local: " << _nlocal_x << " " << _nlocal_y << " " << _nlocal_z << std::endl;
 
+
   return h_brick.begin();
 }
 
@@ -185,7 +194,7 @@ void PPPMGPUMemoryT::clear() {
 
   device->clear();
 }
-/*
+
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
@@ -221,8 +230,11 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
 
   // Compute the block size and grid size to keep all cores busy
   int BX=this->block_size();
-  
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+  // Resequence atom indices to avoid collisions during atomic ops
+  int skip=_resequence_skip;
+  if (skip>GX*BX/8)
+    skip=_block_x_size;
 
   int _max_atoms=10;
   int ainum=this->ans->inum();
@@ -248,7 +260,7 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
                      &d_brick_atoms.begin(), &f_brick_x, &f_brick_y, 
                      &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, &_nlocal_x,
                      &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, 
-                     &d_error_flag.begin());
+                     &d_error_flag.begin(),&skip);
   time_map.stop();
 
   if (_order % 2)
@@ -264,14 +276,17 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
   int BY=block_y_size();
   GX=static_cast<int>(ceil(static_cast<double>(_nlocal_x)/BX));
   int GY=static_cast<int>(ceil(static_cast<double>(_nlocal_y)/BY));
+  int x_threads=GX*BX;
+  int GZ=static_cast<int>(ceil(static_cast<double>(_nlocal_z)/8));
+  GX*=GZ;
   d_brick.zero();
   k_make_rho.set_size(GX,GY,BX,BY);
   k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(),
                  &d_brick_counts.begin(), &d_brick_atoms.begin(),
                  &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, &_npts_x,
-                 &_npts_yx, &_nlocal_x, &_nlocal_y, &_nlocal_z, &f_brick_x,
-                 &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv,
-                 &_order, &_order2, &f_delvolinv);
+                 &_npts_yx, &_nlocal_x, &_nlocal_y, &_nlocal_z, &x_threads,
+                 &f_brick_x, &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv,
+                 &f_delzinv, &_order, &_order2, &f_delvolinv);
   time_rho.stop();
 
   time_out.start();
@@ -291,7 +306,7 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
   
   return h_error_flag[0];
 }
-*/
+
 /*
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
@@ -328,8 +343,11 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
 
   // Compute the block size and grid size to keep all cores busy
   int BX=this->block_size();
-  
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+  // Resequence atom indices to avoid collisions during atomic ops
+  int skip=_resequence_skip;
+  if (skip>GX*BX/8)
+    skip=_block_x_size;
 
   int _max_atoms=10;
   int ainum=this->ans->inum();
@@ -355,7 +373,7 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
                      &d_brick_atoms.begin(), &f_brick_x, &f_brick_y, 
                      &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, &_nlocal_x,
                      &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms,
-                     &d_error_flag.begin());
+                     &d_error_flag.begin(),&skip);
   time_map.stop();
 
   if (_order % 2)
@@ -398,7 +416,7 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
   return h_error_flag[0];
 }
 */
-
+/*
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
@@ -434,8 +452,11 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
 
   // Compute the block size and grid size to keep all cores busy
   int BX=this->block_size();
-  
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+  // Resequence atom indices to avoid collisions during atomic ops
+  int skip=_resequence_skip;
+  if (skip>GX*BX/8)
+    skip=_block_x_size;
 
   int ainum=this->ans->inum();
   
@@ -458,13 +479,13 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
 
   time_rho.start();
   d_brick.zero();
-
   k_make_rho.set_size(GX,BX);
   k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum,
                  &d_brick.begin(), &d_rho_coeff.begin(), &_npts_x,
                  &_npts_yx, &_nlocal_x, &_nlocal_y, &_nlocal_z, &f_brick_x,
                  &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv,
-                 &_order, &_order2, &f_delvolinv, &d_error_flag.begin());
+                 &_order, &_order2, &f_delvolinv, &d_error_flag.begin(),
+                 &skip);
   time_rho.stop();
 
   time_out.start();
@@ -474,7 +495,7 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
   
   return h_error_flag[0];
 }
-
+*/
 
 template <class numtyp, class acctyp>
 double PPPMGPUMemoryT::host_memory_usage() const {
@@ -492,7 +513,7 @@ void PPPMGPUMemoryT::compile_kernels(UCL_Device &dev) {
   pppm_program=new UCL_Program(dev);
   pppm_program->load_string(pppm_gpu_kernel,flags.c_str());
   k_particle_map.set_function(*pppm_program,"particle_map");
-  k_make_rho.set_function(*pppm_program,"make_rho3");
+  k_make_rho.set_function(*pppm_program,"make_rho");
   pos_tex.get_texture(*pppm_program,"pos_tex");
   q_tex.get_texture(*pppm_program,"q_tex");
 
diff --git a/lib/gpu/pppm_gpu_memory.h b/lib/gpu/pppm_gpu_memory.h
index d9052472fc..8e0452ff6d 100644
--- a/lib/gpu/pppm_gpu_memory.h
+++ b/lib/gpu/pppm_gpu_memory.h
@@ -152,6 +152,7 @@ class PPPMGPUMemory {
  protected:
   bool _allocated, _compiled;
   int _block_size, _block_x_size, _block_y_size, _max_brick_atoms;
+  int _resequence_skip;
   double  _max_bytes, _max_an_bytes;
 
   void compile_kernels(UCL_Device &dev);
diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index 638753bd5b..33c92e9ed3 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -495,6 +495,9 @@ void PPPMGPU::init()
   compute_gf_denom();
   compute_rho_coeff();
 
+  if (order>8)
+    error->all("Cannot use order greater than 8 with pppm/gpu.");
+
   bool success;
   host_brick = pppm_gpu_init(atom->nlocal, atom->nlocal+atom->nghost, screen,
                              order, nxlo_out, nylo_out, nzlo_out, nxhi_out,
@@ -689,19 +692,15 @@ double t1=MPI_Wtime();
     nmax = atom->nmax;
     part2grid = memory->create_2d_int_array(nmax,3,"pppm:part2grid");
   }
-time1+=MPI_Wtime()-t1;
   energy = 0.0;
   if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0;
 
   // find grid points for all my particles
   // map my particle charge onto my local 3d density grid
 
-double t2=MPI_Wtime();
   particle_map();
-time2+=MPI_Wtime()-t2;
-double t3=MPI_Wtime();
   make_rho();
-time3+=MPI_Wtime()-t3;
+time1+=MPI_Wtime()-t1;
 
 double max_error=0;
 int _npts_x=nxhi_out-nxlo_out+1;
@@ -753,7 +752,9 @@ std::cout << "Maximum relative error: " << max_error*100.0 << "%\n";
 
   // calculate the force on my particles
 
+double t2=MPI_Wtime();
   fieldforce();
+time2+=MPI_Wtime()-t2;
 
   // sum energy across procs and add in volume-dependent term