From f0263bf14ccbcbaecba8e8d37d97e92ab09723be Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <brownw@ornl.gov>
Date: Fri, 25 Mar 2011 11:49:09 -0400
Subject: [PATCH] pppm/gpu now using all gpu kernels.

---
 lib/gpu/Makefile.fermi        |   2 +-
 lib/gpu/Makefile.lens         |   2 +-
 lib/gpu/Makefile.lincoln      |   2 +-
 lib/gpu/Makefile.linux        |   2 +-
 lib/gpu/Makefile.linux_opencl |   2 +-
 lib/gpu/Makefile.longhorn     |   2 +-
 lib/gpu/Makefile.mac          |   2 +-
 lib/gpu/Makefile.mac_opencl   |   2 +-
 lib/gpu/pair_gpu_ans.cpp      |   4 +-
 lib/gpu/pair_gpu_device.cpp   |  15 +++-
 lib/gpu/pair_gpu_device.h     |  17 ++--
 lib/gpu/pppm_gpu_kernel.cu    |  18 ++--
 lib/gpu/pppm_gpu_memory.cpp   |  26 +++---
 lib/gpu/pppm_gpu_memory.h     |   5 +-
 lib/gpu/pppm_l_gpu.cpp        |  10 +--
 src/GPU/pppm_gpu.cpp          | 159 ++--------------------------------
 src/GPU/pppm_gpu.h            |  10 +--
 17 files changed, 71 insertions(+), 209 deletions(-)

diff --git a/lib/gpu/Makefile.fermi b/lib/gpu/Makefile.fermi
index d830c8924c..98c823cf40 100644
--- a/lib/gpu/Makefile.fermi
+++ b/lib/gpu/Makefile.fermi
@@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math 
 
-CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include 
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include 
 CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
 
 BIN_DIR = ./
diff --git a/lib/gpu/Makefile.lens b/lib/gpu/Makefile.lens
index 3b6301277f..33d2b88801 100644
--- a/lib/gpu/Makefile.lens
+++ b/lib/gpu/Makefile.lens
@@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
 
-CUDR_CPP = mpic++ -DMPI_GERYON -openmp
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -openmp
 CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
 
 BIN_DIR = ./
diff --git a/lib/gpu/Makefile.lincoln b/lib/gpu/Makefile.lincoln
index 97a7901811..bbaca61ef1 100644
--- a/lib/gpu/Makefile.lincoln
+++ b/lib/gpu/Makefile.lincoln
@@ -24,7 +24,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math 
 
-CUDR_CPP = mpic++ -DMPI_GERYON
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT 
 CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
 
 BIN_DIR = ./
diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux
index c0001a54ab..d69a00a817 100644
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
 
-CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
 
 BIN_DIR = ./
diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl
index 69522298c5..3eac71cc88 100644
--- a/lib/gpu/Makefile.linux_opencl
+++ b/lib/gpu/Makefile.linux_opencl
@@ -17,7 +17,7 @@
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
 
-OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 OCL_LINK = -lOpenCL
 OCL_PREC = -D_SINGLE_SINGLE
 
diff --git a/lib/gpu/Makefile.longhorn b/lib/gpu/Makefile.longhorn
index ba921f0f68..cc41174332 100644
--- a/lib/gpu/Makefile.longhorn
+++ b/lib/gpu/Makefile.longhorn
@@ -23,7 +23,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB)
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
 
-CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
 
 BIN_DIR = ./
diff --git a/lib/gpu/Makefile.mac b/lib/gpu/Makefile.mac
index f061a1a68a..5276ac10b2 100644
--- a/lib/gpu/Makefile.mac
+++ b/lib/gpu/Makefile.mac
@@ -24,7 +24,7 @@ CUDA_ARCH = -arch=sm_11
 CUDA_PRECISION = -D_SINGLE_SINGLE
 CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib
-CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32
+CUDA_OPTS = -DUNIX -DUCL_NO_EXIT -O3 -Xptxas -v --use_fast_math -m32
 
 CUDR_CPP = mpic++
 CUDR_OPTS = -O2 -m32 -g
diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl
index 53d6d466e2..50ed67e9c3 100644
--- a/lib/gpu/Makefile.mac_opencl
+++ b/lib/gpu/Makefile.mac_opencl
@@ -17,7 +17,7 @@
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
 
-OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON
+OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON -DUCL_NO_EXIT
 OCL_LINK = -framework OpenCL
 OCL_PREC = -D_SINGLE_SINGLE
 
diff --git a/lib/gpu/pair_gpu_ans.cpp b/lib/gpu/pair_gpu_ans.cpp
index 155b1f6620..69980dd8ab 100644
--- a/lib/gpu/pair_gpu_ans.cpp
+++ b/lib/gpu/pair_gpu_ans.cpp
@@ -269,10 +269,8 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
 template <class numtyp, class acctyp>
 double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
                                    double *virial, double &ecoul) {
-  if (_eflag==false && _vflag==false) {
-    ecoul=0.0;
+  if (_eflag==false && _vflag==false)
     return 0.0;
-  }
 
   if (_charge==false)
     return energy_virial(eatom,vatom,virial);
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index 8cc7de31cb..87775291f5 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -267,8 +267,9 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in,
                                          UCL_Timer &time_rho,
                                          UCL_Timer &time_interp,
                                          PairGPUAns<numtyp,acctyp> &ans, 
-                                         const double max_bytes, FILE *screen) {
-  double single[5], times[5];
+                                         const double max_bytes, 
+                                         const double cpu_time, FILE *screen) {
+  double single[7], times[7];
 
 //  single[0]=atom.transfer_time()+ans.transfer_time()+time_in.total_seconds()+
 //            time_out.total_seconds();
@@ -279,8 +280,10 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in,
   single[2]=time_map.total_seconds();
   single[3]=time_rho.total_seconds();
   single[4]=time_interp.total_seconds();
+  single[5]=ans.transfer_time()+ans.cast_time();
+  single[6]=cpu_time;
 
-  MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
+  MPI_Reduce(single,times,7,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
   double my_max_bytes=max_bytes+atom.max_gpu_bytes();
   double mpi_max_bytes;
@@ -303,7 +306,11 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in,
         fprintf(screen,"Force interp:    %.4f s.\n",times[4]/_replica_size);
         fprintf(screen,"Total rho:       %.4f s.\n",(times[0]+times[2]+times[3])/_replica_size);
         fprintf(screen,"Total interp:    %.4f s.\n",(times[1]+times[4])/_replica_size);
-        fprintf(screen,"Total:           %.4f s.\n",(times[0]+times[1]+times[2]+times[3]+times[4])/_replica_size);
+        fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
+        fprintf(screen,"Total:           %.4f s.\n",
+                (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
+                _replica_size);
+        fprintf(screen,"CPU Poisson:     %.4f s.\n",times[6]/_replica_size);
       }
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
 
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index 664ce6c3be..9dbe2f50f6 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -77,7 +77,8 @@ class PairGPUDevice {
                            UCL_Timer & time_map, UCL_Timer & time_rho,
                            UCL_Timer &time_interp, 
                            PairGPUAns<numtyp,acctyp> &ans, 
-                           const double max_bytes, FILE *screen);
+                           const double max_bytes, const double cpu_time,
+                           FILE *screen);
 
   /// Clear all memory on host and device associated with atom and nbor data
   void clear();
@@ -91,7 +92,7 @@ class PairGPUDevice {
 
   /// Add "answers" (force,energies,etc.) into LAMMPS structures
   inline double fix_gpu(double **f, double **tor, double *eatom,
-                      double **vatom, double *virial, double &ecoul) {
+                        double **vatom, double *virial, double &ecoul) {
     atom.data_unavail();
     if (ans_queue.empty()==false) {
       stop_host_timer();
@@ -106,10 +107,16 @@ class PairGPUDevice {
   }
 
   /// Start timer on host
-  inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
+  inline void start_host_timer() 
+    { _cpu_full=MPI_Wtime(); _host_timer_started=true; }
   
   /// Stop timer on host
-  inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; }
+  inline void stop_host_timer() { 
+    if (_host_timer_started) {
+      _cpu_full=MPI_Wtime()-_cpu_full; 
+      _host_timer_started=false;
+    }
+  }
   
   /// Return host time
   inline double host_time() { return _cpu_full; }
@@ -170,7 +177,7 @@ class PairGPUDevice {
  private:
   std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
   int _init_count;
-  bool _device_init;
+  bool _device_init, _host_timer_started;
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;
   int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
       _replica_size;
diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu
index eeb596ca9c..2df1291936 100644
--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@@ -88,9 +88,9 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
-// Maximum order for stencil
-#define MAX_STENCIL 8
-// Thread block size for all kernels (Must be >=MAX_STENCIL^2)
+// Maximum order for spline
+#define MAX_SPLINE 8
+// Thread block size for all kernels (Must be >=MAX_SPLINE^2)
 #define BLOCK_1D 64
 // Number of threads per pencil for charge spread
 //#define PENCIL_SIZE MEM_THREADS
@@ -158,9 +158,9 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_,
                        const int npts_y, const int npts_z, const int nlocal_x,
                        const int nlocal_y, const int nlocal_z,
                        const int order_m_1, const int order, const int order2) {
-  __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL];
-  __local numtyp front[BLOCK_PENCILS][PENCIL_SIZE+MAX_STENCIL];
-  __local numtyp ans[MAX_STENCIL][BLOCK_1D];
+  __local numtyp rho_coeff[MAX_SPLINE*MAX_SPLINE];
+  __local numtyp front[BLOCK_PENCILS][PENCIL_SIZE+MAX_SPLINE];
+  __local numtyp ans[MAX_SPLINE][BLOCK_1D];
   
   int tid=THREAD_ID_X;
   if (tid<order2+order)
@@ -256,9 +256,9 @@ __kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
                      const numtyp delzinv, const int order,
                      const int order2, const numtyp qqrd2e_scale, 
                      __global acctyp4 *ans) {
-  __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL];
-  __local numtyp rho1d_0[MAX_STENCIL][BLOCK_1D];
-  __local numtyp rho1d_1[MAX_STENCIL][BLOCK_1D];
+  __local numtyp rho_coeff[MAX_SPLINE*MAX_SPLINE];
+  __local numtyp rho1d_0[MAX_SPLINE][BLOCK_1D];
+  __local numtyp rho1d_1[MAX_SPLINE][BLOCK_1D];
 
   int tid=THREAD_ID_X;
   if (tid<order2+order)
diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp
index 648ef786c0..391090efaa 100644
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@@ -24,9 +24,9 @@
 #include "pppm_gpu_memory.h"
 #include <cassert>
 
-// Maximum order for stencil
-#define MAX_STENCIL 8
-// Thread block size for all kernels (Must be >=MAX_STENCIL^2)
+// Maximum order for spline
+#define MAX_SPLINE 8
+// Thread block size for all kernels (Must be >=MAX_SPLINE^2)
 #define BLOCK_1D 64
 // Number of threads per pencil for charge spread
 //#define PENCIL_SIZE MEM_THREADS
@@ -48,7 +48,7 @@ PPPMGPUMemoryT::PPPMGPUMemory() : _allocated(false), _compiled(false),
 
 template <class numtyp, class acctyp>
 PPPMGPUMemoryT::~PPPMGPUMemory() {
-  clear();
+  clear(0.0);
   delete ans;
 }
 
@@ -64,8 +64,6 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
                               const int nxhi_out, const int nyhi_out,
                               const int nzhi_out, double **rho_coeff,
                               numtyp **vd_brick, bool &success) {
-  clear();
-  
   _max_bytes=10;
   screen=_screen;
 
@@ -78,7 +76,7 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
 
   _block_size=BLOCK_1D;
   assert(BLOCK_PENCILS*PENCIL_SIZE==BLOCK_1D);
-  assert(MAX_STENCIL*MAX_STENCIL<=BLOCK_1D);
+  assert(MAX_SPLINE*MAX_SPLINE<=BLOCK_1D);
   if (static_cast<size_t>(_block_size)>ucl_device->group_size())
     _block_size=ucl_device->group_size();
   compile_kernels(*ucl_device);
@@ -161,13 +159,11 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
   d_error_flag.zero();
   _max_bytes+=1;
 
-//success=success && (force_temp.alloc(nall*4*2,*ucl_device)==UCL_SUCCESS);
-  
   return h_brick.begin();
 }
 
 template <class numtyp, class acctyp>
-void PPPMGPUMemoryT::clear() {
+void PPPMGPUMemoryT::clear(const double cpu_time) {
   if (!_allocated)
     return;
   _allocated=false;
@@ -182,7 +178,7 @@ void PPPMGPUMemoryT::clear() {
   
   acc_timers();
   device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
-                              *ans,_max_bytes+_max_an_bytes,screen);
+                              *ans,_max_bytes+_max_an_bytes,cpu_time,screen);
 
   if (_compiled) {
     k_particle_map.clear();
@@ -210,6 +206,8 @@ int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall,
                            double *host_q, double *boxlo, 
                            const double delxinv, const double delyinv,
                            const double delzinv) {
+  device->stop_host_timer();
+  
   acc_timers();
   if (nlocal==0) {
     zero_timers();
@@ -243,7 +241,7 @@ int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall,
   int _max_atoms=10;
   int ainum=this->ans->inum();
   
-  // Boxlo adjusted to be upper left brick and shift for even stencil order
+  // Boxlo adjusted to be upper left brick and shift for even spline order
   double shift=0.0;
   if (_order % 2)
     shift=0.5;
@@ -267,7 +265,6 @@ int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall,
   time_map.stop();
 
   time_rho.start();
-
   BX=block_size();
   GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/BLOCK_PENCILS));
   k_make_rho.set_size(GX,BX);
@@ -320,7 +317,8 @@ void PPPMGPUMemoryT::interp(const numtyp qqrd2e_scale) {
                &_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin());
   time_interp.stop();
 
-//ucl_copy(force_temp,ans->dev_ans,ans->dev_ans.numel(),false);
+  ans->copy_answers(false,false,false,false);
+  device->add_ans_object(ans);
 }
 
 
diff --git a/lib/gpu/pppm_gpu_memory.h b/lib/gpu/pppm_gpu_memory.h
index a68a896ae7..6a9b86eef1 100644
--- a/lib/gpu/pppm_gpu_memory.h
+++ b/lib/gpu/pppm_gpu_memory.h
@@ -56,7 +56,7 @@ class PPPMGPUMemory {
   
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
-  void clear();
+  void clear(const double cpu_time);
 
   /// Returns memory usage on device per atom
   int bytes_per_atom() const;
@@ -133,7 +133,7 @@ class PPPMGPUMemory {
   // Number of local grid points in brick
   int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride;
   
-  // -------------------------- STENCIL DATA -------------------------
+  // -------------------------- SPLINE DATA -------------------------
   UCL_D_Vec<numtyp> d_rho_coeff;
   int _order, _nlower, _nupper, _order_m_1, _order2;
   int _nxlo_out, _nylo_out, _nzlo_out, _nxhi_out, _nyhi_out, _nzhi_out;
@@ -141,7 +141,6 @@ class PPPMGPUMemory {
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
   PairGPUAns<numtyp,acctyp> *ans;
-//UCL_H_Vec<numtyp> force_temp;
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pppm_program;
diff --git a/lib/gpu/pppm_l_gpu.cpp b/lib/gpu/pppm_l_gpu.cpp
index 550348c815..fc3a7ab4ca 100644
--- a/lib/gpu/pppm_l_gpu.cpp
+++ b/lib/gpu/pppm_l_gpu.cpp
@@ -34,7 +34,7 @@ PRECISION * pppm_gpu_init(const int nlocal, const int nall, FILE *screen,
                           const int nxhi_out, const int nyhi_out,
                           const int nzhi_out, double **rho_coeff,
                           PRECISION **vd_brick, bool &success) {
-  PPPMF.clear();
+  PPPMF.clear(0.0);
   int first_gpu=PPPMF.device->first_device();
   int last_gpu=PPPMF.device->last_device();
   int world_me=PPPMF.device->world_me();
@@ -91,8 +91,8 @@ PRECISION * pppm_gpu_init(const int nlocal, const int nall, FILE *screen,
   return host_brick;
 }
 
-void pppm_gpu_clear() {
-  PPPMF.clear();
+void pppm_gpu_clear(const double cpu_time) {
+  PPPMF.clear(cpu_time);
 }
 
 int pppm_gpu_spread(const int ago, const int nlocal, const int nall,
@@ -111,7 +111,3 @@ double pppm_gpu_bytes() {
   return PPPMF.host_memory_usage();
 }
 
-/*
-PRECISION * pppm_gpu_force() { return PPPMF.force_temp.begin(); }
-*/
-
diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index 0db889c64e..a5835777e0 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -43,7 +43,7 @@ numtyp* pppm_gpu_init(const int nlocal, const int nall, FILE *screen,
 		      const int nyhi_out, const int nzhi_out,
 		      double **rho_coeff, numtyp **_vd_brick,
 		      bool &success);
-void pppm_gpu_clear();
+void pppm_gpu_clear(const double poisson_time);
 int pppm_gpu_spread(const int ago, const int nlocal, const int nall,
                     double **host_x, int *host_type, bool &success,
                     double *host_q, double *boxlo, const double delxinv,
@@ -98,7 +98,6 @@ PPPMGPU::PPPMGPU(LAMMPS *lmp, int narg, char **arg) : KSpace(lmp, narg, arg)
   remap = NULL;
 
   nmax = 0;
-  part2grid = NULL;
 }
 
 /* ----------------------------------------------------------------------
@@ -109,17 +108,7 @@ PPPMGPU::~PPPMGPU()
 {
   delete [] factors;
   deallocate();
-  memory->destroy_2d_int_array(part2grid);
-  pppm_gpu_clear();
-double total1, total2, total3;
-int rank,size;
-MPI_Comm_rank(MPI_COMM_WORLD,&rank);
-MPI_Comm_size(MPI_COMM_WORLD,&size);
-MPI_Allreduce(&time1,&total1,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
-MPI_Allreduce(&time2,&total2,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
-MPI_Allreduce(&time3,&total3,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
-if (rank==0 && screen)
-fprintf(screen,"DEBUG_TIMES: %f %f %f\n",total1/size,total2/size,total3/size);
+  pppm_gpu_clear(poisson_time);
 }
 
 /* ----------------------------------------------------------------------
@@ -510,6 +499,7 @@ void PPPMGPU::init()
 
   if (order>8)
     error->all("Cannot use order greater than 8 with pppm/gpu.");
+  pppm_gpu_clear(poisson_time);
 
   bool success;
   numtyp *data, *h_brick;
@@ -526,7 +516,8 @@ void PPPMGPU::init()
 
   if (!success)
     error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
-time1=0; time2=0; time3=0;
+
+  poisson_time=0;
 }
 
 /* ----------------------------------------------------------------------
@@ -707,21 +698,11 @@ void PPPMGPU::compute(int eflag, int vflag)
     domain->x2lamda(atom->nlocal);
   }
 
-  // extend size of per-atom arrays if necessary
-  if (atom->nlocal > nmax) {
-    memory->destroy_2d_int_array(part2grid);
-    nmax = atom->nmax;
-    part2grid = memory->create_2d_int_array(nmax,3,"pppm:part2grid");
-  }
   energy = 0.0;
   if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0;
 
-  // find grid points for all my particles
-  // map my particle charge onto my local 3d density grid
+  double t3=MPI_Wtime();
 
-  particle_map();
-
-double t3=MPI_Wtime();
   // all procs communicate density values from their ghost cells
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
@@ -738,16 +719,13 @@ double t3=MPI_Wtime();
   //   surrounding their 3d bricks
 
   fillbrick();
-time3+=MPI_Wtime()-t3;
 
-  numtyp qqrd2e_scale=qqrd2e*scale;
-  pppm_gpu_interp(qqrd2e_scale);
+  poisson_time+=MPI_Wtime()-t3;
 
   // calculate the force on my particles
 
-double t2=MPI_Wtime();
-  fieldforce();
-time2+=MPI_Wtime()-t2;
+  numtyp qqrd2e_scale=qqrd2e*scale;
+  pppm_gpu_interp(qqrd2e_scale);
 
   // sum energy across procs and add in volume-dependent term
 
@@ -1504,46 +1482,6 @@ void PPPMGPU::fillbrick()
       }
 }
 
-/* ----------------------------------------------------------------------
-   find center grid pt for each of my particles
-   check that full stencil for the particle will fit in my 3d brick
-   store central grid pt indices in part2grid array 
-------------------------------------------------------------------------- */
-
-void PPPMGPU::particle_map()
-{
-  int nx,ny,nz;
-
-  double **x = atom->x;
-  int nlocal = atom->nlocal;
-
-  int flag = 0;
-  for (int i = 0; i < nlocal; i++) {
-    
-    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-    // current particle coord can be outside global and local box
-    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
-
-    nx = static_cast<int> ((x[i][0]-boxlo[0])*delxinv+shift) - OFFSET;
-    ny = static_cast<int> ((x[i][1]-boxlo[1])*delyinv+shift) - OFFSET;
-    nz = static_cast<int> ((x[i][2]-boxlo[2])*delzinv+shift) - OFFSET;
-
-    part2grid[i][0] = nx;
-    part2grid[i][1] = ny;
-    part2grid[i][2] = nz;
-
-    // check that entire stencil around nx,ny,nz will fit in my 3d brick
-
-    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
-	ny+nlower < nylo_out || ny+nupper > nyhi_out ||
-	nz+nlower < nzlo_out || nz+nupper > nzhi_out) flag++;
-  }
-
-  int flag_all;
-  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
-  if (flag_all) error->all("Out of range atoms - cannot compute PPPMGPU");
-}
-
 /* ----------------------------------------------------------------------
    FFT-based Poisson solver 
 ------------------------------------------------------------------------- */
@@ -1665,85 +1603,6 @@ void PPPMGPU::poisson(int eflag, int vflag)
       }
 }
 
-/* ----------------------------------------------------------------------
-   interpolate from grid to get electric field & force on my particles 
-------------------------------------------------------------------------- */
-
-void PPPMGPU::fieldforce()
-{
-  int i,l,m,n,nx,ny,nz,mx,my,mz;
-  double dx,dy,dz,x0,y0,z0;
-  double ek[3];
-
-  // loop over my charges, interpolate electric field from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-  // ek = 3 components of E-field on particle
-
-  double *q = atom->q;
-  double **x = atom->x;
-  double **f = atom->f;
-  int nlocal = atom->nlocal;
-
-
-
-//numtyp *gpu_force=pppm_gpu_force();
-//double max_error=0;
-
-
-  for (i = 0; i < nlocal; i++) {
-    nx = part2grid[i][0];
-    ny = part2grid[i][1];
-    nz = part2grid[i][2];
-    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-    compute_rho1d(dx,dy,dz);
-
-    ek[0] = ek[1] = ek[2] = 0.0;
-    for (n = nlower; n <= nupper; n++) {
-      mz = n+nz;
-      z0 = rho1d[2][n];
-      for (m = nlower; m <= nupper; m++) {
-	my = m+ny;
-	y0 = z0*rho1d[1][m];
-	for (l = nlower; l <= nupper; l++) {
-	  mx = l+nx;
-	  x0 = y0*rho1d[0][l];
-	  ek[0] -= x0*vd_brick[mz][my][mx*4];;
-	  ek[1] -= x0*vd_brick[mz][my][mx*4+1];;
-	  ek[2] -= x0*vd_brick[mz][my][mx*4+2];;
-	}
-      }
-    }
-
-//for (int jj=0; jj<3; jj++) {
-//  double f0= qqrd2e*scale * q[i]*ek[jj];
-//  double error=0.0;
-//  if (f0>1e-8)
-//    error = fabs(((gpu_force[i*4+jj])-(f0))/(f0));
-//  if (error>0.05)
-////    std::cout << "* ";
-//    std::cout << i << " : CPU GPU: " << error << " " << f0 << " " 
-//              << gpu_force[i*4+jj] << std::endl;
-//  if (error>max_error)
-//    max_error=error;
-//}
-
-    // convert E-field to force
-
-    f[i][0] += qqrd2e*scale * q[i]*ek[0];
-    f[i][1] += qqrd2e*scale * q[i]*ek[1];
-    f[i][2] += qqrd2e*scale * q[i]*ek[2];
-  }
-
-//std::cout << "Maximum relative error: " << max_error*100.0 << "%\n";
-
-
-}
-
 /* ----------------------------------------------------------------------
    map nprocs to NX by NY grid as PX by PY procs - return optimal px,py 
 ------------------------------------------------------------------------- */
diff --git a/src/GPU/pppm_gpu.h b/src/GPU/pppm_gpu.h
index 34bb735512..c413712e28 100644
--- a/src/GPU/pppm_gpu.h
+++ b/src/GPU/pppm_gpu.h
@@ -72,7 +72,6 @@ class PPPMGPU : public KSpace {
   class FFT3d *fft1,*fft2;
   class Remap *remap;
 
-  int **part2grid;             // storage for particle -> grid mapping
   int nmax;
 
   int triclinic;               // domain settings, orthog or triclinic
@@ -90,19 +89,18 @@ class PPPMGPU : public KSpace {
   double diffpr(double, double, double, double, double **);
   void compute_gf_denom();
   double gf_denom(double, double, double);
-  virtual void particle_map();
   void brick2fft();
   void fillbrick();
   void poisson(int, int);
-  virtual void fieldforce();
   void procs2grid2d(int,int,int,int *, int*);
   void compute_rho1d(double, double, double);
   void compute_rho_coeff();
   void slabcorr(int);
-double time1,time2,time3;  
 
- numtyp ***create_3d_offset(int, int, int, int, int, int, const char *,
-			    numtyp *, int);
+  double poisson_time;  
+
+  numtyp ***create_3d_offset(int, int, int, int, int, int, const char *,
+			     numtyp *, int);
   void destroy_3d_offset(numtyp ***, int, int);
   
 };