From 04f1dc492eecb53619dd7f46a42eca301352d618 Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <brownw@ornl.gov>
Date: Sat, 26 Mar 2011 12:58:47 -0400
Subject: [PATCH] Changing host neighbor list from gpu build to be full.
 Changing ordering of host neighbor list from gpu builds. Updating lj_cut with
 changes. Breaking everthing else.

---
 lib/gpu/atomic_gpu_memory.cpp    |  49 ++++++++++++
 lib/gpu/atomic_gpu_memory.h      |   8 ++
 lib/gpu/lj_cut_gpu.cpp           |   7 +-
 lib/gpu/pair_gpu_build_kernel.cu |  48 ++++++------
 lib/gpu/pair_gpu_nbor.cpp        |  52 ++++++++++---
 lib/gpu/pair_gpu_nbor.h          |   5 +-
 src/GPU/pair_lj_cut_gpu.cpp      | 126 ++++++-------------------------
 src/GPU/pair_lj_cut_gpu.h        |   3 +-
 8 files changed, 155 insertions(+), 143 deletions(-)
diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp
index a964c9ba01..900b8be9bc 100644
--- a/lib/gpu/atomic_gpu_memory.cpp
+++ b/lib/gpu/atomic_gpu_memory.cpp
@@ -243,6 +243,55 @@ int * AtomicGPUMemoryT::compute(const int ago, const int inum_full,
   return nbor->host_nbor.begin();
 }
 
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary and then compute forces, virials, energies
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int ** AtomicGPUMemoryT::compute(const int ago, const int inum_full,
+                                 const int nall, double **host_x, int *host_type,
+                                 double *boxlo, double *boxhi, int *tag,
+                                 int **nspecial, int **special, const bool eflag, 
+                                 const bool vflag, const bool eatom,
+                                 const bool vatom, int &host_start,
+                                 int **ilist, int **jnum,
+                                 const double cpu_time, bool &success) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return NULL;
+  }
+  
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+ 
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    boxlo, boxhi, tag, nspecial, special, success);
+    if (!success)
+      return NULL;
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+  
+  return nbor->host_jlist.begin()-host_start;
+}
+
 template <class numtyp, class acctyp>
 double AtomicGPUMemoryT::host_memory_usage_atomic() const {
   return device->atom.host_memory_usage()+nbor->host_memory_usage()+
diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h
index 11cda990af..88c4a7ed81 100644
--- a/lib/gpu/atomic_gpu_memory.h
+++ b/lib/gpu/atomic_gpu_memory.h
@@ -129,6 +129,14 @@ class AtomicGPUMemory {
                 const bool eatom, const bool vatom, int &host_start, 
                 const double cpu_time, bool &success);
 
+  /// Pair loop with device neighboring
+  int ** compute(const int ago, const int inum_full,
+                 const int nall, double **host_x, int *host_type, double *boxlo,
+                 double *boxhi, int *tag, int **nspecial,
+                 int **special, const bool eflag, const bool vflag, 
+                 const bool eatom, const bool vatom, int &host_start, 
+                 int **ilist, int **numj, const double cpu_time, bool &success);
+
   // -------------------------- DEVICE DATA ------------------------- 
 
   /// Device Properties and Atom and Neighbor storage
diff --git a/lib/gpu/lj_cut_gpu.cpp b/lib/gpu/lj_cut_gpu.cpp
index 8772eee81a..4082a94f64 100644
--- a/lib/gpu/lj_cut_gpu.cpp
+++ b/lib/gpu/lj_cut_gpu.cpp
@@ -96,15 +96,16 @@ void ljl_gpu_clear() {
   LJLMF.clear();
 }
 
-int * ljl_gpu_compute_n(const int ago, const int inum_full,
+int ** ljl_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *boxlo, double *boxhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success) {
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success) {
   return LJLMF.compute(ago, inum_full, nall, host_x, host_type, boxlo,
                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, cpu_time, success);
+                       vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
diff --git a/lib/gpu/pair_gpu_build_kernel.cu b/lib/gpu/pair_gpu_build_kernel.cu
index a08a26800a..5f5d6717de 100644
--- a/lib/gpu/pair_gpu_build_kernel.cu
+++ b/lib/gpu/pair_gpu_build_kernel.cu
@@ -139,7 +139,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 				     int *cell_particle_id, 
 				     int *cell_counts,
 				     int *nbor_list,
-				     int *host_nbor_list, 
+				     int *host_nbor_list,
+				     int *host_numj, 
 				     int neigh_bin_size, 
 				     numtyp cell_size,
 				     int ncellx, int ncelly, int ncellz,
@@ -183,9 +184,9 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
       neigh_list=neigh_counts+stride;
       nbor_list[pid_i]=pid_i;
     } else {
-      stride=nt-inum;
-    	neigh_counts=host_nbor_list+pid_i-inum;
-      neigh_list=neigh_counts+stride;
+      stride=1;
+    	neigh_counts=host_numj+pid_i-inum;
+      neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
     }
     
     // loop through neighbors
@@ -220,20 +221,18 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 	    
               for (int j = 0; j < end_idx; j++) {
                 int pid_j = cell_list_sh[j]; // gather from shared memory
-                if (pid_i<inum || pid_j<inum || pid_j>pid_i) {
-                  diff.x = atom_i.x - pos_sh[j].x;
-                  diff.y = atom_i.y - pos_sh[j].y;
-                  diff.z = atom_i.z - pos_sh[j].z;
+                diff.x = atom_i.x - pos_sh[j].x;
+                diff.y = atom_i.y - pos_sh[j].y;
+                diff.z = atom_i.z - pos_sh[j].z;
 		
-                  r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
-                  if (r2 < cell_size*cell_size && r2 > 1e-5) {
-                    if (cnt < neigh_bin_size) {
-                      *neigh_list = pid_j;
-                      neigh_list+=stride;
-                    }
-                    cnt++;
-                  }		
-                }
+                r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
+                if (r2 < cell_size*cell_size && r2 > 1e-5) {
+                  if (cnt < neigh_bin_size) {
+                    *neigh_list = pid_j;
+                    neigh_list+=stride;
+                  }
+                  cnt++;
+                }		
               }
             }
 	          __syncthreads();
@@ -247,9 +246,10 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 }
 
 __kernel void kernel_special(__global int *dev_nbor, 
-                             __global int *host_nbor_list, __global int *tag,
+                             __global int *host_nbor_list, 
+                             __global int *host_numj, __global int *tag,
                              __global int *nspecial, __global int *special,
-                             int inum, int nt, int nall) {
+                             int inum, int nt, int nall, int max_nbors) {
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X;
 
@@ -261,15 +261,17 @@ __kernel void kernel_special(__global int *dev_nbor,
     int n2=nspecial[ii*3+1];
     int n3=nspecial[ii*3+2];
 
+    int numj;
     if (ii < inum) {
       stride=inum;
       list=dev_nbor+stride+ii;
+      numj=*list;
+      list+=stride;
     } else {
-      stride=nt-inum;
-      list=host_nbor_list+ii-inum;
+      stride=1;
+      list=host_nbor_list+(ii-inum)*max_nbors;
+      numj=host_numj[ii];
     }
-    int numj=*list;
-    list+=stride;
     list_end=list+numj*stride;
   
     for ( ; list<list_end; list+=stride) {
diff --git a/lib/gpu/pair_gpu_nbor.cpp b/lib/gpu/pair_gpu_nbor.cpp
index ece0efda39..b7b7ce2e83 100644
--- a/lib/gpu/pair_gpu_nbor.cpp
+++ b/lib/gpu/pair_gpu_nbor.cpp
@@ -84,13 +84,14 @@ bool PairGPUNbor::init(PairGPUNborShared *shared, const int inum,
 void PairGPUNbor::alloc(bool &success) { 
   dev_nbor.clear();
   host_acc.clear();
+  int nt=_max_atoms+_max_host;
   if (_use_packing==false || _gpu_nbor) 
     success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev,
                                        UCL_READ_ONLY)==UCL_SUCCESS);
   else 
     success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
                                        UCL_READ_ONLY)==UCL_SUCCESS);
-  success=success && (host_acc.alloc((_max_atoms+_max_host)*2,*dev,
+  success=success && (host_acc.alloc(nt*2,*dev,
                                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
 
   _c_bytes=dev_nbor.row_bytes();
@@ -103,11 +104,27 @@ void PairGPUNbor::alloc(bool &success) {
   if (_max_host>0) {
     host_nbor.clear();
     dev_host_nbor.clear();
-    success=success && (host_nbor.alloc((_max_nbors+1)*_max_host,*dev,
+    dev_host_numj.clear();
+    host_ilist.clear();
+    host_jlist.clear();
+    
+    success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
                                         UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-    success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host,
+    success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
                                             *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-    _c_bytes+=dev_host_nbor.row_bytes();
+    success=success && (dev_host_numj.alloc(_max_host,*dev,
+                                            UCL_WRITE_ONLY)==UCL_SUCCESS);
+    success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
+    for (int i=0; i<nt; i++)
+      host_ilist[i]=i;
+    success=success && (host_jlist.alloc(_max_host,*dev,
+                                         UCL_NOT_PINNED)==UCL_SUCCESS);
+    int *ptr=host_nbor.begin();
+    for (int i=0; i<_max_host; i++) {
+      host_jlist[i]=ptr;
+      ptr+=_max_nbors;
+    }                                                 
+    _c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
   }
   if (_maxspecial>0) {
     dev_nspecial.clear();
@@ -140,6 +157,9 @@ void PairGPUNbor::clear() {
     dev_host_nbor.clear();
     dev_packed.clear();
     host_nbor.clear();
+    dev_host_numj.clear();
+    host_ilist.clear();
+    host_jlist.clear();
     dev_nspecial.clear();
     dev_special.clear();
     dev_special_t.clear();
@@ -152,7 +172,8 @@ void PairGPUNbor::clear() {
 double PairGPUNbor::host_memory_usage() const {
   if (_gpu_nbor) {
     if (_gpu_host)
-      return host_nbor.row_bytes()*host_nbor.rows();
+      return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
+             host_jlist.row_bytes();
     else
       return 0;
   } else 
@@ -297,7 +318,8 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
   _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
   _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
                             &cell_counts.begin(), &dev_nbor.begin(),
-                            &dev_host_nbor.begin(),&_max_nbors,&cell_size_cast,
+                            &dev_host_nbor.begin(), &dev_host_numj.begin(),
+                            &_max_nbors,&cell_size_cast,
                             &ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
 
   /* Get the maximum number of nbors and realloc if necessary */
@@ -307,7 +329,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
   if (nt>inum) {
     UCL_H_Vec<int> host_offset;
     host_offset.view_offset(inum,host_acc,nt-inum);
-    ucl_copy(host_offset,dev_host_nbor,nt-inum,false);
+    ucl_copy(host_offset,dev_host_numj,nt-inum,false);
   }
   mn=host_acc[0];
   for (int i=1; i<nt; i++)
@@ -322,10 +344,15 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
     if (_max_host>0) {
       host_nbor.clear();
       dev_host_nbor.clear();
-      success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor,
+      success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
                                           UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-      success=success && (dev_host_nbor.alloc((mn+1)*_max_host,
+      success=success && (dev_host_nbor.alloc(mn*_max_host,
                                         dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
+      int *ptr=host_nbor.begin();
+      for (int i=0; i<_max_host; i++) {
+        host_jlist[i]=ptr;
+        ptr+=mn;
+      }                                                 
       _gpu_bytes+=dev_host_nbor.row_bytes();
     }
     if (_alloc_packed) {
@@ -348,14 +375,15 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
     const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));
     _shared->k_special.set_size(GX2,cell_block);
     _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
-                           &atom.dev_tag.begin(), &dev_nspecial.begin(), 
-                           &dev_special.begin(), &inum, &nt, &nall);
+                           &dev_host_numj.begin(), &atom.dev_tag.begin(), 
+                           &dev_nspecial.begin(), &dev_special.begin(), 
+                           &inum, &nt, &nall, &_max_nbors);
   }
   time_kernel.stop();
 
   time_nbor.start();
   if (_gpu_host)
-    ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false);
+    ucl_copy(host_nbor,dev_host_nbor,false);
   time_nbor.stop();
 }
 
diff --git a/lib/gpu/pair_gpu_nbor.h b/lib/gpu/pair_gpu_nbor.h
index ff2dcc5ad6..090e81f76e 100644
--- a/lib/gpu/pair_gpu_nbor.h
+++ b/lib/gpu/pair_gpu_nbor.h
@@ -171,8 +171,11 @@ class PairGPUNbor {
   UCL_H_Vec<int> host_nbor;
   /// Device storage for neighbor list matrix that will be copied to host
   /** - 1st row is numj
-    * - Remaining rows are nbors **/
+    * - Remaining rows are by atom, columns are nbors **/
   UCL_D_Vec<int> dev_host_nbor;
+  UCL_D_Vec<int> dev_host_numj;
+  UCL_H_Vec<int> host_ilist;
+  UCL_H_Vec<int*> host_jlist;
   /// Device storage for special neighbor counts
   UCL_D_Vec<int> dev_nspecial;
   /// Device storage for special neighbors
diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp
index 289d6605e8..edafd0a132 100644
--- a/src/GPU/pair_lj_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_gpu.cpp
@@ -46,12 +46,13 @@ bool ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen);
 void ljl_gpu_clear();
-int * ljl_gpu_compute_n(const int ago, const int inum,
-	 	        const int nall, double **host_x, int *host_type, 
-                        double *boxlo, double *boxhi, int *tag, int **nspecial,
-                        int **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success);
+int ** ljl_gpu_compute_n(const int ago, const int inum,
+			 const int nall, double **host_x, int *host_type, 
+			 double *boxlo, double *boxhi, int *tag, int **nspecial,
+			 int **special, const bool eflag, const bool vflag,
+			 const bool eatom, const bool vatom, int &host_start,
+			 int **ilist, int **jnum,
+			 const double cpu_time, bool &success);
 void ljl_gpu_compute(const int ago, const int inum, const int nall, 
 		     double **host_x, int *host_type, int *ilist, int *numj,
 		     int **firstneigh, const bool eflag, const bool vflag,
@@ -89,30 +90,30 @@ void PairLJCutGPU::compute(int eflag, int vflag)
   int inum, host_start;
   
   bool success = true;
-  
+  int *ilist, *numneigh, **firstneigh;
   if (gpu_mode == GPU_NEIGH) {
     inum = atom->nlocal;
-    gpulist = ljl_gpu_compute_n(neighbor->ago, inum, nall,
-			        atom->x, atom->type, domain->sublo,
-				domain->subhi, atom->tag, atom->nspecial,
-                                atom->special, eflag, vflag, eflag_atom,
-                                vflag_atom, host_start, cpu_time, success);
+    firstneigh = ljl_gpu_compute_n(neighbor->ago, inum, nall,
+				   atom->x, atom->type, domain->sublo,
+				   domain->subhi, atom->tag, atom->nspecial,
+				   atom->special, eflag, vflag, eflag_atom,
+				   vflag_atom, host_start, 
+				   &ilist, &numneigh, cpu_time, success);
   } else {
     inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
     ljl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
-		    list->ilist, list->numneigh, list->firstneigh, eflag,
-		    vflag, eflag_atom, vflag_atom, host_start, cpu_time,
-                    success);
+		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		    vflag_atom, host_start, cpu_time, success);
   }
   if (!success)
     error->one("Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
-    if (gpu_mode == GPU_NEIGH)
-      cpu_compute(gpulist, host_start, eflag, vflag);
-    else
-      cpu_compute(host_start, eflag, vflag);
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
@@ -172,11 +173,12 @@ double PairLJCutGPU::memory_usage()
 
 /* ---------------------------------------------------------------------- */
 
-void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) {
-  int i,j,ii,jj,inum,jnum,itype,jtype;
+void PairLJCutGPU::cpu_compute(int start, int inum, int eflag, int vflag, 
+			       int *ilist, int *numneigh, int **firstneigh) {
+  int i,j,ii,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double rsq,r2inv,r6inv,forcelj,factor_lj;
-  int *ilist,*jlist,*numneigh,**firstneigh;
+  int *jlist;
 
   double **x = atom->x;
   double **f = atom->f;
@@ -185,11 +187,6 @@ void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) {
   int nall = nlocal + atom->nghost;
   double *special_lj = force->special_lj;
 
-  inum = list->inum;
-  ilist = list->ilist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
-
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
@@ -237,78 +234,3 @@ void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) {
     }
   }
 }
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
-  int i,j,itype,jtype;
-  int nlocal = atom->nlocal;
-  int nall = nlocal + atom->nghost;
-  int stride = nlocal-start;
-  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
-  double rsq,r2inv,r6inv,forcelj,factor_lj;
-  double *special_lj = force->special_lj;
-
-  double **x = atom->x;
-  double **f = atom->f;
-  int *type = atom->type;
-
-  // loop over neighbors of my atoms
-
-  for (i = start; i < nlocal; i++) {
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-    int *nbor = nbors + i - start;
-    int jnum = *nbor;
-    nbor += stride;
-    int *nbor_end = nbor + stride * jnum;
-
-    for (; nbor<nbor_end; nbor+=stride) {
-      j = *nbor;
-      
-      if (j < nall) factor_lj = 1.0;
-      else {
-	factor_lj = special_lj[j/nall];
-	j %= nall;
-      }
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-      jtype = type[j];
-
-      if (rsq < cutsq[itype][jtype]) {
-	r2inv = 1.0/rsq;
-	r6inv = r2inv*r2inv*r2inv;
-	forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
-	fpair = factor_lj*forcelj*r2inv;
-
-	f[i][0] += delx*fpair;
-	f[i][1] += dely*fpair;
-	f[i][2] += delz*fpair;
-
-	if (eflag) {
-	  evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
-	    offset[itype][jtype];
-	  evdwl *= factor_lj;
-	}
-
-        if (j<start) {
-  	  if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
-        } else {
-          if (j<nlocal) {
-	    f[j][0] -= delx*fpair;
-	    f[j][1] -= dely*fpair;
-	    f[j][2] -= delz*fpair;
-  	  }
-	  if (evflag) ev_tally(i,j,nlocal,0,
-			       evdwl,0.0,fpair,delx,dely,delz);
-	}
-      }
-    }
-  }
-}
-
diff --git a/src/GPU/pair_lj_cut_gpu.h b/src/GPU/pair_lj_cut_gpu.h
index 052294be28..1a8c1db46f 100644
--- a/src/GPU/pair_lj_cut_gpu.h
+++ b/src/GPU/pair_lj_cut_gpu.h
@@ -28,8 +28,7 @@ class PairLJCutGPU : public PairLJCut {
  public:
   PairLJCutGPU(LAMMPS *lmp);
   ~PairLJCutGPU();
-  void cpu_compute(int, int, int);
-  void cpu_compute(int *, int, int, int);
+  void cpu_compute(int, int, int, int, int *, int *, int **);
   void compute(int, int);
   void init_style();
   double memory_usage();