Enabled fphi_uind in hippo/gpu, really need to refactor hippo and amoeba in the GPU lib to remove kernel duplicates

2022-09-16 14:47:16 -05:00
parent 880f20c285
commit 62ecf98cda
11 changed files with 626 additions and 76 deletions
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -68,31 +68,7 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
 # host code compilation
-$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
+$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
 	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
 	$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
 	$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
 	$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
 	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
 	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
 $(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
 $(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
 #ifdef CUDPP_OPT
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -64,7 +64,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
                            cell_size,gpu_split,_screen,amoeba,
                            "k_amoeba_multipole", "k_amoeba_udirect2b",
                            "k_amoeba_umutual2b", "k_amoeba_polar",
-                            "k_amoeba_short_nbor", "k_amoeba_special15");
+                            "k_amoeba_fphi_uind", "k_amoeba_short_nbor",
                            "k_amoeba_special15");
  if (success!=0)
    return success;
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -1626,7 +1626,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
 ------------------------------------------------------------------------- */
-__kernel void k_fphi_uind(const __global numtyp4 *restrict thetai1,
+__kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
                          const __global numtyp4 *restrict thetai2,
                          const __global numtyp4 *restrict thetai3,
                          const __global int *restrict igrid,
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -65,6 +65,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                             const char *k_name_udirect2b,
                             const char *k_name_umutual2b,
                             const char *k_name_polar,
                             const char *k_name_fphi_uind,
                             const char *k_name_short_nbor,
                             const char* k_name_special15) {
  screen=_screen;
@ -100,7 +101,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
  _block_bio_size=device->block_bio_pair();
  compile_kernels(*ucl_device,pair_program,k_name_multipole,
                  k_name_udirect2b, k_name_umutual2b,k_name_polar,
-                  k_name_short_nbor, k_name_special15);
+                  k_name_fphi_uind, k_name_short_nbor, k_name_special15);
  if (_threads_per_atom>1 && gpu_nbor==0) {
    nbor->packing(true);
@ -934,6 +935,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                  const char *kname_udirect2b,
                                  const char *kname_umutual2b,
                                  const char *kname_polar,
                                  const char *kname_fphi_uind,
                                  const char *kname_short_nbor,
                                  const char* kname_special15) {
  if (_compiled)
@ -942,17 +944,17 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
  if (pair_program) delete pair_program;
  pair_program=new UCL_Program(dev);
  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
-  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen);
-  k_multipole.set_function(*pair_program,kname_multipole);
+  k_multipole.set_function(*pair_program, kname_multipole);
-  k_udirect2b.set_function(*pair_program,kname_udirect2b);
+  k_udirect2b.set_function(*pair_program, kname_udirect2b);
-  k_umutual2b.set_function(*pair_program,kname_umutual2b);
+  k_umutual2b.set_function(*pair_program, kname_umutual2b);
-  k_polar.set_function(*pair_program,kname_polar);
+  k_polar.set_function(*pair_program, kname_polar);
-  k_fphi_uind.set_function(*pair_program,"k_fphi_uind");
+  k_fphi_uind.set_function(*pair_program, kname_fphi_uind);
-  k_short_nbor.set_function(*pair_program,kname_short_nbor);
+  k_short_nbor.set_function(*pair_program, kname_short_nbor);
-  k_special15.set_function(*pair_program,kname_special15);
+  k_special15.set_function(*pair_program, kname_special15);
-  pos_tex.get_texture(*pair_program,"pos_tex");
+  pos_tex.get_texture(*pair_program, "pos_tex");
-  q_tex.get_texture(*pair_program,"q_tex");
+  q_tex.get_texture(*pair_program, "q_tex");
  _compiled=true;
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -62,9 +62,10 @@ class BaseAmoeba {
  int init_atomic(const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const int maxspecial15, const double cell_size,
                  const double gpu_split, FILE *screen, const void *pair_program,
-                  const char *kname_multipole,
+                  const char *kname_multipole, const char *kname_udirect2b,
-                  const char *kname_udirect2b, const char *kname_umutual2b,
+                  const char *kname_umutual2b, const char *kname_polar,
-                  const char *kname_polar, const char *kname_short_nbor, const char* kname_special15);
+                  const char *kname_fphi_uind, const char *kname_short_nbor,
                  const char* kname_special15);
  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead(const int add_kernels=0);
@ -309,7 +310,8 @@ class BaseAmoeba {
  void compile_kernels(UCL_Device &dev, const void *pair_string,
     const char *kname_multipole, const char *kname_udirect2b,
     const char *kname_umutual2b, const char *kname_polar,
-     const char *kname_short_nbor, const char* kname_special15);
+     const char *kname_fphi_uind, const char *kname_short_nbor,
     const char* kname_special15);
  virtual int multipole_real(const int eflag, const int vflag) = 0;
  virtual int udirect2b(const int eflag, const int vflag) = 0;
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@ -67,7 +67,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
                            cell_size,gpu_split,_screen,hippo,
                            "k_hippo_multipole", "k_hippo_udirect2b",
                            "k_hippo_umutual2b", "k_hippo_polar",
-                            "k_hippo_short_nbor", "k_hippo_special15");
+                            "k_hippo_fphi_uind", "k_hippo_short_nbor",
                            "k_hippo_special15");
  if (success!=0)
    return success;
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@ -2045,6 +2045,307 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 /* ----------------------------------------------------------------------
   fphi_uind = induced potential from grid
   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
 ------------------------------------------------------------------------- */
 __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
                          const __global numtyp4 *restrict thetai2,
                          const __global numtyp4 *restrict thetai3,
                          const __global int *restrict igrid,
                          const __global numtyp *restrict grid,
                          __global numtyp *restrict fdip_phi1,
                          __global numtyp *restrict fdip_phi2,
                          __global numtyp *restrict fdip_sum_phi,
                          const int bsorder, const int inum,
                          const int nzlo_out, const int nylo_out,
                          const int nxlo_out, const int ngridxy,
                          const int ngridx)
 {
  //int tid, ii, offset, i, n_stride;
  //atom_info(t_per_atom,ii,tid,offset);
  int tid=THREAD_ID_X;
  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
  if (ii<inum) {
    int nlpts = (bsorder-1) / 2;
    int istart = fast_mul(ii,4);
    int igridx = igrid[istart];
    int igridy = igrid[istart+1];
    int igridz = igrid[istart+2];
    // now istart is used to index thetai1, thetai2 and thetai3
    istart = fast_mul(ii,bsorder);
    // extract the permanent multipole field at each site
    numtyp tuv100_1 = (numtyp)0.0;
    numtyp tuv010_1 = (numtyp)0.0;
    numtyp tuv001_1 = (numtyp)0.0;
    numtyp tuv200_1 = (numtyp)0.0;
    numtyp tuv020_1 = (numtyp)0.0;
    numtyp tuv002_1 = (numtyp)0.0;
    numtyp tuv110_1 = (numtyp)0.0;
    numtyp tuv101_1 = (numtyp)0.0;
    numtyp tuv011_1 = (numtyp)0.0;
    numtyp tuv100_2 = (numtyp)0.0;
    numtyp tuv010_2 = (numtyp)0.0;
    numtyp tuv001_2 = (numtyp)0.0;
    numtyp tuv200_2 = (numtyp)0.0;
    numtyp tuv020_2 = (numtyp)0.0;
    numtyp tuv002_2 = (numtyp)0.0;
    numtyp tuv110_2 = (numtyp)0.0;
    numtyp tuv101_2 = (numtyp)0.0;
    numtyp tuv011_2 = (numtyp)0.0;
    numtyp tuv000 = (numtyp)0.0;
    numtyp tuv001 = (numtyp)0.0;
    numtyp tuv010 = (numtyp)0.0;
    numtyp tuv100 = (numtyp)0.0;
    numtyp tuv200 = (numtyp)0.0;
    numtyp tuv020 = (numtyp)0.0;
    numtyp tuv002 = (numtyp)0.0;
    numtyp tuv110 = (numtyp)0.0;
    numtyp tuv101 = (numtyp)0.0;
    numtyp tuv011 = (numtyp)0.0;
    numtyp tuv300 = (numtyp)0.0;
    numtyp tuv030 = (numtyp)0.0;
    numtyp tuv003 = (numtyp)0.0;
    numtyp tuv210 = (numtyp)0.0;
    numtyp tuv201 = (numtyp)0.0;
    numtyp tuv120 = (numtyp)0.0;
    numtyp tuv021 = (numtyp)0.0;
    numtyp tuv102 = (numtyp)0.0;
    numtyp tuv012 = (numtyp)0.0;
    numtyp tuv111 = (numtyp)0.0;
    int k = (igridz - nzlo_out) - nlpts;
    for (int kb = 0; kb < bsorder; kb++) {
      /*
      v0 = thetai3[m][kb][0];
      v1 = thetai3[m][kb][1];
      v2 = thetai3[m][kb][2];
      v3 = thetai3[m][kb][3];
      */
      int i3 = istart + kb;
      numtyp4 tha3 = thetai3[i3];
      numtyp v0 = tha3.x;
      numtyp v1 = tha3.y;
      numtyp v2 = tha3.z;
      numtyp v3 = tha3.w;
      numtyp tu00_1 = (numtyp)0.0;
      numtyp tu01_1 = (numtyp)0.0;
      numtyp tu10_1 = (numtyp)0.0;
      numtyp tu20_1 = (numtyp)0.0;
      numtyp tu11_1 = (numtyp)0.0;
      numtyp tu02_1 = (numtyp)0.0;
      numtyp tu00_2 = (numtyp)0.0;
      numtyp tu01_2 = (numtyp)0.0;
      numtyp tu10_2 = (numtyp)0.0;
      numtyp tu20_2 = (numtyp)0.0;
      numtyp tu11_2 = (numtyp)0.0;
      numtyp tu02_2 = (numtyp)0.0;
      numtyp tu00 = (numtyp)0.0;
      numtyp tu10 = (numtyp)0.0;
      numtyp tu01 = (numtyp)0.0;
      numtyp tu20 = (numtyp)0.0;
      numtyp tu11 = (numtyp)0.0;
      numtyp tu02 = (numtyp)0.0;
      numtyp tu30 = (numtyp)0.0;
      numtyp tu21 = (numtyp)0.0;
      numtyp tu12 = (numtyp)0.0;
      numtyp tu03 = (numtyp)0.0;
      int j = (igridy - nylo_out) - nlpts;
      for (int jb = 0; jb < bsorder; jb++) {
        /*
        u0 = thetai2[m][jb][0];
        u1 = thetai2[m][jb][1];
        u2 = thetai2[m][jb][2];
        u3 = thetai2[m][jb][3];
        */
        int i2 = istart + jb;
        numtyp4 tha2 = thetai2[i2];
        numtyp u0 = tha2.x;
        numtyp u1 = tha2.y;
        numtyp u2 = tha2.z;
        numtyp u3 = tha2.w;
        numtyp t0_1 = (numtyp)0.0;
        numtyp t1_1 = (numtyp)0.0;
        numtyp t2_1 = (numtyp)0.0;
        numtyp t0_2 = (numtyp)0.0;
        numtyp t1_2 = (numtyp)0.0;
        numtyp t2_2 = (numtyp)0.0;
        numtyp t3 = (numtyp)0.0;
        int i = (igridx - nxlo_out) - nlpts;
        for (int ib = 0; ib < bsorder; ib++) {
          /*
          tq_1 = grid[k][j][i][0];
          tq_2 = grid[k][j][i][1];
          t0_1 += tq_1*thetai1[m][ib][0];
          t1_1 += tq_1*thetai1[m][ib][1];
          t2_1 += tq_1*thetai1[m][ib][2];
          t0_2 += tq_2*thetai1[m][ib][0];
          t1_2 += tq_2*thetai1[m][ib][1];
          t2_2 += tq_2*thetai1[m][ib][2];
          t3 += (tq_1+tq_2)*thetai1[m][ib][3];
          */
          int i1 = istart + ib;
          numtyp4 tha1 = thetai1[i1];
          numtyp w0 = tha1.x;
          numtyp w1 = tha1.y;
          numtyp w2 = tha1.z;
          numtyp w3 = tha1.w;
          int gidx = 2*(k*ngridxy + j*ngridx + i);
          numtyp tq_1 = grid[gidx];
          numtyp tq_2 = grid[gidx+1];
          t0_1 += tq_1*w0;
          t1_1 += tq_1*w1;
          t2_1 += tq_1*w2;
          t0_2 += tq_2*w0;
          t1_2 += tq_2*w1;
          t2_2 += tq_2*w2;
          t3 += (tq_1+tq_2)*w3;
          i++;
        }
        tu00_1 += t0_1*u0;
        tu10_1 += t1_1*u0;
        tu01_1 += t0_1*u1;
        tu20_1 += t2_1*u0;
        tu11_1 += t1_1*u1;
        tu02_1 += t0_1*u2;
        tu00_2 += t0_2*u0;
        tu10_2 += t1_2*u0;
        tu01_2 += t0_2*u1;
        tu20_2 += t2_2*u0;
        tu11_2 += t1_2*u1;
        tu02_2 += t0_2*u2;
        numtyp t0 = t0_1 + t0_2;
        numtyp t1 = t1_1 + t1_2;
        numtyp t2 = t2_1 + t2_2;
        tu00 += t0*u0;
        tu10 += t1*u0;
        tu01 += t0*u1;
        tu20 += t2*u0;
        tu11 += t1*u1;
        tu02 += t0*u2;
        tu30 += t3*u0;
        tu21 += t2*u1;
        tu12 += t1*u2;
        tu03 += t0*u3;
        j++;
      }
      tuv100_1 += tu10_1*v0;
      tuv010_1 += tu01_1*v0;
      tuv001_1 += tu00_1*v1;
      tuv200_1 += tu20_1*v0;
      tuv020_1 += tu02_1*v0;
      tuv002_1 += tu00_1*v2;
      tuv110_1 += tu11_1*v0;
      tuv101_1 += tu10_1*v1;
      tuv011_1 += tu01_1*v1;
      tuv100_2 += tu10_2*v0;
      tuv010_2 += tu01_2*v0;
      tuv001_2 += tu00_2*v1;
      tuv200_2 += tu20_2*v0;
      tuv020_2 += tu02_2*v0;
      tuv002_2 += tu00_2*v2;
      tuv110_2 += tu11_2*v0;
      tuv101_2 += tu10_2*v1;
      tuv011_2 += tu01_2*v1;
      tuv000 += tu00*v0;
      tuv100 += tu10*v0;
      tuv010 += tu01*v0;
      tuv001 += tu00*v1;
      tuv200 += tu20*v0;
      tuv020 += tu02*v0;
      tuv002 += tu00*v2;
      tuv110 += tu11*v0;
      tuv101 += tu10*v1;
      tuv011 += tu01*v1;
      tuv300 += tu30*v0;
      tuv030 += tu03*v0;
      tuv003 += tu00*v3;
      tuv210 += tu21*v0;
      tuv201 += tu20*v1;
      tuv120 += tu12*v0;
      tuv021 += tu02*v1;
      tuv102 += tu10*v2;
      tuv012 += tu01*v2;
      tuv111 += tu11*v1;
      k++;
    }
    int idx;
    numtyp fdip_buf[20];
    fdip_buf[0] = (numtyp)0.0;
    fdip_buf[1] = tuv100_1;
    fdip_buf[2] = tuv010_1;
    fdip_buf[3] = tuv001_1;
    fdip_buf[4] = tuv200_1;
    fdip_buf[5] = tuv020_1;
    fdip_buf[6] = tuv002_1;
    fdip_buf[7] = tuv110_1;
    fdip_buf[8] = tuv101_1;
    fdip_buf[9] = tuv011_1;
    idx = ii;    
    for (int m = 0; m < 10; m++) {
      fdip_phi1[idx] = fdip_buf[m];
      idx += inum;
    }
    fdip_buf[0] = (numtyp)0.0;
    fdip_buf[1] = tuv100_2;
    fdip_buf[2] = tuv010_2;
    fdip_buf[3] = tuv001_2;
    fdip_buf[4] = tuv200_2;
    fdip_buf[5] = tuv020_2;
    fdip_buf[6] = tuv002_2;
    fdip_buf[7] = tuv110_2;
    fdip_buf[8] = tuv101_2;
    fdip_buf[9] = tuv011_2;
    idx = ii;    
    for (int m = 0; m < 10; m++) {
      fdip_phi2[idx] = fdip_buf[m];
      idx += inum;
    }
    fdip_buf[0] = tuv000;
    fdip_buf[1] = tuv100;
    fdip_buf[2] = tuv010;
    fdip_buf[3] = tuv001;
    fdip_buf[4] = tuv200;
    fdip_buf[5] = tuv020;
    fdip_buf[6] = tuv002;
    fdip_buf[7] = tuv110;
    fdip_buf[8] = tuv101;
    fdip_buf[9] = tuv011;
    fdip_buf[10] = tuv300;
    fdip_buf[11] = tuv030;
    fdip_buf[12] = tuv003;
    fdip_buf[13] = tuv210;
    fdip_buf[14] = tuv201;
    fdip_buf[15] = tuv120;
    fdip_buf[16] = tuv021;
    fdip_buf[17] = tuv102;
    fdip_buf[18] = tuv012;
    fdip_buf[19] = tuv111;
    idx = ii;    
    for (int m = 0; m < 20; m++) {
      fdip_sum_phi[idx] = fdip_buf[m];
      idx += inum;
    }
  }
 }
 /* ----------------------------------------------------------------------
   scan standard neighbor list and make it compatible with 1-5 neighbors
   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@ -193,6 +193,20 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **
                             eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
                          double ***host_thetai1, double ***host_thetai2,
                          double ***host_thetai3, int** igrid, double ****host_grid_brick,
                          void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
                          const int nzlo_out, const int nzhi_out,
                          const int nylo_out, const int nyhi_out,
                          const int nxlo_out, const int nxhi_out,
                          bool& first_iteration) {
   HIPPOMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
                             host_thetai3, igrid, host_grid_brick, host_fdip_phi1,
                             host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
                             nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration);
 }
 double hippo_gpu_bytes() {
  return HIPPOMF.host_memory_usage();
 }
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -938,7 +938,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
    }
  }
 */
-  // accumulate the field and fieldp values from the real space portion from umutual2b() on the GPU
+  // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
  //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
  amoeba_gpu_update_fieldp(&fieldp_pinned);
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@ -18,7 +18,7 @@
 #include "pair_hippo_gpu.h"
-#include "amoeba_convolution.h"
+#include "amoeba_convolution_gpu.h"
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
@ -46,6 +46,9 @@ enum{GEAR,ASPC,LSQR};
 enum{BUILD,APPLY};
 enum{GORDON1,GORDON2};
 // same as in pair_amoeba.cpp
 enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};
 #define DEBYE 4.80321    // conversion factor from q-Angs (real units) to Debye
 // External functions from cuda library for atom decomposition
@ -102,6 +105,16 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
 void hippo_gpu_update_fieldp(void **fieldp_ptr);
 void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
                          double ***host_thetai1, double ***host_thetai2,
                          double ***host_thetai3, int** igrid,
                          double ****host_grid_brick, void **host_fdip_phi1,
                          void **host_fdip_phi2, void **host_fdip_sum_phi,
                          const int nzlo_out, const int nzhi_out,
                          const int nylo_out, const int nyhi_out,
                          const int nxlo_out, const int nxhi_out,
                          bool& first_iteration);
 void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
              double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@ -129,8 +142,10 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
  gpu_dispersion_real_ready = true;
  gpu_multipole_real_ready = true;
  gpu_udirect2b_ready = true;
  gpu_umutual1_ready = true;
  gpu_fphi_uind_ready = true;
  gpu_umutual2b_ready = true;
-  gpu_polar_real_ready = true;
+  gpu_polar_real_ready = true;         // need to be true for copying data from device back to host
  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@ -198,6 +213,16 @@ void PairHippoGPU::init_style()
    tq_single = false;
  else
    tq_single = true;
  // replace with the gpu counterpart
  if (gpu_umutual1_ready) {
    if (use_ewald && ic_kspace) {
      delete ic_kspace;
      ic_kspace =
        new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
    }
  }
 }
 /* ----------------------------------------------------------------------
@ -392,6 +417,8 @@ void PairHippoGPU::induce()
  int debug = 1;
  first_induce_iteration  = true;
  // set cutoffs, taper coeffs, and PME params
  // create qfac here, free at end of polar()
@ -403,8 +430,6 @@ void PairHippoGPU::induce()
  // owned atoms
  double **x = atom->x;
  double **f = atom->f;
  int nlocal = atom->nlocal;
  // zero out the induced dipoles at each site
@ -996,37 +1021,60 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
  int i,j;
  double term;
  double time0,time1,time2;
  // zero field,fieldp for owned and ghost atoms
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
-  for (i = 0; i < nall; i++) {
+  memset(&field[0][0], 0, 3*nall *sizeof(double));
-    for (j = 0; j < 3; j++) {
+  memset(&fieldp[0][0], 0, 3*nall *sizeof(double));
 /*  
  for (int i = 0; i < nall; i++) {
    for (int j = 0; j < 3; j++) {
      field[i][j] = 0.0;
      fieldp[i][j] = 0.0;
    }
  }
-
+*/
  // get the real space portion of the mutual field first
  MPI_Barrier(world);
  time0 = MPI_Wtime();
  if (polar_rspace_flag) umutual2b(field,fieldp);
  time1 = MPI_Wtime();
  // get the reciprocal space part of the mutual field
  if (polar_kspace_flag) umutual1(field,fieldp);
  time2 = MPI_Wtime();
  // add the self-energy portion of the mutual field
  term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
  for (int i = 0; i < nlocal; i++) {
    field[i][0] += term*uind[i][0];
    field[i][1] += term*uind[i][1];
    field[i][2] += term*uind[i][2];
  }
  for (int i = 0; i < nlocal; i++) {
    fieldp[i][0] += term*uinp[i][0];
    fieldp[i][1] += term*uinp[i][1];
    fieldp[i][2] += term*uinp[i][2];
  }
 /*  
  for (i = 0; i < nlocal; i++) {
    for (j = 0; j < 3; j++) {
      field[i][j] += term*uind[i][j];
      fieldp[i][j] += term*uinp[i][j];
    }
  }
-
+*/
-  // accumulate the field and fieldp values from real-space portion from umutual2b() on the GPU
+  // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
  //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
  hippo_gpu_update_fieldp(&fieldp_pinned);
@ -1049,6 +1097,228 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
    fieldp[i][1] += fieldp_ptr[idx+1];
    fieldp[i][2] += fieldp_ptr[idx+2];
  }
  // accumulate timing information
  time_mutual_rspace += time1 - time0;
  time_mutual_kspace += time2 - time1;
 }
 /* ----------------------------------------------------------------------
   umutual1 = Ewald recip mutual induced field
   umutual1 computes the reciprocal space contribution of the
   induced atomic dipole moments to the field
 ------------------------------------------------------------------------- */
 void PairHippoGPU::umutual1(double **field, double **fieldp)
 {
  int m,n;
  int nxlo,nxhi,nylo,nyhi,nzlo,nzhi;
  double term;
  double a[3][3];  // indices not flipped vs Fortran
  // return if the Ewald coefficient is zero
  if (aewald < 1.0e-6) return;
  // convert Cartesian dipoles to fractional coordinates
  for (int j = 0; j < 3; j++) {
    a[0][j] = nfft1 * recip[0][j];
    a[1][j] = nfft2 * recip[1][j];
    a[2][j] = nfft3 * recip[2][j];
  }
  int nlocal = atom->nlocal;
  for (int i = 0; i < nlocal; i++) {
    fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2];
    fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2];
    fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2];
  }
  for (int i = 0; i < nlocal; i++) {      
    fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2];
    fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
    fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
  }
 /*
  for (i = 0; i < nlocal; i++) {
    for (j = 0; j < 3; j++) {
      fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2];
      fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2];
    }
  }
 */
  // gridpre = my portion of 4d grid in brick decomp w/ ghost values
  double ****gridpre = (double ****) ic_kspace->zero();
  // map 2 values to grid
  grid_uind(fuind,fuinp,gridpre);
  // pre-convolution operations including forward FFT
  // gridfft = my portion of complex 3d grid in FFT decomposition
  double *gridfft = ic_kspace->pre_convolution();
  // ---------------------
  // convolution operation
  // ---------------------
  nxlo = ic_kspace->nxlo_fft;
  nxhi = ic_kspace->nxhi_fft;
  nylo = ic_kspace->nylo_fft;
  nyhi = ic_kspace->nyhi_fft;
  nzlo = ic_kspace->nzlo_fft;
  nzhi = ic_kspace->nzhi_fft;
  // use qfac values stored in udirect1()
  m = n = 0;
  for (int k = nzlo; k <= nzhi; k++) {
    for (int j = nylo; j <= nyhi; j++) {
      for (int i = nxlo; i <= nxhi; i++) {
        term = qfac[m++];
        gridfft[n] *= term;
        gridfft[n+1] *= term;
        n += 2;
      }
    }
  }
  // post-convolution operations including backward FFT
  // gridppost = my portion of 4d grid in brick decomp w/ ghost values
  double ****gridpost = (double ****) ic_kspace->post_convolution();
  // get potential
  double time0, time1;
  MPI_Barrier(world);
  time0 = MPI_Wtime();
  fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
  time1 = MPI_Wtime();
  time_fphi_uind += (time1 - time0);
  // store fractional reciprocal potentials for OPT method
  if (poltyp == OPT) {
    for (int i = 0; i < nlocal; i++) {
      for (int j = 0; j < 10; j++) {
        fopt[i][optlevel][j] = fdip_phi1[i][j];
        foptp[i][optlevel][j] = fdip_phi2[i][j];
      }
    }
  }
  // convert the dipole fields from fractional to Cartesian
  for (int i = 0; i < 3; i++) {
    a[0][i] = nfft1 * recip[0][i];
    a[1][i] = nfft2 * recip[1][i];
    a[2][i] = nfft3 * recip[2][i];
  }
  for (int i = 0; i < nlocal; i++) {
    double dfx = a[0][0]*fdip_phi1[i][1] +
      a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3];
    double dfy = a[1][0]*fdip_phi1[i][1] +
      a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3];
    double dfz = a[2][0]*fdip_phi1[i][1] +
      a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3];
    field[i][0] -= dfx;
    field[i][1] -= dfy;
    field[i][2] -= dfz;
  }
  for (int i = 0; i < nlocal; i++) {
    double dfx = a[0][0]*fdip_phi2[i][1] +
      a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3];
    double dfy = a[1][0]*fdip_phi2[i][1] +
      a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3];
    double dfz = a[2][0]*fdip_phi2[i][1] +
      a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3];
    fieldp[i][0] -= dfx;
    fieldp[i][1] -= dfy;
    fieldp[i][2] -= dfz;
  }
 /*
  for (int i = 0; i < nlocal; i++) {
    for (j = 0; j < 3; j++) {
      dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] +
        a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3];
      dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] +
        a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3];
    }
  }
  // increment the field at each multipole site
  for (i = 0; i < nlocal; i++) {
    for (j = 0; j < 3; j++) {
      field[i][j] -= dipfield1[i][j];
      fieldp[i][j] -= dipfield2[i][j];
    }
  }
 */
 }
 /* ----------------------------------------------------------------------
   fphi_uind = induced potential from grid
   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
 ------------------------------------------------------------------------- */
 void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1,
                              double **fdip_phi2, double **fdip_sum_phi)
 {
  if (!gpu_fphi_uind_ready) {
    PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi);
    return;
  }
  void* fdip_phi1_pinned = nullptr;
  void* fdip_phi2_pinned = nullptr;
  void* fdip_sum_phi_pinned = nullptr;
  hippo_gpu_fphi_uind(atom->nlocal, bsorder, thetai1,
                       thetai2, thetai3, igrid, grid,
                       &fdip_phi1_pinned, &fdip_phi2_pinned,
                       &fdip_sum_phi_pinned,
                       ic_kspace->nzlo_out, ic_kspace->nzhi_out,
                       ic_kspace->nylo_out, ic_kspace->nyhi_out,
                       ic_kspace->nxlo_out, ic_kspace->nxhi_out,
                       first_induce_iteration);
  int nlocal = atom->nlocal;
  double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
  for (int i = 0; i < nlocal; i++) {
    int n = i;
    for (int m = 0; m < 10; m++) {
      fdip_phi1[i][m] = _fdip_phi1_ptr[n];
      n += nlocal;
    }
  }
  double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
  for (int i = 0; i < nlocal; i++) {
    int n = i;
    for (int m = 0; m < 10; m++) {
      fdip_phi2[i][m] = _fdip_phi2_ptr[n];
      n += nlocal;
    }
  }
  double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
  for (int i = 0; i < nlocal; i++) {
    int n = i;
    for (int m = 0; m < 20; m++) {
      fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
      n += nlocal;
    }
  }
 }
 /* ----------------------------------------------------------------------
@ -1089,29 +1359,6 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
  double *pval = atom->dvector[index_pval];
  hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval,
                              aewald, off2, &fieldp_pinned);
 /*
  // accumulate the field and fieldp values from the GPU lib
  //   field and fieldp may already have some nonzero values from kspace (umutual1)
  int nlocal = atom->nlocal;
  double *field_ptr = (double *)fieldp_pinned;
  for (int i = 0; i < nlocal; i++) {
    int idx = 4*i;
    field[i][0] += field_ptr[idx];
    field[i][1] += field_ptr[idx+1];
    field[i][2] += field_ptr[idx+2];
  }
  double* fieldp_ptr = (double *)fieldp_pinned;
  fieldp_ptr += 4*inum;
  for (int i = 0; i < nlocal; i++) {
    int idx = 4*i;
    fieldp[i][0] += fieldp_ptr[idx];
    fieldp[i][1] += fieldp_ptr[idx+1];
    fieldp[i][2] += fieldp_ptr[idx+2];
  }
 */
 }
 /* ----------------------------------------------------------------------
--- a/src/GPU/pair_hippo_gpu.h
+++ b/src/GPU/pair_hippo_gpu.h
@ -39,6 +39,8 @@ class PairHippoGPU : public PairAmoeba {
  virtual void dispersion_real();
  virtual void multipole_real();
  virtual void udirect2b(double **, double **);
  virtual void umutual1(double **, double **);
  virtual void fphi_uind(double ****, double **, double **, double **);
  virtual void umutual2b(double **, double **);
  virtual void ufield0c(double **, double **);
  virtual void polar_real();
@ -55,9 +57,13 @@ class PairHippoGPU : public PairAmoeba {
  bool gpu_dispersion_real_ready;
  bool gpu_multipole_real_ready;
  bool gpu_udirect2b_ready;
  bool gpu_umutual1_ready;
  bool gpu_fphi_uind_ready;
  bool gpu_umutual2b_ready;
  bool gpu_polar_real_ready;
  bool first_induce_iteration;
  void udirect2b_cpu();
  template<class numtyp>