Updated fphi_mpole, renamed precompute_induce to precompute_kspace

2022-09-28 15:08:18 -05:00
parent 166701f13a
commit e6d2582642
8 changed files with 129 additions and 101 deletions
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -68,7 +68,34 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)

 # host code compilation

-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
+$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)

 #ifdef CUDPP_OPT
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -1630,7 +1630,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
                          const __global numtyp4 *restrict thetai2,
                          const __global numtyp4 *restrict thetai3,
                          const __global int *restrict igrid,
-                          const __global numtyp *restrict grid,
+                          const __global numtyp2 *restrict grid,
                          __global numtyp *restrict fdip_phi1,
                          __global numtyp *restrict fdip_phi2,
                          __global numtyp *restrict fdip_sum_phi,
@ -1648,12 +1648,12 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,

  if (ii<inum) {

-    int nlpts = (bsorder-1) / 2;
+    const int nlpts = (bsorder-1) / 2;
    
    int istart = fast_mul(ii,4);
-    int igridx = igrid[istart];
-    int igridy = igrid[istart+1];
-    int igridz = igrid[istart+2];
+    const int igridx = igrid[istart];
+    const int igridy = igrid[istart+1];
+    const int igridz = igrid[istart+2];
    
    // now istart is used to index thetai1, thetai2 and thetai3
    istart = fast_mul(ii,bsorder);
@ -1701,18 +1701,13 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,

    int k = (igridz - nzlo_out) - nlpts;
    for (int kb = 0; kb < bsorder; kb++) {
-      /*
-      v0 = thetai3[m][kb][0];
-      v1 = thetai3[m][kb][1];
-      v2 = thetai3[m][kb][2];
-      v3 = thetai3[m][kb][3];
-      */
-      int i3 = istart + kb;
-      numtyp4 tha3 = thetai3[i3];
-      numtyp v0 = tha3.x;
-      numtyp v1 = tha3.y;
-      numtyp v2 = tha3.z;
-      numtyp v3 = tha3.w;
+      const int mz = fast_mul(k, ngridxy);
+      const int i3 = istart + kb;
+      const numtyp4 tha3 = thetai3[i3];
+      const numtyp v0 = tha3.x; // thetai3[m][kb][0];
+      const numtyp v1 = tha3.y; // thetai3[m][kb][1];
+      const numtyp v2 = tha3.z; // thetai3[m][kb][2];
+      const numtyp v3 = tha3.w; // thetai3[m][kb][3];
      numtyp tu00_1 = (numtyp)0.0;
      numtyp tu01_1 = (numtyp)0.0;
      numtyp tu10_1 = (numtyp)0.0;
@ -1738,18 +1733,13 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,

      int j = (igridy - nylo_out) - nlpts;
      for (int jb = 0; jb < bsorder; jb++) {
-        /*
-        u0 = thetai2[m][jb][0];
-        u1 = thetai2[m][jb][1];
-        u2 = thetai2[m][jb][2];
-        u3 = thetai2[m][jb][3];
-        */
-        int i2 = istart + jb;
-        numtyp4 tha2 = thetai2[i2];
-        numtyp u0 = tha2.x;
-        numtyp u1 = tha2.y;
-        numtyp u2 = tha2.z;
-        numtyp u3 = tha2.w;
+        const int my = mz + fast_mul(j, ngridx);
+        const int i2 = istart + jb;
+        const numtyp4 tha2 = thetai2[i2];
+        const numtyp u0 = tha2.x; // thetai2[m][jb][0];
+        const numtyp u1 = tha2.y; // thetai2[m][jb][1];
+        const numtyp u2 = tha2.z; // thetai2[m][jb][2];
+        const numtyp u3 = tha2.w; // thetai2[m][jb][3];
        numtyp t0_1 = (numtyp)0.0;
        numtyp t1_1 = (numtyp)0.0;
        numtyp t2_1 = (numtyp)0.0;
@ -1771,22 +1761,25 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
          t2_2 += tq_2*thetai1[m][ib][2];
          t3 += (tq_1+tq_2)*thetai1[m][ib][3];
          */
-          int i1 = istart + ib;
-          numtyp4 tha1 = thetai1[i1];
-          numtyp w0 = tha1.x;
-          numtyp w1 = tha1.y;
-          numtyp w2 = tha1.z;
-          numtyp w3 = tha1.w;
-          int gidx = 2*(k*ngridxy + j*ngridx + i);
-          numtyp tq_1 = grid[gidx];
-          numtyp tq_2 = grid[gidx+1];
-          t0_1 += tq_1*w0;
-          t1_1 += tq_1*w1;
-          t2_1 += tq_1*w2;
-          t0_2 += tq_2*w0;
-          t1_2 += tq_2*w1;
-          t2_2 += tq_2*w2;
-          t3 += (tq_1+tq_2)*w3;
+          const int i1 = istart + ib;
+          const numtyp4 tha1 = thetai1[i1];
+          /*
+          const numtyp w0 = tha1.x;
+          const numtyp w1 = tha1.y;
+          const numtyp w2 = tha1.z;
+          const numtyp w3 = tha1.w;
+          */
+          const int gidx = my + i; // k*ngridxy + j*ngridx + i;
+          const numtyp2 tq = grid[gidx];
+          const numtyp tq_1 = tq.x; //grid[gidx];
+          const numtyp tq_2 = tq.y; //grid[gidx+1];
+          t0_1 += tq_1*tha1.x; // w0
+          t1_1 += tq_1*tha1.y; // w1
+          t2_1 += tq_1*tha1.z; // w2
+          t0_2 += tq_2*tha1.x; // w0
+          t1_2 += tq_2*tha1.y; // w1
+          t2_2 += tq_2*tha1.z; // w2
+          t3 += (tq_1+tq_2)*tha1.w; // w3
          i++;
        }

@ -1933,9 +1926,9 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
                          const __global numtyp4 *restrict thetai2,
                          const __global numtyp4 *restrict thetai3,
                          const __global int *restrict igrid,
-                          const __global numtyp *restrict grid,
+                          const __global numtyp2 *restrict grid,
                          __global numtyp *restrict fphi,
-                          const int bsorder, const int inum,
+                          const int bsorder, const int inum, const numtyp felec,
                          const int nzlo_out, const int nylo_out,
                          const int nxlo_out, const int ngridxy,
                          const int ngridx)
@ -2027,7 +2020,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
          int i1 = istart + ib;
          numtyp4 tha1 = thetai1[i1];
          int gidx = k*ngridxy + j*ngridx + i;
-          numtyp tq = grid[gidx];
+          numtyp tq = grid[gidx].x;
          t0 += tq*tha1.x;
          t1 += tq*tha1.y;
          t2 += tq*tha1.z;
@ -2095,7 +2088,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,

    int idx = ii;    
    for (int m = 0; m < 20; m++) {
-      fphi[idx] = buf[m];
+      fphi[idx] = felec * buf[m];
      idx += inum;
    }
  }
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@ -162,15 +162,14 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                              eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }

-void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
+void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
                          double ***host_thetai1, double ***host_thetai2,
                          double ***host_thetai3, int** igrid,
                          const int nzlo_out, const int nzhi_out,
                          const int nylo_out, const int nyhi_out,
                          const int nxlo_out, const int nxhi_out) {
-   AMOEBAMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
-                              host_thetai3, igrid, nzlo_out, nzhi_out,
-                              nylo_out, nyhi_out, nxlo_out, nxhi_out);
+   AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid,
+                              nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out);
 }

 void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
@ -179,8 +178,8 @@ void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
                              host_fdip_phi2, host_fdip_sum_phi);
 }

-void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi) {
-   AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi);
+void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) {
+   AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec);
 }

 void amoeba_setup_fft(const int numel, const int element_type) {
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -566,7 +566,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
 // ---------------------------------------------------------------------------

 template <class numtyp, class acctyp>
-void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
+void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
                                    double ***host_thetai1, double ***host_thetai2,
                                    double ***host_thetai3, int** host_igrid,
                                    const int nzlo_out, const int nzhi_out,
@ -660,7 +660,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
  _ngridx = nxhi_out - nxlo_out + 1;
  _num_grid_points = _ngridx * _ngridy * _ngridz;

-  int numel = _num_grid_points*2;
+  int numel = _num_grid_points;
  if (_cgrid_brick.cols() == 0) {
    _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
  } else if (numel > _cgrid_brick.cols()) {
@ -688,11 +688,13 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
  for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
    for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
      for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
-        _cgrid_brick[n] = host_grid_brick[iz][iy][ix][0];
-        _cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1];
-        n += 2;
+        numtyp2 v;
+        v.x = host_grid_brick[iz][iy][ix][0];
+        v.y = host_grid_brick[iz][iy][ix][1];
+        _cgrid_brick[n] = v;
+        n++;
      }
-  _cgrid_brick.update_device(_num_grid_points*2, false);
+  _cgrid_brick.update_device(_num_grid_points, false);

  const int red_blocks = fphi_uind();

@ -740,7 +742,7 @@ int BaseAmoebaT::fphi_uind() {
 // ---------------------------------------------------------------------------

 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi)
+void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec)
 {
  // TODO: grid brick[k][j][i] is a scalar
  UCL_H_Vec<numtyp> hdummy;
@ -750,11 +752,15 @@ void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi
  for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
    for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
      for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
-        _cgrid_brick[n] = host_grid_brick[iz][iy][ix];
+        numtyp2 v;
+        v.x = host_grid_brick[iz][iy][ix];
+        v.y = (numtyp)0;
+        _cgrid_brick[n] = v;
        n++;
      }
  _cgrid_brick.update_device(_num_grid_points, false);

+  _felec = felec;
  const int red_blocks = fphi_mpole();

  _fdip_sum_phi.update_host(_max_thetai_size*20);
@ -776,13 +782,14 @@ int BaseAmoebaT::fphi_mpole() {

  // Compute the block size and grid size to keep all cores busy
  const int BX=block_size();
+  //printf("BX = %d; pppm block size = %d\n", BX, PPPM_BLOCK_1D);
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));

  time_pair.start();
  int ngridxy = _ngridx * _ngridy;
  k_fphi_mpole.set_size(GX,BX);
  k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
-                  &_fdip_sum_phi, &_bsorder, &ainum,
+                  &_fdip_sum_phi, &_bsorder, &ainum, &_felec,
                  &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
  time_pair.stop();

--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -173,8 +173,8 @@ class BaseAmoeba {
                double **host_uind, double **host_uinp, double *host_pval,
                const double aewald, const double off2_polar, void **fieldp_ptr);

-  /// Allocate/resize per-atom arrays before induce()
-  virtual void precompute_induce(const int inum_full, const int bsorder,
+  /// Allocate/resize per-atom arrays before the kspace parts in induce() and polar
+  virtual void precompute_kspace(const int inum_full, const int bsorder,
                                 double ***host_thetai1, double ***host_thetai2,
                                 double ***host_thetai3, int** igrid,
                                 const int nzlo_out, const int nzhi_out,
@ -185,7 +185,8 @@ class BaseAmoeba {
                                 void **host_fdip_phi1, void **host_fdip_phi2,
                                 void **host_fdip_sum_phi);

-  virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi);
+  virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
+                                  const double felec);

  /// Compute polar real-space with device neighboring
  virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
@ -256,7 +257,7 @@ class BaseAmoeba {
  int _bsorder;
  UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
  UCL_Vector<int,int> _igrid;
-  UCL_Vector<numtyp,numtyp> _cgrid_brick;
+  UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
  UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
  int _max_thetai_size;
  int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@ -193,13 +193,13 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **
                             eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }

-void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
+void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
                          double ***host_thetai1, double ***host_thetai2,
                          double ***host_thetai3, int** igrid,
                          const int nzlo_out, const int nzhi_out,
                          const int nylo_out, const int nyhi_out,
                          const int nxlo_out, const int nxhi_out) {
-   HIPPOMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
+   HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2,
                             host_thetai3, igrid, nzlo_out, nzhi_out,
                             nylo_out, nyhi_out, nxlo_out, nxhi_out);
 }
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -88,7 +88,7 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,

 void amoeba_gpu_update_fieldp(void **fieldp_ptr);

-void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
+void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
              double ***host_thetai1, double ***host_thetai2,
              double ***host_thetai3, int** igrid,
              const int nzlo_out, const int nzhi_out,
@ -98,7 +98,8 @@ void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
 void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
                          void **host_fdip_phi2, void **host_fdip_sum_phi);

-void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi);
+void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi,
+                           const double felec);

 void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
              double **host_rpole, double **host_uind, double **host_uinp,
@ -343,7 +344,7 @@ void PairAmoebaGPU::induce()
  // must be done before the first ufield0c
  // NOTE: this is for ic_kspace, and thetai[1-3]

-  amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
+  amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
                               thetai3, igrid,
                               ic_kspace->nzlo_out, ic_kspace->nzhi_out,
                               ic_kspace->nylo_out, ic_kspace->nyhi_out,
@ -1382,11 +1383,11 @@ void PairAmoebaGPU::polar_kspace()
  
    // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill
    if (gpu_fphi_mpole_ready) {
-       amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
-                                    thetai3, igrid, p_kspace->nzlo_out,
-                                    p_kspace->nzhi_out, p_kspace->nylo_out,
-                                    p_kspace->nyhi_out, p_kspace->nxlo_out,
-                                    p_kspace->nxhi_out);
+       amoeba_gpu_precompute_kspace(atom->nlocal, bsorder,
+                                    thetai1, thetai2, thetai3, igrid,
+                                    p_kspace->nzlo_out, p_kspace->nzhi_out,
+                                    p_kspace->nylo_out, p_kspace->nyhi_out,
+                                    p_kspace->nxlo_out, p_kspace->nxhi_out);
    }
      

@ -1461,10 +1462,15 @@ void PairAmoebaGPU::polar_kspace()
    
    if (!gpu_fphi_mpole_ready) {
      fphi_mpole(gridpost,fphi);
-      //printf("cpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]);
+
+      for (i = 0; i < nlocal; i++) {
+        for (k = 0; k < 20; k++)
+          fphi[i][k] *= felec;
+      }
+
    } else {
      void* fphi_pinned = nullptr;
-      amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned);
+      amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec);
    
      double *_fphi_ptr = (double *)fphi_pinned;
      for (int i = 0; i < nlocal; i++) {
@ -1474,12 +1480,7 @@ void PairAmoebaGPU::polar_kspace()
          idx += nlocal;
        }
      }
-      //printf("gpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]);
-    }

-    for (i = 0; i < nlocal; i++) {
-      for (k = 0; k < 20; k++)
-        fphi[i][k] *= felec;
    }   

    // convert field from fractional to Cartesian
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@ -105,7 +105,7 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,

 void hippo_gpu_update_fieldp(void **fieldp_ptr);

-void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
+void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
              double ***host_thetai1, double ***host_thetai2,
              double ***host_thetai3, int** igrid,
              const int nzlo_out, const int nzhi_out,
@ -475,7 +475,7 @@ void PairHippoGPU::induce()
  // allocate memory and make early host-device transfers
  // must be done before the first ufield0c

-  hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
+  hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
                              thetai3, igrid,
                              ic_kspace->nzlo_out, ic_kspace->nzhi_out,
                              ic_kspace->nylo_out, ic_kspace->nyhi_out,