Made some progress with fphi_uind in the gpu pair style

2022-09-09 12:14:36 -05:00
parent a0af9627e5
commit 4b8caac727
5 changed files with 114 additions and 69 deletions
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -14,7 +14,7 @@
 // ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
-//#include <stdio.h>
+#include <stdio.h>
 #include "lal_aux_fun1.h"
 #ifdef LAMMPS_SMALLBIG
 #define tagint int
@ -1630,14 +1630,19 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
                          __global numtyp *restrict fdip_phi2,
                          __global numtyp *restrict fdip_sum_phi,
                          const int bsorder, const int inum,
-                          const int nyzgrid, const int nygrid,
-                          const int t_per_atom)
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out,
+                          const int ngridxy, const int ngridx)
 {
-  int tid, ii, offset, i, n_stride;
-  atom_info(t_per_atom,ii,tid,offset);
+  //int tid, ii, offset, i, n_stride;
+  //atom_info(t_per_atom,ii,tid,offset);
+
+  int tid=THREAD_ID_X;
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;

  if (ii<inum) {
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp4 ix; fetch4(ix,ii,pos_tex); //x_[i];

    int j,k,m;
    numtyp v0,v1,v2,v3;
@ -1706,7 +1711,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
    tuv012 = (numtyp)0.0;
    tuv111 = (numtyp)0.0;

-    k = igrid[4*i+2] - nlpts;
+    k = igrid[4*ii+2] - nzlo_out - nlpts;
    for (int kb = 0; kb < bsorder; kb++) {
      /*
      v0 = thetai3[m][kb][0];
@ -1714,9 +1719,9 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
      v2 = thetai3[m][kb][2];
      v3 = thetai3[m][kb][3];
      */
-      int i3 = i*4*bsorder + 4*kb;
+      int i3 = ii*4*bsorder + 4*kb;
      v0 = thetai3[i3];
-      v1 = thetai3[i3]+1;
+      v1 = thetai3[i3+1];
      v2 = thetai3[i3+2];
      v3 = thetai3[i3+3];
      tu00_1 = (numtyp)0.0;
@ -1742,7 +1747,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
      tu12 = (numtyp)0.0;
      tu03 = (numtyp)0.0;

-      j = igrid[4*i+1] - nlpts;
+      j = igrid[4*ii+1] - nylo_out - nlpts;
      for (int jb = 0; jb < bsorder; jb++) {
        /*
        u0 = thetai2[m][jb][0];
@ -1750,7 +1755,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
        u2 = thetai2[m][jb][2];
        u3 = thetai2[m][jb][3];
        */
-        int i2 = i*4*bsorder+4*jb;
+        int i2 = ii*4*bsorder+4*jb;
        u0 = thetai2[i2];
        u1 = thetai2[i2+1];
        u2 = thetai2[i2+2];
@ -1763,11 +1768,11 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
        t2_2 = (numtyp)0.0;
        t3 = (numtyp)0.0;

-        int ii = igrid[4*i] - nlpts;
+        int i = igrid[4*ii] - nxlo_out - nlpts;
        for (int ib = 0; ib < bsorder; ib++) {
          /*
-          tq_1 = grid[k][j][ii][0];
-          tq_2 = grid[k][j][ii][1];
+          tq_1 = grid[k][j][i][0];
+          tq_2 = grid[k][j][i][1];
          t0_1 += tq_1*thetai1[m][ib][0];
          t1_1 += tq_1*thetai1[m][ib][1];
          t2_1 += tq_1*thetai1[m][ib][2];
@ -1776,14 +1781,19 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
          t2_2 += tq_2*thetai1[m][ib][2];
          t3 += (tq_1+tq_2)*thetai1[m][ib][3];
          */
-          int i1 = i*4*bsorder+4*ib;
+          int i1 = ii*4*bsorder+4*ib;
          numtyp w0 = thetai1[i1];
          numtyp w1 = thetai1[i1+1];
          numtyp w2 = thetai1[i1+2];
          numtyp w3 = thetai1[i1+3];
-          int gidx = 2*(k*nyzgrid + j*nygrid + ii);
+          int gidx = 2*(k*ngridxy + j*ngridx + i);
          tq_1 = grid[gidx];
          tq_2 = grid[gidx+1];
+/*
+          if (ii == 0 && jb == 0 && kb == 0)
+            printf("ii = 0: igrid %d %d %d; grid %f %f; k = %d j = %d; i = %d; origin = %f %f; gidx = %d\n",
+              igrid[4*ii+0], igrid[4*ii+1], igrid[4*ii+2], tq_1, tq_2, k, j, i, grid[0], grid[1], gidx);
+*/          
          t0_1 += tq_1*w0;
          t1_1 += tq_1*w1;
          t2_1 += tq_1*w2;
@ -1791,7 +1801,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
          t1_2 += tq_2*w1;
          t2_2 += tq_2*w2;
          t3 += (tq_1+tq_2)*w3;
-          ii++;
+          i++;
        }

        tu00_1 += t0_1*u0;
@ -1875,7 +1885,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
    fdip_phi1[m][8] = tuv101_1;
    fdip_phi1[m][9] = tuv011_1;
 */
-    int idx = 10*m;
+    int idx = 10*ii;
    fdip_phi1[idx+0] = (numtyp)0.0;
    fdip_phi1[idx+1] = tuv100_1;
    fdip_phi1[idx+2] = tuv010_1;
@ -1886,7 +1896,18 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
    fdip_phi1[idx+7] = tuv110_1;
    fdip_phi1[idx+8] = tuv101_1;
    fdip_phi1[idx+9] = tuv011_1;
-
+/*
+    fdip_phi2[m][0] = 0.0;
+    fdip_phi2[m][1] = tuv100_2;
+    fdip_phi2[m][2] = tuv010_2;
+    fdip_phi2[m][3] = tuv001_2;
+    fdip_phi2[m][4] = tuv200_2;
+    fdip_phi2[m][5] = tuv020_2;
+    fdip_phi2[m][6] = tuv002_2;
+    fdip_phi2[m][7] = tuv110_2;
+    fdip_phi2[m][8] = tuv101_2;
+    fdip_phi2[m][9] = tuv011_2;
+*/    
    fdip_phi2[idx+0] = (numtyp)0.0;
    fdip_phi2[idx+1] = tuv100_2;
    fdip_phi2[idx+2] = tuv010_2;
@ -1898,7 +1919,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
    fdip_phi2[idx+8] = tuv101_2;
    fdip_phi2[idx+9] = tuv011_2;

-    idx = 20*m;
+    idx = 20*ii;
    fdip_sum_phi[idx+0] = tuv000;
    fdip_sum_phi[idx+1] = tuv100;
    fdip_sum_phi[idx+2] = tuv010;
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@ -165,12 +165,12 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
 void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
                          double ***host_thetai1, double ***host_thetai2,
                          double ***host_thetai3, int** igrid,
-                          double *host_grid_brick_start, void **host_fdip_phi1,
-                          void **host_fdip_phi2, void **host_fdip_sum_phi,
+                          double *host_grid_brick_start, double ****host_grid_brick,
+                          void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
                          int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
                          int nxlo_out, int nxhi_out, bool& first_iteration) {
   AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
-                          host_thetai3, igrid, host_grid_brick_start, host_fdip_phi1,
+                          host_thetai3, igrid, host_grid_brick_start, host_grid_brick, host_fdip_phi1,
                          host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
                          nylo_out, nyhi_out,  nxlo_out, nxhi_out, first_iteration);
 }
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -555,7 +555,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
 // Prepare for umutual1() after bspline_fill() is done on host
 //   - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed
 //     host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
-//     host_igrid is allocated with nmax by by 4
+//     host_igrid is allocated with nmax by 4
 //   - transfer extra data from host to device
 // ---------------------------------------------------------------------------

@ -563,8 +563,9 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
                                    double ***host_thetai1, double ***host_thetai2,
                                    double ***host_thetai3, int** host_igrid,
-                                    double* host_grid_brick_start, int nzlo_out,
-                                    int nzhi_out, int nylo_out, int nyhi_out,
+                                    double* host_grid_brick_start, double**** host_grid_brick,
+                                    int nzlo_out, int nzhi_out,
+                                    int nylo_out, int nyhi_out,
                                    int nxlo_out, int nxhi_out) {
  
  _bsorder = bsorder;
@ -599,7 +600,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
  }

  UCL_H_Vec<double> dview;
-  dview.alloc(inum_full*bsorder*4,*(this->ucl_device));
+  dview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device));

  // pack host data to device

@ -634,7 +635,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
  ucl_copy(_thetai3,dview,false);

  UCL_H_Vec<int> dview_int;
-  dview_int.alloc(inum_full*4, *(this->ucl_device));
+  dview_int.alloc(_max_thetai_size*4, *(this->ucl_device));
  for (int i = 0; i < inum_full; i++) {
    int idx = i*4;
    dview_int[idx+0] = host_igrid[i][0];
@ -643,6 +644,33 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
  }
  ucl_copy(_igrid, dview_int, false);

+  
+  
+}
+
+// ---------------------------------------------------------------------------
+// fphi_uind = induced potential from grid
+// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
+                                    double ***host_thetai1, double ***host_thetai2,
+                                    double ***host_thetai3, int** igrid,
+                                    double *host_grid_brick_start, double ****host_grid_brick,
+                                    void** host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
+                                    int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                                    int nxlo_out, int nxhi_out, bool& first_iteration)
+{
+  // allocation/resize and transfers before the first iteration
+  
+  if (first_iteration) {
+    precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
+                      igrid, host_grid_brick_start, host_grid_brick, nzlo_out, nzhi_out,
+                      nylo_out, nyhi_out, nxlo_out, nxhi_out);
+    if (first_iteration) first_iteration = false;
+  }
+
  // update the cgrid_brick with data host
  
  _nzlo_out = nzlo_out;
@ -656,36 +684,27 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
  _ngridx = nxhi_out - nxlo_out + 1;
  _num_grid_points = _ngridx * _ngridy * _ngridz;

-  UCL_H_Vec<double> dview_cgrid;
-  dview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device));
+  UCL_H_Vec<double> hview_cgrid;
+  hview_cgrid.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_WRITE);
+  int n = 0;
+  for (int iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (int iy = nylo_out; iy <= nyhi_out; iy++)
+      for (int ix = nxlo_out; ix <= nxhi_out; ix++) {
+/*        
+        if (iz == nzlo_out && iy == nylo_out && ix == nxlo_out) {
+          printf("origin = %d %d %d: grid = %f %f %f\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1]);
+        }
+        if (iz == -2 && iy == 4 && ix == 8) printf("ixyz = %d %d %d: grid = %f %f %f; n = %d\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1], n);
+*/        
+        hview_cgrid[n] = host_grid_brick[iz][iy][ix][0];
+        hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1];
+        n += 2;
+      }
+  //hview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device));
  _cgrid_brick.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_ONLY);
-  ucl_copy(_cgrid_brick,dview_cgrid,false);
+  ucl_copy(_cgrid_brick,hview_cgrid,false);

-}

-// ---------------------------------------------------------------------------
-// fphi_uind = induced potential from grid
-// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
-// ---------------------------------------------------------------------------
-
-template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
-                                    double ***host_thetai1, double ***host_thetai2,
-                                    double ***host_thetai3, int** igrid,
-                                    double *host_grid_brick_start, void** host_fdip_phi1,
-                                    void **host_fdip_phi2, void **host_fdip_sum_phi,
-                                    int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
-                                    int nxlo_out, int nxhi_out, bool& first_iteration)
-{
-  // allocation/resize and transfers before the first iteration
-  
-  if (first_iteration) {
-    precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
-                      igrid, host_grid_brick_start, nzlo_out, nzhi_out,
-                      nylo_out, nyhi_out, nxlo_out, nxhi_out);
-    if (first_iteration) first_iteration = false;
-  }
-    
  const int red_blocks = fphi_uind();

  _fdip_phi1.update_host(_max_thetai_size*10);
@ -711,16 +730,16 @@ int BaseAmoebaT::fphi_uind() {

  // Compute the block size and grid size to keep all cores busy
  const int BX=block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
-                               (BX/_threads_per_atom)));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));

  time_pair.start();
-  int ngridyz = _ngridy * _ngridz;
+  int ngridxy = _ngridx * _ngridy;
  k_fphi_uind.set_size(GX,BX);
  k_fphi_uind.run(&atom->x, &_thetai1, &_thetai2, &_thetai3,
                  &_igrid, &_cgrid_brick, &_fdip_phi1, &_fdip_phi2,
-                  &_fdip_sum_phi, &_bsorder, &ainum, &ngridyz, &_ngridy,
-                  &_threads_per_atom);
+                  &_fdip_sum_phi, &_bsorder, &ainum, 
+                  &_nzlo_out, &_nzhi_out, &_nylo_out, &_nyhi_out, 
+                  &_nxlo_out, &_nxhi_out, &ngridxy, &_ngridx);
  time_pair.stop();

  return GX;
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -153,8 +153,9 @@ class BaseAmoeba {
  virtual void precompute_induce(const int inum_full, const int bsorder,
                                 double ***host_thetai1, double ***host_thetai2,
                                 double ***host_thetai3, int** igrid,
-                                 double* grid_brick_start, int nzlo_out,
-                                 int nzhi_out, int nylo_out, int nyhi_out,
+                                 double *host_grid_brick_start, double ****host_grid_brick,
+                                 int nzlo_out, int nzhi_out,
+                                 int nylo_out, int nyhi_out,
                                 int nxlo_out, int nxhi_out);

  /// Compute multipole real-space with device neighboring
@ -182,8 +183,8 @@ class BaseAmoeba {
  virtual void compute_fphi_uind(const int inum_full, const int bsorder,
                                 double ***host_thetai1, double ***host_thetai2,
                                 double ***host_thetai3, int** igrid,
-                                 double *host_grid_brick_start, void **host_fdip_phi1,
-                                 void **host_fdip_phi2, void **host_fdip_sum_phi,
+                                 double *host_grid_brick_start, double ****host_grid_brick,
+                                 void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
                                 int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
                                 int nxlo_out, int nxhi_out, bool& first_iteration);

--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -91,7 +91,7 @@ void amoeba_gpu_update_fieldp(void **fieldp_ptr);
 void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
                          double ***host_thetai1, double ***host_thetai2,
                          double ***host_thetai3, int** igrid,
-                          double *host_grid_brick_start, void **host_fdip_phi1,
+                          double *host_grid_brick_start, double ****host_grid_brick, void **host_fdip_phi1,
                          void **host_fdip_phi2, void **host_fdip_sum_phi,
                          int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
                          int nxlo_out, int nxhi_out, bool& first_iteration);
@ -121,7 +121,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
  gpu_multipole_real_ready = true;     // need to be true for precompute()
  gpu_udirect2b_ready = true;
  gpu_umutual1_ready = true;
-  gpu_fphi_uind_ready = false;
+  gpu_fphi_uind_ready = true;
  gpu_umutual2b_ready = true;
  gpu_polar_real_ready = true;         // need to be true for copying data from device back to host

@ -1139,7 +1139,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
  void* fdip_phi2_pinned = nullptr;
  void* fdip_sum_phi_pinned = nullptr;
  amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, thetai2, thetai3,
-                        igrid, ic_kspace->grid_brick_start,
+                        igrid, ic_kspace->grid_brick_start, grid,
                        &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned,
                        ic_kspace->nzlo_out, ic_kspace->nzhi_out,
                        ic_kspace->nylo_out, ic_kspace->nyhi_out,
@ -1150,8 +1150,10 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
  double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
  for (int i = 0; i < nlocal; i++) {
    int idx = 10 * i;
-    for (int m = 0; m < 10; m++)
-      fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m];
+    for (int m = 0; m < 10; m++) {
+       fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m];
+    }
+    if (i == 0) printf("gpu fdip phi1 = %f %f %f\n", fdip_phi1[i][0], fdip_phi1[i][1], fdip_phi1[i][2]);      
  }

  double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
@ -1159,6 +1161,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
    int idx = 10 * i;
    for (int m = 0; m < 10; m++)
      fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m];
+    if (i == 0) printf("gpu fdip phi2 = %f %f %f\n", fdip_phi2[i][0], fdip_phi2[i][1], fdip_phi2[i][2]);      
  }

  double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
@ -1166,6 +1169,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
    int idx = 20 * i;
    for (int m = 0; m < 20; m++)
      fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m];
+    if (i == 0) printf("gpu fdip sum phi = %f %f %f\n", fdip_sum_phi[i][0], fdip_sum_phi[i][1], fdip_sum_phi[i][2]);            
  }
 }