Both udirect2b and polar_real are working correctly on the GPU

2021-09-09 00:57:21 -05:00
parent 8c5a116d30
commit 6f6fd0999c
5 changed files with 121 additions and 89 deletions
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -284,7 +284,11 @@ class Atom {

  /// Signal that we need to transfer atom data for next timestep
  inline void data_unavail()
-    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; }
+  
+  /// Signal that we need to transfer atom extra data for next kernel call
+  inline void extra_data_unavail()
+    { _extra_avail=false; }

  typedef struct { double x,y,z; } vec3d;
  typedef struct { numtyp x,y,z,w; } vec4d_t;
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -317,8 +317,10 @@ void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const
 // ---------------------------------------------------------------------------
 // Prepare for multiple kernel calls in a time step:
 //   - reallocate per-atom arrays, if needed
+//   - transfer extra data from host to device
 //   - build the full neighbor lists for use by different kernels
 // ---------------------------------------------------------------------------
+
 template <class numtyp, class acctyp>
 int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
                           double **host_x, int *host_type, int *host_amtype,
@ -402,75 +404,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
  return nbor->host_jlist.begin()-host_start;
 }

-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary, and then compute polar real-space
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall,
-                           double **host_x, int *host_type, int *host_amtype,
-                           int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp,
-                           double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special,
-                           int *nspecial15, tagint **special15,
-                           const bool eflag_in, const bool vflag_in,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, double *host_q, double *boxlo,
-                           double *prd, void **tep_ptr) {
-  acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  set_kernel(eflag,vflag);
-
-  // reallocate per-atom arrays and build the neighbor lists if needed
-
-  int** firstneigh = nullptr;
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          host_uind, host_uinp, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
-
-  // ------------------- Resize _tep array ------------------------
-
-  if (nall>_max_tep_size) {
-    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _tep.resize(_max_tep_size*4);
-  }
-  *tep_ptr=_tep.host.begin();
-
-  const int red_blocks=polar_real(eflag,vflag);
-  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  device->add_ans_object(ans);
-  hd_balancer.stop_timer();
-
-  // copy tep from device to host
-
-  _tep.update_host(_max_tep_size*4,false);
-/*  
-  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
-  for (int i = 0; i < 10; i++) {
-    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
-    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
-  }
-*/  
-  return firstneigh; // nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute the direct real space part
 //    of the permanent field
@ -504,7 +437,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i

  set_kernel(eflag,vflag);

-  // reallocate per-atom arrays and build the neighbor lists if needed
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed

  int** firstneigh = nullptr;
  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
@ -539,6 +473,85 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
  return firstneigh; //nbor->host_jlist.begin()-host_start;
 }

+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute polar real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void **tep_ptr) {
+  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed
+  // NOTE: 
+  //   For now we invoke precompute() again here,
+  //     to be able to turn on/off the udirect2b kernel (which comes before this)
+  //   Once all the kernels are ready, precompute() is needed only once
+  //     in the first kernel in a time step.
+  //   We only need to cast uind and uinp from host to device here
+  //     if the neighbor lists are rebuilt and other per-atom arrays
+  //     (x, type, amtype, amgroup, rpole) are ready on the device.
+
+  int** firstneigh = nullptr;
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          host_uind, host_uinp, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);
+  }
+  *tep_ptr=_tep.host.begin();
+
+  const int red_blocks=polar_real(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+
+  // copy tep from device to host
+
+  _tep.update_host(_max_tep_size*4,false);
+/*
+  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
+    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/  
+  return firstneigh; // nbor->host_jlist.begin()-host_start;
+}
+
+
 template <class numtyp, class acctyp>
 double BaseAmoebaT::host_memory_usage_atomic() const {
  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
@ -548,6 +561,11 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
 template <class numtyp, class acctyp>
 void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
    double** uind, double** uinp) {
+
+  // signal that we need to transfer extra data from the host
+
+  atom->extra_data_unavail();
+
  int _nall=atom->nall();
  numtyp *pextra=reinterpret_cast<numtyp*>(&(atom->extra[0]));

--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@ -597,15 +597,6 @@ void PairAmoeba::induce()
  memory->destroy(usum);
  memory->destroy(usump);

-  if (comm->me == 0) {
-    printf("CPU: iter = %d\n", iter);
-    for (i = 0; i < 20; i++) {
-      printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
-        i, uind[i][0], uind[i][1], uind[i][2],
-        uinp[i][0], uinp[i][1], uinp[i][2]); 
-    }    
-  }
-
  // update the lists of previous induced dipole values
  // shift previous m values up to m+1, add new values at m = 0
  // only when preconditioner is used
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -98,6 +98,10 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
  suffix_flag |= Suffix::GPU;
  fieldp_pinned = nullptr;
  tep_pinned = nullptr;
+
+  gpu_udirect2b_ready = true;
+  gpu_polar_real_ready = true; 
+
  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }

@ -114,7 +118,6 @@ PairAmoebaGPU::~PairAmoebaGPU()

 void PairAmoebaGPU::polar_real()
 {
-  bool gpu_polar_real_ready = true;
  if (!gpu_polar_real_ready) {
    PairAmoeba::polar_real();
    return;
@ -139,7 +142,16 @@ void PairAmoebaGPU::polar_real()
    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
  }
  inum = atom->nlocal;
-
+/*
+  if (comm->me == 0) {
+    printf("GPU: polar real\n");
+    for (int i = 0; i < 20; i++) {
+      printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
+        i, uind[i][0], uind[i][1], uind[i][2],
+        uinp[i][0], uinp[i][1], uinp[i][2]); 
+    }    
+  }
+*/
  firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
                                        atom->type, amtype, amgroup,
                                        rpole, uind, uinp, sublo, subhi,
@ -200,6 +212,7 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr)
    tep[0] = tep_ptr[4*i];
    tep[1] = tep_ptr[4*i+1];
    tep[2] = tep_ptr[4*i+2];
+
    torque2force(i,tep,fix,fiy,fiz,fpolar);

    iz = zaxis2local[i];
@ -365,10 +378,14 @@ void PairAmoebaGPU::induce()
  // get the electrostatic field due to permanent multipoles
  
  dfield0c(field,fieldp);
-/*
-  crstyle = FIELD;
-  comm->reverse_comm_pair(this);
-*/
+
+  // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only
+
+  if (!gpu_udirect2b_ready) {
+    crstyle = FIELD;
+    comm->reverse_comm_pair(this);
+  }
+
  // set induced dipoles to polarizability times direct field

  for (i = 0; i < nlocal; i++) {
@ -726,7 +743,7 @@ void PairAmoebaGPU::induce()
  memory->destroy(udir);
  memory->destroy(usum);
  memory->destroy(usump);
-
+/*
  if (comm->me == 0) {
    printf("GPU: iter = %d\n", iter);
    for (i = 0; i < 20; i++) {
@ -735,7 +752,7 @@ void PairAmoebaGPU::induce()
        uinp[i][0], uinp[i][1], uinp[i][2]); 
    }    
  }
-
+*/
  // update the lists of previous induced dipole values
  // shift previous m values up to m+1, add new values at m = 0
  // only when preconditioner is used
@ -825,7 +842,6 @@ void PairAmoebaGPU::dfield0c(double **field, double **fieldp)

 void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
 {
-  bool gpu_udirect2b_ready = true;
  if (!gpu_udirect2b_ready) {
    PairAmoeba::udirect2b(field, fieldp);
    return;
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@ -45,6 +45,9 @@ class PairAmoebaGPU : public PairAmoeba {
  void *fieldp_pinned;
  bool tep_single;

+  bool gpu_polar_real_ready;
+  bool gpu_udirect2b_ready;
+
  void udirect2b_cpu();

  template<class numtyp>