Reverted the binsize function call from the GPU package in Atom, instead added atom_modify sort with a binsize to ensure matching virial values, enabled the udirect2b kernel, need more work to override dfield0c, and induce() to bypass reverse_comm() for field and fieldp (line amoeba_induce.cpp:111-112)

2021-09-03 13:43:22 -05:00
parent 745c7089f0
commit 7d69a870a4
7 changed files with 115 additions and 68 deletions
--- a/examples/amoeba/in.ubiquitin
+++ b/examples/amoeba/in.ubiquitin
@ -4,7 +4,7 @@ units           real
 boundary        p p p

 atom_style amoeba
-
+#atom_modify sort 1000 7.0
 bond_style      class2
 angle_style     amoeba
 dihedral_style  none
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -57,7 +57,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
                  const double polar_uscale) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
-                            cell_size,gpu_split,_screen,amoeba,"k_amoeba_polar");
+                            cell_size,gpu_split,_screen,amoeba,
+                            "k_amoeba_polar", "k_amoeba_udirect2b");
  if (success!=0)
    return success;

@ -164,15 +165,14 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
-/*
-  this->k_polar.set_size(GX,BX);
-  this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
-                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                    &this->ans->force, &this->ans->engv, &this->_tep,
-                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,
-                    &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
-*/
+
+  this->k_udirect2b.set_size(GX,BX);
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &_aewald, &_off2,
+                        &_polar_dscale, &_polar_uscale);
+
  this->time_pair.stop();
  return GX;
 }
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -91,8 +91,8 @@ _texture( q_tex,int2);
    tep[i]=t;                                                               \
  }

-#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset,     \
-                          i, field, fieldp)                                 \
+#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i,  \
+                              fieldp)                                       \
  if (t_per_atom>1) {                                                       \
    red_acc[0][tid]=_fieldp[0];                                             \
    red_acc[1][tid]=_fieldp[1];                                             \
@ -118,8 +118,8 @@ _texture( q_tex,int2);
    numtyp4 f, fp;                                                          \
    f.x  = _fieldp[0]; f.y  = _fieldp[0]; f.z  = _fieldp[2];                \
    fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5];                \
-    field[i] = f;                                                           \
-    fieldp[i] = fp;                                                         \
+    fieldp[ii] = f;                                                         \
+    fieldp[ii+inum] = fp;                                                   \
  }

 #else
@ -152,8 +152,8 @@ _texture( q_tex,int2);
    tep[i]=t;                                                               \
  }

-#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset,     \
-                          i, field, fieldp)                                 \
+#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i,  \
+                             fieldp)                                        \
  if (t_per_atom>1) {                                                       \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom);                   \
@ -168,8 +168,8 @@ _texture( q_tex,int2);
    numtyp4 f, fp;                                                          \
    f.x  = _fieldp[0]; f.y  = _fieldp[0]; f.z  = _fieldp[2];                \
    fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5];                \
-    field[i] = f;                                                           \
-    fieldp[i] = fp;                                                         \
+    fieldp[ii] = f;                                                         \
+    fieldp[ii+inum] = fp;                                                   \
  }

 #endif
@ -177,6 +177,11 @@ _texture( q_tex,int2);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729

+/* ----------------------------------------------------------------------
+   polar_real = real-space portion of induced dipole polarization
+   adapted from Tinker epreal1d() routine
+------------------------------------------------------------------------- */
+
 __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict extra,
                            const __global numtyp4 *restrict damping,
@ -468,7 +473,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
      term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
      numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
-        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 +qir*term6;
+        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
      numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
        (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;

@ -684,19 +689,23 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
     offset,eflag,vflag,ans,engv);
 }

+/* ----------------------------------------------------------------------
+  udirect2b = Ewald real direct field via list
+  udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
 __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
-                            const __global numtyp *restrict extra,
-                            const __global numtyp4 *restrict damping,
-                            const __global numtyp4 *restrict sp_polar,
-                            const __global int *dev_nbor,
-                            const __global int *dev_packed,
-                            __global numtyp4 *restrict field,
-                            __global numtyp4 *restrict fieldp,
-                            const int eflag, const int vflag, const int inum,
-                            const int nall, const int nbor_pitch, const int t_per_atom,
-                            const numtyp aewald, const numtyp felec,
-                            const numtyp off2, const numtyp polar_dscale,
-                            const numtyp polar_uscale)
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 __global numtyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
 {
  int tid, ii, offset, i;
  atom_info(t_per_atom,ii,tid,offset);
@ -771,7 +780,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
      numtyp r = ucl_sqrt(r2);
      numtyp rinv = ucl_recip(r);
      numtyp r2inv = rinv*rinv;
-      numtyp rr1 = felec * rinv;
+      numtyp rr1 = rinv;
      numtyp rr3 = rr1 * r2inv;
      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
@ -888,7 +897,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,

  // accumulate field and fieldp
  
-  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,field,fieldp);
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }

 /* ----------------------------------------------------------------------
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() {
  delete ans;
  delete nbor;
  k_polar.clear();
+  k_udirect2b.clear();
  k_special15.clear();
  if (pair_program) delete pair_program;
 }
@ -53,7 +54,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                             const int maxspecial15,
                             const double cell_size, const double gpu_split,
                             FILE *_screen, const void *pair_program,
-                             const char *k_name) {
+                             const char *k_name_polar,
+                             const char *k_name_udirect2b) {
  screen=_screen;

  int gpu_nbor=0;
@ -85,7 +87,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,

  _block_size=device->pair_block_size();
  _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name);
+  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b);

  if (_threads_per_atom>1 && gpu_nbor==0) {
    nbor->packing(true);
@ -118,9 +120,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
  if (ef_nall==0)
    ef_nall=2000;

-  _max_alloc_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  _fieldp.alloc(_max_alloc_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-  _tep.alloc(_max_alloc_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
+  _max_fieldp_size = _max_tep_size;
+  _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
  dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
  dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
  dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
@ -224,7 +227,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
+void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const int nall,
                          double **host_x, int *host_type, int *host_amtype,
                          int *host_amgroup, double **host_rpole,
                          double **host_uind, double **host_uinp,
@ -252,9 +255,9 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,

  // ------------------- Resize _tep array ------------------------

-  if (nall>_max_alloc_size) {
-    _max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _tep.resize(_max_alloc_size*4);
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);

    dev_nspecial15.clear();
    dev_special15.clear();
@ -302,6 +305,10 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
+
+  // copy tep from device to host
+
+  _tep.update_host(_max_tep_size*4,false);
 }

 // ---------------------------------------------------------------------------
@ -338,9 +345,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const

  // ------------------- Resize _tep array ------------------------

-  if (nall>_max_alloc_size) {
-    _max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _tep.resize(_max_alloc_size*4);
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);

    dev_nspecial15.clear();
    dev_special15.clear();
@ -397,9 +404,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const

  // copy tep from device to host

-  _tep.update_host(_max_alloc_size*4,false);
+  _tep.update_host(_max_tep_size*4,false);
 /*  
-  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_alloc_size);
+  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
  for (int i = 0; i < 10; i++) {
    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
@ -442,9 +449,9 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i

  // ------------------- Resize _fieldp array ------------------------

-  if (nall>_max_alloc_size) {
-    _max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _fieldp.resize(_max_alloc_size*8);
+  if (nall>_max_fieldp_size) {
+    _max_fieldp_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _fieldp.resize(_max_fieldp_size*8);

    dev_nspecial15.clear();
    dev_special15.clear();
@ -492,13 +499,18 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
  *jnum=nbor->host_acc.begin();

  const int red_blocks=udirect2b(eflag,vflag);
-  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  //device->add_ans_object(ans);
  hd_balancer.stop_timer();

-  // copy field and fieldp from device to host
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)

-  //_fieldp.update_host(_max_field_size*8,false);
+  _fieldp.update_host(_max_fieldp_size*8,false);
+/*  
+  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", this->_field.cols(), _max_fieldp_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
+    printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/  

  return nbor->host_jlist.begin()-host_start;
 }
@ -566,7 +578,8 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,

 template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname) {
+                                  const char *kname_polar,
+                                  const char *kname_udirect2b) {
  if (_compiled)
    return;

@ -575,7 +588,8 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
  
-  k_polar.set_function(*pair_program,kname);
+  k_polar.set_function(*pair_program,kname_polar);
+  k_udirect2b.set_function(*pair_program,kname_udirect2b);
  k_special15.set_function(*pair_program,"k_special15");
  pos_tex.get_texture(*pair_program,"pos_tex");
  q_tex.get_texture(*pair_program,"q_tex");
@ -593,6 +607,10 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,

 }

+// ---------------------------------------------------------------------------
+//  Specify 1-5 neighbors from the current neighbor list
+// ---------------------------------------------------------------------------
+
 template <class numtyp, class acctyp>
 int BaseAmoebaT::add_onefive_neighbors() {
  // Compute the block size and grid size to keep all cores busy
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -53,8 +53,8 @@ class BaseAmoeba {
    * - -5 Double precision is not supported on card **/
  int init_atomic(const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const int maxspecial15, const double cell_size,
-                  const double gpu_split, FILE *screen,
-                  const void *pair_program, const char *k_name);
+                  const double gpu_split, FILE *screen, const void *pair_program,
+                  const char *kname_polar, const char *kname_udirect2b);

  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead(const int add_kernels=0);
@ -129,7 +129,7 @@ class BaseAmoeba {
                       bool &success);

  /// Compute polar real-space with host neighboring (not active for now)
-  void compute(const int f_ago, const int inum_full, const int nall,
+  void compute_polar_real(const int f_ago, const int inum_full, const int nall,
               double **host_x, int *host_type, int *host_amtype,
               int *host_amgroup, double **host_rpole, double **host_uind,
               double **host_uinp, int *ilist, int *numj,
@ -190,8 +190,8 @@ class BaseAmoeba {
    double** uind, double** uinp);

  /// Per-atom arrays
-  UCL_Vector<numtyp,numtyp> _tep,_fieldp;
-  int _max_alloc_size;
+  UCL_Vector<numtyp,numtyp> _tep, _fieldp;
+  int _max_tep_size, _max_fieldp_size;

  // ------------------------ FORCE/ENERGY DATA -----------------------

@ -210,7 +210,7 @@ class BaseAmoeba {

  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *pair_program;
-  UCL_Kernel k_polar,k_special15;
+  UCL_Kernel k_polar, k_udirect2b, k_special15;
  inline int block_size() { return _block_size; }
  inline void set_kernel(const int eflag, const int vflag) {}

@ -226,7 +226,8 @@ class BaseAmoeba {
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;

-  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
+  void compile_kernels(UCL_Device &dev, const void *pair_string,
+     const char *kname_polar, const char *kname_udirect2b);

  virtual int polar_real(const int eflag, const int vflag) = 0;
  virtual int udirect2b(const int eflag, const int vflag) = 0;
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -298,7 +298,7 @@ void PairAmoebaGPU::init_style()

 void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
 {
-  bool gpu_udirect2b_ready = false;
+  bool gpu_udirect2b_ready = true;
  if (!gpu_udirect2b_ready) {
    PairAmoeba::udirect2b(field, fieldp);
    return;
@ -335,6 +335,27 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");

+  // get field and fieldp values from the GPU lib
+
+  int nlocal = atom->nlocal;
+  double *field_ptr = (double *)fieldp_pinned;
+
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    field[i][0] = field_ptr[idx];
+    field[i][1] = field_ptr[idx+1];
+    field[i][2] = field_ptr[idx+2]; 
+  }
+
+  double* fieldp_ptr = (double *)fieldp_pinned;
+  fieldp_ptr += 4*inum;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    fieldp[i][0] = fieldp_ptr[idx];
+    fieldp[i][1] = fieldp_ptr[idx+1];
+    fieldp[i][2] = fieldp_ptr[idx+2];
+  }
+
  // rebuild dipole-dipole pair list and store pairwise dipole matrices
  // done one atom at a time in real-space double loop over atoms & neighs

--- a/src/atom.cpp
+++ b/src/atom.cpp
@ -2274,7 +2274,6 @@ void Atom::setup_sort_bins()
 #ifdef LMP_GPU
  if (userbinsize == 0.0) {
    int ifix = modify->find_fix("package_gpu");
-/*
    if (ifix >= 0) {
      const double subx = domain->subhi[0] - domain->sublo[0];
      const double suby = domain->subhi[1] - domain->sublo[1];
@ -2298,7 +2297,6 @@ void Atom::setup_sort_bins()
      bininvy = bininv;
      bininvz = bininv;
    }
-*/    
  }
 #endif