Moved precompute() out of the terms in amoeba and hippo, to be involed in the first term in a time step: multipole for amoeba and repulsion for hippo

2022-09-30 16:31:13 -05:00
parent e6d2582642
commit 1d75ca3b20
9 changed files with 140 additions and 57 deletions
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -162,7 +162,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
  this->time_pair.start();
  // Build the short neighbor list for the cutoff off2_mpole,
-  //   at this point mpole is the first kernel in a time step
+  //   at this point mpole is the first kernel in a time step for AMOEBA
  this->k_short_nbor.set_size(GX,BX);
  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@ -117,7 +117,28 @@ void amoeba_gpu_clear() {
  AMOEBAMF.clear();
 }
-int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
+int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
                            double **host_x, int *host_type, int *host_amtype,
                            int *host_amgroup, double **host_rpole,
                            double **host_uind, double **host_uinp, double *host_pval,
                            double *sublo, double *subhi, tagint *tag,
                            int **nspecial, tagint **special,
                            int *nspecial15, tagint **special15,
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success, double *host_q, double *boxlo, double *prd) {
  return AMOEBAMF.precompute(ago, inum_full, nall, host_x, host_type,
                             host_amtype, host_amgroup, host_rpole,
                             nullptr, nullptr, nullptr, sublo, subhi, tag,
                             nspecial, special, nspecial15, special15,
                             eflag_in, vflag_in, eatom, vatom,
                             host_start, ilist, jnum, cpu_time,
                             success, host_q, boxlo, prd);
 }
 void amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           int *host_amtype, int *host_amgroup, double **host_rpole,
                           double *sublo, double *subhi, tagint *tag, int **nspecial,
@ -127,7 +148,7 @@ int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                           int **ilist, int **jnum, const double cpu_time,
                           bool &success, const double aewald, const double felec, const double off2,
                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
-  return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+  AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
                          host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi,
                          tag, nspecial, special, nspecial15, special15,
                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -226,12 +226,12 @@ int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
-                                         const int nall, double **host_x,
+                                        const int nall, double **host_x,
-                                         int *host_type, double *sublo,
+                                        int *host_type, double *sublo,
-                                         double *subhi, tagint *tag,
+                                        double *subhi, tagint *tag,
-                                         int **nspecial, tagint **special,
+                                        int **nspecial, tagint **special,
-                                         int *nspecial15, tagint **special15,
+                                        int *nspecial15, tagint **special15,
-                                         bool &success) {
+                                        bool &success) {
  success=true;
  resize_atom(inum,nall,success);
  resize_local(inum,host_inum,nbor->max_nbors(),success);
@ -450,7 +450,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
 // Reneighbor on GPU if necessary, and then compute multipole real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
+void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
                                          const int nall, double **host_x,
                                          int *host_type, int *host_amtype,
                                          int *host_amgroup, double **host_rpole, double *host_pval,
@ -469,7 +469,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
  // NOTE:
  //   Once all the kernels are ready, precompute() is needed only once
  //     in the first kernel in a time step.
-
+/*
  int** firstneigh = nullptr;
  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
                          host_amtype, host_amgroup, host_rpole,
@ -478,7 +478,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
                          eflag_in, vflag_in, eatom, vatom,
                          host_start, ilist, jnum, cpu_time,
                          success, host_q, boxlo, prd);
-
+*/
  // ------------------- Resize _tep array ------------------------
  if (inum_full>_max_tep_size) {
@ -503,7 +503,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
  _tep.update_host(_max_tep_size*4,false);
-  return firstneigh; // nbor->host_jlist.begin()-host_start;
+//  return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 // ---------------------------------------------------------------------------
@ -782,7 +782,6 @@ int BaseAmoebaT::fphi_mpole() {
  // Compute the block size and grid size to keep all cores busy
  const int BX=block_size();
  //printf("BX = %d; pppm block size = %d\n", BX, PPPM_BLOCK_1D);
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
  time_pair.start();
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -152,7 +152,7 @@ class BaseAmoeba {
                double *charge, double *boxlo, double *prd);
  /// Compute multipole real-space with device neighboring
-  virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
+  virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
                int *host_amgroup, double **host_rpole, double *host_pval,
                double *sublo, double *subhi, tagint *tag,
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@ -172,7 +172,7 @@ double HippoT::host_memory_usage() const {
 // Reneighbor on GPU if necessary, and then compute repulsion
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** HippoT::compute_repulsion(const int ago, const int inum_full,
+void HippoT::compute_repulsion(const int ago, const int inum_full,
                                const int nall, double **host_x,
                                int *host_type, int *host_amtype,
                                int *host_amgroup, double **host_rpole,
@ -213,7 +213,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
  //   We only need to cast the necessary from host to device here
  //     if the neighbor lists are rebuilt and other per-atom arrays
  //     (x, type, amtype, amgroup, rpole) are ready on the device.
-
+/*
  int** firstneigh = nullptr;
  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
                                host_amtype, host_amgroup, host_rpole,
@ -222,7 +222,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
                                eflag_in, vflag_in, eatom, vatom,
                                host_start, ilist, jnum, cpu_time,
                                success, host_q, boxlo, prd);
-
+*/
  // ------------------- Resize _tep array ------------------------
  if (inum_full>this->_max_tep_size) {
@ -253,7 +253,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
  this->_tep.update_host(this->_max_tep_size*4,false);
-  return firstneigh; // nbor->host_jlist.begin()-host_start;
+//  return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 // ---------------------------------------------------------------------------
@ -275,7 +275,7 @@ int HippoT::repulsion(const int eflag, const int vflag) {
  this->time_pair.start();
  // Build the short neighbor list for the cutoff off2_disp,
-  //   at this point mpole is the first kernel in a time step
+  //   at this point repuslion is the first kernel in a time step for HIPPO
  this->k_short_nbor.set_size(GX,BX);
  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
@ -302,7 +302,7 @@ int HippoT::repulsion(const int eflag, const int vflag) {
 // Reneighbor on GPU if necessary, and then compute dispersion real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
+void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
                                      double **host_rpole, const double aewald,
                                      const double off2_disp) {
@ -324,7 +324,7 @@ int** HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
  this->hd_balancer.stop_timer();
-  return nullptr; // nbor->host_jlist.begin()-host_start;
+ // return nullptr; // nbor->host_jlist.begin()-host_start;
 }
 // ---------------------------------------------------------------------------
@ -372,7 +372,7 @@ int HippoT::dispersion_real(const int eflag, const int vflag) {
 // Reneighbor on GPU if necessary, and then compute multipole real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** HippoT::compute_multipole_real(const int ago, const int inum_full,
+void HippoT::compute_multipole_real(const int ago, const int inum_full,
                                     const int nall, double **host_x,
                                     int *host_type, int *host_amtype,
                                     int *host_amgroup, double **host_rpole,
@ -417,7 +417,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
  this->_tep.update_host(this->_max_tep_size*4,false);
-  return nullptr; // nbor->host_jlist.begin()-host_start;
+  //return nullptr; // nbor->host_jlist.begin()-host_start;
 }
 // ---------------------------------------------------------------------------
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@ -55,7 +55,7 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
           const double polar_dscale, const double polar_uscale);
  /// Compute repulsion with device neighboring
-  int** compute_repulsion(const int ago, const int inum_full,
+  virtual void compute_repulsion(const int ago, const int inum_full,
                          const int nall, double **host_x,
                          int *host_type, int *host_amtype,
                          int *host_amgroup, double **host_rpole,
@ -72,12 +72,12 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
                          double c3, double c4, double c5,void** tep_ptr);
  /// Compute dispersion real-space with device neighboring
-  int** compute_dispersion_real(int *host_amtype,  int *host_amgroup,
+  virtual void compute_dispersion_real(int *host_amtype,  int *host_amgroup,
                                double **host_rpole, const double aewald,
                                const double off2_disp);
  /// Compute multipole real-space with device neighboring
-  virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
+  virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
                int *host_amgroup, double **host_rpole, double *host_pval,
                double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special,
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@ -120,7 +120,27 @@ void hippo_gpu_clear() {
  HIPPOMF.clear();
 }
-int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
+int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
                            double **host_x, int *host_type, int *host_amtype,
                            int *host_amgroup, double **host_rpole,
                            double **host_uind, double **host_uinp, double *host_pval,
                            double *sublo, double *subhi, tagint *tag,
                            int **nspecial, tagint **special,
                            int *nspecial15, tagint **special15,
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success, double *host_q, double *boxlo, double *prd) {
  return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type,
                            host_amtype, host_amgroup, host_rpole,
                            nullptr, nullptr, nullptr, sublo, subhi, tag,
                            nspecial, special, nspecial15, special15,
                            eflag_in, vflag_in, eatom, vatom,
                            host_start, ilist, jnum, cpu_time,
                            success, host_q, boxlo, prd);
 }
 void hippo_gpu_compute_repulsion(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           int *host_amtype, int *host_amgroup, double **host_rpole,
                           double *sublo, double *subhi, tagint *tag, int **nspecial,
@ -132,7 +152,7 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
                           double *host_q, double *boxlo, double *prd,
                           double cut2, double c0, double c1, double c2,
                           double c3, double c4, double c5, void **tep_ptr) {
-  return HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
+  HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
                          tag, nspecial, special, nspecial15, special15,
                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
@ -147,7 +167,7 @@ void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup,
                                         aewald, off2);
 }
-int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
+void hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           int *host_amtype, int *host_amgroup, double **host_rpole,
                           double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
@ -157,7 +177,7 @@ int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
                           int **ilist, int **jnum, const double cpu_time,
                           bool &success, const double aewald, const double felec, const double off2,
                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
-  return HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+  HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
                          host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi,
                          tag, nspecial, special, nspecial15, special15,
                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -69,7 +69,19 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
                    const double polar_dscale, const double polar_uscale, int& tq_size);
 void amoeba_gpu_clear();
-int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
+int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
                            double **host_x, int *host_type, int *host_amtype,
                            int *host_amgroup, double **host_rpole,
                            double **host_uind, double **host_uinp, double *host_pval,
                            double *sublo, double *subhi, tagint *tag,
                            int **nspecial, tagint **special,
                            int *nspecial15, tagint **special15,
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success, double *host_q, double *boxlo, double *prd);
 void amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
              double **host_rpole, double *sublo, double *subhi, tagint *tag,
              int **nspecial, tagint **special, int* nspecial15, tagint** special15,
@ -240,6 +252,18 @@ void PairAmoebaGPU::multipole_real()
  }
  inum = atom->nlocal;
  firstneigh = amoeba_gpu_precompute(neighbor->ago, inum, nall, atom->x,
                                     atom->type, amtype, amgroup, rpole,
                                     nullptr, nullptr, nullptr,
                                     sublo, subhi, atom->tag,
                                     atom->nspecial, atom->special,
                                     atom->nspecial15, atom->special15,
                                     eflag, vflag, eflag_atom, vflag_atom,
                                     host_start, &ilist, &numneigh, cpu_time,
                                     success, atom->q, domain->boxlo, domain->prd);
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");
  // select the correct cutoff for the term
  if (use_ewald) choose(MPOLE_LONG);
@ -249,18 +273,17 @@ void PairAmoebaGPU::multipole_real()
  double felec = electric / am_dielectric;
-  firstneigh = amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
+  amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
-                                                 atom->type, amtype, amgroup, rpole,
+                                    atom->type, amtype, amgroup, rpole,
-                                                 sublo, subhi, atom->tag,
+                                    sublo, subhi, atom->tag,
-                                                 atom->nspecial, atom->special,
+                                    atom->nspecial, atom->special,
-                                                 atom->nspecial15, atom->special15,
+                                    atom->nspecial15, atom->special15,
-                                                 eflag, vflag, eflag_atom, vflag_atom,
+                                    eflag, vflag, eflag_atom, vflag_atom,
-                                                 host_start, &ilist, &numneigh, cpu_time,
+                                    host_start, &ilist, &numneigh, cpu_time,
-                                                 success, aewald, felec, off2, atom->q,
+                                    success, aewald, felec, off2, atom->q,
-                                                 domain->boxlo, domain->prd, &tq_pinned);
+                                    domain->boxlo, domain->prd, &tq_pinned);
-  if (!success)
+  
    error->one(FLERR,"Insufficient memory on accelerator");
  // reference to the tep array from GPU lib
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@ -70,7 +70,19 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                    const double polar_dscale, const double polar_uscale, int& tq_size);
 void hippo_gpu_clear();
-int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
+int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
                            double **host_x, int *host_type, int *host_amtype,
                            int *host_amgroup, double **host_rpole,
                            double **host_uind, double **host_uinp, double *host_pval,
                            double *sublo, double *subhi, tagint *tag,
                            int **nspecial, tagint **special,
                            int *nspecial15, tagint **special15,
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success, double *host_q, double *boxlo, double *prd);
 void hippo_gpu_compute_repulsion(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           int *host_amtype, int *host_amgroup, double **host_rpole,
                           double *sublo, double *subhi, tagint *tag, int **nspecial,
@ -86,7 +98,7 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
 void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup, double **host_rpole,
                                        const double aewald, const double off2);
-int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
+void hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
              double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag,
              int **nspecial, tagint **special, int* nspecial15, tagint** special15,
@ -258,22 +270,30 @@ void PairHippoGPU::repulsion()
  }
  inum = atom->nlocal;
  firstneigh = hippo_gpu_precompute(neighbor->ago, inum, nall, atom->x,
                                     atom->type, amtype, amgroup, rpole,
                                     nullptr, nullptr, nullptr,
                                     sublo, subhi, atom->tag,
                                     atom->nspecial, atom->special,
                                     atom->nspecial15, atom->special15,
                                     eflag, vflag, eflag_atom, vflag_atom,
                                     host_start, &ilist, &numneigh, cpu_time,
                                     success, atom->q, domain->boxlo, domain->prd);
  // select the correct cutoff for the term
  choose(REPULSE);
-  // set the energy unit conversion factor for multipolar real-space calculation
+  hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x,
-
+                              atom->type, amtype, amgroup, rpole,
-  firstneigh = hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x,
+                              sublo, subhi, atom->tag,
-                                           atom->type, amtype, amgroup, rpole,
+                              atom->nspecial, atom->special,
-                                           sublo, subhi, atom->tag,
+                              atom->nspecial15, atom->special15,
-                                           atom->nspecial, atom->special,
+                              eflag, vflag, eflag_atom, vflag_atom,
-                                           atom->nspecial15, atom->special15,
+                              host_start, &ilist, &numneigh, cpu_time,
-                                           eflag, vflag, eflag_atom, vflag_atom,
+                              success, aewald, off2, atom->q,
-                                           host_start, &ilist, &numneigh, cpu_time,
+                              domain->boxlo, domain->prd, cut2,
-                                           success, aewald, off2, atom->q,
+                              c0, c1, c2, c3, c4, c5, &tq_pinned);
                                           domain->boxlo, domain->prd, cut2,
                                           c0, c1, c2, c3, c4, c5, &tq_pinned);
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");