Added timing for the induced dipole spreading part, computed the block size to ensure all the CUs are occupied by the fphi_uind and fphi_mpole kernels

2022-10-06 15:03:58 -05:00
parent 009ed36301
commit 6b9e83fe20
10 changed files with 106 additions and 87 deletions
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -278,9 +278,14 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();

  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
+  const int max_cus = this->device->max_cus();
+  int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  while (GX < max_cus) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  }
+
  this->time_pair.start();

  // Build the short neighbor list if not done yet
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -155,7 +155,14 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
  dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
  dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);

+  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
  fft_plan_created = false;
+  #endif
+
+  #ifdef ASYNC_DEVICE_COPY
+  _end_command_queue=ucl_device->num_queues();
+  ucl_device->push_command_queue();
+  #endif

  return success;
 }
@ -507,6 +514,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double
 
  *fieldp_ptr=_fieldp.host.begin();

+  // specify the correct cutoff and alpha values
  _off2_polar = off2_polar;
  _aewald = aewald;
  const int red_blocks=udirect2b(_eflag,_vflag);
@ -525,18 +533,20 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
                                     double **host_uind, double **host_uinp, double *host_pval,
                                     const double aewald, const double off2_polar,
                                     void** fieldp_ptr) {
-  // all the necessary data arrays are already copied from host to device
-
-  //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  // only copy the necessary data arrays that are updated over the iterations
+  // use nullptr for the other arrays that are already copied from host to device
  cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr);
  atom->add_extra_data();                          

+  // set the correct cutoff and alpha
  _off2_polar = off2_polar;
  _aewald = aewald;
+  // launch the kernel
  const int red_blocks=umutual2b(_eflag,_vflag);

  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
  // NOTE: move this step to update_fieldp() to delay device-host transfer
+  //       after umutual1 and self are done on the GPU
  // *fieldp_ptr=_fieldp.host.begin();
  // _fieldp.update_host(_max_fieldp_size*8,false);
 }
@ -547,7 +557,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
 //     host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
 //     host_igrid is allocated with nmax by 4
 //   - transfer extra data from host to device
-// NOTE: can be re-used for fphi_mpole() (already allocate 2x grid points)
+// NOTE: can be re-used for fphi_mpole() but with a different bsorder value
 // ---------------------------------------------------------------------------

 template <class numtyp, class acctyp>
@ -588,6 +598,12 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
    }
  }

+  #ifdef ASYNC_DEVICE_COPY
+  _thetai1.cq(ucl_device->cq(_end_command_queue));
+  _thetai2.cq(ucl_device->cq(_end_command_queue));
+  _thetai3.cq(ucl_device->cq(_end_command_queue));
+  #endif
+
  // pack host data to device

  for (int i = 0; i < inum_full; i++)
@ -634,6 +650,8 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
  }
  _igrid.update_device(true);

+  // _cgrid_brick holds the grid-based potential
+
  _nzlo_out = nzlo_out;
  _nzhi_out = nzhi_out;
  _nylo_out = nylo_out;
@ -679,14 +697,21 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
        _cgrid_brick[n] = v;
        n++;
      }
-  _cgrid_brick.update_device(_num_grid_points, false);
+  _cgrid_brick.update_device(_num_grid_points, true);

+  #ifdef ASYNC_DEVICE_COPY
+  ucl_device->sync();
+  #endif
+
+  // launch the kernel with its execution configuration (see below)
  const int red_blocks = fphi_uind();

-  _fdip_phi1.update_host(_max_thetai_size*10);
-  _fdip_phi2.update_host(_max_thetai_size*10);
-  _fdip_sum_phi.update_host(_max_thetai_size*20);
+  // copy data from device to host asynchronously
+  _fdip_phi1.update_host(_max_thetai_size*10, true);
+  _fdip_phi2.update_host(_max_thetai_size*10, true);
+  _fdip_sum_phi.update_host(_max_thetai_size*20, true);

+  // return the pointers to the host-side arrays
  *host_fdip_phi1 = _fdip_phi1.host.begin();
  *host_fdip_phi2 = _fdip_phi2.host.begin();
  *host_fdip_sum_phi = _fdip_sum_phi.host.begin();
@ -701,13 +726,15 @@ int BaseAmoebaT::fphi_uind() {
  if (ainum == 0)
    return 0;

-  int _nall=atom->nall();
-  int nbor_pitch=nbor->nbor_pitch();
-
  // Compute the block size and grid size to keep all cores busy
-  const int BX=block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
-
+  const int max_cus = device->max_cus();
+  int BX=block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  while (GX < max_cus) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  }
+  
  time_pair.start();
  int ngridxy = _ngridx * _ngridy;
  k_fphi_uind.set_size(GX,BX);
@ -766,8 +793,13 @@ int BaseAmoebaT::fphi_mpole() {
  int nbor_pitch=nbor->nbor_pitch();

  // Compute the block size and grid size to keep all cores busy
-  const int BX=block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+  const int max_cus = device->max_cus();
+  int BX=block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  while (GX < max_cus) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  }

  time_pair.start();
  int ngridxy = _ngridx * _ngridy;
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -31,6 +31,8 @@
 #include "geryon/nvd_texture.h"
 #endif

+//#define ASYNC_DEVICE_COPY
+
 #if !defined(USE_OPENCL) && !defined(USE_HIP)
 // temporary workaround for int2 also defined in cufft
 #ifdef int2
@ -263,6 +265,8 @@ class BaseAmoeba {
  int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
  int _ngridx, _ngridy, _ngridz, _num_grid_points;

+  int _end_command_queue;
+  
  // ------------------------ FORCE/ENERGY DATA -----------------------

  Answer<numtyp,acctyp> *ans;
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -214,6 +214,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
      }
    }
    _first_device = _last_device = best_device;
+    _max_cus = best_cus;
    type = gpu->device_type(_first_device);

    if (ndevices > 0) {
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -241,6 +241,8 @@ class Device {
  inline int shuffle_avail() const { return _shuffle_avail; }
  /// For OpenCL, 0 if fast-math options disabled, 1 enabled
  inline int fast_math() const { return _fast_math; }
+  /// return the max number of CUs among the devices
+  inline int max_cus() const { return _max_cus; }

  /// Return the number of threads per atom for pair styles
  inline int threads_per_atom() const { return _threads_per_atom; }
@ -324,7 +326,7 @@ class Device {

 private:
  std::queue<Answer<numtyp,acctyp> *> ans_queue;
-  int _init_count;
+  int _init_count, _max_cus;
  bool _device_init, _host_timer_started, _time_device;
  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@ -619,9 +619,14 @@ int HippoT::polar_real(const int eflag, const int vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();

  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
+  const int max_cus = this->device->max_cus();
+  int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  while (GX < max_cus) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  }
+
  this->time_pair.start();

  // Build the short neighbor list if not done yet