diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index e3bb4c5ef5..dfe092c52b 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -185,7 +185,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Calculate the real-space permanent field, returning field and fieldp +// Launch the real-space permanent field kernel // --------------------------------------------------------------------------- template int AmoebaT::udirect2b(const int eflag, const int vflag) { @@ -202,7 +202,9 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { (BX/this->_threads_per_atom))); this->time_pair.start(); - // Build the short neighbor list if not done yet + // Build the short neighbor list for the cutoff _off2_polar, if not done yet + // this is the first kernel in a time step where _off2_polar is used + if (!this->short_nbor_polar_avail) { this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, @@ -225,7 +227,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Calculate the real-space induced field, returning field and fieldp +// Launch the real-space induced field kernel, returning field and fieldp // --------------------------------------------------------------------------- template int AmoebaT::umutual2b(const int eflag, const int vflag) { @@ -264,7 +266,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Calculate the polar real-space term, returning tep +// Launch the polar real-space kernel, returning tep // --------------------------------------------------------------------------- template int AmoebaT::polar_real(const int eflag, const int vflag) { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 16335fa17e..17e05b4a16 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -447,7 +447,9 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute multipole real-space +// Compute multipole real-space part +// precompute() should be already invoked before mem (re)allocation +// this is the first part in a time step done on the GPU for AMOEBA for now // --------------------------------------------------------------------------- template void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, @@ -464,21 +466,6 @@ void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, const double aewald, const double felec, const double off2_mpole, double *host_q, double *boxlo, double *prd, void **tep_ptr) { - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - // NOTE: - // Once all the kernels are ready, precompute() is needed only once - // in the first kernel in a time step. -/* - int** firstneigh = nullptr; - firstneigh = precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, nullptr, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); -*/ // ------------------- Resize _tep array ------------------------ if (inum_full>_max_tep_size) { @@ -502,8 +489,6 @@ void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, // copy tep from device to host _tep.update_host(_max_tep_size*4,false); - -// return firstneigh; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -842,22 +827,23 @@ double BaseAmoebaT::host_memory_usage_atomic() const { } // --------------------------------------------------------------------------- -// Setup the FFT plan +// Setup the FFT plan: only placeholder for now // --------------------------------------------------------------------------- template void BaseAmoebaT::setup_fft(const int numel, const int element_type) { - + // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT) } // --------------------------------------------------------------------------- -// Compute FFT on the device +// Compute FFT on the device: only placeholder for now // --------------------------------------------------------------------------- template void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode) { + // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT) #if !defined(USE_OPENCL) && !defined(USE_HIP) if (fft_plan_created == false) { int m = numel/2; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index dc2b6f2c7a..221fe16f3c 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -143,8 +143,12 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, _polar_uscale = polar_uscale; _allocated=true; - this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes() + coeff_amclass.row_bytes() + - + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes(); + this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes() + + coeff_amclass.row_bytes() + sp_polar.row_bytes() + + sp_nonpolar.row_bytes() + this->_tep.row_bytes() + + this->_fieldp.row_bytes() + this->_thetai1.row_bytes() + + this->_thetai2.row_bytes() + this->_thetai3.row_bytes() + + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes(); return 0; } @@ -169,7 +173,7 @@ double HippoT::host_memory_usage() const { } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute repulsion +// Compute the repulsion term, returning tep // --------------------------------------------------------------------------- template void HippoT::compute_repulsion(const int ago, const int inum_full, @@ -203,26 +207,6 @@ void HippoT::compute_repulsion(const int ago, const int inum_full, this->set_kernel(eflag,vflag); - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - // NOTE: - // For now we invoke precompute() again here, - // to be able to turn on/off the udirect2b kernel (which comes before this) - // Once all the kernels are ready, precompute() is needed only once - // in the first kernel in a time step. - // We only need to cast the necessary from host to device here - // if the neighbor lists are rebuilt and other per-atom arrays - // (x, type, amtype, amgroup, rpole) are ready on the device. -/* - int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, nullptr, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); -*/ // ------------------- Resize _tep array ------------------------ if (inum_full>this->_max_tep_size) { @@ -252,12 +236,10 @@ void HippoT::compute_repulsion(const int ago, const int inum_full, // copy tep from device to host this->_tep.update_host(this->_max_tep_size*4,false); - -// return firstneigh; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- -// Calculate the repulsion term, returning tep +// Launch the repulsion kernel // --------------------------------------------------------------------------- template int HippoT::repulsion(const int eflag, const int vflag) { @@ -299,7 +281,7 @@ int HippoT::repulsion(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute dispersion real-space +// Compute dispersion real-space // --------------------------------------------------------------------------- template void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, @@ -323,12 +305,10 @@ void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, //this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); - - // return nullptr; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- -// Calculate the dispersion real-space term, returning tep +// Launch the dispersion real-space kernel // --------------------------------------------------------------------------- template int HippoT::dispersion_real(const int eflag, const int vflag) { @@ -346,7 +326,7 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { this->time_pair.start(); // Build the short neighbor list for the cutoff off2_disp, - // at this point mpole is the first kernel in a time step + // at this point dispersion is the first kernel in a time step this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, @@ -356,20 +336,20 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { k_dispersion.set_size(GX,BX); k_dispersion.run(&this->atom->x, &this->atom->extra, - &coeff_amtype, &coeff_amclass, &sp_nonpolar, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, &this->_aewald, - &this->_off2_disp); + &coeff_amtype, &coeff_amclass, &sp_nonpolar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, + &this->_off2_disp); this->time_pair.stop(); return GX; } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute multipole real-space +// Compute the multipole real-space term, returning tep // --------------------------------------------------------------------------- template void HippoT::compute_multipole_real(const int ago, const int inum_full, @@ -416,12 +396,10 @@ void HippoT::compute_multipole_real(const int ago, const int inum_full, // copy tep from device to host this->_tep.update_host(this->_max_tep_size*4,false); - - //return nullptr; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- -// Calculate the multipole real-space term, returning tep +// Launch the multipole real-space kernel // --------------------------------------------------------------------------- template int HippoT::multipole_real(const int eflag, const int vflag) { @@ -438,8 +416,7 @@ int HippoT::multipole_real(const int eflag, const int vflag) { (BX/this->_threads_per_atom))); this->time_pair.start(); - // Build the short neighbor list for the cutoff off2_mpole, - // at this point mpole is the first kernel in a time step + // Build the short neighbor list for the cutoff off2_mpole this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, @@ -462,8 +439,8 @@ int HippoT::multipole_real(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute the direct real space part -// of the permanent field +// Compute the direct real space part of the permanent field +// returning field and fieldp // --------------------------------------------------------------------------- template void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, @@ -488,7 +465,7 @@ void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **hos } // --------------------------------------------------------------------------- -// Calculate the real-space permanent field, returning field and fieldp +// Launch the real-space permanent field kernel // --------------------------------------------------------------------------- template int HippoT::udirect2b(const int eflag, const int vflag) { @@ -505,7 +482,9 @@ int HippoT::udirect2b(const int eflag, const int vflag) { (BX/this->_threads_per_atom))); this->time_pair.start(); - // Build the short neighbor list if not done yet + // Build the short neighbor list for the cutoff _off2_polar, if not done yet + // this is the first kernel in a time step where _off2_polar is used + if (!this->short_nbor_polar_avail) { this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, @@ -529,8 +508,8 @@ int HippoT::udirect2b(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute the direct real space part -// of the induced field +// Compute the direct real space term of the induced field +// returning field and fieldp // --------------------------------------------------------------------------- template void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, @@ -554,7 +533,7 @@ void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **hos } // --------------------------------------------------------------------------- -// Calculate the real-space induced field, returning field and fieldp +// Launch the real-space induced field kernel // --------------------------------------------------------------------------- template int HippoT::umutual2b(const int eflag, const int vflag) { @@ -628,7 +607,7 @@ void HippoT::compute_polar_real(int *host_amtype, int *host_amgroup, double **ho } // --------------------------------------------------------------------------- -// Calculate the polar real-space term, returning tep +// Launch the polar real-space kernel // --------------------------------------------------------------------------- template int HippoT::polar_real(const int eflag, const int vflag) {