Added timing for the induced dipole spreading part, computed the block size to ensure all the CUs are occupied by the fphi_uind and fphi_mpole kernels

This commit is contained in:
Trung Nguyen
2022-10-06 15:03:58 -05:00
parent 009ed36301
commit 6b9e83fe20
10 changed files with 106 additions and 87 deletions

View File

@ -278,9 +278,14 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int max_cus = this->device->max_cus();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int BX=this->block_size();
(BX/this->_threads_per_atom))); int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
while (GX < max_cus) {
BX /= 2;
GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
}
this->time_pair.start(); this->time_pair.start();
// Build the short neighbor list if not done yet // Build the short neighbor list if not done yet

View File

@ -155,7 +155,14 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
#if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
fft_plan_created = false; fft_plan_created = false;
#endif
#ifdef ASYNC_DEVICE_COPY
_end_command_queue=ucl_device->num_queues();
ucl_device->push_command_queue();
#endif
return success; return success;
} }
@ -507,6 +514,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double
*fieldp_ptr=_fieldp.host.begin(); *fieldp_ptr=_fieldp.host.begin();
// specify the correct cutoff and alpha values
_off2_polar = off2_polar; _off2_polar = off2_polar;
_aewald = aewald; _aewald = aewald;
const int red_blocks=udirect2b(_eflag,_vflag); const int red_blocks=udirect2b(_eflag,_vflag);
@ -525,18 +533,20 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
double **host_uind, double **host_uinp, double *host_pval, double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2_polar, const double aewald, const double off2_polar,
void** fieldp_ptr) { void** fieldp_ptr) {
// all the necessary data arrays are already copied from host to device // only copy the necessary data arrays that are updated over the iterations
// use nullptr for the other arrays that are already copied from host to device
//cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr); cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr);
atom->add_extra_data(); atom->add_extra_data();
// set the correct cutoff and alpha
_off2_polar = off2_polar; _off2_polar = off2_polar;
_aewald = aewald; _aewald = aewald;
// launch the kernel
const int red_blocks=umutual2b(_eflag,_vflag); const int red_blocks=umutual2b(_eflag,_vflag);
// copy field and fieldp from device to host (_fieldp store both arrays, one after another) // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
// NOTE: move this step to update_fieldp() to delay device-host transfer // NOTE: move this step to update_fieldp() to delay device-host transfer
// after umutual1 and self are done on the GPU
// *fieldp_ptr=_fieldp.host.begin(); // *fieldp_ptr=_fieldp.host.begin();
// _fieldp.update_host(_max_fieldp_size*8,false); // _fieldp.update_host(_max_fieldp_size*8,false);
} }
@ -547,7 +557,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
// host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4 // host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
// host_igrid is allocated with nmax by 4 // host_igrid is allocated with nmax by 4
// - transfer extra data from host to device // - transfer extra data from host to device
// NOTE: can be re-used for fphi_mpole() (already allocate 2x grid points) // NOTE: can be re-used for fphi_mpole() but with a different bsorder value
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -588,6 +598,12 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
} }
} }
#ifdef ASYNC_DEVICE_COPY
_thetai1.cq(ucl_device->cq(_end_command_queue));
_thetai2.cq(ucl_device->cq(_end_command_queue));
_thetai3.cq(ucl_device->cq(_end_command_queue));
#endif
// pack host data to device // pack host data to device
for (int i = 0; i < inum_full; i++) for (int i = 0; i < inum_full; i++)
@ -634,6 +650,8 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
} }
_igrid.update_device(true); _igrid.update_device(true);
// _cgrid_brick holds the grid-based potential
_nzlo_out = nzlo_out; _nzlo_out = nzlo_out;
_nzhi_out = nzhi_out; _nzhi_out = nzhi_out;
_nylo_out = nylo_out; _nylo_out = nylo_out;
@ -679,14 +697,21 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
_cgrid_brick[n] = v; _cgrid_brick[n] = v;
n++; n++;
} }
_cgrid_brick.update_device(_num_grid_points, false); _cgrid_brick.update_device(_num_grid_points, true);
#ifdef ASYNC_DEVICE_COPY
ucl_device->sync();
#endif
// launch the kernel with its execution configuration (see below)
const int red_blocks = fphi_uind(); const int red_blocks = fphi_uind();
_fdip_phi1.update_host(_max_thetai_size*10); // copy data from device to host asynchronously
_fdip_phi2.update_host(_max_thetai_size*10); _fdip_phi1.update_host(_max_thetai_size*10, true);
_fdip_sum_phi.update_host(_max_thetai_size*20); _fdip_phi2.update_host(_max_thetai_size*10, true);
_fdip_sum_phi.update_host(_max_thetai_size*20, true);
// return the pointers to the host-side arrays
*host_fdip_phi1 = _fdip_phi1.host.begin(); *host_fdip_phi1 = _fdip_phi1.host.begin();
*host_fdip_phi2 = _fdip_phi2.host.begin(); *host_fdip_phi2 = _fdip_phi2.host.begin();
*host_fdip_sum_phi = _fdip_sum_phi.host.begin(); *host_fdip_sum_phi = _fdip_sum_phi.host.begin();
@ -701,12 +726,14 @@ int BaseAmoebaT::fphi_uind() {
if (ainum == 0) if (ainum == 0)
return 0; return 0;
int _nall=atom->nall();
int nbor_pitch=nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=block_size(); const int max_cus = device->max_cus();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX)); int BX=block_size();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
while (GX < max_cus) {
BX /= 2;
GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
}
time_pair.start(); time_pair.start();
int ngridxy = _ngridx * _ngridy; int ngridxy = _ngridx * _ngridy;
@ -766,8 +793,13 @@ int BaseAmoebaT::fphi_mpole() {
int nbor_pitch=nbor->nbor_pitch(); int nbor_pitch=nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=block_size(); const int max_cus = device->max_cus();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX)); int BX=block_size();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
while (GX < max_cus) {
BX /= 2;
GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
}
time_pair.start(); time_pair.start();
int ngridxy = _ngridx * _ngridy; int ngridxy = _ngridx * _ngridy;

View File

@ -31,6 +31,8 @@
#include "geryon/nvd_texture.h" #include "geryon/nvd_texture.h"
#endif #endif
//#define ASYNC_DEVICE_COPY
#if !defined(USE_OPENCL) && !defined(USE_HIP) #if !defined(USE_OPENCL) && !defined(USE_HIP)
// temporary workaround for int2 also defined in cufft // temporary workaround for int2 also defined in cufft
#ifdef int2 #ifdef int2
@ -263,6 +265,8 @@ class BaseAmoeba {
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
int _ngridx, _ngridy, _ngridz, _num_grid_points; int _ngridx, _ngridy, _ngridz, _num_grid_points;
int _end_command_queue;
// ------------------------ FORCE/ENERGY DATA ----------------------- // ------------------------ FORCE/ENERGY DATA -----------------------
Answer<numtyp,acctyp> *ans; Answer<numtyp,acctyp> *ans;

View File

@ -214,6 +214,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
} }
} }
_first_device = _last_device = best_device; _first_device = _last_device = best_device;
_max_cus = best_cus;
type = gpu->device_type(_first_device); type = gpu->device_type(_first_device);
if (ndevices > 0) { if (ndevices > 0) {

View File

@ -241,6 +241,8 @@ class Device {
inline int shuffle_avail() const { return _shuffle_avail; } inline int shuffle_avail() const { return _shuffle_avail; }
/// For OpenCL, 0 if fast-math options disabled, 1 enabled /// For OpenCL, 0 if fast-math options disabled, 1 enabled
inline int fast_math() const { return _fast_math; } inline int fast_math() const { return _fast_math; }
/// return the max number of CUs among the devices
inline int max_cus() const { return _max_cus; }
/// Return the number of threads per atom for pair styles /// Return the number of threads per atom for pair styles
inline int threads_per_atom() const { return _threads_per_atom; } inline int threads_per_atom() const { return _threads_per_atom; }
@ -324,7 +326,7 @@ class Device {
private: private:
std::queue<Answer<numtyp,acctyp> *> ans_queue; std::queue<Answer<numtyp,acctyp> *> ans_queue;
int _init_count; int _init_count, _max_cus;
bool _device_init, _host_timer_started, _time_device; bool _device_init, _host_timer_started, _time_device;
MPI_Comm _comm_world, _comm_replica, _comm_gpu; MPI_Comm _comm_world, _comm_replica, _comm_gpu;
int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,

View File

@ -619,9 +619,14 @@ int HippoT::polar_real(const int eflag, const int vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int max_cus = this->device->max_cus();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int BX=this->block_size();
(BX/this->_threads_per_atom))); int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
while (GX < max_cus) {
BX /= 2;
GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
}
this->time_pair.start(); this->time_pair.start();
// Build the short neighbor list if not done yet // Build the short neighbor list if not done yet

View File

@ -901,14 +901,22 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
} }
} }
double time0, time1;
// gridpre = my portion of 4d grid in brick decomp w/ ghost values // gridpre = my portion of 4d grid in brick decomp w/ ghost values
double ****gridpre = (double ****) ic_kspace->zero(); double ****gridpre = (double ****) ic_kspace->zero();
// map 2 values to grid // map 2 values to grid
MPI_Barrier(world);
time0 = MPI_Wtime();
grid_uind(fuind,fuinp,gridpre); grid_uind(fuind,fuinp,gridpre);
time1 = MPI_Wtime();
time_grid_uind += (time1 - time0);
// pre-convolution operations including forward FFT // pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomposition // gridfft = my portion of complex 3d grid in FFT decomposition
@ -945,7 +953,6 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
double ****gridpost = (double ****) ic_kspace->post_convolution(); double ****gridpost = (double ****) ic_kspace->post_convolution();
// get potential // get potential
double time0, time1;
MPI_Barrier(world); MPI_Barrier(world);
time0 = MPI_Wtime(); time0 = MPI_Wtime();

View File

@ -367,7 +367,7 @@ void PairAmoeba::compute(int eflag, int vflag)
time_mutual_rspace = time_mutual_kspace = 0.0; time_mutual_rspace = time_mutual_kspace = 0.0;
time_polar_rspace = time_polar_kspace = 0.0; time_polar_rspace = time_polar_kspace = 0.0;
time_fphi_uind = 0.0; time_grid_uind = time_fphi_uind = 0.0;
if (ic_kspace) { if (ic_kspace) {
ic_kspace->time_fft = 0.0; ic_kspace->time_fft = 0.0;
} }
@ -566,6 +566,9 @@ void PairAmoeba::finish()
MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_polar_kspace = ave/comm->nprocs; time_polar_kspace = ave/comm->nprocs;
MPI_Allreduce(&time_grid_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_grid_uind = ave/comm->nprocs;
MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world); MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_fphi_uind = ave/comm->nprocs; time_fphi_uind = ave/comm->nprocs;
@ -592,15 +595,19 @@ void PairAmoeba::finish()
utils::logmesg(lmp," Qxfer time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total); utils::logmesg(lmp," Qxfer time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total);
utils::logmesg(lmp," Total time: {:.6g}\n",time_total * 100.0); utils::logmesg(lmp," Total time: {:.6g}\n",time_total * 100.0);
utils::logmesg(lmp," Real-space timing breakdown:\n"); double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace;
double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace;
utils::logmesg(lmp," Real-space timing breakdown: {:.3g}%\n", rspace_time/time_total);
utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total); utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total);
utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total); utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total);
utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total); utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total);
utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total);
utils::logmesg(lmp," K-space timing breakdown:\n"); utils::logmesg(lmp," K-space timing breakdown : {:.3g}%\n", kspace_time/time_total);
utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total); utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total); utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total); utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
utils::logmesg(lmp," - Grid : {:.6g} {:.3g}%\n", time_grid_uind, time_grid_uind/time_total);
utils::logmesg(lmp," - FFT : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total); utils::logmesg(lmp," - FFT : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
utils::logmesg(lmp," - Interp : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total); utils::logmesg(lmp," - Interp : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total);
utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total); utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);

View File

@ -80,11 +80,11 @@ class PairAmoeba : public Pair {
double time_init, time_hal, time_repulse, time_disp; double time_init, time_hal, time_repulse, time_disp;
double time_mpole, time_induce, time_polar, time_qxfer; double time_mpole, time_induce, time_polar, time_qxfer;
double time_mpole_rspace,time_mpole_kspace; double time_mpole_rspace, time_mpole_kspace;
double time_direct_rspace,time_direct_kspace; double time_direct_rspace, time_direct_kspace;
double time_mutual_rspace,time_mutual_kspace; double time_mutual_rspace, time_mutual_kspace;
double time_polar_rspace,time_polar_kspace; double time_polar_rspace, time_polar_kspace;
double time_fphi_uind; double time_grid_uind, time_fphi_uind;
// energy/virial components // energy/virial components

View File

@ -930,15 +930,6 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
memset(&field[0][0], 0, 3*nall *sizeof(double)); memset(&field[0][0], 0, 3*nall *sizeof(double));
memset(&fieldp[0][0], 0, 3*nall *sizeof(double)); memset(&fieldp[0][0], 0, 3*nall *sizeof(double));
/*
for (int i = 0; i < nall; i++) {
for (int j = 0; j < 3; j++) {
field[i][j] = 0.0;
fieldp[i][j] = 0.0;
}
}
*/
// get the real space portion of the mutual field first // get the real space portion of the mutual field first
MPI_Barrier(world); MPI_Barrier(world);
@ -960,19 +951,13 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
field[i][1] += term*uind[i][1]; field[i][1] += term*uind[i][1];
field[i][2] += term*uind[i][2]; field[i][2] += term*uind[i][2];
} }
for (int i = 0; i < nlocal; i++) { for (int i = 0; i < nlocal; i++) {
fieldp[i][0] += term*uinp[i][0]; fieldp[i][0] += term*uinp[i][0];
fieldp[i][1] += term*uinp[i][1]; fieldp[i][1] += term*uinp[i][1];
fieldp[i][2] += term*uinp[i][2]; fieldp[i][2] += term*uinp[i][2];
} }
/*
for (i = 0; i < nlocal; i++) {
for (j = 0; j < 3; j++) {
field[i][j] += term*uind[i][j];
fieldp[i][j] += term*uinp[i][j];
}
}
*/
// accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
// field and fieldp may already have some nonzero values from kspace (umutual1 and self) // field and fieldp may already have some nonzero values from kspace (umutual1 and self)
@ -1029,7 +1014,6 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
} }
int nlocal = atom->nlocal; int nlocal = atom->nlocal;
for (int i = 0; i < nlocal; i++) { for (int i = 0; i < nlocal; i++) {
fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2]; fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2];
fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2]; fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2];
@ -1041,22 +1025,23 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
} }
/*
for (i = 0; i < nlocal; i++) { double time0, time1;
for (j = 0; j < 3; j++) {
fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2];
fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2];
}
}
*/
// gridpre = my portion of 4d grid in brick decomp w/ ghost values // gridpre = my portion of 4d grid in brick decomp w/ ghost values
double ****gridpre = (double ****) ic_kspace->zero(); double ****gridpre = (double ****) ic_kspace->zero();
// map 2 values to grid // map 2 values to grid
MPI_Barrier(world);
time0 = MPI_Wtime();
grid_uind(fuind,fuinp,gridpre); grid_uind(fuind,fuinp,gridpre);
time1 = MPI_Wtime();
time_grid_uind += (time1 - time0);
// pre-convolution operations including forward FFT // pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomposition // gridfft = my portion of complex 3d grid in FFT decomposition
@ -1093,9 +1078,6 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
double ****gridpost = (double ****) ic_kspace->post_convolution(); double ****gridpost = (double ****) ic_kspace->post_convolution();
// get potential // get potential
double time0, time1;
MPI_Barrier(world);
time0 = MPI_Wtime(); time0 = MPI_Wtime();
fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
@ -1114,14 +1096,6 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
} }
} }
// convert the dipole fields from fractional to Cartesian
for (int i = 0; i < 3; i++) {
a[0][i] = nfft1 * recip[0][i];
a[1][i] = nfft2 * recip[1][i];
a[2][i] = nfft3 * recip[2][i];
}
for (int i = 0; i < nlocal; i++) { for (int i = 0; i < nlocal; i++) {
double dfx = a[0][0]*fdip_phi1[i][1] + double dfx = a[0][0]*fdip_phi1[i][1] +
a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3]; a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3];
@ -1145,25 +1119,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
fieldp[i][1] -= dfy; fieldp[i][1] -= dfy;
fieldp[i][2] -= dfz; fieldp[i][2] -= dfz;
} }
/*
for (int i = 0; i < nlocal; i++) {
for (j = 0; j < 3; j++) {
dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] +
a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3];
dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] +
a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3];
}
}
// increment the field at each multipole site
for (i = 0; i < nlocal; i++) {
for (j = 0; j < 3; j++) {
field[i][j] -= dipfield1[i][j];
fieldp[i][j] -= dipfield2[i][j];
}
}
*/
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------