Exposing fphi_uind to the gpu pair style, still keeping the part not ready though
This commit is contained in:
@ -162,9 +162,17 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
|
||||
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
||||
}
|
||||
|
||||
void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp,
|
||||
void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
|
||||
double ***host_thetai1, double ***host_thetai2,
|
||||
double ***host_thetai3, double ***grid) {
|
||||
double ***host_thetai3, int** igrid,
|
||||
double *host_grid_brick_start, void **host_fdip_phi1,
|
||||
void **host_fdip_phi2, void **host_fdip_sum_phi,
|
||||
int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
|
||||
int nxlo_out, int nxhi_out, bool& first_iteration) {
|
||||
AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
|
||||
host_thetai3, igrid, host_grid_brick_start, host_fdip_phi1,
|
||||
host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
|
||||
nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration);
|
||||
}
|
||||
|
||||
void amoeba_setup_fft(const int numel, const int element_type) {
|
||||
|
||||
@ -144,7 +144,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
||||
_max_fieldp_size = _max_tep_size;
|
||||
_fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
|
||||
_max_thetai_size = _max_tep_size;
|
||||
_max_thetai_size = 0;
|
||||
|
||||
_nmax = nall;
|
||||
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
@ -441,81 +441,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Prepare for umutual1: bspline_fill
|
||||
// - reallocate per-atom arrays, thetai1, thetai2, thetai3, if needed
|
||||
// - transfer extra data from host to device
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
|
||||
double **host_thetai1, double **host_thetai2,
|
||||
double **host_thetai3, int** host_igrid,
|
||||
double* grid_brick_start, int nzlo_out,
|
||||
int nzhi_out, int nylo_out, int nyhi_out,
|
||||
int nxlo_out, int nxhi_out) {
|
||||
|
||||
_bsorder = bsorder;
|
||||
|
||||
// allocate or resize per-atom arrays
|
||||
// _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
|
||||
// will be consolidated once all terms are ready
|
||||
|
||||
if (_max_thetai_size == 0) {
|
||||
_max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
_thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
_thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
_thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
_igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
_fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||
_fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||
_fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||
|
||||
} else {
|
||||
if (inum_full>_max_thetai_size) {
|
||||
_max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
_thetai1.resize(_max_thetai_size*bsorder*4);
|
||||
_thetai2.resize(_max_thetai_size*bsorder*4);
|
||||
_thetai3.resize(_max_thetai_size*bsorder*4);
|
||||
_igrid.resize(_max_thetai_size*4);
|
||||
|
||||
_fdip_phi1.resize(_max_thetai_size*10);
|
||||
_fdip_phi2.resize(_max_thetai_size*10);
|
||||
_fdip_sum_phi.resize(_max_thetai_size*20);
|
||||
}
|
||||
}
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
|
||||
// copy from host to device
|
||||
|
||||
dview.view(&host_thetai1[0][0],inum_full*bsorder*4,*(this->ucl_device));
|
||||
ucl_copy(_thetai1,dview,false);
|
||||
dview.view(&host_thetai2[0][0],inum_full*bsorder*4,*(this->ucl_device));
|
||||
ucl_copy(_thetai2,dview,false);
|
||||
dview.view(&host_thetai3[0][0],inum_full*bsorder*4,*(this->ucl_device));
|
||||
ucl_copy(_thetai3,dview,false);
|
||||
|
||||
UCL_H_Vec<int> dview_int;
|
||||
dview_int.view(&host_igrid[0][0],inum_full*4,*(this->ucl_device));
|
||||
ucl_copy(_igrid,dview_int,false);
|
||||
|
||||
_nzlo_out = nzlo_out;
|
||||
_nzhi_out = nzhi_out;
|
||||
_nylo_out = nylo_out;
|
||||
_nyhi_out = nyhi_out;
|
||||
_nxlo_out = nxlo_out;
|
||||
_nxhi_out = nxhi_out;
|
||||
_ngridz = nzhi_out - nzlo_out + 1;
|
||||
_ngridy = nyhi_out - nylo_out + 1;
|
||||
_ngridx = nxhi_out - nxlo_out + 1;
|
||||
_num_grid_points = _ngridx*_ngridy*_ngridz*2;
|
||||
dview.view(grid_brick_start,_num_grid_points,*(this->ucl_device));
|
||||
ucl_copy(_cgrid_brick,dview,false);
|
||||
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary, and then compute multipole real-space
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -626,6 +551,98 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
|
||||
// _fieldp.update_host(_max_fieldp_size*8,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Prepare for umutual1() after bspline_fill() is done on host
|
||||
// - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed
|
||||
// host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
|
||||
// host_igrid is allocated with nmax by by 4
|
||||
// - transfer extra data from host to device
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
|
||||
double ***host_thetai1, double ***host_thetai2,
|
||||
double ***host_thetai3, int** host_igrid,
|
||||
double* grid_brick_start, int nzlo_out,
|
||||
int nzhi_out, int nylo_out, int nyhi_out,
|
||||
int nxlo_out, int nxhi_out) {
|
||||
|
||||
_bsorder = bsorder;
|
||||
|
||||
// allocate or resize per-atom arrays
|
||||
// _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
|
||||
// will be consolidated once all terms are ready
|
||||
|
||||
if (_max_thetai_size == 0) {
|
||||
_max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
_thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
_thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
_thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
_igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
_fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||
_fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||
_fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||
|
||||
} else {
|
||||
if (inum_full>_max_thetai_size) {
|
||||
_max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
_thetai1.resize(_max_thetai_size*bsorder*4);
|
||||
_thetai2.resize(_max_thetai_size*bsorder*4);
|
||||
_thetai3.resize(_max_thetai_size*bsorder*4);
|
||||
_igrid.resize(_max_thetai_size*4);
|
||||
|
||||
_fdip_phi1.resize(_max_thetai_size*10);
|
||||
_fdip_phi2.resize(_max_thetai_size*10);
|
||||
_fdip_sum_phi.resize(_max_thetai_size*20);
|
||||
}
|
||||
}
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
dview.alloc(inum_full*bsorder*4,*(this->ucl_device));
|
||||
|
||||
// pack host data to device
|
||||
|
||||
for (int i = 0; i < inum_full; i++)
|
||||
for (int j = 0; j < bsorder; j++) {
|
||||
int idx = i*4*bsorder + 4*j;
|
||||
dview[idx+0] = host_thetai1[i][j][0];
|
||||
dview[idx+1] = host_thetai1[i][j][1];
|
||||
dview[idx+2] = host_thetai1[i][j][2];
|
||||
dview[idx+3] = host_thetai1[i][j][3];
|
||||
}
|
||||
ucl_copy(_thetai1,dview,false);
|
||||
|
||||
for (int i = 0; i < inum_full; i++)
|
||||
for (int j = 0; j < bsorder; j++) {
|
||||
int idx = i*4*bsorder + 4*j;
|
||||
dview[idx+0] = host_thetai2[i][j][0];
|
||||
dview[idx+1] = host_thetai2[i][j][1];
|
||||
dview[idx+2] = host_thetai2[i][j][2];
|
||||
dview[idx+3] = host_thetai2[i][j][3];
|
||||
}
|
||||
ucl_copy(_thetai2,dview,false);
|
||||
|
||||
for (int i = 0; i < inum_full; i++)
|
||||
for (int j = 0; j < bsorder; j++) {
|
||||
int idx = i*4*bsorder + 4*j;
|
||||
dview[idx+0] = host_thetai3[i][j][0];
|
||||
dview[idx+1] = host_thetai3[i][j][1];
|
||||
dview[idx+2] = host_thetai3[i][j][2];
|
||||
dview[idx+3] = host_thetai3[i][j][3];
|
||||
}
|
||||
ucl_copy(_thetai3,dview,false);
|
||||
|
||||
UCL_H_Vec<int> dview_int;
|
||||
for (int i = 0; i < inum_full; i++) {
|
||||
int idx = i*4;
|
||||
dview_int[idx+0] = host_igrid[i][0];
|
||||
dview_int[idx+1] = host_igrid[i][1];
|
||||
dview_int[idx+2] = host_igrid[i][2];
|
||||
}
|
||||
ucl_copy(_igrid,dview_int,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// fphi_uind = induced potential from grid
|
||||
// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
|
||||
@ -633,19 +650,22 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
|
||||
double **host_thetai1, double **host_thetai2,
|
||||
double **host_thetai3, int** igrid,
|
||||
double *host_grid_brick_start, double **host_fdip_phi1,
|
||||
double **host_fdip_phi2, double **host_fdip_sum_phi,
|
||||
double ***host_thetai1, double ***host_thetai2,
|
||||
double ***host_thetai3, int** igrid,
|
||||
double *host_grid_brick_start, void** host_fdip_phi1,
|
||||
void **host_fdip_phi2, void **host_fdip_sum_phi,
|
||||
int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
|
||||
int nxlo_out, int nxhi_out)
|
||||
int nxlo_out, int nxhi_out, bool& first_iteration)
|
||||
{
|
||||
// allocation/resize and transfers (do this right after udirect?)
|
||||
// allocation/resize and transfers before the first iteration
|
||||
|
||||
precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
|
||||
igrid, host_grid_brick_start, nzlo_out, nzhi_out, nylo_out, nyhi_out,
|
||||
nxlo_out, nxhi_out);
|
||||
|
||||
if (first_iteration) {
|
||||
precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
|
||||
igrid, host_grid_brick_start, nzlo_out, nzhi_out,
|
||||
nylo_out, nyhi_out, nxlo_out, nxhi_out);
|
||||
if (first_iteration) first_iteration = false;
|
||||
}
|
||||
|
||||
// update the cgrid_brick with data host
|
||||
|
||||
_nzlo_out = nzlo_out;
|
||||
@ -664,6 +684,14 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
|
||||
ucl_copy(_cgrid_brick,dview,false);
|
||||
|
||||
const int red_blocks = fphi_uind();
|
||||
|
||||
_fdip_phi1.update_host(_max_thetai_size*10);
|
||||
_fdip_phi2.update_host(_max_thetai_size*10);
|
||||
_fdip_sum_phi.update_host(_max_thetai_size*20);
|
||||
|
||||
*host_fdip_phi1 = _fdip_phi1.host.begin();
|
||||
*host_fdip_phi2 = _fdip_phi2.host.begin();
|
||||
*host_fdip_sum_phi = _fdip_sum_phi.host.begin();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@ -151,8 +151,8 @@ class BaseAmoeba {
|
||||
double *charge, double *boxlo, double *prd);
|
||||
|
||||
virtual void precompute_induce(const int inum_full, const int bsorder,
|
||||
double **host_thetai1, double **host_thetai2,
|
||||
double **host_thetai3, int** igrid,
|
||||
double ***host_thetai1, double ***host_thetai2,
|
||||
double ***host_thetai3, int** igrid,
|
||||
double* grid_brick_start, int nzlo_out,
|
||||
int nzhi_out, int nylo_out, int nyhi_out,
|
||||
int nxlo_out, int nxhi_out);
|
||||
@ -180,12 +180,12 @@ class BaseAmoeba {
|
||||
const double aewald, const double off2_polar, void **fieldp_ptr);
|
||||
|
||||
virtual void compute_fphi_uind(const int inum_full, const int bsorder,
|
||||
double **host_thetai1, double **host_thetai2,
|
||||
double **host_thetai3, int** igrid,
|
||||
double *host_grid_brick_start, double **host_fdip_phi1,
|
||||
double **host_fdip_phi2, double **host_fdip_sum_phi,
|
||||
double ***host_thetai1, double ***host_thetai2,
|
||||
double ***host_thetai3, int** igrid,
|
||||
double *host_grid_brick_start, void **host_fdip_phi1,
|
||||
void **host_fdip_phi2, void **host_fdip_sum_phi,
|
||||
int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
|
||||
int nxlo_out, int nxhi_out);
|
||||
int nxlo_out, int nxhi_out, bool& first_iteration);
|
||||
|
||||
/// Compute polar real-space with device neighboring
|
||||
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
|
||||
@ -407,7 +407,7 @@ class PairAmoeba : public Pair {
|
||||
void grid_mpole(double **, double ***);
|
||||
void fphi_mpole(double ***, double **);
|
||||
void grid_uind(double **, double **, double ****);
|
||||
void fphi_uind(double ****, double **, double **, double **);
|
||||
virtual void fphi_uind(double ****, double **, double **, double **);
|
||||
void grid_disp(double ***);
|
||||
|
||||
void kewald();
|
||||
|
||||
@ -88,9 +88,13 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
|
||||
|
||||
void amoeba_gpu_update_fieldp(void **fieldp_ptr);
|
||||
|
||||
void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp,
|
||||
double** host_thetai1, double** host_thetai2,
|
||||
double** host_thetai3, double ***grid);
|
||||
void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
|
||||
double ***host_thetai1, double ***host_thetai2,
|
||||
double ***host_thetai3, int** igrid,
|
||||
double *host_grid_brick_start, void **host_fdip_phi1,
|
||||
void **host_fdip_phi2, void **host_fdip_sum_phi,
|
||||
int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
|
||||
int nxlo_out, int nxhi_out, bool& first_iteration);
|
||||
|
||||
void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
|
||||
double **host_rpole, double **host_uind, double **host_uinp,
|
||||
@ -117,6 +121,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
|
||||
gpu_multipole_real_ready = true; // need to be true for precompute()
|
||||
gpu_udirect2b_ready = true;
|
||||
gpu_umutual1_ready = true;
|
||||
gpu_fphi_uind_ready = false;
|
||||
gpu_umutual2b_ready = true;
|
||||
gpu_polar_real_ready = true; // need to be true for copying data from device back to host
|
||||
|
||||
@ -481,6 +486,8 @@ void PairAmoebaGPU::induce()
|
||||
|
||||
// conjugate gradient iteration of the mutual induced dipoles
|
||||
|
||||
first_induce_iteration = true;
|
||||
|
||||
while (!done) {
|
||||
iter++;
|
||||
|
||||
@ -1115,6 +1122,53 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
|
||||
*/
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
fphi_uind = induced potential from grid
|
||||
fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
|
||||
double **fdip_phi2, double **fdip_sum_phi)
|
||||
{
|
||||
if (!gpu_fphi_uind_ready) {
|
||||
PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi);
|
||||
return;
|
||||
}
|
||||
|
||||
void* fdip_phi1_pinned = nullptr;
|
||||
void* fdip_phi2_pinned = nullptr;
|
||||
void* fdip_sum_phi_pinned = nullptr;
|
||||
amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, thetai2, thetai3,
|
||||
igrid, ic_kspace->grid_brick_start,
|
||||
&fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned,
|
||||
ic_kspace->nzlo_out, ic_kspace->nzhi_out,
|
||||
ic_kspace->nylo_out, ic_kspace->nyhi_out,
|
||||
ic_kspace->nxlo_out, ic_kspace->nxhi_out,
|
||||
first_induce_iteration);
|
||||
|
||||
int nlocal = atom->nlocal;
|
||||
double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
int idx = 10 * i;
|
||||
for (int m = 0; m < 10; m++)
|
||||
fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m];
|
||||
}
|
||||
|
||||
double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
int idx = 10 * i;
|
||||
for (int m = 0; m < 10; m++)
|
||||
fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m];
|
||||
}
|
||||
|
||||
double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
int idx = 20 * i;
|
||||
for (int m = 0; m < 20; m++)
|
||||
fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m];
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
umutual2b = Ewald real mutual field via list
|
||||
umutual2b computes the real space contribution of the induced
|
||||
|
||||
@ -39,6 +39,7 @@ class PairAmoebaGPU : public PairAmoeba {
|
||||
virtual void multipole_real();
|
||||
virtual void udirect2b(double **, double **);
|
||||
virtual void umutual1(double **, double **);
|
||||
virtual void fphi_uind(double ****, double **, double **, double **);
|
||||
virtual void umutual2b(double **, double **);
|
||||
virtual void ufield0c(double **, double **);
|
||||
virtual void polar_real();
|
||||
@ -56,9 +57,12 @@ class PairAmoebaGPU : public PairAmoeba {
|
||||
bool gpu_multipole_real_ready;
|
||||
bool gpu_udirect2b_ready;
|
||||
bool gpu_umutual1_ready;
|
||||
bool gpu_fphi_uind_ready;
|
||||
bool gpu_umutual2b_ready;
|
||||
bool gpu_polar_real_ready;
|
||||
|
||||
bool first_induce_iteration;
|
||||
|
||||
void udirect2b_cpu();
|
||||
|
||||
template<class numtyp>
|
||||
|
||||
Reference in New Issue
Block a user