Both udirect2b and polar_real are working correctly on the GPU

This commit is contained in:
Trung Nguyen
2021-09-09 00:57:21 -05:00
parent 8c5a116d30
commit 6f6fd0999c
5 changed files with 121 additions and 89 deletions

View File

@ -284,7 +284,11 @@ class Atom {
/// Signal that we need to transfer atom data for next timestep
inline void data_unavail()
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; }
/// Signal that we need to transfer atom extra data for next kernel call
inline void extra_data_unavail()
{ _extra_avail=false; }
typedef struct { double x,y,z; } vec3d;
typedef struct { numtyp x,y,z,w; } vec4d_t;

View File

@ -317,8 +317,10 @@ void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const
// ---------------------------------------------------------------------------
// Prepare for multiple kernel calls in a time step:
// - reallocate per-atom arrays, if needed
// - transfer extra data from host to device
// - build the full neighbor lists for use by different kernels
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
@ -402,75 +404,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
return nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute polar real-space
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd, void **tep_ptr) {
acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
// reallocate per-atom arrays and build the neighbor lists if needed
int** firstneigh = nullptr;
firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole,
host_uind, host_uinp, sublo, subhi, tag,
nspecial, special, nspecial15, special15,
eflag_in, vflag_in, eatom, vatom,
host_start, ilist, jnum, cpu_time,
success, host_q, boxlo, prd);
// ------------------- Resize _tep array ------------------------
if (nall>_max_tep_size) {
_max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
_tep.resize(_max_tep_size*4);
}
*tep_ptr=_tep.host.begin();
const int red_blocks=polar_real(eflag,vflag);
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
device->add_ans_object(ans);
hd_balancer.stop_timer();
// copy tep from device to host
_tep.update_host(_max_tep_size*4,false);
/*
printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
for (int i = 0; i < 10; i++) {
numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
}
*/
return firstneigh; // nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute the direct real space part
// of the permanent field
@ -504,7 +437,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
set_kernel(eflag,vflag);
// reallocate per-atom arrays and build the neighbor lists if needed
// reallocate per-atom arrays, transfer data from the host
// and build the neighbor lists if needed
int** firstneigh = nullptr;
firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
@ -539,6 +473,85 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
return firstneigh; //nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute polar real-space
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd, void **tep_ptr) {
acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
// reallocate per-atom arrays, transfer data from the host
// and build the neighbor lists if needed
// NOTE:
// For now we invoke precompute() again here,
// to be able to turn on/off the udirect2b kernel (which comes before this)
// Once all the kernels are ready, precompute() is needed only once
// in the first kernel in a time step.
// We only need to cast uind and uinp from host to device here
// if the neighbor lists are rebuilt and other per-atom arrays
// (x, type, amtype, amgroup, rpole) are ready on the device.
int** firstneigh = nullptr;
firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole,
host_uind, host_uinp, sublo, subhi, tag,
nspecial, special, nspecial15, special15,
eflag_in, vflag_in, eatom, vatom,
host_start, ilist, jnum, cpu_time,
success, host_q, boxlo, prd);
// ------------------- Resize _tep array ------------------------
if (nall>_max_tep_size) {
_max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
_tep.resize(_max_tep_size*4);
}
*tep_ptr=_tep.host.begin();
const int red_blocks=polar_real(eflag,vflag);
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
device->add_ans_object(ans);
hd_balancer.stop_timer();
// copy tep from device to host
_tep.update_host(_max_tep_size*4,false);
/*
printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
for (int i = 0; i < 10; i++) {
numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
}
*/
return firstneigh; // nbor->host_jlist.begin()-host_start;
}
template <class numtyp, class acctyp>
double BaseAmoebaT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
@ -548,6 +561,11 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
template <class numtyp, class acctyp>
void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
double** uind, double** uinp) {
// signal that we need to transfer extra data from the host
atom->extra_data_unavail();
int _nall=atom->nall();
numtyp *pextra=reinterpret_cast<numtyp*>(&(atom->extra[0]));

View File

@ -597,15 +597,6 @@ void PairAmoeba::induce()
memory->destroy(usum);
memory->destroy(usump);
if (comm->me == 0) {
printf("CPU: iter = %d\n", iter);
for (i = 0; i < 20; i++) {
printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
i, uind[i][0], uind[i][1], uind[i][2],
uinp[i][0], uinp[i][1], uinp[i][2]);
}
}
// update the lists of previous induced dipole values
// shift previous m values up to m+1, add new values at m = 0
// only when preconditioner is used

View File

@ -98,6 +98,10 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
suffix_flag |= Suffix::GPU;
fieldp_pinned = nullptr;
tep_pinned = nullptr;
gpu_udirect2b_ready = true;
gpu_polar_real_ready = true;
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
}
@ -114,7 +118,6 @@ PairAmoebaGPU::~PairAmoebaGPU()
void PairAmoebaGPU::polar_real()
{
bool gpu_polar_real_ready = true;
if (!gpu_polar_real_ready) {
PairAmoeba::polar_real();
return;
@ -139,7 +142,16 @@ void PairAmoebaGPU::polar_real()
domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
}
inum = atom->nlocal;
/*
if (comm->me == 0) {
printf("GPU: polar real\n");
for (int i = 0; i < 20; i++) {
printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
i, uind[i][0], uind[i][1], uind[i][2],
uinp[i][0], uinp[i][1], uinp[i][2]);
}
}
*/
firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
atom->type, amtype, amgroup,
rpole, uind, uinp, sublo, subhi,
@ -200,6 +212,7 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr)
tep[0] = tep_ptr[4*i];
tep[1] = tep_ptr[4*i+1];
tep[2] = tep_ptr[4*i+2];
torque2force(i,tep,fix,fiy,fiz,fpolar);
iz = zaxis2local[i];
@ -365,10 +378,14 @@ void PairAmoebaGPU::induce()
// get the electrostatic field due to permanent multipoles
dfield0c(field,fieldp);
/*
crstyle = FIELD;
comm->reverse_comm_pair(this);
*/
// need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only
if (!gpu_udirect2b_ready) {
crstyle = FIELD;
comm->reverse_comm_pair(this);
}
// set induced dipoles to polarizability times direct field
for (i = 0; i < nlocal; i++) {
@ -726,7 +743,7 @@ void PairAmoebaGPU::induce()
memory->destroy(udir);
memory->destroy(usum);
memory->destroy(usump);
/*
if (comm->me == 0) {
printf("GPU: iter = %d\n", iter);
for (i = 0; i < 20; i++) {
@ -735,7 +752,7 @@ void PairAmoebaGPU::induce()
uinp[i][0], uinp[i][1], uinp[i][2]);
}
}
*/
// update the lists of previous induced dipole values
// shift previous m values up to m+1, add new values at m = 0
// only when preconditioner is used
@ -825,7 +842,6 @@ void PairAmoebaGPU::dfield0c(double **field, double **fieldp)
void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
{
bool gpu_udirect2b_ready = true;
if (!gpu_udirect2b_ready) {
PairAmoeba::udirect2b(field, fieldp);
return;

View File

@ -45,6 +45,9 @@ class PairAmoebaGPU : public PairAmoeba {
void *fieldp_pinned;
bool tep_single;
bool gpu_polar_real_ready;
bool gpu_udirect2b_ready;
void udirect2b_cpu();
template<class numtyp>