Both udirect2b and polar_real are working correctly on the GPU
This commit is contained in:
@ -284,7 +284,11 @@ class Atom {
|
||||
|
||||
/// Signal that we need to transfer atom data for next timestep
|
||||
inline void data_unavail()
|
||||
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
|
||||
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; }
|
||||
|
||||
/// Signal that we need to transfer atom extra data for next kernel call
|
||||
inline void extra_data_unavail()
|
||||
{ _extra_avail=false; }
|
||||
|
||||
typedef struct { double x,y,z; } vec3d;
|
||||
typedef struct { numtyp x,y,z,w; } vec4d_t;
|
||||
|
||||
@ -317,8 +317,10 @@ void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const
|
||||
// ---------------------------------------------------------------------------
|
||||
// Prepare for multiple kernel calls in a time step:
|
||||
// - reallocate per-atom arrays, if needed
|
||||
// - transfer extra data from host to device
|
||||
// - build the full neighbor lists for use by different kernels
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
@ -402,75 +404,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary, and then compute polar real-space
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
int *nspecial15, tagint **special15,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo,
|
||||
double *prd, void **tep_ptr) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
|
||||
// reallocate per-atom arrays and build the neighbor lists if needed
|
||||
|
||||
int** firstneigh = nullptr;
|
||||
firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
|
||||
host_amtype, host_amgroup, host_rpole,
|
||||
host_uind, host_uinp, sublo, subhi, tag,
|
||||
nspecial, special, nspecial15, special15,
|
||||
eflag_in, vflag_in, eatom, vatom,
|
||||
host_start, ilist, jnum, cpu_time,
|
||||
success, host_q, boxlo, prd);
|
||||
|
||||
// ------------------- Resize _tep array ------------------------
|
||||
|
||||
if (nall>_max_tep_size) {
|
||||
_max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_tep.resize(_max_tep_size*4);
|
||||
}
|
||||
*tep_ptr=_tep.host.begin();
|
||||
|
||||
const int red_blocks=polar_real(eflag,vflag);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
// copy tep from device to host
|
||||
|
||||
_tep.update_host(_max_tep_size*4,false);
|
||||
/*
|
||||
printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
|
||||
printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
|
||||
}
|
||||
*/
|
||||
return firstneigh; // nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary, and then compute the direct real space part
|
||||
// of the permanent field
|
||||
@ -504,7 +437,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
|
||||
// reallocate per-atom arrays and build the neighbor lists if needed
|
||||
// reallocate per-atom arrays, transfer data from the host
|
||||
// and build the neighbor lists if needed
|
||||
|
||||
int** firstneigh = nullptr;
|
||||
firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
|
||||
@ -539,6 +473,85 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
|
||||
return firstneigh; //nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary, and then compute polar real-space
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
int *nspecial15, tagint **special15,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo,
|
||||
double *prd, void **tep_ptr) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
|
||||
// reallocate per-atom arrays, transfer data from the host
|
||||
// and build the neighbor lists if needed
|
||||
// NOTE:
|
||||
// For now we invoke precompute() again here,
|
||||
// to be able to turn on/off the udirect2b kernel (which comes before this)
|
||||
// Once all the kernels are ready, precompute() is needed only once
|
||||
// in the first kernel in a time step.
|
||||
// We only need to cast uind and uinp from host to device here
|
||||
// if the neighbor lists are rebuilt and other per-atom arrays
|
||||
// (x, type, amtype, amgroup, rpole) are ready on the device.
|
||||
|
||||
int** firstneigh = nullptr;
|
||||
firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
|
||||
host_amtype, host_amgroup, host_rpole,
|
||||
host_uind, host_uinp, sublo, subhi, tag,
|
||||
nspecial, special, nspecial15, special15,
|
||||
eflag_in, vflag_in, eatom, vatom,
|
||||
host_start, ilist, jnum, cpu_time,
|
||||
success, host_q, boxlo, prd);
|
||||
|
||||
// ------------------- Resize _tep array ------------------------
|
||||
|
||||
if (nall>_max_tep_size) {
|
||||
_max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_tep.resize(_max_tep_size*4);
|
||||
}
|
||||
*tep_ptr=_tep.host.begin();
|
||||
|
||||
const int red_blocks=polar_real(eflag,vflag);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
// copy tep from device to host
|
||||
|
||||
_tep.update_host(_max_tep_size*4,false);
|
||||
/*
|
||||
printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
|
||||
printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
|
||||
}
|
||||
*/
|
||||
return firstneigh; // nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double BaseAmoebaT::host_memory_usage_atomic() const {
|
||||
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
||||
@ -548,6 +561,11 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
|
||||
double** uind, double** uinp) {
|
||||
|
||||
// signal that we need to transfer extra data from the host
|
||||
|
||||
atom->extra_data_unavail();
|
||||
|
||||
int _nall=atom->nall();
|
||||
numtyp *pextra=reinterpret_cast<numtyp*>(&(atom->extra[0]));
|
||||
|
||||
|
||||
@ -597,15 +597,6 @@ void PairAmoeba::induce()
|
||||
memory->destroy(usum);
|
||||
memory->destroy(usump);
|
||||
|
||||
if (comm->me == 0) {
|
||||
printf("CPU: iter = %d\n", iter);
|
||||
for (i = 0; i < 20; i++) {
|
||||
printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
|
||||
i, uind[i][0], uind[i][1], uind[i][2],
|
||||
uinp[i][0], uinp[i][1], uinp[i][2]);
|
||||
}
|
||||
}
|
||||
|
||||
// update the lists of previous induced dipole values
|
||||
// shift previous m values up to m+1, add new values at m = 0
|
||||
// only when preconditioner is used
|
||||
|
||||
@ -98,6 +98,10 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
|
||||
suffix_flag |= Suffix::GPU;
|
||||
fieldp_pinned = nullptr;
|
||||
tep_pinned = nullptr;
|
||||
|
||||
gpu_udirect2b_ready = true;
|
||||
gpu_polar_real_ready = true;
|
||||
|
||||
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
|
||||
}
|
||||
|
||||
@ -114,7 +118,6 @@ PairAmoebaGPU::~PairAmoebaGPU()
|
||||
|
||||
void PairAmoebaGPU::polar_real()
|
||||
{
|
||||
bool gpu_polar_real_ready = true;
|
||||
if (!gpu_polar_real_ready) {
|
||||
PairAmoeba::polar_real();
|
||||
return;
|
||||
@ -139,7 +142,16 @@ void PairAmoebaGPU::polar_real()
|
||||
domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
|
||||
}
|
||||
inum = atom->nlocal;
|
||||
|
||||
/*
|
||||
if (comm->me == 0) {
|
||||
printf("GPU: polar real\n");
|
||||
for (int i = 0; i < 20; i++) {
|
||||
printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
|
||||
i, uind[i][0], uind[i][1], uind[i][2],
|
||||
uinp[i][0], uinp[i][1], uinp[i][2]);
|
||||
}
|
||||
}
|
||||
*/
|
||||
firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
|
||||
atom->type, amtype, amgroup,
|
||||
rpole, uind, uinp, sublo, subhi,
|
||||
@ -200,6 +212,7 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr)
|
||||
tep[0] = tep_ptr[4*i];
|
||||
tep[1] = tep_ptr[4*i+1];
|
||||
tep[2] = tep_ptr[4*i+2];
|
||||
|
||||
torque2force(i,tep,fix,fiy,fiz,fpolar);
|
||||
|
||||
iz = zaxis2local[i];
|
||||
@ -365,10 +378,14 @@ void PairAmoebaGPU::induce()
|
||||
// get the electrostatic field due to permanent multipoles
|
||||
|
||||
dfield0c(field,fieldp);
|
||||
/*
|
||||
crstyle = FIELD;
|
||||
comm->reverse_comm_pair(this);
|
||||
*/
|
||||
|
||||
// need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only
|
||||
|
||||
if (!gpu_udirect2b_ready) {
|
||||
crstyle = FIELD;
|
||||
comm->reverse_comm_pair(this);
|
||||
}
|
||||
|
||||
// set induced dipoles to polarizability times direct field
|
||||
|
||||
for (i = 0; i < nlocal; i++) {
|
||||
@ -726,7 +743,7 @@ void PairAmoebaGPU::induce()
|
||||
memory->destroy(udir);
|
||||
memory->destroy(usum);
|
||||
memory->destroy(usump);
|
||||
|
||||
/*
|
||||
if (comm->me == 0) {
|
||||
printf("GPU: iter = %d\n", iter);
|
||||
for (i = 0; i < 20; i++) {
|
||||
@ -735,7 +752,7 @@ void PairAmoebaGPU::induce()
|
||||
uinp[i][0], uinp[i][1], uinp[i][2]);
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
// update the lists of previous induced dipole values
|
||||
// shift previous m values up to m+1, add new values at m = 0
|
||||
// only when preconditioner is used
|
||||
@ -825,7 +842,6 @@ void PairAmoebaGPU::dfield0c(double **field, double **fieldp)
|
||||
|
||||
void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
|
||||
{
|
||||
bool gpu_udirect2b_ready = true;
|
||||
if (!gpu_udirect2b_ready) {
|
||||
PairAmoeba::udirect2b(field, fieldp);
|
||||
return;
|
||||
|
||||
@ -45,6 +45,9 @@ class PairAmoebaGPU : public PairAmoeba {
|
||||
void *fieldp_pinned;
|
||||
bool tep_single;
|
||||
|
||||
bool gpu_polar_real_ready;
|
||||
bool gpu_udirect2b_ready;
|
||||
|
||||
void udirect2b_cpu();
|
||||
|
||||
template<class numtyp>
|
||||
|
||||
Reference in New Issue
Block a user