Reverted the binsize function call from the GPU package in Atom, instead added atom_modify sort with a binsize to ensure matching virial values, enabled the udirect2b kernel, need more work to override dfield0c, and induce() to bypass reverse_comm() for field and fieldp (line amoeba_induce.cpp:111-112)

This commit is contained in:
Trung Nguyen
2021-09-03 13:43:22 -05:00
parent 745c7089f0
commit 7d69a870a4
7 changed files with 115 additions and 68 deletions

View File

@ -4,7 +4,7 @@ units real
boundary p p p
atom_style amoeba
#atom_modify sort 1000 7.0
bond_style class2
angle_style amoeba
dihedral_style none

View File

@ -57,7 +57,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
const double polar_uscale) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
cell_size,gpu_split,_screen,amoeba,"k_amoeba_polar");
cell_size,gpu_split,_screen,amoeba,
"k_amoeba_polar", "k_amoeba_udirect2b");
if (success!=0)
return success;
@ -164,15 +165,14 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
/*
this->k_polar.set_size(GX,BX);
this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom,
&_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
*/
this->k_udirect2b.set_size(GX,BX);
this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->_fieldp, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &_aewald, &_off2,
&_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}

View File

@ -91,8 +91,8 @@ _texture( q_tex,int2);
tep[i]=t; \
}
#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, \
i, field, fieldp) \
#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i, \
fieldp) \
if (t_per_atom>1) { \
red_acc[0][tid]=_fieldp[0]; \
red_acc[1][tid]=_fieldp[1]; \
@ -118,8 +118,8 @@ _texture( q_tex,int2);
numtyp4 f, fp; \
f.x = _fieldp[0]; f.y = _fieldp[0]; f.z = _fieldp[2]; \
fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5]; \
field[i] = f; \
fieldp[i] = fp; \
fieldp[ii] = f; \
fieldp[ii+inum] = fp; \
}
#else
@ -152,8 +152,8 @@ _texture( q_tex,int2);
tep[i]=t; \
}
#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, \
i, field, fieldp) \
#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i, \
fieldp) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
_fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom); \
@ -168,8 +168,8 @@ _texture( q_tex,int2);
numtyp4 f, fp; \
f.x = _fieldp[0]; f.y = _fieldp[0]; f.z = _fieldp[2]; \
fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5]; \
field[i] = f; \
fieldp[i] = fp; \
fieldp[ii] = f; \
fieldp[ii+inum] = fp; \
}
#endif
@ -177,6 +177,11 @@ _texture( q_tex,int2);
#define MIN(A,B) ((A) < (B) ? (A) : (B))
#define MY_PIS (acctyp)1.77245385090551602729
/* ----------------------------------------------------------------------
polar_real = real-space portion of induced dipole polarization
adapted from Tinker epreal1d() routine
------------------------------------------------------------------------- */
__kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
const __global numtyp *restrict extra,
const __global numtyp4 *restrict damping,
@ -468,7 +473,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
(numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 +qir*term6;
(numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
(numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
@ -684,19 +689,23 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
offset,eflag,vflag,ans,engv);
}
/* ----------------------------------------------------------------------
udirect2b = Ewald real direct field via list
udirect2b computes the real space contribution of the permanent
atomic multipole moments to the field via a neighbor list
------------------------------------------------------------------------- */
__kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
const __global numtyp *restrict extra,
const __global numtyp4 *restrict damping,
const __global numtyp4 *restrict sp_polar,
const __global int *dev_nbor,
const __global int *dev_packed,
__global numtyp4 *restrict field,
__global numtyp4 *restrict fieldp,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch, const int t_per_atom,
const numtyp aewald, const numtyp felec,
const numtyp off2, const numtyp polar_dscale,
const numtyp polar_uscale)
const __global numtyp *restrict extra,
const __global numtyp4 *restrict damping,
const __global numtyp4 *restrict sp_polar,
const __global int *dev_nbor,
const __global int *dev_packed,
__global numtyp4 *restrict fieldp,
const int inum, const int nall,
const int nbor_pitch, const int t_per_atom,
const numtyp aewald, const numtyp off2,
const numtyp polar_dscale, const numtyp polar_uscale)
{
int tid, ii, offset, i;
atom_info(t_per_atom,ii,tid,offset);
@ -771,7 +780,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
numtyp r = ucl_sqrt(r2);
numtyp rinv = ucl_recip(r);
numtyp r2inv = rinv*rinv;
numtyp rr1 = felec * rinv;
numtyp rr1 = rinv;
numtyp rr3 = rr1 * r2inv;
numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
@ -888,7 +897,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
// accumulate field and fieldp
store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,field,fieldp);
store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
}
/* ----------------------------------------------------------------------

View File

@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() {
delete ans;
delete nbor;
k_polar.clear();
k_udirect2b.clear();
k_special15.clear();
if (pair_program) delete pair_program;
}
@ -53,7 +54,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
const int maxspecial15,
const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program,
const char *k_name) {
const char *k_name_polar,
const char *k_name_udirect2b) {
screen=_screen;
int gpu_nbor=0;
@ -85,7 +87,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
_block_size=device->pair_block_size();
_block_bio_size=device->block_bio_pair();
compile_kernels(*ucl_device,pair_program,k_name);
compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b);
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
@ -118,9 +120,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
if (ef_nall==0)
ef_nall=2000;
_max_alloc_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
_fieldp.alloc(_max_alloc_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_tep.alloc(_max_alloc_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
_max_fieldp_size = _max_tep_size;
_fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
@ -224,7 +227,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
@ -252,9 +255,9 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
// ------------------- Resize _tep array ------------------------
if (nall>_max_alloc_size) {
_max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
_tep.resize(_max_alloc_size*4);
if (nall>_max_tep_size) {
_max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
_tep.resize(_max_tep_size*4);
dev_nspecial15.clear();
dev_special15.clear();
@ -302,6 +305,10 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
device->add_ans_object(ans);
hd_balancer.stop_timer();
// copy tep from device to host
_tep.update_host(_max_tep_size*4,false);
}
// ---------------------------------------------------------------------------
@ -338,9 +345,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
// ------------------- Resize _tep array ------------------------
if (nall>_max_alloc_size) {
_max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
_tep.resize(_max_alloc_size*4);
if (nall>_max_tep_size) {
_max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
_tep.resize(_max_tep_size*4);
dev_nspecial15.clear();
dev_special15.clear();
@ -397,9 +404,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
// copy tep from device to host
_tep.update_host(_max_alloc_size*4,false);
_tep.update_host(_max_tep_size*4,false);
/*
printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_alloc_size);
printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
for (int i = 0; i < 10; i++) {
numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
@ -442,9 +449,9 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
// ------------------- Resize _fieldp array ------------------------
if (nall>_max_alloc_size) {
_max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
_fieldp.resize(_max_alloc_size*8);
if (nall>_max_fieldp_size) {
_max_fieldp_size=static_cast<int>(static_cast<double>(nall)*1.10);
_fieldp.resize(_max_fieldp_size*8);
dev_nspecial15.clear();
dev_special15.clear();
@ -492,13 +499,18 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
*jnum=nbor->host_acc.begin();
const int red_blocks=udirect2b(eflag,vflag);
//ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
//device->add_ans_object(ans);
hd_balancer.stop_timer();
// copy field and fieldp from device to host
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
//_fieldp.update_host(_max_field_size*8,false);
_fieldp.update_host(_max_fieldp_size*8,false);
/*
printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", this->_field.cols(), _max_fieldp_size);
for (int i = 0; i < 10; i++) {
numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
}
*/
return nbor->host_jlist.begin()-host_start;
}
@ -566,7 +578,8 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
template <class numtyp, class acctyp>
void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *kname) {
const char *kname_polar,
const char *kname_udirect2b) {
if (_compiled)
return;
@ -575,7 +588,8 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_polar.set_function(*pair_program,kname);
k_polar.set_function(*pair_program,kname_polar);
k_udirect2b.set_function(*pair_program,kname_udirect2b);
k_special15.set_function(*pair_program,"k_special15");
pos_tex.get_texture(*pair_program,"pos_tex");
q_tex.get_texture(*pair_program,"q_tex");
@ -593,6 +607,10 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
}
// ---------------------------------------------------------------------------
// Specify 1-5 neighbors from the current neighbor list
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int BaseAmoebaT::add_onefive_neighbors() {
// Compute the block size and grid size to keep all cores busy

View File

@ -53,8 +53,8 @@ class BaseAmoeba {
* - -5 Double precision is not supported on card **/
int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15, const double cell_size,
const double gpu_split, FILE *screen,
const void *pair_program, const char *k_name);
const double gpu_split, FILE *screen, const void *pair_program,
const char *kname_polar, const char *kname_udirect2b);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(const int add_kernels=0);
@ -129,7 +129,7 @@ class BaseAmoeba {
bool &success);
/// Compute polar real-space with host neighboring (not active for now)
void compute(const int f_ago, const int inum_full, const int nall,
void compute_polar_real(const int f_ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double **host_uind,
double **host_uinp, int *ilist, int *numj,
@ -190,8 +190,8 @@ class BaseAmoeba {
double** uind, double** uinp);
/// Per-atom arrays
UCL_Vector<numtyp,numtyp> _tep,_fieldp;
int _max_alloc_size;
UCL_Vector<numtyp,numtyp> _tep, _fieldp;
int _max_tep_size, _max_fieldp_size;
// ------------------------ FORCE/ENERGY DATA -----------------------
@ -210,7 +210,7 @@ class BaseAmoeba {
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_polar,k_special15;
UCL_Kernel k_polar, k_udirect2b, k_special15;
inline int block_size() { return _block_size; }
inline void set_kernel(const int eflag, const int vflag) {}
@ -226,7 +226,8 @@ class BaseAmoeba {
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *kname_polar, const char *kname_udirect2b);
virtual int polar_real(const int eflag, const int vflag) = 0;
virtual int udirect2b(const int eflag, const int vflag) = 0;

View File

@ -298,7 +298,7 @@ void PairAmoebaGPU::init_style()
void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
{
bool gpu_udirect2b_ready = false;
bool gpu_udirect2b_ready = true;
if (!gpu_udirect2b_ready) {
PairAmoeba::udirect2b(field, fieldp);
return;
@ -335,6 +335,27 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
if (!success)
error->one(FLERR,"Insufficient memory on accelerator");
// get field and fieldp values from the GPU lib
int nlocal = atom->nlocal;
double *field_ptr = (double *)fieldp_pinned;
for (int i = 0; i < nlocal; i++) {
int idx = 4*i;
field[i][0] = field_ptr[idx];
field[i][1] = field_ptr[idx+1];
field[i][2] = field_ptr[idx+2];
}
double* fieldp_ptr = (double *)fieldp_pinned;
fieldp_ptr += 4*inum;
for (int i = 0; i < nlocal; i++) {
int idx = 4*i;
fieldp[i][0] = fieldp_ptr[idx];
fieldp[i][1] = fieldp_ptr[idx+1];
fieldp[i][2] = fieldp_ptr[idx+2];
}
// rebuild dipole-dipole pair list and store pairwise dipole matrices
// done one atom at a time in real-space double loop over atoms & neighs

View File

@ -2274,7 +2274,6 @@ void Atom::setup_sort_bins()
#ifdef LMP_GPU
if (userbinsize == 0.0) {
int ifix = modify->find_fix("package_gpu");
/*
if (ifix >= 0) {
const double subx = domain->subhi[0] - domain->sublo[0];
const double suby = domain->subhi[1] - domain->sublo[1];
@ -2298,7 +2297,6 @@ void Atom::setup_sort_bins()
bininvy = bininv;
bininvz = bininv;
}
*/
}
#endif