Fixed bugs in the multipole real-space part on the GPU; separately multipole real and polar real work correctly (along with udirect2b and umutual2b), but
together they are conflicting due to the use of ans to copy forces back from device to host. The other 2 kernels (induce part) do not touch forces and energies.
This commit is contained in:
@ -53,8 +53,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
|
|||||||
const int nlocal, const int nall, const int max_nbors,
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
const int maxspecial, const int maxspecial15,
|
const int maxspecial, const int maxspecial15,
|
||||||
const double cell_size, const double gpu_split, FILE *_screen,
|
const double cell_size, const double gpu_split, FILE *_screen,
|
||||||
const double aewald, const double polar_dscale,
|
const double polar_dscale, const double polar_uscale) {
|
||||||
const double polar_uscale) {
|
|
||||||
int success;
|
int success;
|
||||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
|
||||||
cell_size,gpu_split,_screen,amoeba,
|
cell_size,gpu_split,_screen,amoeba,
|
||||||
@ -97,7 +96,6 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
|
|||||||
}
|
}
|
||||||
ucl_copy(sp_polar,dview,5,false);
|
ucl_copy(sp_polar,dview,5,false);
|
||||||
|
|
||||||
_aewald = aewald;
|
|
||||||
_polar_dscale = polar_dscale;
|
_polar_dscale = polar_dscale;
|
||||||
_polar_uscale = polar_uscale;
|
_polar_uscale = polar_uscale;
|
||||||
|
|
||||||
@ -158,7 +156,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
|
|||||||
&this->dev_short_nbor,
|
&this->dev_short_nbor,
|
||||||
&this->ans->force, &this->ans->engv, &this->_tep,
|
&this->ans->force, &this->ans->engv, &this->_tep,
|
||||||
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
|
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
|
||||||
&this->_threads_per_atom, &_aewald, &this->_felec,
|
&this->_threads_per_atom, &this->_aewald, &this->_felec,
|
||||||
&this->_off2_mpole, &_polar_dscale, &_polar_uscale);
|
&this->_off2_mpole, &_polar_dscale, &_polar_uscale);
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
|
|
||||||
@ -198,7 +196,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
|
|||||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
&this->dev_short_nbor,
|
&this->dev_short_nbor,
|
||||||
&this->_fieldp, &ainum, &_nall, &nbor_pitch,
|
&this->_fieldp, &ainum, &_nall, &nbor_pitch,
|
||||||
&this->_threads_per_atom, &_aewald, &this->_off2_polar,
|
&this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
|
||||||
&_polar_dscale, &_polar_uscale);
|
&_polar_dscale, &_polar_uscale);
|
||||||
|
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
@ -237,7 +235,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
|
|||||||
this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
|
this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
|
||||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
&this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
|
&this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
|
||||||
&nbor_pitch, &this->_threads_per_atom, &_aewald,
|
&nbor_pitch, &this->_threads_per_atom, &this->_aewald,
|
||||||
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
|
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
|
||||||
|
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
@ -278,7 +276,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
|
|||||||
&this->dev_short_nbor,
|
&this->dev_short_nbor,
|
||||||
&this->ans->force, &this->ans->engv, &this->_tep,
|
&this->ans->force, &this->ans->engv, &this->_tep,
|
||||||
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
|
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
|
||||||
&this->_threads_per_atom, &_aewald, &this->_felec,
|
&this->_threads_per_atom, &this->_aewald, &this->_felec,
|
||||||
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
|
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
|
|
||||||
|
|||||||
@ -225,20 +225,20 @@ _texture( q_tex,int2);
|
|||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
__kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
__kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
||||||
const __global numtyp *restrict extra,
|
const __global numtyp *restrict extra,
|
||||||
const __global numtyp4 *restrict damping,
|
const __global numtyp4 *restrict damping,
|
||||||
const __global numtyp4 *restrict sp_polar,
|
const __global numtyp4 *restrict sp_polar,
|
||||||
const __global int *dev_nbor,
|
const __global int *dev_nbor,
|
||||||
const __global int *dev_packed,
|
const __global int *dev_packed,
|
||||||
const __global int *dev_short_nbor,
|
const __global int *dev_short_nbor,
|
||||||
__global acctyp4 *restrict ans,
|
__global acctyp4 *restrict ans,
|
||||||
__global acctyp *restrict engv,
|
__global acctyp *restrict engv,
|
||||||
__global numtyp4 *restrict tep,
|
__global numtyp4 *restrict tep,
|
||||||
const int eflag, const int vflag, const int inum,
|
const int eflag, const int vflag, const int inum,
|
||||||
const int nall, const int nbor_pitch, const int t_per_atom,
|
const int nall, const int nbor_pitch, const int t_per_atom,
|
||||||
const numtyp aewald, const numtyp felec,
|
const numtyp aewald, const numtyp felec,
|
||||||
const numtyp off2, const numtyp polar_dscale,
|
const numtyp off2, const numtyp polar_dscale,
|
||||||
const numtyp polar_uscale)
|
const numtyp polar_uscale)
|
||||||
{
|
{
|
||||||
int tid, ii, offset, i;
|
int tid, ii, offset, i;
|
||||||
atom_info(t_per_atom,ii,tid,offset);
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
@ -257,7 +257,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
acctyp4 tq;
|
acctyp4 tq;
|
||||||
tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; tq.w=(acctyp)0;
|
tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
|
||||||
|
|
||||||
numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
|
numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
|
||||||
numtyp4* polar1 = (numtyp4*)(&extra[0]);
|
numtyp4* polar1 = (numtyp4*)(&extra[0]);
|
||||||
@ -272,7 +272,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
|||||||
numtyp term1,term2,term3;
|
numtyp term1,term2,term3;
|
||||||
numtyp term4,term5;
|
numtyp term4,term5;
|
||||||
numtyp term6,term7;
|
numtyp term6,term7;
|
||||||
numtyp rc3[3],rc5[3],rc7[3];
|
|
||||||
numtyp bn[6];
|
numtyp bn[6];
|
||||||
numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
|
numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
|
||||||
|
|
||||||
@ -309,9 +308,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
|||||||
// debug:
|
// debug:
|
||||||
// xi__ = ix; xi__.w = itype;
|
// xi__ = ix; xi__.w = itype;
|
||||||
|
|
||||||
numtyp pdi = damping[itype].x;
|
|
||||||
numtyp pti = damping[itype].y;
|
|
||||||
|
|
||||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int jextra=nbor_mem[nbor];
|
int jextra=nbor_mem[nbor];
|
||||||
@ -326,10 +322,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
|||||||
numtyp zr = jx.z - ix.z;
|
numtyp zr = jx.z - ix.z;
|
||||||
numtyp r2 = xr*xr + yr*yr + zr*zr;
|
numtyp r2 = xr*xr + yr*yr + zr*zr;
|
||||||
|
|
||||||
//if (r2>off2) continue;
|
if (r2>off2) continue;
|
||||||
|
|
||||||
numtyp r = ucl_sqrt(r2);
|
numtyp r = ucl_sqrt(r2);
|
||||||
|
|
||||||
numtyp ck = polar1[j].x; // rpole[j][0];
|
numtyp ck = polar1[j].x; // rpole[j][0];
|
||||||
numtyp dkx = polar1[j].y; // rpole[j][1];
|
numtyp dkx = polar1[j].y; // rpole[j][1];
|
||||||
numtyp dky = polar1[j].z; // rpole[j][2];
|
numtyp dky = polar1[j].z; // rpole[j][2];
|
||||||
@ -363,7 +358,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
|||||||
numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
|
numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
|
||||||
numtyp diqk = dix*qkx + diy*qky + diz*qkz;
|
numtyp diqk = dix*qkx + diy*qky + diz*qkz;
|
||||||
numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
|
numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
|
||||||
numtyp qiqk = (numtyp )2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
|
numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
|
||||||
qixx*qkxx + qiyy*qkyy + qizz*qkzz;
|
qixx*qkxx + qiyy*qkyy + qizz*qkzz;
|
||||||
|
|
||||||
// additional intermediates involving moments and distance
|
// additional intermediates involving moments and distance
|
||||||
@ -452,8 +447,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
|||||||
term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
|
term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
|
||||||
term4 = dir*qkr - dkr*qir - 4.0*qik;
|
term4 = dir*qkr - dkr*qir - 4.0*qik;
|
||||||
term5 = qir*qkr;
|
term5 = qir*qkr;
|
||||||
|
numtyp scalek = (numtyp)1.0 - factor_mpole;
|
||||||
numtyp scalek = 1.0 - factor_mpole;
|
|
||||||
rr1 = bn[0] - scalek*rr1;
|
rr1 = bn[0] - scalek*rr1;
|
||||||
rr3 = bn[1] - scalek*rr3;
|
rr3 = bn[1] - scalek*rr3;
|
||||||
rr5 = bn[2] - scalek*rr5;
|
rr5 = bn[2] - scalek*rr5;
|
||||||
@ -485,11 +479,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
// compute the torque components for this interaction
|
// compute the torque components for this interaction
|
||||||
|
|
||||||
numtyp tix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) -
|
numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) -
|
||||||
term4*qirx - term6*(qikrx+qikx);
|
term4*qirx - term6*(qikrx+qikx);
|
||||||
numtyp tiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) -
|
numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) -
|
||||||
term4*qiry - term6*(qikry+qiky);
|
term4*qiry - term6*(qikry+qiky);
|
||||||
numtyp tiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) -
|
numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) -
|
||||||
term4*qirz - term6*(qikrz+qikz);
|
term4*qirz - term6*(qikrz+qikz);
|
||||||
|
|
||||||
// increment force-based gradient and torque on first site
|
// increment force-based gradient and torque on first site
|
||||||
@ -497,16 +491,16 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
|||||||
f.x += frcx;
|
f.x += frcx;
|
||||||
f.y += frcy;
|
f.y += frcy;
|
||||||
f.z += frcz;
|
f.z += frcz;
|
||||||
tq.x += tix;
|
tq.x += ttmix;
|
||||||
tq.y += tiy;
|
tq.y += ttmiy;
|
||||||
tq.z += tiz;
|
tq.z += ttmiz;
|
||||||
|
|
||||||
if (EVFLAG && vflag) {
|
if (EVFLAG && vflag) {
|
||||||
numtyp vxx = -xr * frcx;
|
numtyp vxx = -xr * frcx;
|
||||||
numtyp vxy = (numtyp )-0.5 * (yr*frcx+xr*frcy);
|
numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
|
||||||
numtyp vxz = (numtyp )-0.5 * (zr*frcx+xr*frcz);
|
numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
|
||||||
numtyp vyy = -yr * frcy;
|
numtyp vyy = -yr * frcy;
|
||||||
numtyp vyz = (numtyp )-0.5 * (zr*frcy+yr*frcz);
|
numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
|
||||||
numtyp vzz = -zr * frcz;
|
numtyp vzz = -zr * frcz;
|
||||||
|
|
||||||
virial[0] += vxx;
|
virial[0] += vxx;
|
||||||
@ -520,7 +514,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
} // ii<inum
|
} // ii<inum
|
||||||
|
|
||||||
// accumulate ufld and dufld to compute tep
|
// accumulate tq
|
||||||
store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
|
store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
|
||||||
|
|
||||||
// accumate force, energy and virial
|
// accumate force, energy and virial
|
||||||
|
|||||||
@ -46,8 +46,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
|
|||||||
const int nlocal, const int nall, const int max_nbors,
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
const int maxspecial, const int maxspecial15, const double cell_size,
|
const int maxspecial, const int maxspecial15, const double cell_size,
|
||||||
const double gpu_split, FILE *_screen,
|
const double gpu_split, FILE *_screen,
|
||||||
const double aewald, const double polar_dscale,
|
const double polar_dscale, const double polar_uscale);
|
||||||
const double polar_uscale);
|
|
||||||
|
|
||||||
/// Clear all host and device data
|
/// Clear all host and device data
|
||||||
/** \note This is called at the beginning of the init() routine **/
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
@ -75,7 +74,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
|
|||||||
/// Number of atom types
|
/// Number of atom types
|
||||||
int _lj_types;
|
int _lj_types;
|
||||||
|
|
||||||
numtyp _aewald, _polar_dscale, _polar_uscale;
|
numtyp _polar_dscale, _polar_uscale;
|
||||||
numtyp _qqrd2e;
|
numtyp _qqrd2e;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|||||||
@ -37,8 +37,8 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
|
|||||||
const int nlocal, const int nall, const int max_nbors,
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
const int maxspecial, const int maxspecial15,
|
const int maxspecial, const int maxspecial15,
|
||||||
const double cell_size, int &gpu_mode, FILE *screen,
|
const double cell_size, int &gpu_mode, FILE *screen,
|
||||||
const double aewald, const double polar_dscale,
|
const double polar_dscale, const double polar_uscale,
|
||||||
const double polar_uscale, int& tep_size) {
|
int& tep_size) {
|
||||||
AMOEBAMF.clear();
|
AMOEBAMF.clear();
|
||||||
gpu_mode=AMOEBAMF.device->gpu_mode();
|
gpu_mode=AMOEBAMF.device->gpu_mode();
|
||||||
double gpu_split=AMOEBAMF.device->particle_split();
|
double gpu_split=AMOEBAMF.device->particle_split();
|
||||||
@ -67,7 +67,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
|
|||||||
host_special_mpole, host_special_polar_wscale,
|
host_special_mpole, host_special_polar_wscale,
|
||||||
host_special_polar_piscale, host_special_polar_pscale,
|
host_special_polar_piscale, host_special_polar_pscale,
|
||||||
nlocal, nall, max_nbors, maxspecial, maxspecial15,
|
nlocal, nall, max_nbors, maxspecial, maxspecial15,
|
||||||
cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale);
|
cell_size, gpu_split, screen, polar_dscale, polar_uscale);
|
||||||
|
|
||||||
AMOEBAMF.device->world_barrier();
|
AMOEBAMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
@ -87,7 +87,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
|
|||||||
host_special_mpole, host_special_polar_wscale,
|
host_special_mpole, host_special_polar_wscale,
|
||||||
host_special_polar_piscale, host_special_polar_pscale,
|
host_special_polar_piscale, host_special_polar_pscale,
|
||||||
nlocal, nall, max_nbors, maxspecial, maxspecial15,
|
nlocal, nall, max_nbors, maxspecial, maxspecial15,
|
||||||
cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale);
|
cell_size, gpu_split, screen, polar_dscale, polar_uscale);
|
||||||
|
|
||||||
AMOEBAMF.device->gpu_barrier();
|
AMOEBAMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
@ -113,13 +113,13 @@ int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
|
|||||||
const bool eflag, const bool vflag, const bool eatom,
|
const bool eflag, const bool vflag, const bool eatom,
|
||||||
const bool vatom, int &host_start,
|
const bool vatom, int &host_start,
|
||||||
int **ilist, int **jnum, const double cpu_time,
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
bool &success, const double felec, const double off2,
|
bool &success, const double aewald, const double felec, const double off2,
|
||||||
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
|
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
|
||||||
return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
|
return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
|
||||||
host_amtype, host_amgroup, host_rpole, sublo, subhi,
|
host_amtype, host_amgroup, host_rpole, sublo, subhi,
|
||||||
tag, nspecial, special, nspecial15, special15,
|
tag, nspecial, special, nspecial15, special15,
|
||||||
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
|
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
|
||||||
cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr);
|
cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
|
int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
|
||||||
@ -131,13 +131,13 @@ int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
|
|||||||
const bool eflag, const bool vflag, const bool eatom,
|
const bool eflag, const bool vflag, const bool eatom,
|
||||||
const bool vatom, int &host_start,
|
const bool vatom, int &host_start,
|
||||||
int **ilist, int **jnum, const double cpu_time,
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
bool &success, const double off2, double *host_q,
|
bool &success, const double aewald, const double off2, double *host_q,
|
||||||
double *boxlo, double *prd, void **fieldp_ptr) {
|
double *boxlo, double *prd, void **fieldp_ptr) {
|
||||||
return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
|
return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
|
||||||
host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
|
host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
|
||||||
sublo, subhi, tag, nspecial, special, nspecial15, special15,
|
sublo, subhi, tag, nspecial, special, nspecial15, special15,
|
||||||
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
|
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
|
||||||
cpu_time, success, off2, host_q, boxlo, prd, fieldp_ptr);
|
cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full,
|
int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full,
|
||||||
@ -149,13 +149,13 @@ int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full,
|
|||||||
const bool eflag, const bool vflag,
|
const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
int **ilist, int **jnum, const double cpu_time,
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
bool &success, const double off2, double *host_q,
|
bool &success, const double aewald, const double off2, double *host_q,
|
||||||
double *boxlo, double *prd, void **fieldp_ptr) {
|
double *boxlo, double *prd, void **fieldp_ptr) {
|
||||||
return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type,
|
return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type,
|
||||||
host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
|
host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
|
||||||
sublo, subhi, tag, nspecial, special, nspecial15, special15,
|
sublo, subhi, tag, nspecial, special, nspecial15, special15,
|
||||||
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
|
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
|
||||||
cpu_time, success, off2, host_q, boxlo, prd, fieldp_ptr);
|
cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
|
int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
|
||||||
@ -167,13 +167,13 @@ int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
|
|||||||
const bool eflag, const bool vflag, const bool eatom,
|
const bool eflag, const bool vflag, const bool eatom,
|
||||||
const bool vatom, int &host_start,
|
const bool vatom, int &host_start,
|
||||||
int **ilist, int **jnum, const double cpu_time,
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
bool &success, const double felec, const double off2,
|
bool &success, const double aewald, const double felec, const double off2,
|
||||||
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
|
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
|
||||||
return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
|
return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
|
||||||
host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
|
host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
|
||||||
sublo, subhi, tag, nspecial, special, nspecial15, special15,
|
sublo, subhi, tag, nspecial, special, nspecial15, special15,
|
||||||
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
|
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
|
||||||
cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr);
|
cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
double amoeba_gpu_bytes() {
|
double amoeba_gpu_bytes() {
|
||||||
|
|||||||
@ -252,8 +252,8 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
|
|||||||
const bool eflag_in, const bool vflag_in,
|
const bool eflag_in, const bool vflag_in,
|
||||||
const bool eatom, const bool vatom,
|
const bool eatom, const bool vatom,
|
||||||
int &host_start, const double cpu_time,
|
int &host_start, const double cpu_time,
|
||||||
bool &success, const double off2_polar, const double felec,
|
bool &success, const double aewald, const double felec,
|
||||||
double *host_q, const int nlocal,
|
const double off2_polar, double *host_q, const int nlocal,
|
||||||
double *boxlo, double *prd, void **tep_ptr) {
|
double *boxlo, double *prd, void **tep_ptr) {
|
||||||
acc_timers();
|
acc_timers();
|
||||||
int eflag, vflag;
|
int eflag, vflag;
|
||||||
@ -440,7 +440,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co
|
|||||||
const bool eflag_in, const bool vflag_in,
|
const bool eflag_in, const bool vflag_in,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
int **ilist, int **jnum, const double cpu_time,
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
bool &success, const double felec, const double off2_mpole,
|
bool &success, const double aewald, const double felec, const double off2_mpole,
|
||||||
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
|
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
|
||||||
acc_timers();
|
acc_timers();
|
||||||
int eflag, vflag;
|
int eflag, vflag;
|
||||||
@ -488,6 +488,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co
|
|||||||
|
|
||||||
_off2_mpole = off2_mpole;
|
_off2_mpole = off2_mpole;
|
||||||
_felec = felec;
|
_felec = felec;
|
||||||
|
_aewald = aewald;
|
||||||
const int red_blocks=multipole_real(eflag,vflag);
|
const int red_blocks=multipole_real(eflag,vflag);
|
||||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||||
device->add_ans_object(ans);
|
device->add_ans_object(ans);
|
||||||
@ -521,8 +522,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
|
|||||||
const bool eflag_in, const bool vflag_in,
|
const bool eflag_in, const bool vflag_in,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
int **ilist, int **jnum, const double cpu_time,
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
bool &success, const double off2_polar, double *host_q,
|
bool &success, const double aewald, const double off2_polar,
|
||||||
double *boxlo, double *prd, void** fieldp_ptr) {
|
double *host_q, double *boxlo, double *prd, void** fieldp_ptr) {
|
||||||
acc_timers();
|
acc_timers();
|
||||||
int eflag, vflag;
|
int eflag, vflag;
|
||||||
if (eatom) eflag=2;
|
if (eatom) eflag=2;
|
||||||
@ -560,6 +561,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
|
|||||||
*fieldp_ptr=_fieldp.host.begin();
|
*fieldp_ptr=_fieldp.host.begin();
|
||||||
|
|
||||||
_off2_polar = off2_polar;
|
_off2_polar = off2_polar;
|
||||||
|
_aewald = aewald;
|
||||||
const int red_blocks=udirect2b(eflag,vflag);
|
const int red_blocks=udirect2b(eflag,vflag);
|
||||||
|
|
||||||
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
|
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
|
||||||
@ -591,8 +593,8 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i
|
|||||||
const bool eflag_in, const bool vflag_in,
|
const bool eflag_in, const bool vflag_in,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
int **ilist, int **jnum, const double cpu_time,
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
bool &success, const double off2_polar, double *host_q,
|
bool &success, const double aewald, const double off2_polar,
|
||||||
double *boxlo, double *prd, void** fieldp_ptr) {
|
double *host_q, double *boxlo, double *prd, void** fieldp_ptr) {
|
||||||
acc_timers();
|
acc_timers();
|
||||||
int eflag, vflag;
|
int eflag, vflag;
|
||||||
if (eatom) eflag=2;
|
if (eatom) eflag=2;
|
||||||
@ -630,6 +632,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i
|
|||||||
*fieldp_ptr=_fieldp.host.begin();
|
*fieldp_ptr=_fieldp.host.begin();
|
||||||
|
|
||||||
_off2_polar = off2_polar;
|
_off2_polar = off2_polar;
|
||||||
|
_aewald = aewald;
|
||||||
const int red_blocks=umutual2b(eflag,vflag);
|
const int red_blocks=umutual2b(eflag,vflag);
|
||||||
|
|
||||||
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
|
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
|
||||||
@ -660,8 +663,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
|
|||||||
const bool eflag_in, const bool vflag_in,
|
const bool eflag_in, const bool vflag_in,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
int **ilist, int **jnum, const double cpu_time,
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
bool &success, const double felec, const double off2_polar,
|
bool &success, const double aewald, const double felec,
|
||||||
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
|
const double off2_polar, double *host_q, double *boxlo,
|
||||||
|
double *prd, void **tep_ptr) {
|
||||||
acc_timers();
|
acc_timers();
|
||||||
int eflag, vflag;
|
int eflag, vflag;
|
||||||
if (eatom) eflag=2;
|
if (eatom) eflag=2;
|
||||||
@ -708,6 +712,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
|
|||||||
|
|
||||||
_off2_polar = off2_polar;
|
_off2_polar = off2_polar;
|
||||||
_felec = felec;
|
_felec = felec;
|
||||||
|
_aewald = aewald;
|
||||||
const int red_blocks=polar_real(eflag,vflag);
|
const int red_blocks=polar_real(eflag,vflag);
|
||||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||||
device->add_ans_object(ans);
|
device->add_ans_object(ans);
|
||||||
|
|||||||
@ -151,7 +151,7 @@ class BaseAmoeba {
|
|||||||
const bool eflag, const bool vflag,
|
const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||||
const double felec, const double off2_mpole, double *charge,
|
const double aewald, const double felec, const double off2_mpole, double *charge,
|
||||||
double *boxlo, double *prd, void **tep_ptr);
|
double *boxlo, double *prd, void **tep_ptr);
|
||||||
|
|
||||||
/// Compute the real space part of the permanent field (udirect2b) with device neighboring
|
/// Compute the real space part of the permanent field (udirect2b) with device neighboring
|
||||||
@ -165,7 +165,8 @@ class BaseAmoeba {
|
|||||||
const bool eflag, const bool vflag,
|
const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||||
const double off2_polar, double *charge, double *boxlo, double *prd, void **fieldp_ptr);
|
const double aewald, const double off2_polar, double *charge,
|
||||||
|
double *boxlo, double *prd, void **fieldp_ptr);
|
||||||
|
|
||||||
/// Compute the real space part of the induced field (umutual2b) with device neighboring
|
/// Compute the real space part of the induced field (umutual2b) with device neighboring
|
||||||
int** compute_umutual2b(const int ago, const int inum_full, const int nall,
|
int** compute_umutual2b(const int ago, const int inum_full, const int nall,
|
||||||
@ -178,7 +179,8 @@ class BaseAmoeba {
|
|||||||
const bool eflag, const bool vflag,
|
const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||||
const double off2_polar, double *charge, double *boxlo, double *prd, void **fieldp_ptr);
|
const double aewald, const double off2_polar, double *charge,
|
||||||
|
double *boxlo, double *prd, void **fieldp_ptr);
|
||||||
|
|
||||||
/// Compute polar real-space with device neighboring
|
/// Compute polar real-space with device neighboring
|
||||||
int** compute_polar_real(const int ago, const int inum_full, const int nall,
|
int** compute_polar_real(const int ago, const int inum_full, const int nall,
|
||||||
@ -190,7 +192,7 @@ class BaseAmoeba {
|
|||||||
const bool eflag, const bool vflag,
|
const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||||
const double felec, const double off2_polar, double *charge,
|
const double aewald, const double felec, const double off2_polar, double *charge,
|
||||||
double *boxlo, double *prd, void **tep_ptr);
|
double *boxlo, double *prd, void **tep_ptr);
|
||||||
|
|
||||||
/// Compute polar real-space with host neighboring (not active for now)
|
/// Compute polar real-space with host neighboring (not active for now)
|
||||||
@ -200,7 +202,7 @@ class BaseAmoeba {
|
|||||||
double **host_uinp, int *ilist, int *numj,
|
double **host_uinp, int *ilist, int *numj,
|
||||||
int **firstneigh, const bool eflag, const bool vflag,
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const double cpu_time, bool &success, const double felec, const double off2_polar,
|
const double cpu_time, bool &success, const double aewald, const double felec, const double off2_polar,
|
||||||
double *charge, const int nlocal, double *boxlo, double *prd, void **tep_ptr);
|
double *charge, const int nlocal, double *boxlo, double *prd, void **tep_ptr);
|
||||||
|
|
||||||
// -------------------------- DEVICE DATA -------------------------
|
// -------------------------- DEVICE DATA -------------------------
|
||||||
@ -272,7 +274,8 @@ class BaseAmoeba {
|
|||||||
bool short_nbor_avail;
|
bool short_nbor_avail;
|
||||||
UCL_D_Vec<int> *_nbor_data;
|
UCL_D_Vec<int> *_nbor_data;
|
||||||
|
|
||||||
numtyp _felec,_off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar;
|
numtyp _aewald,_felec;
|
||||||
|
numtyp _off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar;
|
||||||
|
|
||||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||||
const char *kname_multipole, const char *kname_udirect2b,
|
const char *kname_multipole, const char *kname_udirect2b,
|
||||||
|
|||||||
@ -369,6 +369,9 @@ void PairAmoeba::multipole_real()
|
|||||||
bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2;
|
bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2;
|
||||||
}
|
}
|
||||||
for (k = 0; k < 6; k++) bn[k] *= felec;
|
for (k = 0; k < 6; k++) bn[k] *= felec;
|
||||||
|
//if (i == 0 && j < 10) {
|
||||||
|
// printf("j = %d: aewald = %f; rr1 = %f; bn: %f %f %f %f %f %f\n", j, aewald, rr1, bn[0], bn[1], bn[2], bn[3], bn[4], bn[5]);
|
||||||
|
//}
|
||||||
|
|
||||||
// find damped multipole intermediates and energy value
|
// find damped multipole intermediates and energy value
|
||||||
|
|
||||||
@ -447,6 +450,10 @@ void PairAmoeba::multipole_real()
|
|||||||
rr9 = bn[4] - scalek*rr9;
|
rr9 = bn[4] - scalek*rr9;
|
||||||
rr11 = bn[5] - scalek*rr11;
|
rr11 = bn[5] - scalek*rr11;
|
||||||
e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9;
|
e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9;
|
||||||
|
if (i == 0 && j < 10) {
|
||||||
|
//printf("j = %d: scalek = %f; rr11 = %f; terms: %f %f %f %f %f\n", j, scalek, rr11, term1, term2, term3, term4, term5);
|
||||||
|
//printf("j = %d: felec = %f; rr1 = %f; bn0 = %f\n", j, felec, rr1, bn[0]);
|
||||||
|
}
|
||||||
|
|
||||||
// find standard multipole intermediates for force and torque
|
// find standard multipole intermediates for force and torque
|
||||||
|
|
||||||
@ -457,6 +464,7 @@ void PairAmoeba::multipole_real()
|
|||||||
term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
|
term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
|
||||||
term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9);
|
term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9);
|
||||||
term6 = 4.0 * rr7;
|
term6 = 4.0 * rr7;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
empole += e;
|
empole += e;
|
||||||
@ -515,16 +523,20 @@ void PairAmoeba::multipole_real()
|
|||||||
tq[i][0] += ttmi[0];
|
tq[i][0] += ttmi[0];
|
||||||
tq[i][1] += ttmi[1];
|
tq[i][1] += ttmi[1];
|
||||||
tq[i][2] += ttmi[2];
|
tq[i][2] += ttmi[2];
|
||||||
|
//if (i == 0 && j < 10) {
|
||||||
|
// printf("j = %d: erfc = %f; f: %f %f %f; tq = %f %f %f\n", j, erfc(ralpha), frcx, frcy, frcz, ttmi[0], ttmi[1], ttmi[2]);
|
||||||
|
//printf("j = %d: terms: %f %f %f; tq = %f %f %f\n", j, term1, term2, term3, qikrx, qikry, qikrz);
|
||||||
|
//}
|
||||||
// increment force-based gradient and torque on second site
|
// increment force-based gradient and torque on second site
|
||||||
|
// commenting out j parts for DEBUGGING
|
||||||
|
|
||||||
fmpole[j][0] -= frcx;
|
fmpole[j][0] -= frcx;
|
||||||
fmpole[j][1] -= frcy;
|
fmpole[j][1] -= frcy;
|
||||||
fmpole[j][2] -= frcz;
|
fmpole[j][2] -= frcz;
|
||||||
tq[j][0] += ttmk[0];
|
tq[j][0] += ttmk[0];
|
||||||
tq[j][1] += ttmk[1];
|
tq[j][1] += ttmk[1];
|
||||||
tq[j][2] += ttmk[2];
|
tq[j][2] += ttmk[2];
|
||||||
|
|
||||||
// increment the virial due to pairwise Cartesian forces
|
// increment the virial due to pairwise Cartesian forces
|
||||||
|
|
||||||
vxx = -xr * frcx;
|
vxx = -xr * frcx;
|
||||||
@ -556,10 +568,11 @@ void PairAmoeba::multipole_real()
|
|||||||
comm->reverse_comm_pair(this);
|
comm->reverse_comm_pair(this);
|
||||||
|
|
||||||
// resolve site torques then increment forces and virial
|
// resolve site torques then increment forces and virial
|
||||||
|
printf("compute multipole real\n");
|
||||||
for (i = 0; i < nlocal; i++) {
|
for (i = 0; i < nlocal; i++) {
|
||||||
torque2force(i,tq[i],fix,fiy,fiz,fmpole);
|
if (i == 0) printf("before fmpole = %f %f %f\n", fmpole[i][0], fmpole[i][1], fmpole[i][2]);
|
||||||
|
torque2force(i,tq[i],fix,fiy,fiz,fmpole);
|
||||||
|
if (i == 0) printf("after fmpole = %f %f %f\n", fmpole[i][0], fmpole[i][1], fmpole[i][2]);
|
||||||
iz = zaxis2local[i];
|
iz = zaxis2local[i];
|
||||||
ix = xaxis2local[i];
|
ix = xaxis2local[i];
|
||||||
iy = yaxis2local[i];
|
iy = yaxis2local[i];
|
||||||
@ -575,15 +588,16 @@ void PairAmoeba::multipole_real()
|
|||||||
ziy = x[iy][2] - x[i][2];
|
ziy = x[iy][2] - x[i][2];
|
||||||
|
|
||||||
vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
|
vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
|
||||||
|
vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
|
||||||
|
vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
|
||||||
vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
|
vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
|
||||||
xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
|
xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
|
||||||
vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
|
vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
|
||||||
xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
|
xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
|
||||||
vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
|
|
||||||
vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
|
vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
|
||||||
yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
|
yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
|
||||||
vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
|
|
||||||
|
|
||||||
|
//if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
|
||||||
virmpole[0] += vxx;
|
virmpole[0] += vxx;
|
||||||
virmpole[1] += vyy;
|
virmpole[1] += vyy;
|
||||||
virmpole[2] += vzz;
|
virmpole[2] += vzz;
|
||||||
|
|||||||
@ -1176,7 +1176,7 @@ void PairAmoeba::polar_real()
|
|||||||
comm->reverse_comm_pair(this);
|
comm->reverse_comm_pair(this);
|
||||||
|
|
||||||
// torque is induced field and gradient cross permanent moments
|
// torque is induced field and gradient cross permanent moments
|
||||||
|
printf("compute polar real\n");
|
||||||
for (i = 0; i < nlocal; i++) {
|
for (i = 0; i < nlocal; i++) {
|
||||||
dix = rpole[i][1];
|
dix = rpole[i][1];
|
||||||
diy = rpole[i][2];
|
diy = rpole[i][2];
|
||||||
@ -1197,8 +1197,10 @@ void PairAmoeba::polar_real()
|
|||||||
qiyz*dufld[i][3] - qixz*dufld[i][4] +
|
qiyz*dufld[i][3] - qixz*dufld[i][4] +
|
||||||
2.0*qixy*(dufld[i][0]-dufld[i][2]) + (qiyy-qixx)*dufld[i][1];
|
2.0*qixy*(dufld[i][0]-dufld[i][2]) + (qiyy-qixx)*dufld[i][1];
|
||||||
|
|
||||||
|
if (i == 0) printf("before fpolar = %f %f %f\n", fpolar[i][0], fpolar[i][1], fpolar[i][2]);
|
||||||
torque2force(i,tep,fix,fiy,fiz,fpolar);
|
torque2force(i,tep,fix,fiy,fiz,fpolar);
|
||||||
|
if (i == 0) printf("after fpolar = %f %f %f\n", fpolar[i][0], fpolar[i][1], fpolar[i][2]);
|
||||||
|
|
||||||
iz = zaxis2local[i];
|
iz = zaxis2local[i];
|
||||||
ix = xaxis2local[i];
|
ix = xaxis2local[i];
|
||||||
iy = yaxis2local[i];
|
iy = yaxis2local[i];
|
||||||
@ -1222,7 +1224,7 @@ void PairAmoeba::polar_real()
|
|||||||
xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
|
xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
|
||||||
vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
|
vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
|
||||||
yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
|
yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
|
||||||
|
//if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
|
||||||
virpolar[0] += vxx;
|
virpolar[0] += vxx;
|
||||||
virpolar[1] += vyy;
|
virpolar[1] += vyy;
|
||||||
virpolar[2] += vzz;
|
virpolar[2] += vzz;
|
||||||
|
|||||||
@ -972,6 +972,9 @@ void PairAmoeba::init_style()
|
|||||||
// request neighbor lists
|
// request neighbor lists
|
||||||
|
|
||||||
int irequest = neighbor->request(this,instance_me);
|
int irequest = neighbor->request(this,instance_me);
|
||||||
|
// for DEBUGGING with GPU
|
||||||
|
//neighbor->requests[irequest]->half = 0;
|
||||||
|
//neighbor->requests[irequest]->full = 1;
|
||||||
|
|
||||||
// open debug output files
|
// open debug output files
|
||||||
// names are hard-coded
|
// names are hard-coded
|
||||||
|
|||||||
@ -60,8 +60,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
|
|||||||
const int nlocal, const int nall, const int max_nbors,
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
const int maxspecial, const int maxspecial15,
|
const int maxspecial, const int maxspecial15,
|
||||||
const double cell_size, int &gpu_mode, FILE *screen,
|
const double cell_size, int &gpu_mode, FILE *screen,
|
||||||
const double aewald, const double polar_dscale,
|
const double polar_dscale, const double polar_uscale, int& tep_size);
|
||||||
const double polar_uscale, int& tep_size);
|
|
||||||
void amoeba_gpu_clear();
|
void amoeba_gpu_clear();
|
||||||
|
|
||||||
int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
|
int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
|
||||||
@ -70,8 +69,8 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in
|
|||||||
int **nspecial, tagint **special, int* nspecial15, tagint** special15,
|
int **nspecial, tagint **special, int* nspecial15, tagint** special15,
|
||||||
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
|
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
|
||||||
int &host_start, int **ilist, int **jnum, const double cpu_time,
|
int &host_start, int **ilist, int **jnum, const double cpu_time,
|
||||||
bool &success, const double felec, const double off2, double *host_q,
|
bool &success, const double aewald, const double felec, const double off2,
|
||||||
double *boxlo, double *prd, void **tep_ptr);
|
double *host_q, double *boxlo, double *prd, void **tep_ptr);
|
||||||
|
|
||||||
int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
|
int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
|
||||||
double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
|
double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
|
||||||
@ -80,7 +79,7 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal
|
|||||||
tagint **special, int* nspecial15, tagint** special15,
|
tagint **special, int* nspecial15, tagint** special15,
|
||||||
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
|
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
|
||||||
int &host_start, int **ilist, int **jnum, const double cpu_time,
|
int &host_start, int **ilist, int **jnum, const double cpu_time,
|
||||||
bool &success, const double off2, double *host_q,
|
bool &success, const double aewald, const double off2, double *host_q,
|
||||||
double *boxlo, double *prd, void **fieldp_ptr);
|
double *boxlo, double *prd, void **fieldp_ptr);
|
||||||
|
|
||||||
int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
|
int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
|
||||||
@ -90,7 +89,7 @@ int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nal
|
|||||||
tagint **special, int* nspecial15, tagint** special15,
|
tagint **special, int* nspecial15, tagint** special15,
|
||||||
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
|
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
|
||||||
int &host_start, int **ilist, int **jnum, const double cpu_time,
|
int &host_start, int **ilist, int **jnum, const double cpu_time,
|
||||||
bool &success, const double off2, double *host_q,
|
bool &success, const double aewald, const double off2, double *host_q,
|
||||||
double *boxlo, double *prd, void **fieldp_ptr);
|
double *boxlo, double *prd, void **fieldp_ptr);
|
||||||
|
|
||||||
int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall,
|
int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall,
|
||||||
@ -100,8 +99,8 @@ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int na
|
|||||||
tagint **special, int* nspecial15, tagint** special15,
|
tagint **special, int* nspecial15, tagint** special15,
|
||||||
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
|
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
|
||||||
int &host_start, int **ilist, int **jnum, const double cpu_time,
|
int &host_start, int **ilist, int **jnum, const double cpu_time,
|
||||||
bool &success, const double felec, const double off2, double *host_q,
|
bool &success, const double aewald, const double felec, const double off2,
|
||||||
double *boxlo, double *prd, void **tep_ptr);
|
double *host_q, double *boxlo, double *prd, void **tep_ptr);
|
||||||
|
|
||||||
double amoeba_gpu_bytes();
|
double amoeba_gpu_bytes();
|
||||||
|
|
||||||
@ -119,7 +118,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
|
|||||||
gpu_multipole_real_ready = true;
|
gpu_multipole_real_ready = true;
|
||||||
gpu_udirect2b_ready = true;
|
gpu_udirect2b_ready = true;
|
||||||
gpu_umutual2b_ready = true;
|
gpu_umutual2b_ready = true;
|
||||||
gpu_polar_real_ready = true;
|
gpu_polar_real_ready = false;
|
||||||
|
|
||||||
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
|
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
|
||||||
}
|
}
|
||||||
@ -174,7 +173,7 @@ void PairAmoebaGPU::init_style()
|
|||||||
special_polar_pscale, atom->nlocal,
|
special_polar_pscale, atom->nlocal,
|
||||||
atom->nlocal+atom->nghost, mnf, maxspecial,
|
atom->nlocal+atom->nghost, mnf, maxspecial,
|
||||||
maxspecial15, cell_size, gpu_mode, screen,
|
maxspecial15, cell_size, gpu_mode, screen,
|
||||||
aewald, polar_dscale, polar_uscale, tep_size);
|
polar_dscale, polar_uscale, tep_size);
|
||||||
GPU_EXTRA::check_flag(success,error,world);
|
GPU_EXTRA::check_flag(success,error,world);
|
||||||
|
|
||||||
if (gpu_mode == GPU_FORCE)
|
if (gpu_mode == GPU_FORCE)
|
||||||
@ -231,14 +230,14 @@ void PairAmoebaGPU::multipole_real()
|
|||||||
atom->nspecial15, atom->special15,
|
atom->nspecial15, atom->special15,
|
||||||
eflag, vflag, eflag_atom, vflag_atom,
|
eflag, vflag, eflag_atom, vflag_atom,
|
||||||
host_start, &ilist, &numneigh, cpu_time,
|
host_start, &ilist, &numneigh, cpu_time,
|
||||||
success, felec, off2, atom->q, domain->boxlo,
|
success, aewald, felec, off2, atom->q,
|
||||||
domain->prd, &tep_pinned);
|
domain->boxlo, domain->prd, &tep_pinned);
|
||||||
|
|
||||||
if (!success)
|
if (!success)
|
||||||
error->one(FLERR,"Insufficient memory on accelerator");
|
error->one(FLERR,"Insufficient memory on accelerator");
|
||||||
|
|
||||||
// reference to the tep array from GPU lib
|
// reference to the tep array from GPU lib
|
||||||
|
printf("compute multipole real\n");
|
||||||
if (tep_single) {
|
if (tep_single) {
|
||||||
float *tep_ptr = (float *)tep_pinned;
|
float *tep_ptr = (float *)tep_pinned;
|
||||||
compute_force_from_tep<float>(tep_ptr, fmpole, virmpole);
|
compute_force_from_tep<float>(tep_ptr, fmpole, virmpole);
|
||||||
@ -727,7 +726,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
|
|||||||
atom->nspecial15, atom->special15,
|
atom->nspecial15, atom->special15,
|
||||||
eflag, vflag, eflag_atom, vflag_atom,
|
eflag, vflag, eflag_atom, vflag_atom,
|
||||||
host_start, &ilist, &numneigh, cpu_time,
|
host_start, &ilist, &numneigh, cpu_time,
|
||||||
success, off2, atom->q, domain->boxlo,
|
success, aewald, off2, atom->q, domain->boxlo,
|
||||||
domain->prd, &fieldp_pinned);
|
domain->prd, &fieldp_pinned);
|
||||||
if (!success)
|
if (!success)
|
||||||
error->one(FLERR,"Insufficient memory on accelerator");
|
error->one(FLERR,"Insufficient memory on accelerator");
|
||||||
@ -951,7 +950,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
|
|||||||
atom->nspecial15, atom->special15,
|
atom->nspecial15, atom->special15,
|
||||||
eflag, vflag, eflag_atom, vflag_atom,
|
eflag, vflag, eflag_atom, vflag_atom,
|
||||||
host_start, &ilist, &numneigh, cpu_time,
|
host_start, &ilist, &numneigh, cpu_time,
|
||||||
success, off2, atom->q, domain->boxlo,
|
success,aewald, off2, atom->q, domain->boxlo,
|
||||||
domain->prd, &fieldp_pinned);
|
domain->prd, &fieldp_pinned);
|
||||||
if (!success)
|
if (!success)
|
||||||
error->one(FLERR,"Insufficient memory on accelerator");
|
error->one(FLERR,"Insufficient memory on accelerator");
|
||||||
@ -1008,7 +1007,7 @@ void PairAmoebaGPU::polar_real()
|
|||||||
}
|
}
|
||||||
inum = atom->nlocal;
|
inum = atom->nlocal;
|
||||||
|
|
||||||
// select the correct cutoff for the term
|
// select the correct cutoff and aewald for the term
|
||||||
|
|
||||||
if (use_ewald) choose(POLAR_LONG);
|
if (use_ewald) choose(POLAR_LONG);
|
||||||
else choose(POLAR);
|
else choose(POLAR);
|
||||||
@ -1024,14 +1023,14 @@ void PairAmoebaGPU::polar_real()
|
|||||||
atom->nspecial15, atom->special15,
|
atom->nspecial15, atom->special15,
|
||||||
eflag, vflag, eflag_atom, vflag_atom,
|
eflag, vflag, eflag_atom, vflag_atom,
|
||||||
host_start, &ilist, &numneigh, cpu_time,
|
host_start, &ilist, &numneigh, cpu_time,
|
||||||
success, felec, off2, atom->q, domain->boxlo,
|
success, aewald, felec, off2, atom->q, domain->boxlo,
|
||||||
domain->prd, &tep_pinned);
|
domain->prd, &tep_pinned);
|
||||||
|
|
||||||
if (!success)
|
if (!success)
|
||||||
error->one(FLERR,"Insufficient memory on accelerator");
|
error->one(FLERR,"Insufficient memory on accelerator");
|
||||||
|
|
||||||
// reference to the tep array from GPU lib
|
// reference to the tep array from GPU lib
|
||||||
|
printf("compute polar real\n");
|
||||||
if (tep_single) {
|
if (tep_single) {
|
||||||
float *tep_ptr = (float *)tep_pinned;
|
float *tep_ptr = (float *)tep_pinned;
|
||||||
compute_force_from_tep<float>(tep_ptr, fpolar, virpolar);
|
compute_force_from_tep<float>(tep_ptr, fpolar, virpolar);
|
||||||
@ -1066,7 +1065,9 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
|
|||||||
tep[1] = tep_ptr[4*i+1];
|
tep[1] = tep_ptr[4*i+1];
|
||||||
tep[2] = tep_ptr[4*i+2];
|
tep[2] = tep_ptr[4*i+2];
|
||||||
|
|
||||||
|
if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
|
||||||
torque2force(i,tep,fix,fiy,fiz,force_comp);
|
torque2force(i,tep,fix,fiy,fiz,force_comp);
|
||||||
|
if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
|
||||||
|
|
||||||
iz = zaxis2local[i];
|
iz = zaxis2local[i];
|
||||||
ix = xaxis2local[i];
|
ix = xaxis2local[i];
|
||||||
@ -1086,12 +1087,12 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
|
|||||||
vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
|
vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
|
||||||
vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
|
vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
|
||||||
vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
|
vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
|
||||||
xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
|
xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
|
||||||
vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
|
vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
|
||||||
xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
|
xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
|
||||||
vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
|
vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
|
||||||
yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
|
yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
|
||||||
|
//if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
|
||||||
virial_comp[0] += vxx;
|
virial_comp[0] += vxx;
|
||||||
virial_comp[1] += vyy;
|
virial_comp[1] += vyy;
|
||||||
virial_comp[2] += vzz;
|
virial_comp[2] += vzz;
|
||||||
|
|||||||
Reference in New Issue
Block a user