Working hippo multipole real-space term, added helper functions in a separate file

This commit is contained in:
Trung Nguyen
2021-09-24 16:44:43 -05:00
parent ad8164dfc0
commit e77df80ce2
7 changed files with 797 additions and 34 deletions

View File

@ -143,7 +143,7 @@ class BaseAmoeba {
double *charge, double *boxlo, double *prd);
/// Compute multipole real-space with device neighboring
int** compute_multipole_real(const int ago, const int inum_full, const int nall,
virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
tagint *tag, int **nspecial, tagint **special,
@ -155,7 +155,7 @@ class BaseAmoeba {
double *boxlo, double *prd, void **tep_ptr);
/// Compute the real space part of the permanent field (udirect2b) with device neighboring
int** compute_udirect2b(const int ago, const int inum_full, const int nall,
virtual int** compute_udirect2b(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
@ -169,7 +169,7 @@ class BaseAmoeba {
double *boxlo, double *prd, void **fieldp_ptr);
/// Compute the real space part of the induced field (umutual2b) with device neighboring
int** compute_umutual2b(const int ago, const int inum_full, const int nall,
virtual int** compute_umutual2b(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
@ -183,7 +183,7 @@ class BaseAmoeba {
double *boxlo, double *prd, void **fieldp_ptr);
/// Compute polar real-space with device neighboring
int** compute_polar_real(const int ago, const int inum_full, const int nall,
virtual int** compute_polar_real(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double **host_uind,
double **host_uinp, double *sublo, double *subhi,

View File

@ -56,6 +56,7 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_csix, const double *host_adisp,
const double *host_pcore, const double *host_palpha,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, const double gpu_split, FILE *_screen,
@ -69,7 +70,9 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
if (success!=0)
return success;
// specific to HIPPO
k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
_pval.alloc(this->_max_tep_size,*(this->ucl_device),UCL_READ_ONLY,UCL_READ_ONLY);
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
@ -98,8 +101,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
for (int i = 0; i < max_amclass; i++) {
host_write2[i].x = host_csix[i];
host_write2[i].y = host_adisp[i];
host_write2[i].z = (numtyp)0;
host_write2[i].w = (numtyp)0;
host_write2[i].z = host_pcore[i];
host_write2[i].w = host_palpha[i];
}
coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
@ -262,6 +265,93 @@ int HippoT::dispersion_real(const int eflag, const int vflag) {
return GX;
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute multipole real-space
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** HippoT::compute_multipole_real(const int ago, const int inum_full,
const int nall, double **host_x,
int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success,
const double aewald, const double felec,
const double off2_mpole, double *host_q,
double *boxlo, double *prd, void **tep_ptr) {
this->acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
this->set_kernel(eflag,vflag);
// reallocate per-atom arrays, transfer data from the host
// and build the neighbor lists if needed
// NOTE:
// For now we invoke precompute() again here,
// to be able to turn on/off the udirect2b kernel (which comes before this)
// Once all the kernels are ready, precompute() is needed only once
// in the first kernel in a time step.
// We only need to cast uind and uinp from host to device here
// if the neighbor lists are rebuilt and other per-atom arrays
// (x, type, amtype, amgroup, rpole) are ready on the device.
int** firstneigh = nullptr;
firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole,
nullptr, nullptr, sublo, subhi, tag,
nspecial, special, nspecial15, special15,
eflag_in, vflag_in, eatom, vatom,
host_start, ilist, jnum, cpu_time,
success, host_q, boxlo, prd);
// ------------------- Resize _tep array ------------------------
if (inum_full>this->_max_tep_size) {
this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
this->_tep.resize(this->_max_tep_size*4);
}
*tep_ptr=this->_tep.host.begin();
this->_off2_mpole = off2_mpole;
this->_felec = felec;
this->_aewald = aewald;
const int red_blocks=multipole_real(eflag,vflag);
// leave the answers (forces, energies and virial) on the device,
// only copy them back in the last kernel (polar_real)
//ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
//device->add_ans_object(ans);
this->hd_balancer.stop_timer();
// copy tep from device to host
this->_tep.update_host(this->_max_tep_size*4,false);
/*
printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
for (int i = 0; i < 10; i++) {
numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
}
*/
return firstneigh; // nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Calculate the multipole real-space term, returning tep
// ---------------------------------------------------------------------------
@ -290,13 +380,14 @@ int HippoT::multipole_real(const int eflag, const int vflag) {
&nbor_pitch, &this->_threads_per_atom);
this->k_multipole.set_size(GX,BX);
this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_felec,
&this->_off2_mpole, &_polar_dscale, &_polar_uscale);
this->k_multipole.run(&this->atom->x, &this->atom->extra, &_pval,
&coeff_amtype, &coeff_amclass, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_felec,
&this->_off2_mpole, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;

View File

@ -15,7 +15,8 @@
#if defined(NV_KERNEL) || defined(USE_HIP)
#include <stdio.h>
#include "lal_aux_fun1.h"
#include "lal_hippo_extra.h"
//#include "lal_aux_fun1.h"
#ifdef LAMMPS_SMALLBIG
#define tagint int
#endif
@ -404,6 +405,318 @@ _texture( q_tex,int2);
#define MIN(A,B) ((A) < (B) ? (A) : (B))
#define MY_PIS (acctyp)1.77245385090551602729
/* ----------------------------------------------------------------------
repulsion = Pauli repulsion interactions
adapted from Tinker erepel1b() routine
------------------------------------------------------------------------- */
__kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
const __global numtyp *restrict extra,
const __global numtyp4 *restrict coeff,
const __global numtyp4 *restrict sp_nonpolar,
const __global int *dev_nbor,
const __global int *dev_packed,
const __global int *dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global acctyp4 *restrict tep,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
const int t_per_atom, const numtyp aewald,
const numtyp off2, const numtyp cut2,
const numtyp c0, const numtyp c1, const numtyp c2,
const numtyp c3, const numtyp c4, const numtyp c5)
{
int tid, ii, offset, i;
atom_info(t_per_atom,ii,tid,offset);
int n_stride;
local_allocate_store_charge();
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6];
if (EVFLAG) {
energy=(acctyp)0;
e_coul=(acctyp)0;
for (int l=0; l<6; l++) virial[l]=(acctyp)0;
}
acctyp4 tq;
tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
numtyp4* polar1 = (numtyp4*)(&extra[0]);
numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
if (ii<inum) {
int numj, nbor, nbor_end;
const __global int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//numtyp qtmp; fetch(qtmp,i,q_tex);
//int itype=ix.w;
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
const numtyp4 pol1i = polar1[i];
numtyp ci = pol1i.x; // rpole[i][0];
numtyp dix = pol1i.y; // rpole[i][1];
numtyp diy = pol1i.z; // rpole[i][2];
numtyp diz = pol1i.w; // rpole[i][3];
const numtyp4 pol2i = polar2[i];
numtyp qixx = pol2i.x; // rpole[i][4];
numtyp qixy = pol2i.y; // rpole[i][5];
numtyp qixz = pol2i.z; // rpole[i][6];
numtyp qiyy = pol2i.w; // rpole[i][8];
const numtyp4 pol3i = polar3[i];
numtyp qiyz = pol3i.x; // rpole[i][9];
numtyp qizz = pol3i.y; // rpole[i][12];
int itype = pol3i.z; // amtype[i];
numtyp sizi = coeff[itype].x; // sizpr[itype];
numtyp dmpi = coeff[itype].y; // dmppr[itype];
numtyp vali = coeff[itype].z; // elepr[itype];
for ( ; nbor<nbor_end; nbor+=n_stride) {
int jextra=nbor_mem[nbor];
int j = jextra & NEIGHMASK15;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int jtype=jx.w;
// Compute r12
numtyp xr = ix.x - jx.x;
numtyp yr = ix.y - jx.y;
numtyp zr = ix.z - jx.z;
numtyp r2 = xr*xr + yr*yr + zr*zr;
if (r2>off2) continue;
const numtyp4 pol1j = polar1[j];
numtyp ck = pol1j.x; // rpole[j][0];
numtyp dkx = pol1j.y; // rpole[j][1];
numtyp dky = pol1j.z; // rpole[j][2];
numtyp dkz = pol1j.w; // rpole[j][3];
const numtyp4 pol2j = polar2[j];
numtyp qkxx = pol2j.x; // rpole[j][4];
numtyp qkxy = pol2j.y; // rpole[j][5];
numtyp qkxz = pol2j.z; // rpole[j][6];
numtyp qkyy = pol2j.w; // rpole[j][8];
const numtyp4 pol3j = polar3[j];
numtyp qkyz = pol3j.x; // rpole[j][9];
numtyp qkzz = pol3j.y; // rpole[j][12];
int jtype = pol3j.z; // amtype[j];
numtyp sizk = coeff[jtype].x; // sizpr[jtype];
numtyp dmpk = coeff[jtype].y; // dmppr[jtype];
numtyp valk = coeff[jtype].z; // elepr[jtype];
const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)];
numtyp factor_repel = sp_nonpol.y; // factor_repel = special_repel[sbmask15(j)];
// intermediates involving moments and separation distance
numtyp dir = dix*xr + diy*yr + diz*zr;
numtyp qix = qixx*xr + qixy*yr + qixz*zr;
numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
numtyp qir = qix*xr + qiy*yr + qiz*zr;
numtyp dkr = dkx*xr + dky*yr + dkz*zr;
numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
numtyp qkr = qkx*xr + qky*yr + qkz*zr;
numtyp dik = dix*dkx + diy*dky + diz*dkz;
numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
numtyp diqk = dix*qkx + diy*qky + diz*qkz;
numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
qixx*qkxx + qiyy*qkyy + qizz*qkzz;
// additional intermediates involving moments and distance
numtyp dirx = diy*zr - diz*yr;
numtyp diry = diz*xr - dix*zr;
numtyp dirz = dix*yr - diy*xr;
numtyp dkrx = dky*zr - dkz*yr;
numtyp dkry = dkz*xr - dkx*zr;
numtyp dkrz = dkx*yr - dky*xr;
numtyp dikx = diy*dkz - diz*dky;
numtyp diky = diz*dkx - dix*dkz;
numtyp dikz = dix*dky - diy*dkx;
numtyp qirx = qiz*yr - qiy*zr;
numtyp qiry = qix*zr - qiz*xr;
numtyp qirz = qiy*xr - qix*yr;
numtyp qkrx = qkz*yr - qky*zr;
numtyp qkry = qkx*zr - qkz*xr;
numtyp qkrz = qky*xr - qkx*yr;
numtyp qikx = qky*qiz - qkz*qiy;
numtyp qiky = qkz*qix - qkx*qiz;
numtyp qikz = qkx*qiy - qky*qix;
numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz;
numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz;
numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz;
numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz;
numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz;
numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz;
numtyp qikrx = qizk*yr - qiyk*zr;
numtyp qikry = qixk*zr - qizk*xr;
numtyp qikrz = qiyk*xr - qixk*yr;
numtyp qkirx = qkzi*yr - qkyi*zr;
numtyp qkiry = qkxi*zr - qkzi*xr;
numtyp qkirz = qkyi*xr - qkxi*yr;
numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
numtyp diqkrx = diqkz*yr - diqky*zr;
numtyp diqkry = diqkx*zr - diqkz*xr;
numtyp diqkrz = diqky*xr - diqkx*yr;
numtyp dkqirx = dkqiz*yr - dkqiy*zr;
numtyp dkqiry = dkqix*zr - dkqiz*xr;
numtyp dkqirz = dkqiy*xr - dkqix*yr;
numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy -
(numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz -
(numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix -
(numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
// get reciprocal distance terms for this interaction
numtyp r = ucl_sqrt(r2);
numtyp rinv = ucl_recip(r);
numtyp r2inv = rinv*rinv;
numtyp rr1 = rinv;
numtyp rr3 = rr1 * r2inv;
numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;
// get damping coefficients for the Pauli repulsion energy
numtyp dmpik[11];
damprep(r,r2,rr1,rr3,rr5,rr7,rr9,rr11,11,dmpi,dmpk,dmpik);
// calculate intermediate terms needed for the energy
numtyp term1 = vali*valk;
numtyp term2 = valk*dir - vali*dkr + dik;
numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
numtyp term4 = dir*qkr - dkr*qir - 4.0*qik;
numtyp term5 = qir*qkr;
numtyp eterm = term1*dmpik[0] + term2*dmpik[2] +
term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8];
// compute the Pauli repulsion energy for this interaction
numtyp sizik = sizi * sizk * factor_repel;
numtyp e = sizik * eterm * rr1;
// calculate intermediate terms for force and torque
numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] +
term4*dmpik[8] + term5*dmpik[10];
term1 = -valk*dmpik[2] + dkr*dmpik[4] - qkr*dmpik[6];
term2 = vali*dmpik[2] + dir*dmpik[4] + qir*dmpik[6];
term3 = (numtyp)2.0 * dmpik[4];
term4 = (numtyp)2.0 * (-valk*dmpik[4] + dkr*dmpik[6] - qkr*dmpik[8]);
term5 = (numtyp)2.0 * (-vali*dmpik[4] - dir*dmpik[6] - qir*dmpik[8]);
numtyp term6 = (numtyp)4.0 * dmpik[6];
// compute the force components for this interaction
numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) +
term4*qix + term5*qkx + term6*(qixk+qkxi);
numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) +
term4*qiy + term5*qky + term6*(qiyk+qkyi);
numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
term4*qiz + term5*qkz + term6*(qizk+qkzi);
frcx = frcx*rr1 + eterm*rr3*xr;
frcy = frcy*rr1 + eterm*rr3*yr;
frcz = frcz*rr1 + eterm*rr3*zr;
// compute the torque components for this interaction
numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) -
term4*qirx - term6*(qikrx+qikx);
numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) -
term4*qiry - term6*(qikry+qiky);
numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) -
term4*qirz - term6*(qikrz+qikz);
ttmix = sizik * ttmix * rr1;
ttmiy = sizik * ttmiy * rr1;
ttmiz = sizik * ttmiz * rr1;
// use energy switching if near the cutoff distance
if (r2 > cut2) {
numtyp r3 = r2 * r;
numtyp r4 = r2 * r2;
numtyp r5 = r2 * r3;
numtyp taper = c5*r5 + c4*r4 + c3*r3 + c2*r2 + c1*r + c0;
numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp).0*c4*r3 +
(numtyp)3.0*c3*r2 + (numtyp)2.0*c2*r + c1;
dtaper *= e * rr1;
e *= taper;
frcx = frcx*taper - dtaper*xr;
frcy = frcy*taper - dtaper*yr;
frcz = frcz*taper - dtaper*zr;
ttmix *= taper;
ttmiy *= taper;
ttmiz *= taper;
}
energy += e;
// increment force-based gradient and torque on atom I
f.x += frcx;
f.y += frcy;
f.z += frcz;
tq.x += ttmix;
tq.y += ttmiy;
tq.z += ttmiz;
// increment the internal virial tensor components
if (EVFLAG && vflag) {
numtyp vxx = -xr * frcx;
numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
numtyp vyy = -yr * frcy;
numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
numtyp vzz = -zr * frcz;
virial[0] += vxx;
virial[1] += vyy;
virial[2] += vzz;
virial[3] += vxy;
virial[4] += vxz;
virial[5] += vyz;
}
} // nbor
} // ii<inum
// accumulate tq
store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
// accumate force, energy and virial
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
offset,eflag,vflag,ans,engv);
}
/* ----------------------------------------------------------------------
dispersion = real-space portion of Ewald dispersion
adapted from Tinker edreal1d() routine
@ -594,20 +907,22 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
------------------------------------------------------------------------- */
__kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
const __global numtyp *restrict extra,
const __global numtyp4 *restrict coeff,
const __global numtyp4 *restrict sp_polar,
const __global int *dev_nbor,
const __global int *dev_packed,
const __global int *dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global acctyp4 *restrict tep,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
const int t_per_atom, const numtyp aewald,
const numtyp felec, const numtyp off2,
const numtyp polar_dscale, const numtyp polar_uscale)
const __global numtyp *restrict extra,
const __global numtyp *restrict pval,
const __global numtyp4 *restrict coeff_amtype,
const __global numtyp4 *restrict coeff_amclass,
const __global numtyp4 *restrict sp_polar,
const __global int *dev_nbor,
const __global int *dev_packed,
const __global int *dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global acctyp4 *restrict tep,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
const int t_per_atom, const numtyp aewald,
const numtyp felec, const numtyp off2,
const numtyp polar_dscale, const numtyp polar_uscale)
{
int tid, ii, offset, i;
atom_info(t_per_atom,ii,tid,offset);
@ -633,6 +948,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
if (ii<inum) {
int m;
int itype,iclass;
numtyp bfac;
numtyp term1,term2,term3;
numtyp term4,term5,term6;
@ -668,6 +984,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
const numtyp4 pol3i = polar3[i];
numtyp qiyz = pol3i.x; // rpole[i][9];
numtyp qizz = pol3i.y; // rpole[i][12];
itype = pol3i.z; // amtype[i];
iclass = coeff_amtype[itype].w; // amtype2class[itype];
numtyp corei = coeff_amclass[itype].z; // pcore[iclass];
numtyp alphai = coeff_amclass[itype].w; // palpha[iclass];
numtyp vali = pval[i];
for ( ; nbor<nbor_end; nbor+=n_stride) {
@ -701,10 +1023,15 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
numtyp qkzz = pol3j.y; // rpole[j][12];
int jtype = pol3j.z; // amtype[j];
int jgroup = pol3j.w; // amgroup[j];
int jclass = coeff_amtype[jtype].w; // amtype2class[jtype];
const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
numtyp corek = coeff_amclass[jtype].z; // pcore[jclass];
numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
numtyp valk = pval[j];
// intermediates involving moments and separation distance
numtyp dir = dix*xr + diy*yr + diz*zr;

View File

@ -48,6 +48,7 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_csix, const double *host_adisp,
const double *host_pcore, const double *host_palpha,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15, const double cell_size,
const double gpu_split, FILE *_screen,
@ -65,6 +66,18 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
const double aewald, const double off2_disp, double *charge,
double *boxlo, double *prd);
/// Compute multipole real-space with device neighboring
virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
tagint *tag, int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success,
const double aewald, const double felec, const double off2_mpole, double *charge,
double *boxlo, double *prd, void **tep_ptr);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
@ -105,6 +118,8 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
UCL_Kernel k_dispersion;
UCL_Vector<acctyp,acctyp> _pval;
protected:
bool _allocated;
int dispersion_real(const int eflag, const int vflag);

View File

@ -38,6 +38,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_csix, const double *host_adisp,
const double *host_pcore, const double *host_palpha,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, int &gpu_mode, FILE *screen,
@ -73,7 +74,8 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
host_special_repel, host_special_disp,
host_special_mpole, host_special_polar_wscale,
host_special_polar_piscale, host_special_polar_pscale,
host_csix, host_adisp, nlocal, nall, max_nbors,
host_csix, host_adisp, host_pcore, host_palpha,
nlocal, nall, max_nbors,
maxspecial, maxspecial15, cell_size, gpu_split,
screen, polar_dscale, polar_uscale);
@ -97,7 +99,8 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
host_special_repel, host_special_disp,
host_special_mpole, host_special_polar_wscale,
host_special_polar_piscale, host_special_polar_pscale,
host_csix, host_adisp, nlocal, nall, max_nbors,
host_csix, host_adisp, host_pcore, host_palpha,
nlocal, nall, max_nbors,
maxspecial, maxspecial15, cell_size, gpu_split,
screen, polar_dscale, polar_uscale);

326
lib/gpu/lal_hippo_extra.h Normal file
View File

@ -0,0 +1,326 @@
/// **************************************************************************
// hippo_extra.h
// -------------------
// Trung Dac Nguyen
//
// Device code for hippo math routines
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : ndactrung@gmail.com
// ***************************************************************************/*
#ifndef LAL_HIPPO_EXTRA_H
#define LAL_HIPPO_EXTRA_H
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#else
#endif
#define MY_PI2 (numtyp)1.57079632679489661923
#define MY_PI4 (numtyp)0.78539816339744830962
/* ----------------------------------------------------------------------
damprep generates coefficients for the Pauli repulsion
damping function for powers of the interatomic distance
literature reference:
J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An
Anisotropic, Atomic Multipole Model", Journal of Chemical Physics,
150, 084104 (2019)
------------------------------------------------------------------------- */
ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
const numtyp rr3, const numtyp rr5, const numtyp rr7,
const numtyp rr9, const numtyp rr11, const int rorder,
const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11])
{
numtyp r3,r4;
numtyp r5,r6,r7,r8;
numtyp s,ds,d2s;
numtyp d3s,d4s,d5s;
numtyp dmpi2,dmpk2;
numtyp dmpi22,dmpi23;
numtyp dmpi24,dmpi25;
numtyp dmpi26,dmpi27;
numtyp dmpk22,dmpk23;
numtyp dmpk24,dmpk25;
numtyp dmpk26;
numtyp eps,diff;
numtyp expi,expk;
numtyp dampi,dampk;
numtyp pre,term,tmp;
// compute tolerance value for damping exponents
eps = (numtyp)0.001;
diff = dmpi-dmpk;
if (diff < (numtyp)0) diff = -diff;
// treat the case where alpha damping exponents are equal
if (diff < eps) {
r3 = r2 * r;
r4 = r3 * r;
r5 = r4 * r;
r6 = r5 * r;
r7 = r6 * r;
dmpi2 = (numtyp)0.5 * dmpi;
dampi = dmpi2 * r;
expi = ucl_exp(-dampi);
dmpi22 = dmpi2 * dmpi2;
dmpi23 = dmpi22 * dmpi2;
dmpi24 = dmpi23 * dmpi2;
dmpi25 = dmpi24 * dmpi2;
dmpi26 = dmpi25 * dmpi2;
pre = (numtyp)128.0;
s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi;
ds = (dmpi22*r3 + dmpi23*r4) * expi / 3.0;
d2s = dmpi24 * expi * r5 / 9.0;
d3s = dmpi25 * expi * r6 / 45.0;
d4s = (dmpi25*r6 + dmpi26*r7) * expi / 315.0;
if (rorder >= 11) {
r8 = r7 * r;
dmpi27 = dmpi2 * dmpi26;
d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/3.0) * expi / 945.0;
}
// treat the case where alpha damping exponents are unequal
} else {
r3 = r2 * r;
r4 = r3 * r;
r5 = r4 * r;
dmpi2 = 0.5 * dmpi;
dmpk2 = 0.5 * dmpk;
dampi = dmpi2 * r;
dampk = dmpk2 * r;
expi = exp(-dampi);
expk = exp(-dampk);
dmpi22 = dmpi2 * dmpi2;
dmpi23 = dmpi22 * dmpi2;
dmpi24 = dmpi23 * dmpi2;
dmpi25 = dmpi24 * dmpi2;
dmpk22 = dmpk2 * dmpk2;
dmpk23 = dmpk22 * dmpk2;
dmpk24 = dmpk23 * dmpk2;
dmpk25 = dmpk24 * dmpk2;
term = dmpi22 - dmpk22;
pre = 8192.0 * dmpi23 * dmpk23 / pow(term,4.0);
tmp = 4.0 * dmpi2 * dmpk2 / term;
s = (dampi-tmp)*expk + (dampk+tmp)*expi;
ds = (dmpi2*dmpk2*r2 - 4.0*dmpi2*dmpk22*r/term -
4.0*dmpi2*dmpk2/term) * expk +
(dmpi2*dmpk2*r2 + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi;
d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/3.0 -
(4.0/3.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term -
4.0*dmpi2*dmpk2/term) * expk +
(dmpi2*dmpk2*r2/3.0 + dmpi22*dmpk2*r3/3.0 +
(4.0/3.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term +
4.0*dmpi2*dmpk2/term) * expi;
d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/5.0 + dmpi2*dmpk2*r2/5.0 -
(4.0/15.0)*dmpi2*dmpk24*r3/term - (8.0/5.0)*dmpi2*dmpk23*r2/term -
4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk +
(dmpi23*dmpk2*r4/15.0 + dmpi22*dmpk2*r3/5.0 + dmpi2*dmpk2*r2/5.0 +
(4.0/15.0)*dmpi24*dmpk2*r3/term + (8.0/5.0)*dmpi23*dmpk2*r2/term +
4.0*dmpi22*dmpk2*r/term + 4.0/term*dmpi2*dmpk2) * expi;
d4s = (dmpi2*dmpk24*r5/105.0 + (2.0/35.0)*dmpi2*dmpk23*r4 +
dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/7.0 -
(4.0/105.0)*dmpi2*dmpk25*r4/term - (8.0/21.0)*dmpi2*dmpk24*r3/term -
(12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term -
4.0*dmpi2*dmpk2/term) * expk +
(dmpi24*dmpk2*r5/105.0 + (2.0/35.0)*dmpi23*dmpk2*r4 +
dmpi22*dmpk2*r3/7.0 + dmpi2*dmpk2*r2/7.0 +
(4.0/105.0)*dmpi25*dmpk2*r4/term + (8.0/21.0)*dmpi24*dmpk2*r3/term +
(12.0/7.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term +
4.0*dmpi2*dmpk2/term) * expi;
if (rorder >= 11) {
r6 = r5 * r;
dmpi26 = dmpi25 * dmpi2;
dmpk26 = dmpk25 * dmpk2;
d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 +
dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 -
(4.0/945.0)*dmpi2*dmpk26*r5/term -
(4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term -
(16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term -
4.0*dmpi2*dmpk2/term) * expk +
(dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 +
dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 +
(4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term +
(4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term +
4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi;
}
}
// convert partial derivatives into full derivatives
s = s * rr1;
ds = ds * rr3;
d2s = d2s * rr5;
d3s = d3s * rr7;
d4s = d4s * rr9;
d5s = d5s * rr11;
dmpik[0] = 0.5 * pre * s * s;
dmpik[2] = pre * s * ds;
dmpik[4] = pre * (s*d2s + ds*ds);
dmpik[6] = pre * (s*d3s + 3.0*ds*d2s);
dmpik[8] = pre * (s*d4s + 4.0*ds*d3s + 3.0*d2s*d2s);
if (rorder >= 11) dmpik[10] = pre * (s*d5s + 5.0*ds*d4s + 10.0*d2s*d3s);
}
/* ----------------------------------------------------------------------
damppole generates coefficients for the charge penetration
damping function for powers of the interatomic distance
literature references:
L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the
Effective Fragment Potential Method: Theory and Application to
the Benzene Dimer", Journal of Computational Chemistry, 28,
276-291 (2007) [Gordon f1 and f2 models]
J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and
J. W. Ponder, "An Optimized Charge Penetration Model for Use with
the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19,
276-291 (2017)
------------------------------------------------------------------------- */
ucl_inline void damppole(const numtyp r, const int rorder,
const numtyp alphai, const numtyp alphak,
numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11])
{
numtyp termi,termk;
numtyp termi2,termk2;
numtyp alphai2,alphak2;
numtyp eps,diff;
numtyp expi,expk;
numtyp dampi,dampk;
numtyp dampi2,dampi3;
numtyp dampi4,dampi5;
numtyp dampi6,dampi7;
numtyp dampi8;
numtyp dampk2,dampk3;
numtyp dampk4,dampk5;
numtyp dampk6;
// compute tolerance and exponential damping factors
eps = 0.001;
diff = fabs(alphai-alphak);
dampi = alphai * r;
dampk = alphak * r;
expi = exp(-dampi);
expk = exp(-dampk);
// core-valence charge penetration damping for Gordon f1
dampi2 = dampi * dampi;
dampi3 = dampi * dampi2;
dampi4 = dampi2 * dampi2;
dampi5 = dampi2 * dampi3;
dmpi[0] = 1.0 - (1.0 + 0.5*dampi)*expi;
dmpi[2] = 1.0 - (1.0 + dampi + 0.5*dampi2)*expi;
dmpi[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi;
dmpi[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + dampi4/30.0)*expi;
dmpi[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 +
4.0*dampi4/105.0 + dampi5/210.0)*expi;
if (diff < eps) {
dmpk[0] = dmpi[0];
dmpk[2] = dmpi[2];
dmpk[4] = dmpi[4];
dmpk[6] = dmpi[6];
dmpk[8] = dmpi[8];
} else {
dampk2 = dampk * dampk;
dampk3 = dampk * dampk2;
dampk4 = dampk2 * dampk2;
dampk5 = dampk2 * dampk3;
dmpk[0] = 1.0 - (1.0 + 0.5*dampk)*expk;
dmpk[2] = 1.0 - (1.0 + dampk + 0.5*dampk2)*expk;
dmpk[4] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk;
dmpk[6] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk;
dmpk[8] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 +
4.0*dampk4/105.0 + dampk5/210.0)*expk;
}
// valence-valence charge penetration damping for Gordon f1
if (diff < eps) {
dampi6 = dampi3 * dampi3;
dampi7 = dampi3 * dampi4;
dmpik[0] = 1.0 - (1.0 + 11.0*dampi/16.0 + 3.0*dampi2/16.0 +
dampi3/48.0)*expi;
dmpik[2] = 1.0 - (1.0 + dampi + 0.5*dampi2 +
7.0*dampi3/48.0 + dampi4/48.0)*expi;
dmpik[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 +
dampi4/24.0 + dampi5/144.0)*expi;
dmpik[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 +
dampi4/24.0 + dampi5/120.0 + dampi6/720.0)*expi;
dmpik[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 +
dampi4/24.0 + dampi5/120.0 + dampi6/720.0 +
dampi7/5040.0)*expi;
if (rorder >= 11) {
dampi8 = dampi4 * dampi4;
dmpik[10] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 +
dampi4/24.0 + dampi5/120.0 + dampi6/720.0 +
dampi7/5040.0 + dampi8/45360.0)*expi;
}
} else {
alphai2 = alphai * alphai;
alphak2 = alphak * alphak;
termi = alphak2 / (alphak2-alphai2);
termk = alphai2 / (alphai2-alphak2);
termi2 = termi * termi;
termk2 = termk * termk;
dmpik[0] = 1.0 - termi2*(1.0 + 2.0*termk + 0.5*dampi)*expi -
termk2*(1.0 + 2.0*termi + 0.5*dampk)*expk;
dmpik[2] = 1.0 - termi2*(1.0+dampi+0.5*dampi2)*expi -
termk2*(1.0+dampk+0.5*dampk2)*expk -
2.0*termi2*termk*(1.0+dampi)*expi -
2.0*termk2*termi*(1.0+dampk)*expk;
dmpik[4] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi -
termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk -
2.0*termi2*termk*(1.0 + dampi + dampi2/3.0)*expi -
2.0*termk2*termi*(1.0 + dampk + dampk2/3.0)*expk;
dmpik[6] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 +
dampi3/6.0 + dampi4/30.0)*expi -
termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk -
2.0*termi2*termk*(1.0 + dampi + 2.0*dampi2/5.0 + dampi3/15.0)*expi -
2.0*termk2*termi*(1.0 + dampk + 2.0*dampk2/5.0 + dampk3/15.0)*expk;
dmpik[8] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 +
4.0*dampi4/105.0 + dampi5/210.0)*expi -
termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 +
4.0*dampk4/105.0 + dampk5/210.0)*expk -
2.0*termi2*termk*(1.0 + dampi + 3.0*dampi2/7.0 +
2.0*dampi3/21.0 + dampi4/105.0)*expi -
2.0*termk2*termi*(1.0 + dampk + 3.0*dampk2/7.0 +
2.0*dampk3/21.0 + dampk4/105.0)*expk;
if (rorder >= 11) {
dampi6 = dampi3 * dampi3;
dampk6 = dampk3 * dampk3;
dmpik[10] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 +
5.0*dampi4/126.0 + 2.0*dampi5/315.0 +
dampi6/1890.0)*expi -
termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 5.0*dampk4/126.0 +
2.0*dampk5/315.0 + dampk6/1890.0)*expk -
2.0*termi2*termk*(1.0 + dampi + 4.0*dampi2/9.0 + dampi3/9.0 +
dampi4/63.0 + dampi5/945.0)*expi -
2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/9.0 + dampk3/9.0 +
dampk4/63.0 + dampk5/945.0)*expk;
}
}
}
#endif

View File

@ -59,6 +59,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_csix, const double *host_adisp,
const double *host_pcore, const double *host_palpha,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, int &gpu_mode, FILE *screen,
@ -191,10 +192,10 @@ void PairHippoGPU::init_style()
pdamp, thole, dirdamp, amtype2class, special_hal,
special_repel, special_disp, special_mpole,
special_polar_wscale, special_polar_piscale,
special_polar_pscale, csix, adisp, atom->nlocal,
atom->nlocal+atom->nghost, mnf, maxspecial,
maxspecial15, cell_size, gpu_mode, screen,
polar_dscale, polar_uscale, tq_size);
special_polar_pscale, csix, adisp, pcore, palpha,
atom->nlocal, atom->nlocal+atom->nghost, mnf,
maxspecial, maxspecial15, cell_size, gpu_mode,
screen, polar_dscale, polar_uscale, tq_size);
GPU_EXTRA::check_flag(success,error,world);
if (gpu_mode == GPU_FORCE)