Added work on amoeba/gpu, some minor changes to PairAmoeba to allow function overriding in PairAmoebaGPU, added the package AMOEBA to cmake/CMakeLists.txt

This commit is contained in:
Trung Nguyen
2021-08-25 22:57:37 -05:00
parent 9c095e8d76
commit 3825fee8e9
19 changed files with 2228 additions and 6 deletions

View File

@ -140,6 +140,7 @@ option(CMAKE_VERBOSE_MAKEFILE "Generate verbose Makefiles" OFF)
set(STANDARD_PACKAGES
ADIOS
AMOEBA
ASPHERE
ATC
AWPMD
@ -308,6 +309,7 @@ endif()
pkg_depends(ML-IAP ML-SNAP)
pkg_depends(MPIIO MPI)
pkg_depends(ATC MANYBODY)
pkg_depends(AMOEBA KSPACE)
pkg_depends(LATBOLTZ MPI)
pkg_depends(PHONON KSPACE)
pkg_depends(SCAFACOS MPI)

155
lib/gpu/lal_amoeba.cpp Normal file
View File

@ -0,0 +1,155 @@
/***************************************************************************
amoeba.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Class for acceleration of the amoeba pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#if defined(USE_OPENCL)
#include "amoeba_cl.h"
#elif defined(USE_CUDART)
const char *amoeba=0;
#else
#include "amoeba_cubin.h"
#endif
#include "lal_amoeba.h"
#include <cassert>
namespace LAMMPS_AL {
#define AmoebaT Amoeba<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
AmoebaT::Amoeba() : BaseAmoeba<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
AmoebaT::~Amoeba() {
clear();
}
template <class numtyp, class acctyp>
int AmoebaT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp,
const double *host_thole, const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, const double gpu_split, FILE *_screen,
const double aewald, const double felec,
const double off2, const double polar_dscale,
const double polar_uscale) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
cell_size,gpu_split,_screen,amoeba,"k_amoeba_polar");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
for (int i = 0; i < max_amtype; i++) {
host_write[i].x = host_pdamp[i];
host_write[i].y = host_thole[i];
host_write[i].z = (numtyp)0;
host_write[i].w = (numtyp)0;
}
damping.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(damping,host_write,false);
UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<5; i++) {
dview[i].x=host_special_polar_wscale[i];
dview[i].y=host_special_polar_piscale[i];
dview[i].z=host_special_polar_pscale[i];
dview[i].w=(numtyp)0;
}
ucl_copy(sp_polar,dview,5,false);
_aewald = aewald;
_felec = felec;
_off2 = off2;
_polar_dscale = polar_dscale;
_polar_uscale = polar_uscale;
_allocated=true;
this->_max_bytes=damping.row_bytes()
+ sp_polar.row_bytes()
+ this->_tep.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void AmoebaT::clear() {
if (!_allocated)
return;
_allocated=false;
damping.clear();
sp_polar.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double AmoebaT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int AmoebaT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int _nall=this->atom->nall();
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
this->k_polar.set_size(GX,BX);
this->k_polar.run(&this->atom->x, &this->atom->extra,
&damping, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom,
&_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
template class Amoeba<PRECISION,ACC_PRECISION>;
}

684
lib/gpu/lal_amoeba.cu Normal file
View File

@ -0,0 +1,684 @@
// **************************************************************************
// amoeba.cu
// -------------------
// Trung Dac Nguyen (Northwestern)
//
// Device code for acceleration of the amoeba pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : trung.nguyen@northwestern.edu
// ***************************************************************************
#if defined(NV_KERNEL) || defined(USE_HIP)
#include <stdio.h>
#include "lal_aux_fun1.h"
#ifdef LAMMPS_SMALLBIG
#define tagint int
#endif
#ifdef LAMMPS_BIGBIG
#include "inttypes.h"
#define tagint int64_t
#endif
#ifdef LAMMPS_SMALLSMALL
#define tagint int
#endif
#ifndef _DOUBLE_DOUBLE
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else
#define pos_tex x_
#define q_tex q_
#endif
#if (SHUFFLE_AVAIL == 0)
#define local_allocate_store_ufld() \
__local acctyp red_acc[6][BLOCK_PAIR];
#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset, \
i, tep) \
if (t_per_atom>1) { \
red_acc[0][tid]=ufld[0]; \
red_acc[1][tid]=ufld[1]; \
red_acc[2][tid]=ufld[2]; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
for (int r=0; r<3; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
ufld[0]=red_acc[0][tid]; \
ufld[1]=red_acc[1][tid]; \
ufld[2]=red_acc[2][tid]; \
red_acc[0][tid]=dufld[0]; \
red_acc[1][tid]=dufld[1]; \
red_acc[2][tid]=dufld[2]; \
red_acc[3][tid]=dufld[3]; \
red_acc[4][tid]=dufld[4]; \
red_acc[5][tid]=dufld[5]; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
for (int r=0; r<6; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
dufld[0]=red_acc[0][tid]; \
dufld[1]=red_acc[1][tid]; \
dufld[2]=red_acc[2][tid]; \
dufld[3]=red_acc[3][tid]; \
dufld[4]=red_acc[4][tid]; \
dufld[5]=red_acc[5][tid]; \
} \
if (offset==0 && ii<inum) { \
numtyp4 t; \
t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \
(numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \
t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \
(numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3]; \
t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] + \
(numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1]; \
tep[i]=t; \
}
#else
#define local_allocate_store_ufld()
#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset, \
i, tep) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
ufld[0] += shfl_down(ufld[0], s, t_per_atom); \
ufld[1] += shfl_down(ufld[1], s, t_per_atom); \
ufld[2] += shfl_down(ufld[2], s, t_per_atom); \
dufld[0] += shfl_down(dufld[0], s, t_per_atom); \
dufld[1] += shfl_down(dufld[1], s, t_per_atom); \
dufld[2] += shfl_down(dufld[2], s, t_per_atom); \
dufld[3] += shfl_down(dufld[3], s, t_per_atom); \
dufld[4] += shfl_down(dufld[4], s, t_per_atom); \
dufld[5] += shfl_down(dufld[5], s, t_per_atom); \
} \
} \
if (offset==0 && ii<inum) { \
numtyp4 t; \
t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \
(numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \
t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \
(numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3]; \
t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] + \
(numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1]; \
tep[i]=t; \
}
#endif
#define MIN(A,B) ((A) < (B) ? (A) : (B))
#define MY_PIS (acctyp)1.77245385090551602729
__kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
const __global numtyp *restrict extra,
const __global numtyp4 *restrict damping,
const __global numtyp4 *restrict sp_polar,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global numtyp4 *restrict tep,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch, const int t_per_atom,
const numtyp aewald, const numtyp felec,
const numtyp off2, const numtyp polar_dscale,
const numtyp polar_uscale)
{
int tid, ii, offset, i;
atom_info(t_per_atom,ii,tid,offset);
int n_stride;
local_allocate_store_ufld();
local_allocate_store_charge();
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6];
if (EVFLAG) {
energy=(acctyp)0;
e_coul=(acctyp)0;
for (int l=0; l<6; l++) virial[l]=(acctyp)0;
}
acctyp ufld[3];
ufld[0] = (acctyp)0; ufld[1]=(acctyp)0; ufld[2]=(acctyp)0;
acctyp dufld[6];
for (int l=0; l<6; l++) dufld[l]=(acctyp)0;
numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
numtyp4* polar1 = (numtyp4*)(&extra[0]);
numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
//numtyp4 xi__;
if (ii<inum) {
int k,m,itype,igroup;
numtyp bfac;
numtyp psc3,psc5,psc7;
numtyp dsc3,dsc5,dsc7;
numtyp usc3,usc5;
numtyp psr3,psr5,psr7;
numtyp dsr3,dsr5,dsr7;
numtyp usr5;
numtyp term1,term2,term3;
numtyp term4,term5;
numtyp term6,term7;
numtyp rc3[3],rc5[3],rc7[3];
numtyp prc3[3],prc5[3],prc7[3];
numtyp drc3[3],drc5[3],drc7[3];
numtyp urc3[3],urc5[3];
numtyp bn[5];
numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
int numj, nbor, nbor_end;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//numtyp qtmp; fetch(qtmp,i,q_tex);
//int itype=ix.w;
ci = polar1[i].x; // rpole[i][0];
dix = polar1[i].y; // rpole[i][1];
diy = polar1[i].z; // rpole[i][2];
diz = polar1[i].w; // rpole[i][3];
qixx = polar2[i].x; // rpole[i][4];
qixy = polar2[i].y; // rpole[i][5];
qixz = polar2[i].z; // rpole[i][6];
qiyy = polar2[i].w; // rpole[i][8];
qiyz = polar3[i].x; // rpole[i][9];
qizz = polar3[i].y; // rpole[i][12];
itype = polar3[i].z; // amtype[i];
igroup = polar3[i].w; // amgroup[i];
uix = polar4[i].x; // uind[i][0];
uiy = polar4[i].y; // uind[i][1];
uiz = polar4[i].z; // uind[i][2];
uixp = polar5[i].x; // uinp[i][0];
uiyp = polar5[i].y; // uinp[i][1];
uizp = polar5[i].z; // uinp[i][2];
// debug:
// xi__ = ix; xi__.w = itype;
numtyp pdi = damping[itype].x;
numtyp pti = damping[itype].y;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int jextra=dev_packed[nbor];
int j = jextra & NEIGHMASK15;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int jtype=jx.w;
// Compute r12
numtyp xr = jx.x - ix.x;
numtyp yr = jx.y - ix.y;
numtyp zr = jx.z - ix.z;
numtyp r2 = xr*xr + yr*yr + zr*zr;
if (r2>off2) continue;
numtyp r = ucl_sqrt(r2);
numtyp ck = polar1[j].x; // rpole[j][0];
numtyp dkx = polar1[j].y; // rpole[j][1];
numtyp dky = polar1[j].z; // rpole[j][2];
numtyp dkz = polar1[j].w; // rpole[j][3];
numtyp qkxx = polar2[j].x; // rpole[j][4];
numtyp qkxy = polar2[j].y; // rpole[j][5];
numtyp qkxz = polar2[j].z; // rpole[j][6];
numtyp qkyy = polar2[j].w; // rpole[j][8];
numtyp qkyz = polar3[j].x; // rpole[j][9];
numtyp qkzz = polar3[j].y; // rpole[j][12];
int jtype = polar3[j].z; // amtype[j];
int jgroup = polar3[j].w; // amgroup[j];
numtyp ukx = polar4[j].x; // uind[j][0];
numtyp uky = polar4[j].y; // uind[j][1];
numtyp ukz = polar4[j].z; // uind[j][2];
numtyp ukxp = polar5[j].x; // uinp[j][0];
numtyp ukyp = polar5[j].y; // uinp[j][1];
numtyp ukzp = polar5[j].z; // uinp[j][2];
numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
if (igroup == jgroup) {
factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
factor_dscale = polar_dscale;
factor_uscale = polar_uscale;
} else {
factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
factor_dscale = factor_uscale = (numtyp)1.0;
}
// intermediates involving moments and separation distance
numtyp dir = dix*xr + diy*yr + diz*zr;
numtyp qix = qixx*xr + qixy*yr + qixz*zr;
numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
numtyp qir = qix*xr + qiy*yr + qiz*zr;
numtyp dkr = dkx*xr + dky*yr + dkz*zr;
numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
numtyp qkr = qkx*xr + qky*yr + qkz*zr;
numtyp uir = uix*xr + uiy*yr + uiz*zr;
numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
numtyp ukr = ukx*xr + uky*yr + ukz*zr;
numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;
// get reciprocal distance terms for this interaction
numtyp rinv = ucl_recip(r);
numtyp r2inv = rinv*rinv;
numtyp rr1 = felec * rinv;
numtyp rr3 = rr1 * r2inv;
numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
// calculate the real space Ewald error function terms
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
//bn[0] = erfc(ralpha) / r;
bn[0] = _erfc * rinv;
numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
numtyp alsq2n = (numtyp)0.0;
if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
for (m = 1; m <= 4; m++) {
bfac = (numtyp) (m+m-1);
alsq2n = alsq2 * alsq2n;
bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
}
for (m = 0; m < 5; m++) bn[m] *= felec;
// apply Thole polarization damping to scale factors
numtyp sc3 = (numtyp)1.0;
numtyp sc5 = (numtyp)1.0;
numtyp sc7 = (numtyp)1.0;
for (k = 0; k < 3; k++) {
rc3[k] = (numtyp)0.0;
rc5[k] = (numtyp)0.0;
rc7[k] = (numtyp)0.0;
}
// apply Thole polarization damping to scale factors
numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
if (damp != (numtyp)0.0) {
numtyp pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
if (damp < (numtyp)50.0) {
numtyp expdamp = ucl_exp(-damp);
sc3 = (numtyp)1.0 - expdamp;
sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp;
sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp;
numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv;
numtyp temp5 = damp;
numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp;
rc3[0] = xr * temp3;
rc3[1] = yr * temp3;
rc3[2] = zr * temp3;
rc5[0] = rc3[0] * temp5;
rc5[1] = rc3[1] * temp5;
rc5[2] = rc3[2] * temp5;
rc7[0] = rc5[0] * temp7;
rc7[1] = rc5[1] * temp7;
rc7[2] = rc5[2] * temp7;
}
psc3 = (numtyp)1.0 - sc3*factor_pscale;
psc5 = (numtyp)1.0 - sc5*factor_pscale;
psc7 = (numtyp)1.0 - sc7*factor_pscale;
dsc3 = (numtyp)1.0 - sc3*factor_dscale;
dsc5 = (numtyp)1.0 - sc5*factor_dscale;
dsc7 = (numtyp)1.0 - sc7*factor_dscale;
usc3 = (numtyp)1.0 - sc3*factor_uscale;
usc5 = (numtyp)1.0 - sc5*factor_uscale;
psr3 = bn[1] - psc3*rr3;
psr5 = bn[2] - psc5*rr5;
psr7 = bn[3] - psc7*rr7;
dsr3 = bn[1] - dsc3*rr3;
dsr5 = bn[2] - dsc5*rr5;
dsr7 = bn[3] - dsc7*rr7;
usr5 = bn[2] - usc5*rr5;
for (k = 0; k < 3; k++) {
prc3[k] = rc3[k] * factor_pscale;
prc5[k] = rc5[k] * factor_pscale;
prc7[k] = rc7[k] * factor_pscale;
drc3[k] = rc3[k] * factor_dscale;
drc5[k] = rc5[k] * factor_dscale;
drc7[k] = rc7[k] * factor_dscale;
urc3[k] = rc3[k] * factor_uscale;
urc5[k] = rc5[k] * factor_uscale;
}
} else { // damp == 0: ???
}
// get the induced dipole field used for dipole torques
numtyp tix3 = psr3*ukx + dsr3*ukxp;
numtyp tiy3 = psr3*uky + dsr3*ukyp;
numtyp tiz3 = psr3*ukz + dsr3*ukzp;
numtyp tuir = -psr5*ukr - dsr5*ukrp;
ufld[0] += tix3 + xr*tuir;
ufld[1] += tiy3 + yr*tuir;
ufld[2] += tiz3 + zr*tuir;
// get induced dipole field gradient used for quadrupole torques
numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp);
numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp);
numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp);
tuir = -psr7*ukr - dsr7*ukrp;
dufld[0] += xr*tix5 + xr*xr*tuir;
dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
dufld[2] += yr*tiy5 + yr*yr*tuir;
dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
dufld[5] += zr*tiz5 + zr*zr*tuir;
// get the dEd/dR terms used for direct polarization force
term1 = bn[2] - dsc3*rr5;
term2 = bn[3] - dsc5*rr7;
term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0];
term4 = rr3*drc3[0] - term1*xr - dsr5*xr;
term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0];
term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
(numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 +qir*term6;
numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
(numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1];
term4 = rr3*drc3[1] - term1*yr - dsr5*yr;
term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1];
term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1];
term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr;
numtyp tiyy = ci*term3 + diy*term4 + dir*term5 +
(numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 +
(numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2];
term4 = rr3*drc3[2] - term1*zr - dsr5*zr;
term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2];
term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2];
term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr;
numtyp tizz = ci*term3 + diz*term4 + dir*term5 +
(numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 +
(numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
term3 = term1*xr*yr - rr3*yr*drc3[0];
term4 = rr3*drc3[0] - term1*xr;
term5 = term2*xr*yr - rr5*yr*drc5[0];
term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0];
term7 = rr5*drc5[0] - term2*xr;
numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 +
(numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 +
(numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6;
term3 = term1*xr*zr - rr3*zr*drc3[0];
term5 = term2*xr*zr - rr5*zr*drc5[0];
term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0];
numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 +
(numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 +
(numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
term3 = term1*yr*zr - rr3*zr*drc3[1];
term4 = rr3*drc3[1] - term1*yr;
term5 = term2*yr*zr - rr5*zr*drc5[1];
term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1];
term7 = rr5*drc5[1] - term2*yr;
numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 +
(numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 +
(numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp;
numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp;
numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp;
numtyp frcx = depx;
numtyp frcy = depy;
numtyp frcz = depz;
// get the dEp/dR terms used for direct polarization force
// tixx and tkxx
term1 = bn[2] - psc3*rr5;
term2 = bn[3] - psc5*rr7;
term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0];
term4 = rr3*prc3[0] - term1*xr - psr5*xr;
term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0];
term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0];
term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr;
tixx = ci*term3 + dix*term4 + dir*term5 +
(numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
tkxx = ck*term3 - dkx*term4 - dkr*term5 +
(numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
// tiyy and tkyy
term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1];
term4 = rr3*prc3[1] - term1*yr - psr5*yr;
term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1];
term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1];
term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr;
tiyy = ci*term3 + diy*term4 + dir*term5 +
(numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
tkyy = ck*term3 - dky*term4 - dkr*term5 +
(numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
// tizz and tkzz
term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2];
term4 = rr3*prc3[2] - term1*zr - psr5*zr;
term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2];
term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2];
term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr;
tizz = ci*term3 + diz*term4 + dir*term5 +
(numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
tkzz = ck*term3 - dkz*term4 - dkr*term5 +
(numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
// tixy and tkxy
term3 = term1*xr*yr - rr3*yr*prc3[0];
term4 = rr3*prc3[0] - term1*xr;
term5 = term2*xr*yr - rr5*yr*prc5[0];
term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0];
term7 = rr5*prc5[0] - term2*xr;
tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 +
(numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 +
(numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6;
// tixz and tkxz
term3 = term1*xr*zr - rr3*zr*prc3[0];
term5 = term2*xr*zr - rr5*zr*prc5[0];
term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0];
tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 +
(numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 +
(numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
// tiyz and tkyz
term3 = term1*yr*zr - rr3*zr*prc3[1];
term4 = rr3*prc3[1] - term1*yr;
term5 = term2*yr*zr - rr5*zr*prc5[1];
term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1];
term7 = rr5*prc5[1] - term2*yr;
tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 +
(numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 +
(numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz;
depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;
frcx = frcx + depx;
frcy = frcy + depy;
frcz = frcz + depz;
// get the dtau/dr terms used for mutual polarization force
// poltyp == MUTUAL && amoeba
term1 = bn[2] - usc3*rr5;
term2 = bn[3] - usc5*rr7;
term3 = usr5 + term1;
term4 = rr3 * factor_uscale;
term5 = -xr*term3 + rc3[0]*term4;
term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0];
tixx = uix*term5 + uir*term6;
tkxx = ukx*term5 + ukr*term6;
term5 = -yr*term3 + rc3[1]*term4;
term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1];
tiyy = uiy*term5 + uir*term6;
tkyy = uky*term5 + ukr*term6;
term5 = -zr*term3 + rc3[2]*term4;
term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2];
tizz = uiz*term5 + uir*term6;
tkzz = ukz*term5 + ukr*term6;
term4 = -usr5 * yr;
term5 = -xr*term1 + rr3*urc3[0];
term6 = xr*yr*term2 - rr5*yr*urc5[0];
tixy = uix*term4 + uiy*term5 + uir*term6;
tkxy = ukx*term4 + uky*term5 + ukr*term6;
term4 = -usr5 * zr;
term6 = xr*zr*term2 - rr5*zr*urc5[0];
tixz = uix*term4 + uiz*term5 + uir*term6;
tkxz = ukx*term4 + ukz*term5 + ukr*term6;
term5 = -yr*term1 + rr3*urc3[1];
term6 = yr*zr*term2 - rr5*zr*urc5[1];
tiyz = uiy*term4 + uiz*term5 + uir*term6;
tkyz = uky*term4 + ukz*term5 + ukr*term6;
depx = tixx*ukxp + tixy*ukyp + tixz*ukzp
+ tkxx*uixp + tkxy*uiyp + tkxz*uizp;
depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp
+ tkxy*uixp + tkyy*uiyp + tkyz*uizp;
depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp
+ tkxz*uixp + tkyz*uiyp + tkzz*uizp;
frcx = frcx + depx;
frcy = frcy + depy;
frcz = frcz + depz;
f.x -= frcx;
f.y -= frcy;
f.z -= frcz;
if (EVFLAG && vflag) {
numtyp vxx = xr * frcx;
numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy);
numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz);
numtyp vyy = yr * frcy;
numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz);
numtyp vzz = zr * frcz;
virial[0] += vxx;
virial[1] += vyy;
virial[2] += vzz;
virial[3] += vxy;
virial[4] += vxz;
virial[5] += vyz;
}
} // nbor
} // ii<inum
// accumulate ufld and dufld to compute tep
store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);
// accumate force, energy and virial
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
offset,eflag,vflag,ans,engv);
}
/* ----------------------------------------------------------------------
scan standard neighbor list and make it compatible with 1-5 neighbors
if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
else do nothing to IJ entry
------------------------------------------------------------------------- */
__kernel void k_special15(__global int * dev_nbor,
const __global int * dev_packed,
const __global tagint *restrict tag,
const __global int *restrict nspecial15,
const __global tagint *restrict special15,
const int inum, const int nall, const int nbor_pitch,
const int t_per_atom) {
int tid, ii, offset, n_stride, i;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int numj, nbor, nbor_end;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
int n15 = nspecial15[ii];
for ( ; nbor<nbor_end; nbor+=n_stride) {
int sj=dev_packed[nbor];
int which = sj >> SBBITS & 3;
int j = sj & NEIGHMASK;
tagint jtag = tag[j];
if (!which) {
int offset=ii;
for (int k=0; k<n15; k++) {
if (special15[offset] == jtag) {
which = 4;
break;
}
offset += nall;
}
}
if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
} // for nbor
} // if ii
}

87
lib/gpu/lal_amoeba.h Normal file
View File

@ -0,0 +1,87 @@
/***************************************************************************
amoeba.h
-------------------
Trung Dac Nguyen (Northwestern)
Class for acceleration of the amoeba pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#ifndef LAL_AMOEBA_H
#define LAL_AMOEBA_H
#include "lal_base_amoeba.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Amoeba : public BaseAmoeba<numtyp, acctyp> {
public:
Amoeba();
~Amoeba();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, const int max_amtype, const double *host_pdamp,
const double *host_thole, const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15, const double cell_size,
const double gpu_split, FILE *_screen,
const double aewald, const double felec,
const double off2, const double polar_dscale,
const double polar_uscale);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// pdamp = damping.x; thole = damping.y
UCL_D_Vec<numtyp4> damping;
/// Special polar values [0-4]:
/// sp_polar.x = special_polar_wscale
/// sp_polar.y special_polar_pscale,
/// sp_polar.z = special_polar_piscale
UCL_D_Vec<numtyp4> sp_polar;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _aewald, _felec, _off2, _polar_dscale, _polar_uscale;
numtyp _qqrd2e;
private:
bool _allocated;
int loop(const int eflag, const int vflag);
};
}
#endif

142
lib/gpu/lal_amoeba_ext.cpp Normal file
View File

@ -0,0 +1,142 @@
/***************************************************************************
amoeba_ext.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Functions for LAMMPS access to amoeba acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#include <iostream>
#include <cassert>
#include <cmath>
#include "lal_amoeba.h"
using namespace std;
using namespace LAMMPS_AL;
static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int amoeba_gpu_init(const int ntypes, const int max_amtype,
const double *host_pdamp, const double *host_thole,
const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, int &gpu_mode, FILE *screen,
const double aewald, const double felec,
const double off2, const double polar_dscale,
const double polar_uscale, int& tep_size) {
AMOEBAMF.clear();
gpu_mode=AMOEBAMF.device->gpu_mode();
double gpu_split=AMOEBAMF.device->particle_split();
int first_gpu=AMOEBAMF.device->first_device();
int last_gpu=AMOEBAMF.device->last_device();
int world_me=AMOEBAMF.device->world_me();
int gpu_rank=AMOEBAMF.device->gpu_rank();
int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();
tep_size=sizeof(PRECISION);
AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);
bool message=false;
if (AMOEBAMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole,
host_special_polar_wscale, host_special_polar_piscale,
host_special_polar_pscale, nlocal, nall, max_nbors,
maxspecial, maxspecial15, cell_size, gpu_split, screen,
aewald, felec, off2, polar_dscale, polar_uscale);
AMOEBAMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole,
host_special_polar_wscale, host_special_polar_piscale,
host_special_polar_pscale, nlocal, nall, max_nbors,
maxspecial, maxspecial15, cell_size, gpu_split, screen,
aewald, felec, off2, polar_dscale, polar_uscale);
AMOEBAMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
AMOEBAMF.estimate_gpu_overhead();
return init_ok;
}
void amoeba_gpu_clear() {
AMOEBAMF.clear();
}
int** amoeba_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *host_amtype, int *host_amgroup,
double **host_rpole, double **host_uind, double **host_uinp,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint** special15,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd, void **tep_ptr) {
return AMOEBAMF.compute(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
sublo, subhi, tag, nspecial, special, nspecial15, special15,
eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd, tep_ptr);
}
void amoeba_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
double **host_rpole, double **host_uind, double **host_uinp,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd, void **tep_ptr) {
AMOEBAMF.compute(ago,inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
ilist, numj, firstneigh, eflag, vflag, eatom, vatom,
host_start, cpu_time, success, host_q, nlocal, boxlo, prd, tep_ptr);
}
double amoeba_gpu_bytes() {
return AMOEBAMF.host_memory_usage();
}

516
lib/gpu/lal_base_amoeba.cpp Normal file
View File

@ -0,0 +1,516 @@
/***************************************************************************
base_amoeba.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Base class for pair styles needing per-particle data for position,
charge, and type.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#include "lal_base_amoeba.h"
namespace LAMMPS_AL {
#define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp>
BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor();
pair_program=nullptr;
ucl_device=nullptr;
#if defined(LAL_OCL_EV_JIT)
pair_program_noev=nullptr;
#endif
}
template <class numtyp, class acctyp>
BaseAmoebaT::~BaseAmoeba() {
delete ans;
delete nbor;
k_polar.clear();
k_special15.clear();
if (pair_program) delete pair_program;
}
template <class numtyp, class acctyp>
int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const {
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const int maxspecial15,
const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program,
const char *k_name) {
screen=_screen;
int gpu_nbor=0;
if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=1;
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
_threads_per_atom=device->threads_per_charge();
bool charge = true;
bool rot = false;
bool vel = false;
_extra_fields = 24; // round up to accomodate quadruples of numtyp values
// rpole 13; uind 3; uinp 3; amtype, amgroup
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields);
if (success!=0)
return success;
if (ucl_device!=device->gpu) _compiled=false;
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->pair_block_size();
_block_bio_size=device->block_bio_pair();
compile_kernels(*ucl_device,pair_program,k_name);
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
time_pair.zero();
pos_tex.bind_float(atom->x,4);
q_tex.bind_float(atom->q,1);
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
_maxspecial=maxspecial;
_maxspecial15=maxspecial15;
// allocate per-atom array tep
int ef_nall=nall;
if (ef_nall==0)
ef_nall=2000;
_max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
_tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
return success;
}
template <class numtyp, class acctyp>
void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) {
device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
void BaseAmoebaT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
_gpu_overhead*=hd_balancer.timestep();
_driver_overhead*=hd_balancer.timestep();
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
time_pair.clear();
hd_balancer.clear();
nbor->clear();
ans->clear();
_tep.clear();
dev_nspecial15.clear();
dev_special15.clear();
dev_special15_t.clear();
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
int *numj, int **firstneigh, bool &success) {
success=true;
int mn=nbor->max_nbor_loop(inum,numj,ilist);
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
return nullptr;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return ilist;
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
bool &success) {
success=true;
resize_atom(inum,nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),success);
if (!success)
return;
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
tag, nspecial, special, success, mn, ans->error_flag);
// add one-five neighbors
if (_maxspecial15>0) {
UCL_H_Vec<int> view_nspecial15;
UCL_H_Vec<tagint> view_special15;
view_nspecial15.view(nspecial15,nall,*ucl_device);
view_special15.view(special15[0],nall*_maxspecial15,*ucl_device);
ucl_copy(dev_nspecial15,view_nspecial15,nall,false);
ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false);
nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall);
add_onefive_neighbors();
}
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
int *ilist, int *numj, int **firstneigh,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, double *host_q, const int nlocal,
double *boxlo, double *prd, void **tep_ptr) {
acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
// ------------------- Resize _tep array ------------------------
if (nall>_max_tep_size) {
_max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
_tep.resize(_max_tep_size*4);
dev_nspecial15.clear();
dev_special15.clear();
dev_special15_t.clear();
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
}
*tep_ptr=_tep.host.begin();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return;
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum);
host_start=inum;
if (ago==0) {
reset_nbors(nall, inum, ilist, numj, firstneigh, success);
if (!success)
return;
}
// packing host arrays into host_extra
atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q);
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
atom->add_q_data();
atom->add_extra_data();
device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
boxlo, prd);
const int red_blocks=loop(eflag,vflag);
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
device->add_ans_object(ans);
hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd, void **tep_ptr) {
acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
// ------------------- Resize _tep array ------------------------
if (nall>_max_tep_size) {
_max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
_tep.resize(_max_tep_size*4);
dev_nspecial15.clear();
dev_special15.clear();
dev_special15_t.clear();
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
}
*tep_ptr=_tep.host.begin();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return nullptr;
}
hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, nspecial15, special15,
success);
if (!success)
return nullptr;
atom->cast_q_data(host_q);
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
hd_balancer.start_timer();
} else {
atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q);
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
atom->add_q_data();
atom->add_extra_data();
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
boxlo, prd);
const int red_blocks=loop(eflag,vflag);
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
device->add_ans_object(ans);
hd_balancer.stop_timer();
// copy tep from device to host
_tep.update_host(_max_tep_size*4,false);
/*
printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
for (int i = 0; i < 10; i++) {
numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
}
*/
return nbor->host_jlist.begin()-host_start;
}
template <class numtyp, class acctyp>
double BaseAmoebaT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(BaseAmoeba<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
double** uind, double** uinp) {
int _nall=atom->nall();
numtyp *pextra=reinterpret_cast<numtyp*>(&(atom->extra[0]));
int n = 0;
int nstride = 4;
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx] = rpole[i][0];
pextra[idx+1] = rpole[i][1];
pextra[idx+2] = rpole[i][2];
pextra[idx+3] = rpole[i][3];
}
n += nstride*_nall;
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx] = rpole[i][4];
pextra[idx+1] = rpole[i][5];
pextra[idx+2] = rpole[i][6];
pextra[idx+3] = rpole[i][8];
}
n += nstride*_nall;
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx] = rpole[i][9];
pextra[idx+1] = rpole[i][12];
pextra[idx+2] = (numtyp)amtype[i];
pextra[idx+3] = (numtyp)amgroup[i];
}
n += nstride*_nall;
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx] = uind[i][0];
pextra[idx+1] = uind[i][1];
pextra[idx+2] = uind[i][2];
}
n += nstride*_nall;
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx] = uinp[i][0];
pextra[idx+1] = uinp[i][1];
pextra[idx+2] = uinp[i][2];
}
}
template <class numtyp, class acctyp>
void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *kname) {
if (_compiled)
return;
if (pair_program) delete pair_program;
pair_program=new UCL_Program(dev);
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_polar.set_function(*pair_program,kname);
k_special15.set_function(*pair_program,"k_special15");
pos_tex.get_texture(*pair_program,"pos_tex");
q_tex.get_texture(*pair_program,"q_tex");
_compiled=true;
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
if (dev.has_subgroup_support()) {
size_t mx_subgroup_sz = k_polar.max_subgroup_size(_block_size);
if (_threads_per_atom > mx_subgroup_sz)
_threads_per_atom = mx_subgroup_sz;
device->set_simd_size(mx_subgroup_sz);
}
#endif
}
template <class numtyp, class acctyp>
int BaseAmoebaT::add_onefive_neighbors() {
// Compute the block size and grid size to keep all cores busy
const int BX=block_size();
int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
(BX/_threads_per_atom)));
int _nall=atom->nall();
int ainum=ans->inum();
int nbor_pitch=nbor->nbor_pitch();
k_special15.set_size(GX,BX);
k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
&atom->dev_tag, &dev_nspecial15, &dev_special15,
&ainum, &_nall, &nbor_pitch,
&_threads_per_atom);
return GX;
}
template class BaseAmoeba<PRECISION,ACC_PRECISION>;
}

225
lib/gpu/lal_base_amoeba.h Normal file
View File

@ -0,0 +1,225 @@
/***************************************************************************
base_amoeba.h
-------------------
Trung Dac Nguyen (Northwestern)
Base class for pair styles needing per-particle data for position,
charge, and type.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#ifndef LAL_BASE_AMOEBA_H
#define LAL_BASE_AMOEBA_H
#include "lal_device.h"
#include "lal_balance.h"
#include "mpi.h"
#if defined(USE_OPENCL)
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class BaseAmoeba {
public:
BaseAmoeba();
virtual ~BaseAmoeba();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15, const double cell_size,
const double gpu_split, FILE *screen,
const void *pair_program, const char *k_name);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(const int add_kernels=0);
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(nall, success)) {
pos_tex.bind_float(atom->x,4);
q_tex.bind_float(atom->q,1);
}
ans->resize(inum,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success);
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear_atomic();
/// Returns memory usage on device per atom
int bytes_per_atom_atomic(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage_atomic() const;
/// Accumulate timers
inline void acc_timers() {
if (device->time_device()) {
nbor->acc_timers(screen);
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
}
}
/// Zero timers
inline void zero_timers() {
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
}
/// Copy neighbor list from host
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
int **firstneigh, bool &success);
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint **special15,
bool &success);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double **host_uind,
double **host_uinp, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *charge,
const int nlocal, double *boxlo, double *prd, void **tep_ptr);
/// Pair loop with device neighboring
int** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double **host_uind,
double **host_uinp, double *sublo, double *subhi,
tagint *tag, int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success,
double *charge, double *boxlo, double *prd, void **tep_ptr);
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
Device<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Timers
UCL_Timer time_pair;
/// Host device load balancer
Balance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
Atom<numtyp,acctyp> *atom;
UCL_Vector<numtyp,numtyp> polar1, polar2, polar3, polar4, polar5;
/// cast host arrays into a single array for atom->extra
void cast_extra_data(int* amtype, int* amgroup, double** rpole,
double** uind, double** uinp);
/// Per-atom arrays
UCL_Vector<numtyp,numtyp> _tep;
int _max_tep_size;
// ------------------------ FORCE/ENERGY DATA -----------------------
Answer<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
Neighbor *nbor;
/// Device storage for 1-5 special neighbor counts
UCL_D_Vec<int> dev_nspecial15;
/// Device storage for special neighbors
UCL_D_Vec<tagint> dev_special15, dev_special15_t;
int add_onefive_neighbors();
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_polar,k_special15;
inline int block_size() { return _block_size; }
inline void set_kernel(const int eflag, const int vflag) {}
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
UCL_Texture q_tex;
protected:
bool _compiled;
int _block_size, _block_bio_size, _threads_per_atom;
int _extra_fields;
double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
virtual int loop(const int eflag, const int vflag) = 0;
};
}
#endif

View File

@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_atom();
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
bool charge = false;
bool rot = false;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;

View File

@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_charge();
int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
bool charge = true;
bool rot = false;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;

View File

@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_charge();
int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
bool charge = true;
bool rot = true;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;

View File

@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_atom();
int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
bool charge = false;
bool rot = false;
bool vel = true;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel);
if (success!=0)
return success;

View File

@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
else
_threads_per_atom=device->threads_per_three();
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
bool charge = false;
bool rot = false;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;

View File

@ -579,6 +579,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
time_nbor.stop();
if (_time_device)
time_nbor.add_to_total();
// on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial)
// on the device, transpose the matrix (1-d array) for coalesced reads
// dev_special[i][j] = the special i neighbor of atom j
time_transpose.start();
const int b2x=_block_cell_2d;
const int b2y=_block_cell_2d;
@ -682,6 +687,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
if (_cutoff < _cell_size) vadjust*=1.46;
mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
if (mn<33) mn+=3;
resize_max_neighbors<numtyp,acctyp>(mn,success);
set_nbor_block_size(mn/2);
if (!success)
@ -834,6 +840,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
time_nbor.stop();
}
void Neighbor::transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
const int columns_in, const int rows_in)
{
const int b2x=_block_cell_2d;
const int b2y=_block_cell_2d;
const int g2x=static_cast<int>(ceil(static_cast<double>(columns_in)/b2x));
const int g2y=static_cast<int>(ceil(static_cast<double>(rows_in)/b2y));
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
_shared->k_transpose.run(&out, &in, &columns_in, &rows_in);
}
template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
(double **x, const int inum, const int host_inum, const int nall,
Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,

View File

@ -260,6 +260,10 @@ class Neighbor {
return o.str();
}
/// Helper function
void transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
const int columns_in, const int rows_in);
private:
NeighborShared *_shared;
UCL_Device *dev;

View File

@ -44,6 +44,19 @@ _texture_2d( pos_tex,int4);
#define LAL_USE_OLD_NEIGHBOR
#endif
/*
compute the id of the cell where the atoms belong to
x: atom coordinates
cell_id: cell ids
particle_id:
boxlo[0-2]: the lower left corner of the local box
ncell[xyz]: the number of cells in xyz dims
i_cell_size is the inverse cell size
inum = the number of the local atoms that are ported to the device
nall = the number of the local+ghost atoms that are ported to the device
cells_in_cutoff = the number of cells that are within the cutoff
*/
__kernel void calc_cell_id(const numtyp4 *restrict x_,
unsigned *restrict cell_id,
int *restrict particle_id,
@ -86,6 +99,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_,
}
}
// compute the number of atoms in each cell
__kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
int *restrict cell_counts,
int nall, int ncell) {

View File

@ -357,7 +357,7 @@ class PairAmoeba : public Pair {
void polar();
void polar_energy();
void polar_real();
virtual void polar_real();
void polar_kspace();
void damppole(double, int, double, double, double *, double *, double *);

View File

@ -41,6 +41,8 @@ action fix_npt_gpu.cpp
action fix_nve_asphere_gpu.h fix_nve_asphere.h
action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp
action gpu_extra.h
action pair_amoeba_gpu.cpp pair_amoeba.cpp
action pair_amoeba_gpu.h pair_amoeba.h
action pair_beck_gpu.cpp pair_beck.cpp
action pair_beck_gpu.h pair_beck.h
action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp

299
src/GPU/pair_amoeba_gpu.cpp Normal file
View File

@ -0,0 +1,299 @@
// clang-format off
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Trung Nguyen (Northwestern)
------------------------------------------------------------------------- */
#include "pair_amoeba_gpu.h"
#include "atom.h"
#include "comm.h"
#include "domain.h"
#include "error.h"
#include "force.h"
#include "gpu_extra.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "neighbor.h"
#include "suffix.h"
#include <cmath>
using namespace LAMMPS_NS;
// External functions from cuda library for atom decomposition
int amoeba_gpu_init(const int ntypes, const int max_amtype,
const double *host_pdamp, const double *host_thole,
const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, int &gpu_mode, FILE *screen,
const double aewald, const double felec,
const double off2, const double polar_dscale,
const double polar_uscale, int& tep_size);
void amoeba_gpu_clear();
int ** amoeba_gpu_compute_n(const int ago, const int inum, const int nall,
double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
double **host_rpole, double **host_uind, double **host_uinp,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int* nspecial15, tagint** special15,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo, double *prd,
void **tep_ptr);
void amoeba_gpu_compute(const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *host_amtype, int *host_amgroup,
double **host_rpole, double **host_uind, double **host_uinp,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q, const int nlocal,
double *boxlo, double *prd, void **tep_ptr);
double amoeba_gpu_bytes();
enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
/* ---------------------------------------------------------------------- */
PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
{
respa_enable = 0;
reinitflag = 0;
cpu_time = 0.0;
suffix_flag |= Suffix::GPU;
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairAmoebaGPU::~PairAmoebaGPU()
{
amoeba_gpu_clear();
}
/* ---------------------------------------------------------------------- */
void PairAmoebaGPU::polar_real()
{
int eflag=1, vflag=1;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
int *ilist, *numneigh, **firstneigh;
if (gpu_mode != GPU_FORCE) {
double sublo[3],subhi[3];
if (domain->triclinic == 0) {
sublo[0] = domain->sublo[0];
sublo[1] = domain->sublo[1];
sublo[2] = domain->sublo[2];
subhi[0] = domain->subhi[0];
subhi[1] = domain->subhi[1];
subhi[2] = domain->subhi[2];
} else {
domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
}
inum = atom->nlocal;
firstneigh = amoeba_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
atom->type, amtype, amgroup,
rpole, uind, uinp, sublo, subhi,
atom->tag, atom->nspecial, atom->special,
atom->nspecial15, atom->special15,
eflag, vflag, eflag_atom, vflag_atom,
host_start, &ilist, &numneigh, cpu_time,
success, atom->q, domain->boxlo,
domain->prd, &tep_pinned);
} else {
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
amoeba_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
amtype, amgroup, rpole, uind, uinp,
ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success, atom->q,
atom->nlocal, domain->boxlo, domain->prd, &tep_pinned);
}
if (!success)
error->one(FLERR,"Insufficient memory on accelerator");
// reference to the tep array from GPU lib
if (tep_single) {
float *tep_ptr = (float *)tep_pinned;
compute_force_from_tep<float>(tep_ptr);
} else {
double *tep_ptr = (double *)tep_pinned;
compute_force_from_tep<double>(tep_ptr);
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
template <class numtyp>
void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr)
{
int i,ix,iy,iz;
double ci,dix,diy,diz;
double qixx,qixy,qixz;
double qiyy,qiyz,qizz;
double xix,yix,zix;
double xiy,yiy,ziy;
double xiz,yiz,ziz;
double vxx,vyy,vzz;
double vxy,vxz,vyz;
double fix[3],fiy[3],fiz[3],tep[4];
double** x = atom->x;
int nlocal = atom->nlocal;
for (i = 0; i < nlocal; i++) {
dix = rpole[i][1];
diy = rpole[i][2];
diz = rpole[i][3];
qixx = rpole[i][4];
qixy = rpole[i][5];
qixz = rpole[i][6];
qiyy = rpole[i][8];
qiyz = rpole[i][9];
qizz = rpole[i][12];
tep[0] = tep_ptr[4*i];
tep[1] = tep_ptr[4*i+1];
tep[2] = tep_ptr[4*i+2];
torque2force(i,tep,fix,fiy,fiz,fpolar);
iz = zaxis2local[i];
ix = xaxis2local[i];
iy = yaxis2local[i];
xiz = x[iz][0] - x[i][0];
yiz = x[iz][1] - x[i][1];
ziz = x[iz][2] - x[i][2];
xix = x[ix][0] - x[i][0];
yix = x[ix][1] - x[i][1];
zix = x[ix][2] - x[i][2];
xiy = x[iy][0] - x[i][0];
yiy = x[iy][1] - x[i][1];
ziy = x[iy][2] - x[i][2];
vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
virpolar[0] += vxx;
virpolar[1] += vyy;
virpolar[2] += vzz;
virpolar[3] += vxy;
virpolar[4] += vxz;
virpolar[5] += vyz;
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairAmoebaGPU::init_style()
{
PairAmoeba::init_style();
if (gpu_mode == GPU_FORCE) {
if (comm->me == 0)
error->warning(FLERR,"Pair style amoeba/gpu does not support neigh no "
"for now, automatically switching to neigh yes");
gpu_mode = GPU_NEIGH;
}
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
// select the cutoff (off2) for neighbor list builds (the polar term for now)
if (use_ewald) choose(POLAR_LONG);
else choose(POLAR);
double cell_size = sqrt(off2) + neighbor->skin;
int maxspecial=0;
int maxspecial15=0;
if (atom->molecular != Atom::ATOMIC) {
maxspecial=atom->maxspecial;
maxspecial15=atom->maxspecial15;
}
int tep_size;
int mnf = 5e-2 * neighbor->oneatom;
// set the energy unit conversion factor for polar real-space calculation
double felec = 0.5 * electric / am_dielectric;
int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole,
special_polar_wscale, special_polar_piscale,
special_polar_pscale, atom->nlocal,
atom->nlocal+atom->nghost, mnf, maxspecial,
maxspecial15, cell_size, gpu_mode, screen,
aewald, felec, off2, polar_dscale, polar_uscale,
tep_size);
GPU_EXTRA::check_flag(success,error,world);
if (tep_size == sizeof(double))
tep_single = false;
else
tep_single = true;
}
/* ---------------------------------------------------------------------- */
double PairAmoebaGPU::memory_usage()
{
double bytes = Pair::memory_usage();
return bytes + amoeba_gpu_bytes();
}

63
src/GPU/pair_amoeba_gpu.h Normal file
View File

@ -0,0 +1,63 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
// clang-format off
PairStyle(amoeba/gpu,PairAmoebaGPU);
// clang-format on
#else
#ifndef LMP_PAIR_AMOEBA_GPU_H
#define LMP_PAIR_AMOEBA_GPU_H
#include "pair_amoeba.h"
namespace LAMMPS_NS {
class PairAmoebaGPU : public PairAmoeba {
public:
PairAmoebaGPU(LAMMPS *lmp);
~PairAmoebaGPU();
void init_style();
double memory_usage();
enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
virtual void polar_real();
private:
int gpu_mode;
double cpu_time;
void *tep_pinned;
bool tep_single;
template<class numtyp>
void compute_force_from_tep(const numtyp*);
};
} // namespace LAMMPS_NS
#endif
#endif
/* ERROR/WARNING messages:
E: Insufficient memory on accelerator
There is insufficient memory on one of the devices specified for the gpu
package
E: Pair style amoeba/gpu requires atom attribute q
The atom style defined does not have this attribute.
*/