Added work on amoeba/gpu, some minor changes to PairAmoeba to allow function overriding in PairAmoebaGPU, added the package AMOEBA to cmake/CMakeLists.txt
This commit is contained in:
@ -140,6 +140,7 @@ option(CMAKE_VERBOSE_MAKEFILE "Generate verbose Makefiles" OFF)
|
||||
|
||||
set(STANDARD_PACKAGES
|
||||
ADIOS
|
||||
AMOEBA
|
||||
ASPHERE
|
||||
ATC
|
||||
AWPMD
|
||||
@ -308,6 +309,7 @@ endif()
|
||||
pkg_depends(ML-IAP ML-SNAP)
|
||||
pkg_depends(MPIIO MPI)
|
||||
pkg_depends(ATC MANYBODY)
|
||||
pkg_depends(AMOEBA KSPACE)
|
||||
pkg_depends(LATBOLTZ MPI)
|
||||
pkg_depends(PHONON KSPACE)
|
||||
pkg_depends(SCAFACOS MPI)
|
||||
|
||||
155
lib/gpu/lal_amoeba.cpp
Normal file
155
lib/gpu/lal_amoeba.cpp
Normal file
@ -0,0 +1,155 @@
|
||||
/***************************************************************************
|
||||
amoeba.cpp
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Class for acceleration of the amoeba pair style.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#if defined(USE_OPENCL)
|
||||
#include "amoeba_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *amoeba=0;
|
||||
#else
|
||||
#include "amoeba_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_amoeba.h"
|
||||
#include <cassert>
|
||||
namespace LAMMPS_AL {
|
||||
#define AmoebaT Amoeba<numtyp, acctyp>
|
||||
|
||||
extern Device<PRECISION,ACC_PRECISION> device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
AmoebaT::Amoeba() : BaseAmoeba<numtyp,acctyp>(),
|
||||
_allocated(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
AmoebaT::~Amoeba() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int AmoebaT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp,
|
||||
const double *host_thole, const double *host_special_polar_wscale,
|
||||
const double *host_special_polar_piscale,
|
||||
const double *host_special_polar_pscale,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const int maxspecial15,
|
||||
const double cell_size, const double gpu_split, FILE *_screen,
|
||||
const double aewald, const double felec,
|
||||
const double off2, const double polar_dscale,
|
||||
const double polar_uscale) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
|
||||
cell_size,gpu_split,_screen,amoeba,"k_amoeba_polar");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
int max_shared_types=this->device->max_shared_types();
|
||||
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||
lj_types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
|
||||
UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
|
||||
for (int i = 0; i < max_amtype; i++) {
|
||||
host_write[i].x = host_pdamp[i];
|
||||
host_write[i].y = host_thole[i];
|
||||
host_write[i].z = (numtyp)0;
|
||||
host_write[i].w = (numtyp)0;
|
||||
}
|
||||
|
||||
damping.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
|
||||
ucl_copy(damping,host_write,false);
|
||||
|
||||
UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
|
||||
sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<5; i++) {
|
||||
dview[i].x=host_special_polar_wscale[i];
|
||||
dview[i].y=host_special_polar_piscale[i];
|
||||
dview[i].z=host_special_polar_pscale[i];
|
||||
dview[i].w=(numtyp)0;
|
||||
}
|
||||
ucl_copy(sp_polar,dview,5,false);
|
||||
|
||||
_aewald = aewald;
|
||||
_felec = felec;
|
||||
_off2 = off2;
|
||||
_polar_dscale = polar_dscale;
|
||||
_polar_uscale = polar_uscale;
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=damping.row_bytes()
|
||||
+ sp_polar.row_bytes()
|
||||
+ this->_tep.row_bytes();
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AmoebaT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
damping.clear();
|
||||
sp_polar.clear();
|
||||
|
||||
this->clear_atomic();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double AmoebaT::host_memory_usage() const {
|
||||
return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int AmoebaT::loop(const int eflag, const int vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int _nall=this->atom->nall();
|
||||
int ainum=this->ans->inum();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
|
||||
this->k_polar.set_size(GX,BX);
|
||||
|
||||
this->k_polar.run(&this->atom->x, &this->atom->extra,
|
||||
&damping, &sp_polar,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &this->_tep,
|
||||
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
|
||||
&this->_threads_per_atom,
|
||||
&_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class Amoeba<PRECISION,ACC_PRECISION>;
|
||||
}
|
||||
684
lib/gpu/lal_amoeba.cu
Normal file
684
lib/gpu/lal_amoeba.cu
Normal file
@ -0,0 +1,684 @@
|
||||
// **************************************************************************
|
||||
// amoeba.cu
|
||||
// -------------------
|
||||
// Trung Dac Nguyen (Northwestern)
|
||||
//
|
||||
// Device code for acceleration of the amoeba pair style
|
||||
//
|
||||
// __________________________________________________________________________
|
||||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// email : trung.nguyen@northwestern.edu
|
||||
// ***************************************************************************
|
||||
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include <stdio.h>
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifdef LAMMPS_SMALLBIG
|
||||
#define tagint int
|
||||
#endif
|
||||
#ifdef LAMMPS_BIGBIG
|
||||
#include "inttypes.h"
|
||||
#define tagint int64_t
|
||||
#endif
|
||||
#ifdef LAMMPS_SMALLSMALL
|
||||
#define tagint int
|
||||
#endif
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
#if (SHUFFLE_AVAIL == 0)
|
||||
|
||||
#define local_allocate_store_ufld() \
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset, \
|
||||
i, tep) \
|
||||
if (t_per_atom>1) { \
|
||||
red_acc[0][tid]=ufld[0]; \
|
||||
red_acc[1][tid]=ufld[1]; \
|
||||
red_acc[2][tid]=ufld[2]; \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (offset < s) { \
|
||||
for (int r=0; r<3; r++) \
|
||||
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
ufld[0]=red_acc[0][tid]; \
|
||||
ufld[1]=red_acc[1][tid]; \
|
||||
ufld[2]=red_acc[2][tid]; \
|
||||
red_acc[0][tid]=dufld[0]; \
|
||||
red_acc[1][tid]=dufld[1]; \
|
||||
red_acc[2][tid]=dufld[2]; \
|
||||
red_acc[3][tid]=dufld[3]; \
|
||||
red_acc[4][tid]=dufld[4]; \
|
||||
red_acc[5][tid]=dufld[5]; \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (offset < s) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
dufld[0]=red_acc[0][tid]; \
|
||||
dufld[1]=red_acc[1][tid]; \
|
||||
dufld[2]=red_acc[2][tid]; \
|
||||
dufld[3]=red_acc[3][tid]; \
|
||||
dufld[4]=red_acc[4][tid]; \
|
||||
dufld[5]=red_acc[5][tid]; \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
numtyp4 t; \
|
||||
t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \
|
||||
(numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \
|
||||
t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \
|
||||
(numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3]; \
|
||||
t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] + \
|
||||
(numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1]; \
|
||||
tep[i]=t; \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define local_allocate_store_ufld()
|
||||
|
||||
#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset, \
|
||||
i, tep) \
|
||||
if (t_per_atom>1) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
ufld[0] += shfl_down(ufld[0], s, t_per_atom); \
|
||||
ufld[1] += shfl_down(ufld[1], s, t_per_atom); \
|
||||
ufld[2] += shfl_down(ufld[2], s, t_per_atom); \
|
||||
dufld[0] += shfl_down(dufld[0], s, t_per_atom); \
|
||||
dufld[1] += shfl_down(dufld[1], s, t_per_atom); \
|
||||
dufld[2] += shfl_down(dufld[2], s, t_per_atom); \
|
||||
dufld[3] += shfl_down(dufld[3], s, t_per_atom); \
|
||||
dufld[4] += shfl_down(dufld[4], s, t_per_atom); \
|
||||
dufld[5] += shfl_down(dufld[5], s, t_per_atom); \
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) { \
|
||||
numtyp4 t; \
|
||||
t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] + \
|
||||
(numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4]; \
|
||||
t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] + \
|
||||
(numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3]; \
|
||||
t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] + \
|
||||
(numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1]; \
|
||||
tep[i]=t; \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define MIN(A,B) ((A) < (B) ? (A) : (B))
|
||||
#define MY_PIS (acctyp)1.77245385090551602729
|
||||
|
||||
__kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict extra,
|
||||
const __global numtyp4 *restrict damping,
|
||||
const __global numtyp4 *restrict sp_polar,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global numtyp4 *restrict tep,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch, const int t_per_atom,
|
||||
const numtyp aewald, const numtyp felec,
|
||||
const numtyp off2, const numtyp polar_dscale,
|
||||
const numtyp polar_uscale)
|
||||
{
|
||||
int tid, ii, offset, i;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
int n_stride;
|
||||
local_allocate_store_ufld();
|
||||
local_allocate_store_charge();
|
||||
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int l=0; l<6; l++) virial[l]=(acctyp)0;
|
||||
}
|
||||
|
||||
acctyp ufld[3];
|
||||
ufld[0] = (acctyp)0; ufld[1]=(acctyp)0; ufld[2]=(acctyp)0;
|
||||
acctyp dufld[6];
|
||||
for (int l=0; l<6; l++) dufld[l]=(acctyp)0;
|
||||
|
||||
numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
|
||||
numtyp4* polar1 = (numtyp4*)(&extra[0]);
|
||||
numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
|
||||
numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
|
||||
numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
|
||||
numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
|
||||
|
||||
//numtyp4 xi__;
|
||||
|
||||
if (ii<inum) {
|
||||
int k,m,itype,igroup;
|
||||
numtyp bfac;
|
||||
numtyp psc3,psc5,psc7;
|
||||
numtyp dsc3,dsc5,dsc7;
|
||||
numtyp usc3,usc5;
|
||||
numtyp psr3,psr5,psr7;
|
||||
numtyp dsr3,dsr5,dsr7;
|
||||
numtyp usr5;
|
||||
numtyp term1,term2,term3;
|
||||
numtyp term4,term5;
|
||||
numtyp term6,term7;
|
||||
numtyp rc3[3],rc5[3],rc7[3];
|
||||
numtyp prc3[3],prc5[3],prc7[3];
|
||||
numtyp drc3[3],drc5[3],drc7[3];
|
||||
numtyp urc3[3],urc5[3];
|
||||
numtyp bn[5];
|
||||
numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
|
||||
|
||||
int numj, nbor, nbor_end;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
//numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
//int itype=ix.w;
|
||||
|
||||
ci = polar1[i].x; // rpole[i][0];
|
||||
dix = polar1[i].y; // rpole[i][1];
|
||||
diy = polar1[i].z; // rpole[i][2];
|
||||
diz = polar1[i].w; // rpole[i][3];
|
||||
qixx = polar2[i].x; // rpole[i][4];
|
||||
qixy = polar2[i].y; // rpole[i][5];
|
||||
qixz = polar2[i].z; // rpole[i][6];
|
||||
qiyy = polar2[i].w; // rpole[i][8];
|
||||
qiyz = polar3[i].x; // rpole[i][9];
|
||||
qizz = polar3[i].y; // rpole[i][12];
|
||||
itype = polar3[i].z; // amtype[i];
|
||||
igroup = polar3[i].w; // amgroup[i];
|
||||
uix = polar4[i].x; // uind[i][0];
|
||||
uiy = polar4[i].y; // uind[i][1];
|
||||
uiz = polar4[i].z; // uind[i][2];
|
||||
uixp = polar5[i].x; // uinp[i][0];
|
||||
uiyp = polar5[i].y; // uinp[i][1];
|
||||
uizp = polar5[i].z; // uinp[i][2];
|
||||
|
||||
// debug:
|
||||
// xi__ = ix; xi__.w = itype;
|
||||
|
||||
numtyp pdi = damping[itype].x;
|
||||
numtyp pti = damping[itype].y;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int jextra=dev_packed[nbor];
|
||||
int j = jextra & NEIGHMASK15;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
//int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp xr = jx.x - ix.x;
|
||||
numtyp yr = jx.y - ix.y;
|
||||
numtyp zr = jx.z - ix.z;
|
||||
numtyp r2 = xr*xr + yr*yr + zr*zr;
|
||||
|
||||
if (r2>off2) continue;
|
||||
|
||||
numtyp r = ucl_sqrt(r2);
|
||||
|
||||
numtyp ck = polar1[j].x; // rpole[j][0];
|
||||
numtyp dkx = polar1[j].y; // rpole[j][1];
|
||||
numtyp dky = polar1[j].z; // rpole[j][2];
|
||||
numtyp dkz = polar1[j].w; // rpole[j][3];
|
||||
numtyp qkxx = polar2[j].x; // rpole[j][4];
|
||||
numtyp qkxy = polar2[j].y; // rpole[j][5];
|
||||
numtyp qkxz = polar2[j].z; // rpole[j][6];
|
||||
numtyp qkyy = polar2[j].w; // rpole[j][8];
|
||||
numtyp qkyz = polar3[j].x; // rpole[j][9];
|
||||
numtyp qkzz = polar3[j].y; // rpole[j][12];
|
||||
int jtype = polar3[j].z; // amtype[j];
|
||||
int jgroup = polar3[j].w; // amgroup[j];
|
||||
numtyp ukx = polar4[j].x; // uind[j][0];
|
||||
numtyp uky = polar4[j].y; // uind[j][1];
|
||||
numtyp ukz = polar4[j].z; // uind[j][2];
|
||||
numtyp ukxp = polar5[j].x; // uinp[j][0];
|
||||
numtyp ukyp = polar5[j].y; // uinp[j][1];
|
||||
numtyp ukzp = polar5[j].z; // uinp[j][2];
|
||||
|
||||
numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
|
||||
const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
|
||||
factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
|
||||
if (igroup == jgroup) {
|
||||
factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
|
||||
factor_dscale = polar_dscale;
|
||||
factor_uscale = polar_uscale;
|
||||
} else {
|
||||
factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
|
||||
factor_dscale = factor_uscale = (numtyp)1.0;
|
||||
}
|
||||
|
||||
// intermediates involving moments and separation distance
|
||||
|
||||
numtyp dir = dix*xr + diy*yr + diz*zr;
|
||||
numtyp qix = qixx*xr + qixy*yr + qixz*zr;
|
||||
numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
|
||||
numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
|
||||
numtyp qir = qix*xr + qiy*yr + qiz*zr;
|
||||
numtyp dkr = dkx*xr + dky*yr + dkz*zr;
|
||||
numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
|
||||
numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
|
||||
numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
|
||||
numtyp qkr = qkx*xr + qky*yr + qkz*zr;
|
||||
numtyp uir = uix*xr + uiy*yr + uiz*zr;
|
||||
numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
|
||||
numtyp ukr = ukx*xr + uky*yr + ukz*zr;
|
||||
numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;
|
||||
|
||||
// get reciprocal distance terms for this interaction
|
||||
|
||||
numtyp rinv = ucl_recip(r);
|
||||
numtyp r2inv = rinv*rinv;
|
||||
numtyp rr1 = felec * rinv;
|
||||
numtyp rr3 = rr1 * r2inv;
|
||||
numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
|
||||
numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
|
||||
numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
|
||||
|
||||
// calculate the real space Ewald error function terms
|
||||
|
||||
numtyp ralpha = aewald * r;
|
||||
numtyp exp2a = ucl_exp(-ralpha*ralpha);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
|
||||
numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
|
||||
//bn[0] = erfc(ralpha) / r;
|
||||
bn[0] = _erfc * rinv;
|
||||
numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
|
||||
numtyp alsq2n = (numtyp)0.0;
|
||||
if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
|
||||
|
||||
for (m = 1; m <= 4; m++) {
|
||||
bfac = (numtyp) (m+m-1);
|
||||
alsq2n = alsq2 * alsq2n;
|
||||
bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
|
||||
}
|
||||
for (m = 0; m < 5; m++) bn[m] *= felec;
|
||||
|
||||
// apply Thole polarization damping to scale factors
|
||||
|
||||
numtyp sc3 = (numtyp)1.0;
|
||||
numtyp sc5 = (numtyp)1.0;
|
||||
numtyp sc7 = (numtyp)1.0;
|
||||
for (k = 0; k < 3; k++) {
|
||||
rc3[k] = (numtyp)0.0;
|
||||
rc5[k] = (numtyp)0.0;
|
||||
rc7[k] = (numtyp)0.0;
|
||||
}
|
||||
|
||||
// apply Thole polarization damping to scale factors
|
||||
|
||||
numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
|
||||
if (damp != (numtyp)0.0) {
|
||||
numtyp pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
|
||||
damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
|
||||
if (damp < (numtyp)50.0) {
|
||||
numtyp expdamp = ucl_exp(-damp);
|
||||
sc3 = (numtyp)1.0 - expdamp;
|
||||
sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp;
|
||||
sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp;
|
||||
numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv;
|
||||
numtyp temp5 = damp;
|
||||
numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp;
|
||||
rc3[0] = xr * temp3;
|
||||
rc3[1] = yr * temp3;
|
||||
rc3[2] = zr * temp3;
|
||||
rc5[0] = rc3[0] * temp5;
|
||||
rc5[1] = rc3[1] * temp5;
|
||||
rc5[2] = rc3[2] * temp5;
|
||||
rc7[0] = rc5[0] * temp7;
|
||||
rc7[1] = rc5[1] * temp7;
|
||||
rc7[2] = rc5[2] * temp7;
|
||||
}
|
||||
|
||||
psc3 = (numtyp)1.0 - sc3*factor_pscale;
|
||||
psc5 = (numtyp)1.0 - sc5*factor_pscale;
|
||||
psc7 = (numtyp)1.0 - sc7*factor_pscale;
|
||||
dsc3 = (numtyp)1.0 - sc3*factor_dscale;
|
||||
dsc5 = (numtyp)1.0 - sc5*factor_dscale;
|
||||
dsc7 = (numtyp)1.0 - sc7*factor_dscale;
|
||||
usc3 = (numtyp)1.0 - sc3*factor_uscale;
|
||||
usc5 = (numtyp)1.0 - sc5*factor_uscale;
|
||||
psr3 = bn[1] - psc3*rr3;
|
||||
psr5 = bn[2] - psc5*rr5;
|
||||
psr7 = bn[3] - psc7*rr7;
|
||||
dsr3 = bn[1] - dsc3*rr3;
|
||||
dsr5 = bn[2] - dsc5*rr5;
|
||||
dsr7 = bn[3] - dsc7*rr7;
|
||||
usr5 = bn[2] - usc5*rr5;
|
||||
for (k = 0; k < 3; k++) {
|
||||
prc3[k] = rc3[k] * factor_pscale;
|
||||
prc5[k] = rc5[k] * factor_pscale;
|
||||
prc7[k] = rc7[k] * factor_pscale;
|
||||
drc3[k] = rc3[k] * factor_dscale;
|
||||
drc5[k] = rc5[k] * factor_dscale;
|
||||
drc7[k] = rc7[k] * factor_dscale;
|
||||
urc3[k] = rc3[k] * factor_uscale;
|
||||
urc5[k] = rc5[k] * factor_uscale;
|
||||
}
|
||||
} else { // damp == 0: ???
|
||||
}
|
||||
|
||||
// get the induced dipole field used for dipole torques
|
||||
|
||||
numtyp tix3 = psr3*ukx + dsr3*ukxp;
|
||||
numtyp tiy3 = psr3*uky + dsr3*ukyp;
|
||||
numtyp tiz3 = psr3*ukz + dsr3*ukzp;
|
||||
numtyp tuir = -psr5*ukr - dsr5*ukrp;
|
||||
|
||||
ufld[0] += tix3 + xr*tuir;
|
||||
ufld[1] += tiy3 + yr*tuir;
|
||||
ufld[2] += tiz3 + zr*tuir;
|
||||
|
||||
// get induced dipole field gradient used for quadrupole torques
|
||||
|
||||
numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp);
|
||||
numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp);
|
||||
numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp);
|
||||
tuir = -psr7*ukr - dsr7*ukrp;
|
||||
|
||||
dufld[0] += xr*tix5 + xr*xr*tuir;
|
||||
dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
|
||||
dufld[2] += yr*tiy5 + yr*yr*tuir;
|
||||
dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
|
||||
dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
|
||||
dufld[5] += zr*tiz5 + zr*zr*tuir;
|
||||
|
||||
// get the dEd/dR terms used for direct polarization force
|
||||
|
||||
term1 = bn[2] - dsc3*rr5;
|
||||
term2 = bn[3] - dsc5*rr7;
|
||||
term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0];
|
||||
term4 = rr3*drc3[0] - term1*xr - dsr5*xr;
|
||||
term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0];
|
||||
term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
|
||||
term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
|
||||
numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
|
||||
(numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 +qir*term6;
|
||||
numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
|
||||
(numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
|
||||
|
||||
term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1];
|
||||
term4 = rr3*drc3[1] - term1*yr - dsr5*yr;
|
||||
term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1];
|
||||
term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1];
|
||||
term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr;
|
||||
numtyp tiyy = ci*term3 + diy*term4 + dir*term5 +
|
||||
(numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
|
||||
numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 +
|
||||
(numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
|
||||
|
||||
term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2];
|
||||
term4 = rr3*drc3[2] - term1*zr - dsr5*zr;
|
||||
term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2];
|
||||
term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2];
|
||||
term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr;
|
||||
numtyp tizz = ci*term3 + diz*term4 + dir*term5 +
|
||||
(numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
|
||||
numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 +
|
||||
(numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
|
||||
|
||||
term3 = term1*xr*yr - rr3*yr*drc3[0];
|
||||
term4 = rr3*drc3[0] - term1*xr;
|
||||
term5 = term2*xr*yr - rr5*yr*drc5[0];
|
||||
term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0];
|
||||
term7 = rr5*drc5[0] - term2*xr;
|
||||
numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 +
|
||||
(numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
|
||||
numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 +
|
||||
(numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6;
|
||||
|
||||
term3 = term1*xr*zr - rr3*zr*drc3[0];
|
||||
term5 = term2*xr*zr - rr5*zr*drc5[0];
|
||||
term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0];
|
||||
numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 +
|
||||
(numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
|
||||
numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 +
|
||||
(numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
|
||||
|
||||
term3 = term1*yr*zr - rr3*zr*drc3[1];
|
||||
term4 = rr3*drc3[1] - term1*yr;
|
||||
term5 = term2*yr*zr - rr5*zr*drc5[1];
|
||||
term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1];
|
||||
term7 = rr5*drc5[1] - term2*yr;
|
||||
numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 +
|
||||
(numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
|
||||
numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 +
|
||||
(numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
|
||||
|
||||
numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp;
|
||||
numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp;
|
||||
numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp;
|
||||
|
||||
numtyp frcx = depx;
|
||||
numtyp frcy = depy;
|
||||
numtyp frcz = depz;
|
||||
|
||||
// get the dEp/dR terms used for direct polarization force
|
||||
|
||||
// tixx and tkxx
|
||||
term1 = bn[2] - psc3*rr5;
|
||||
term2 = bn[3] - psc5*rr7;
|
||||
term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0];
|
||||
term4 = rr3*prc3[0] - term1*xr - psr5*xr;
|
||||
term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0];
|
||||
term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0];
|
||||
term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr;
|
||||
tixx = ci*term3 + dix*term4 + dir*term5 +
|
||||
(numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
|
||||
tkxx = ck*term3 - dkx*term4 - dkr*term5 +
|
||||
(numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
|
||||
|
||||
// tiyy and tkyy
|
||||
term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1];
|
||||
term4 = rr3*prc3[1] - term1*yr - psr5*yr;
|
||||
term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1];
|
||||
term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1];
|
||||
term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr;
|
||||
tiyy = ci*term3 + diy*term4 + dir*term5 +
|
||||
(numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
|
||||
tkyy = ck*term3 - dky*term4 - dkr*term5 +
|
||||
(numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
|
||||
|
||||
// tizz and tkzz
|
||||
term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2];
|
||||
term4 = rr3*prc3[2] - term1*zr - psr5*zr;
|
||||
term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2];
|
||||
term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2];
|
||||
term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr;
|
||||
tizz = ci*term3 + diz*term4 + dir*term5 +
|
||||
(numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
|
||||
tkzz = ck*term3 - dkz*term4 - dkr*term5 +
|
||||
(numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
|
||||
|
||||
// tixy and tkxy
|
||||
term3 = term1*xr*yr - rr3*yr*prc3[0];
|
||||
term4 = rr3*prc3[0] - term1*xr;
|
||||
term5 = term2*xr*yr - rr5*yr*prc5[0];
|
||||
term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0];
|
||||
term7 = rr5*prc5[0] - term2*xr;
|
||||
tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 +
|
||||
(numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
|
||||
tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 +
|
||||
(numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6;
|
||||
|
||||
// tixz and tkxz
|
||||
term3 = term1*xr*zr - rr3*zr*prc3[0];
|
||||
term5 = term2*xr*zr - rr5*zr*prc5[0];
|
||||
term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0];
|
||||
tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 +
|
||||
(numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
|
||||
tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 +
|
||||
(numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
|
||||
|
||||
// tiyz and tkyz
|
||||
term3 = term1*yr*zr - rr3*zr*prc3[1];
|
||||
term4 = rr3*prc3[1] - term1*yr;
|
||||
term5 = term2*yr*zr - rr5*zr*prc5[1];
|
||||
term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1];
|
||||
term7 = rr5*prc5[1] - term2*yr;
|
||||
tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 +
|
||||
(numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
|
||||
tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 +
|
||||
(numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
|
||||
|
||||
depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz;
|
||||
depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
|
||||
depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;
|
||||
|
||||
frcx = frcx + depx;
|
||||
frcy = frcy + depy;
|
||||
frcz = frcz + depz;
|
||||
|
||||
// get the dtau/dr terms used for mutual polarization force
|
||||
// poltyp == MUTUAL && amoeba
|
||||
|
||||
term1 = bn[2] - usc3*rr5;
|
||||
term2 = bn[3] - usc5*rr7;
|
||||
term3 = usr5 + term1;
|
||||
term4 = rr3 * factor_uscale;
|
||||
term5 = -xr*term3 + rc3[0]*term4;
|
||||
term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0];
|
||||
tixx = uix*term5 + uir*term6;
|
||||
tkxx = ukx*term5 + ukr*term6;
|
||||
|
||||
term5 = -yr*term3 + rc3[1]*term4;
|
||||
term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1];
|
||||
tiyy = uiy*term5 + uir*term6;
|
||||
tkyy = uky*term5 + ukr*term6;
|
||||
|
||||
term5 = -zr*term3 + rc3[2]*term4;
|
||||
term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2];
|
||||
tizz = uiz*term5 + uir*term6;
|
||||
tkzz = ukz*term5 + ukr*term6;
|
||||
|
||||
term4 = -usr5 * yr;
|
||||
term5 = -xr*term1 + rr3*urc3[0];
|
||||
term6 = xr*yr*term2 - rr5*yr*urc5[0];
|
||||
tixy = uix*term4 + uiy*term5 + uir*term6;
|
||||
tkxy = ukx*term4 + uky*term5 + ukr*term6;
|
||||
|
||||
term4 = -usr5 * zr;
|
||||
term6 = xr*zr*term2 - rr5*zr*urc5[0];
|
||||
tixz = uix*term4 + uiz*term5 + uir*term6;
|
||||
tkxz = ukx*term4 + ukz*term5 + ukr*term6;
|
||||
|
||||
term5 = -yr*term1 + rr3*urc3[1];
|
||||
term6 = yr*zr*term2 - rr5*zr*urc5[1];
|
||||
tiyz = uiy*term4 + uiz*term5 + uir*term6;
|
||||
tkyz = uky*term4 + ukz*term5 + ukr*term6;
|
||||
|
||||
depx = tixx*ukxp + tixy*ukyp + tixz*ukzp
|
||||
+ tkxx*uixp + tkxy*uiyp + tkxz*uizp;
|
||||
depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp
|
||||
+ tkxy*uixp + tkyy*uiyp + tkyz*uizp;
|
||||
depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp
|
||||
+ tkxz*uixp + tkyz*uiyp + tkzz*uizp;
|
||||
|
||||
frcx = frcx + depx;
|
||||
frcy = frcy + depy;
|
||||
frcz = frcz + depz;
|
||||
|
||||
f.x -= frcx;
|
||||
f.y -= frcy;
|
||||
f.z -= frcz;
|
||||
|
||||
if (EVFLAG && vflag) {
|
||||
numtyp vxx = xr * frcx;
|
||||
numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy);
|
||||
numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz);
|
||||
numtyp vyy = yr * frcy;
|
||||
numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz);
|
||||
numtyp vzz = zr * frcz;
|
||||
|
||||
virial[0] += vxx;
|
||||
virial[1] += vyy;
|
||||
virial[2] += vzz;
|
||||
virial[3] += vxy;
|
||||
virial[4] += vxz;
|
||||
virial[5] += vyz;
|
||||
}
|
||||
} // nbor
|
||||
|
||||
} // ii<inum
|
||||
|
||||
// accumulate ufld and dufld to compute tep
|
||||
store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);
|
||||
|
||||
// accumate force, energy and virial
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
|
||||
offset,eflag,vflag,ans,engv);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
scan standard neighbor list and make it compatible with 1-5 neighbors
|
||||
if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
|
||||
else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
|
||||
else do nothing to IJ entry
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__kernel void k_special15(__global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global tagint *restrict tag,
|
||||
const __global int *restrict nspecial15,
|
||||
const __global tagint *restrict special15,
|
||||
const int inum, const int nall, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
int tid, ii, offset, n_stride, i;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
int numj, nbor, nbor_end;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
int n15 = nspecial15[ii];
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int sj=dev_packed[nbor];
|
||||
int which = sj >> SBBITS & 3;
|
||||
int j = sj & NEIGHMASK;
|
||||
tagint jtag = tag[j];
|
||||
|
||||
if (!which) {
|
||||
int offset=ii;
|
||||
for (int k=0; k<n15; k++) {
|
||||
if (special15[offset] == jtag) {
|
||||
which = 4;
|
||||
break;
|
||||
}
|
||||
offset += nall;
|
||||
}
|
||||
}
|
||||
|
||||
if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
|
||||
} // for nbor
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
87
lib/gpu/lal_amoeba.h
Normal file
87
lib/gpu/lal_amoeba.h
Normal file
@ -0,0 +1,87 @@
|
||||
/***************************************************************************
|
||||
amoeba.h
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Class for acceleration of the amoeba pair style.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef LAL_AMOEBA_H
|
||||
#define LAL_AMOEBA_H
|
||||
|
||||
#include "lal_base_amoeba.h"
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class Amoeba : public BaseAmoeba<numtyp, acctyp> {
|
||||
public:
|
||||
Amoeba();
|
||||
~Amoeba();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, const int max_amtype, const double *host_pdamp,
|
||||
const double *host_thole, const double *host_special_polar_wscale,
|
||||
const double *host_special_polar_piscale,
|
||||
const double *host_special_polar_pscale,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const int maxspecial15, const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
const double aewald, const double felec,
|
||||
const double off2, const double polar_dscale,
|
||||
const double polar_uscale);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// pdamp = damping.x; thole = damping.y
|
||||
UCL_D_Vec<numtyp4> damping;
|
||||
/// Special polar values [0-4]:
|
||||
/// sp_polar.x = special_polar_wscale
|
||||
/// sp_polar.y special_polar_pscale,
|
||||
/// sp_polar.z = special_polar_piscale
|
||||
UCL_D_Vec<numtyp4> sp_polar;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
numtyp _aewald, _felec, _off2, _polar_dscale, _polar_uscale;
|
||||
numtyp _qqrd2e;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
int loop(const int eflag, const int vflag);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
142
lib/gpu/lal_amoeba_ext.cpp
Normal file
142
lib/gpu/lal_amoeba_ext.cpp
Normal file
@ -0,0 +1,142 @@
|
||||
/***************************************************************************
|
||||
amoeba_ext.cpp
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Functions for LAMMPS access to amoeba acceleration routines.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
||||
#include "lal_amoeba.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace LAMMPS_AL;
|
||||
|
||||
static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int amoeba_gpu_init(const int ntypes, const int max_amtype,
|
||||
const double *host_pdamp, const double *host_thole,
|
||||
const double *host_special_polar_wscale,
|
||||
const double *host_special_polar_piscale,
|
||||
const double *host_special_polar_pscale,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const int maxspecial15,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
const double aewald, const double felec,
|
||||
const double off2, const double polar_dscale,
|
||||
const double polar_uscale, int& tep_size) {
|
||||
AMOEBAMF.clear();
|
||||
gpu_mode=AMOEBAMF.device->gpu_mode();
|
||||
double gpu_split=AMOEBAMF.device->particle_split();
|
||||
int first_gpu=AMOEBAMF.device->first_device();
|
||||
int last_gpu=AMOEBAMF.device->last_device();
|
||||
int world_me=AMOEBAMF.device->world_me();
|
||||
int gpu_rank=AMOEBAMF.device->gpu_rank();
|
||||
int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();
|
||||
|
||||
tep_size=sizeof(PRECISION);
|
||||
|
||||
AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (AMOEBAMF.device->replica_me()==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole,
|
||||
host_special_polar_wscale, host_special_polar_piscale,
|
||||
host_special_polar_pscale, nlocal, nall, max_nbors,
|
||||
maxspecial, maxspecial15, cell_size, gpu_split, screen,
|
||||
aewald, felec, off2, polar_dscale, polar_uscale);
|
||||
|
||||
AMOEBAMF.device->world_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole,
|
||||
host_special_polar_wscale, host_special_polar_piscale,
|
||||
host_special_polar_pscale, nlocal, nall, max_nbors,
|
||||
maxspecial, maxspecial15, cell_size, gpu_split, screen,
|
||||
aewald, felec, off2, polar_dscale, polar_uscale);
|
||||
|
||||
AMOEBAMF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
|
||||
if (init_ok==0)
|
||||
AMOEBAMF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void amoeba_gpu_clear() {
|
||||
AMOEBAMF.clear();
|
||||
}
|
||||
|
||||
int** amoeba_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *host_amtype, int *host_amgroup,
|
||||
double **host_rpole, double **host_uind, double **host_uinp,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, int *nspecial15, tagint** special15,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo,
|
||||
double *prd, void **tep_ptr) {
|
||||
return AMOEBAMF.compute(ago, inum_full, nall, host_x, host_type,
|
||||
host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
|
||||
sublo, subhi, tag, nspecial, special, nspecial15, special15,
|
||||
eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd, tep_ptr);
|
||||
}
|
||||
|
||||
void amoeba_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
|
||||
double **host_rpole, double **host_uind, double **host_uinp,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q,
|
||||
const int nlocal, double *boxlo, double *prd, void **tep_ptr) {
|
||||
AMOEBAMF.compute(ago,inum_full, nall, host_x, host_type,
|
||||
host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
|
||||
ilist, numj, firstneigh, eflag, vflag, eatom, vatom,
|
||||
host_start, cpu_time, success, host_q, nlocal, boxlo, prd, tep_ptr);
|
||||
}
|
||||
|
||||
double amoeba_gpu_bytes() {
|
||||
return AMOEBAMF.host_memory_usage();
|
||||
}
|
||||
516
lib/gpu/lal_base_amoeba.cpp
Normal file
516
lib/gpu/lal_base_amoeba.cpp
Normal file
@ -0,0 +1,516 @@
|
||||
/***************************************************************************
|
||||
base_amoeba.cpp
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Base class for pair styles needing per-particle data for position,
|
||||
charge, and type.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#include "lal_base_amoeba.h"
|
||||
namespace LAMMPS_AL {
|
||||
#define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
|
||||
|
||||
extern Device<PRECISION,ACC_PRECISION> global_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0) {
|
||||
device=&global_device;
|
||||
ans=new Answer<numtyp,acctyp>();
|
||||
nbor=new Neighbor();
|
||||
pair_program=nullptr;
|
||||
ucl_device=nullptr;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
pair_program_noev=nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BaseAmoebaT::~BaseAmoeba() {
|
||||
delete ans;
|
||||
delete nbor;
|
||||
k_polar.clear();
|
||||
k_special15.clear();
|
||||
if (pair_program) delete pair_program;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
|
||||
nbor->bytes_per_atom(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const int maxspecial15,
|
||||
const double cell_size, const double gpu_split,
|
||||
FILE *_screen, const void *pair_program,
|
||||
const char *k_name) {
|
||||
screen=_screen;
|
||||
|
||||
int gpu_nbor=0;
|
||||
if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
|
||||
gpu_nbor=1;
|
||||
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
|
||||
gpu_nbor=2;
|
||||
|
||||
int _gpu_host=0;
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
_threads_per_atom=device->threads_per_charge();
|
||||
|
||||
bool charge = true;
|
||||
bool rot = false;
|
||||
bool vel = false;
|
||||
_extra_fields = 24; // round up to accomodate quadruples of numtyp values
|
||||
// rpole 13; uind 3; uinp 3; amtype, amgroup
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
if (ucl_device!=device->gpu) _compiled=false;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
|
||||
_block_size=device->pair_block_size();
|
||||
_block_bio_size=device->block_bio_pair();
|
||||
compile_kernels(*ucl_device,pair_program,k_name);
|
||||
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_pair.init(*ucl_device);
|
||||
time_pair.zero();
|
||||
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
q_tex.bind_float(atom->q,1);
|
||||
|
||||
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
_maxspecial=maxspecial;
|
||||
_maxspecial15=maxspecial15;
|
||||
|
||||
// allocate per-atom array tep
|
||||
|
||||
int ef_nall=nall;
|
||||
if (ef_nall==0)
|
||||
ef_nall=2000;
|
||||
|
||||
_max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
|
||||
_tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) {
|
||||
device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::clear_atomic() {
|
||||
// Output any timing information
|
||||
acc_timers();
|
||||
double avg_split=hd_balancer.all_avg_split();
|
||||
_gpu_overhead*=hd_balancer.timestep();
|
||||
_driver_overhead*=hd_balancer.timestep();
|
||||
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
|
||||
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
|
||||
|
||||
time_pair.clear();
|
||||
hd_balancer.clear();
|
||||
|
||||
nbor->clear();
|
||||
ans->clear();
|
||||
|
||||
_tep.clear();
|
||||
dev_nspecial15.clear();
|
||||
dev_special15.clear();
|
||||
dev_special15_t.clear();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy neighbor list from host
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
|
||||
int *numj, int **firstneigh, bool &success) {
|
||||
success=true;
|
||||
|
||||
int mn=nbor->max_nbor_loop(inum,numj,ilist);
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,mn,success);
|
||||
if (!success)
|
||||
return nullptr;
|
||||
|
||||
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
|
||||
return ilist;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Build neighbor list on device
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
int *nspecial15, tagint **special15,
|
||||
bool &success) {
|
||||
success=true;
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,host_inum,nbor->max_nbors(),success);
|
||||
if (!success)
|
||||
return;
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
|
||||
tag, nspecial, special, success, mn, ans->error_flag);
|
||||
|
||||
// add one-five neighbors
|
||||
|
||||
if (_maxspecial15>0) {
|
||||
UCL_H_Vec<int> view_nspecial15;
|
||||
UCL_H_Vec<tagint> view_special15;
|
||||
view_nspecial15.view(nspecial15,nall,*ucl_device);
|
||||
view_special15.view(special15[0],nall*_maxspecial15,*ucl_device);
|
||||
ucl_copy(dev_nspecial15,view_nspecial15,nall,false);
|
||||
ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false);
|
||||
nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall);
|
||||
|
||||
add_onefive_neighbors();
|
||||
}
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q, const int nlocal,
|
||||
double *boxlo, double *prd, void **tep_ptr) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
|
||||
// ------------------- Resize _tep array ------------------------
|
||||
|
||||
if (nall>_max_tep_size) {
|
||||
_max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_tep.resize(_max_tep_size*4);
|
||||
|
||||
dev_nspecial15.clear();
|
||||
dev_special15.clear();
|
||||
dev_special15_t.clear();
|
||||
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
|
||||
}
|
||||
|
||||
*tep_ptr=_tep.host.begin();
|
||||
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
resize_atom(0,nall,success);
|
||||
zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
reset_nbors(nall, inum, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
}
|
||||
|
||||
// packing host arrays into host_extra
|
||||
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
atom->cast_q_data(host_q);
|
||||
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
atom->add_q_data();
|
||||
atom->add_extra_data();
|
||||
|
||||
device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
|
||||
boxlo, prd);
|
||||
|
||||
const int red_blocks=loop(eflag,vflag);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
int *nspecial15, tagint **special15,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo,
|
||||
double *prd, void **tep_ptr) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
|
||||
// ------------------- Resize _tep array ------------------------
|
||||
|
||||
if (nall>_max_tep_size) {
|
||||
_max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
_tep.resize(_max_tep_size*4);
|
||||
|
||||
dev_nspecial15.clear();
|
||||
dev_special15.clear();
|
||||
dev_special15_t.clear();
|
||||
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
|
||||
}
|
||||
*tep_ptr=_tep.host.begin();
|
||||
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
resize_atom(0,nall,success);
|
||||
zero_timers();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
hd_balancer.balance(cpu_time);
|
||||
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, nspecial15, special15,
|
||||
success);
|
||||
if (!success)
|
||||
return nullptr;
|
||||
atom->cast_q_data(host_q);
|
||||
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
|
||||
hd_balancer.start_timer();
|
||||
} else {
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
atom->cast_q_data(host_q);
|
||||
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
atom->add_q_data();
|
||||
atom->add_extra_data();
|
||||
|
||||
*ilist=nbor->host_ilist.begin();
|
||||
*jnum=nbor->host_acc.begin();
|
||||
|
||||
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
|
||||
boxlo, prd);
|
||||
|
||||
const int red_blocks=loop(eflag,vflag);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
// copy tep from device to host
|
||||
|
||||
_tep.update_host(_max_tep_size*4,false);
|
||||
/*
|
||||
printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
|
||||
printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
|
||||
}
|
||||
*/
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double BaseAmoebaT::host_memory_usage_atomic() const {
|
||||
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
||||
4*sizeof(numtyp)+sizeof(BaseAmoeba<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
|
||||
double** uind, double** uinp) {
|
||||
int _nall=atom->nall();
|
||||
numtyp *pextra=reinterpret_cast<numtyp*>(&(atom->extra[0]));
|
||||
|
||||
int n = 0;
|
||||
int nstride = 4;
|
||||
for (int i = 0; i < _nall; i++) {
|
||||
int idx = n+i*nstride;
|
||||
pextra[idx] = rpole[i][0];
|
||||
pextra[idx+1] = rpole[i][1];
|
||||
pextra[idx+2] = rpole[i][2];
|
||||
pextra[idx+3] = rpole[i][3];
|
||||
}
|
||||
|
||||
n += nstride*_nall;
|
||||
for (int i = 0; i < _nall; i++) {
|
||||
int idx = n+i*nstride;
|
||||
pextra[idx] = rpole[i][4];
|
||||
pextra[idx+1] = rpole[i][5];
|
||||
pextra[idx+2] = rpole[i][6];
|
||||
pextra[idx+3] = rpole[i][8];
|
||||
}
|
||||
|
||||
n += nstride*_nall;
|
||||
for (int i = 0; i < _nall; i++) {
|
||||
int idx = n+i*nstride;
|
||||
pextra[idx] = rpole[i][9];
|
||||
pextra[idx+1] = rpole[i][12];
|
||||
pextra[idx+2] = (numtyp)amtype[i];
|
||||
pextra[idx+3] = (numtyp)amgroup[i];
|
||||
}
|
||||
|
||||
n += nstride*_nall;
|
||||
for (int i = 0; i < _nall; i++) {
|
||||
int idx = n+i*nstride;
|
||||
pextra[idx] = uind[i][0];
|
||||
pextra[idx+1] = uind[i][1];
|
||||
pextra[idx+2] = uind[i][2];
|
||||
}
|
||||
|
||||
n += nstride*_nall;
|
||||
for (int i = 0; i < _nall; i++) {
|
||||
int idx = n+i*nstride;
|
||||
pextra[idx] = uinp[i][0];
|
||||
pextra[idx+1] = uinp[i][1];
|
||||
pextra[idx+2] = uinp[i][2];
|
||||
}
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
const char *kname) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
if (pair_program) delete pair_program;
|
||||
pair_program=new UCL_Program(dev);
|
||||
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
||||
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
|
||||
k_polar.set_function(*pair_program,kname);
|
||||
k_special15.set_function(*pair_program,"k_special15");
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
q_tex.get_texture(*pair_program,"q_tex");
|
||||
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.has_subgroup_support()) {
|
||||
size_t mx_subgroup_sz = k_polar.max_subgroup_size(_block_size);
|
||||
if (_threads_per_atom > mx_subgroup_sz)
|
||||
_threads_per_atom = mx_subgroup_sz;
|
||||
device->set_simd_size(mx_subgroup_sz);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseAmoebaT::add_onefive_neighbors() {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
|
||||
(BX/_threads_per_atom)));
|
||||
|
||||
int _nall=atom->nall();
|
||||
int ainum=ans->inum();
|
||||
int nbor_pitch=nbor->nbor_pitch();
|
||||
|
||||
k_special15.set_size(GX,BX);
|
||||
k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
|
||||
&atom->dev_tag, &dev_nspecial15, &dev_special15,
|
||||
&ainum, &_nall, &nbor_pitch,
|
||||
&_threads_per_atom);
|
||||
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class BaseAmoeba<PRECISION,ACC_PRECISION>;
|
||||
}
|
||||
225
lib/gpu/lal_base_amoeba.h
Normal file
225
lib/gpu/lal_base_amoeba.h
Normal file
@ -0,0 +1,225 @@
|
||||
/***************************************************************************
|
||||
base_amoeba.h
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Base class for pair styles needing per-particle data for position,
|
||||
charge, and type.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef LAL_BASE_AMOEBA_H
|
||||
#define LAL_BASE_AMOEBA_H
|
||||
|
||||
#include "lal_device.h"
|
||||
#include "lal_balance.h"
|
||||
#include "mpi.h"
|
||||
|
||||
#if defined(USE_OPENCL)
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_texture.h"
|
||||
#elif defined(USE_HIP)
|
||||
#include "geryon/hip_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class BaseAmoeba {
|
||||
public:
|
||||
BaseAmoeba();
|
||||
virtual ~BaseAmoeba();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const int maxspecial15, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const void *pair_program, const char *k_name);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead(const int add_kernels=0);
|
||||
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(nall, success)) {
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
q_tex.bind_float(atom->q,1);
|
||||
}
|
||||
ans->resize(inum,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
* \note olist_size=total number of local particles **/
|
||||
inline void resize_local(const int inum, const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,max_nbors,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note nlocal+host_inum=total number local particles
|
||||
* \note olist_size=0 **/
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,host_inum,max_nbors,success);
|
||||
}
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear_atomic();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom_atomic(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage_atomic() const;
|
||||
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (device->time_device()) {
|
||||
nbor->acc_timers(screen);
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero timers
|
||||
inline void zero_timers() {
|
||||
time_pair.zero();
|
||||
atom->zero_timers();
|
||||
ans->zero_timers();
|
||||
}
|
||||
|
||||
/// Copy neighbor list from host
|
||||
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
|
||||
int **firstneigh, bool &success);
|
||||
|
||||
/// Build neighbor list on device
|
||||
void build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, int *nspecial15, tagint **special15,
|
||||
bool &success);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int f_ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole, double **host_uind,
|
||||
double **host_uinp, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *charge,
|
||||
const int nlocal, double *boxlo, double *prd, void **tep_ptr);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int** compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole, double **host_uind,
|
||||
double **host_uinp, double *sublo, double *subhi,
|
||||
tagint *tag, int **nspecial, tagint **special,
|
||||
int *nspecial15, tagint **special15,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||
double *charge, double *boxlo, double *prd, void **tep_ptr);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
Device<numtyp,acctyp> *device;
|
||||
|
||||
/// Geryon device
|
||||
UCL_Device *ucl_device;
|
||||
|
||||
/// Device Timers
|
||||
UCL_Timer time_pair;
|
||||
|
||||
/// Host device load balancer
|
||||
Balance<numtyp,acctyp> hd_balancer;
|
||||
|
||||
/// LAMMPS pointer for screen output
|
||||
FILE *screen;
|
||||
|
||||
// --------------------------- ATOM DATA --------------------------
|
||||
|
||||
/// Atom Data
|
||||
Atom<numtyp,acctyp> *atom;
|
||||
|
||||
UCL_Vector<numtyp,numtyp> polar1, polar2, polar3, polar4, polar5;
|
||||
|
||||
/// cast host arrays into a single array for atom->extra
|
||||
void cast_extra_data(int* amtype, int* amgroup, double** rpole,
|
||||
double** uind, double** uinp);
|
||||
|
||||
/// Per-atom arrays
|
||||
UCL_Vector<numtyp,numtyp> _tep;
|
||||
int _max_tep_size;
|
||||
|
||||
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||
|
||||
Answer<numtyp,acctyp> *ans;
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
/// Neighbor data
|
||||
Neighbor *nbor;
|
||||
/// Device storage for 1-5 special neighbor counts
|
||||
UCL_D_Vec<int> dev_nspecial15;
|
||||
/// Device storage for special neighbors
|
||||
UCL_D_Vec<tagint> dev_special15, dev_special15_t;
|
||||
|
||||
int add_onefive_neighbors();
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_polar,k_special15;
|
||||
inline int block_size() { return _block_size; }
|
||||
inline void set_kernel(const int eflag, const int vflag) {}
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex;
|
||||
UCL_Texture q_tex;
|
||||
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_size, _block_bio_size, _threads_per_atom;
|
||||
int _extra_fields;
|
||||
double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
|
||||
|
||||
virtual int loop(const int eflag, const int vflag) = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
|
||||
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
|
||||
bool charge = false;
|
||||
bool rot = false;
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
|
||||
|
||||
_threads_per_atom=device->threads_per_charge();
|
||||
|
||||
int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
|
||||
bool charge = true;
|
||||
bool rot = false;
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
|
||||
|
||||
_threads_per_atom=device->threads_per_charge();
|
||||
|
||||
int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
|
||||
bool charge = true;
|
||||
bool rot = true;
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
|
||||
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
|
||||
bool charge = false;
|
||||
bool rot = false;
|
||||
bool vel = true;
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
||||
else
|
||||
_threads_per_atom=device->threads_per_three();
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
|
||||
bool charge = false;
|
||||
bool rot = false;
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
@ -579,6 +579,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
time_nbor.stop();
|
||||
if (_time_device)
|
||||
time_nbor.add_to_total();
|
||||
|
||||
// on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial)
|
||||
// on the device, transpose the matrix (1-d array) for coalesced reads
|
||||
// dev_special[i][j] = the special i neighbor of atom j
|
||||
|
||||
time_transpose.start();
|
||||
const int b2x=_block_cell_2d;
|
||||
const int b2y=_block_cell_2d;
|
||||
@ -682,6 +687,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
if (_cutoff < _cell_size) vadjust*=1.46;
|
||||
mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
|
||||
if (mn<33) mn+=3;
|
||||
|
||||
resize_max_neighbors<numtyp,acctyp>(mn,success);
|
||||
set_nbor_block_size(mn/2);
|
||||
if (!success)
|
||||
@ -834,6 +840,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
time_nbor.stop();
|
||||
}
|
||||
|
||||
void Neighbor::transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
|
||||
const int columns_in, const int rows_in)
|
||||
{
|
||||
const int b2x=_block_cell_2d;
|
||||
const int b2y=_block_cell_2d;
|
||||
const int g2x=static_cast<int>(ceil(static_cast<double>(columns_in)/b2x));
|
||||
const int g2y=static_cast<int>(ceil(static_cast<double>(rows_in)/b2y));
|
||||
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
|
||||
_shared->k_transpose.run(&out, &in, &columns_in, &rows_in);
|
||||
}
|
||||
|
||||
template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
|
||||
(double **x, const int inum, const int host_inum, const int nall,
|
||||
Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
|
||||
|
||||
@ -260,6 +260,10 @@ class Neighbor {
|
||||
return o.str();
|
||||
}
|
||||
|
||||
/// Helper function
|
||||
void transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
|
||||
const int columns_in, const int rows_in);
|
||||
|
||||
private:
|
||||
NeighborShared *_shared;
|
||||
UCL_Device *dev;
|
||||
|
||||
@ -44,6 +44,19 @@ _texture_2d( pos_tex,int4);
|
||||
#define LAL_USE_OLD_NEIGHBOR
|
||||
#endif
|
||||
|
||||
/*
|
||||
compute the id of the cell where the atoms belong to
|
||||
x: atom coordinates
|
||||
cell_id: cell ids
|
||||
particle_id:
|
||||
boxlo[0-2]: the lower left corner of the local box
|
||||
ncell[xyz]: the number of cells in xyz dims
|
||||
i_cell_size is the inverse cell size
|
||||
inum = the number of the local atoms that are ported to the device
|
||||
nall = the number of the local+ghost atoms that are ported to the device
|
||||
cells_in_cutoff = the number of cells that are within the cutoff
|
||||
*/
|
||||
|
||||
__kernel void calc_cell_id(const numtyp4 *restrict x_,
|
||||
unsigned *restrict cell_id,
|
||||
int *restrict particle_id,
|
||||
@ -86,6 +99,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_,
|
||||
}
|
||||
}
|
||||
|
||||
// compute the number of atoms in each cell
|
||||
|
||||
__kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
|
||||
int *restrict cell_counts,
|
||||
int nall, int ncell) {
|
||||
|
||||
@ -357,7 +357,7 @@ class PairAmoeba : public Pair {
|
||||
|
||||
void polar();
|
||||
void polar_energy();
|
||||
void polar_real();
|
||||
virtual void polar_real();
|
||||
void polar_kspace();
|
||||
void damppole(double, int, double, double, double *, double *, double *);
|
||||
|
||||
|
||||
@ -41,6 +41,8 @@ action fix_npt_gpu.cpp
|
||||
action fix_nve_asphere_gpu.h fix_nve_asphere.h
|
||||
action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp
|
||||
action gpu_extra.h
|
||||
action pair_amoeba_gpu.cpp pair_amoeba.cpp
|
||||
action pair_amoeba_gpu.h pair_amoeba.h
|
||||
action pair_beck_gpu.cpp pair_beck.cpp
|
||||
action pair_beck_gpu.h pair_beck.h
|
||||
action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp
|
||||
|
||||
299
src/GPU/pair_amoeba_gpu.cpp
Normal file
299
src/GPU/pair_amoeba_gpu.cpp
Normal file
@ -0,0 +1,299 @@
|
||||
// clang-format off
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
https://www.lammps.org/, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: Trung Nguyen (Northwestern)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "pair_amoeba_gpu.h"
|
||||
|
||||
#include "atom.h"
|
||||
#include "comm.h"
|
||||
#include "domain.h"
|
||||
#include "error.h"
|
||||
#include "force.h"
|
||||
#include "gpu_extra.h"
|
||||
#include "neigh_list.h"
|
||||
#include "neigh_request.h"
|
||||
#include "neighbor.h"
|
||||
#include "suffix.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
// External functions from cuda library for atom decomposition
|
||||
|
||||
int amoeba_gpu_init(const int ntypes, const int max_amtype,
|
||||
const double *host_pdamp, const double *host_thole,
|
||||
const double *host_special_polar_wscale,
|
||||
const double *host_special_polar_piscale,
|
||||
const double *host_special_polar_pscale,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const int maxspecial15,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
const double aewald, const double felec,
|
||||
const double off2, const double polar_dscale,
|
||||
const double polar_uscale, int& tep_size);
|
||||
void amoeba_gpu_clear();
|
||||
|
||||
int ** amoeba_gpu_compute_n(const int ago, const int inum, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
|
||||
double **host_rpole, double **host_uind, double **host_uinp,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, int* nspecial15, tagint** special15,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo, double *prd,
|
||||
void **tep_ptr);
|
||||
void amoeba_gpu_compute(const int ago, const int inum,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *host_amtype, int *host_amgroup,
|
||||
double **host_rpole, double **host_uind, double **host_uinp,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q, const int nlocal,
|
||||
double *boxlo, double *prd, void **tep_ptr);
|
||||
|
||||
double amoeba_gpu_bytes();
|
||||
|
||||
enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
|
||||
{
|
||||
respa_enable = 0;
|
||||
reinitflag = 0;
|
||||
cpu_time = 0.0;
|
||||
suffix_flag |= Suffix::GPU;
|
||||
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
free all arrays
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
PairAmoebaGPU::~PairAmoebaGPU()
|
||||
{
|
||||
amoeba_gpu_clear();
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void PairAmoebaGPU::polar_real()
|
||||
{
|
||||
int eflag=1, vflag=1;
|
||||
int nall = atom->nlocal + atom->nghost;
|
||||
int inum, host_start;
|
||||
|
||||
bool success = true;
|
||||
int *ilist, *numneigh, **firstneigh;
|
||||
if (gpu_mode != GPU_FORCE) {
|
||||
double sublo[3],subhi[3];
|
||||
if (domain->triclinic == 0) {
|
||||
sublo[0] = domain->sublo[0];
|
||||
sublo[1] = domain->sublo[1];
|
||||
sublo[2] = domain->sublo[2];
|
||||
subhi[0] = domain->subhi[0];
|
||||
subhi[1] = domain->subhi[1];
|
||||
subhi[2] = domain->subhi[2];
|
||||
} else {
|
||||
domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
|
||||
}
|
||||
inum = atom->nlocal;
|
||||
|
||||
firstneigh = amoeba_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
|
||||
atom->type, amtype, amgroup,
|
||||
rpole, uind, uinp, sublo, subhi,
|
||||
atom->tag, atom->nspecial, atom->special,
|
||||
atom->nspecial15, atom->special15,
|
||||
eflag, vflag, eflag_atom, vflag_atom,
|
||||
host_start, &ilist, &numneigh, cpu_time,
|
||||
success, atom->q, domain->boxlo,
|
||||
domain->prd, &tep_pinned);
|
||||
|
||||
} else {
|
||||
inum = list->inum;
|
||||
ilist = list->ilist;
|
||||
numneigh = list->numneigh;
|
||||
firstneigh = list->firstneigh;
|
||||
|
||||
amoeba_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
|
||||
amtype, amgroup, rpole, uind, uinp,
|
||||
ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
|
||||
vflag_atom, host_start, cpu_time, success, atom->q,
|
||||
atom->nlocal, domain->boxlo, domain->prd, &tep_pinned);
|
||||
}
|
||||
if (!success)
|
||||
error->one(FLERR,"Insufficient memory on accelerator");
|
||||
|
||||
// reference to the tep array from GPU lib
|
||||
|
||||
if (tep_single) {
|
||||
float *tep_ptr = (float *)tep_pinned;
|
||||
compute_force_from_tep<float>(tep_ptr);
|
||||
} else {
|
||||
double *tep_ptr = (double *)tep_pinned;
|
||||
compute_force_from_tep<double>(tep_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
init specific to this pair style
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr)
|
||||
{
|
||||
int i,ix,iy,iz;
|
||||
double ci,dix,diy,diz;
|
||||
double qixx,qixy,qixz;
|
||||
double qiyy,qiyz,qizz;
|
||||
double xix,yix,zix;
|
||||
double xiy,yiy,ziy;
|
||||
double xiz,yiz,ziz;
|
||||
double vxx,vyy,vzz;
|
||||
double vxy,vxz,vyz;
|
||||
double fix[3],fiy[3],fiz[3],tep[4];
|
||||
|
||||
double** x = atom->x;
|
||||
int nlocal = atom->nlocal;
|
||||
|
||||
for (i = 0; i < nlocal; i++) {
|
||||
dix = rpole[i][1];
|
||||
diy = rpole[i][2];
|
||||
diz = rpole[i][3];
|
||||
qixx = rpole[i][4];
|
||||
qixy = rpole[i][5];
|
||||
qixz = rpole[i][6];
|
||||
qiyy = rpole[i][8];
|
||||
qiyz = rpole[i][9];
|
||||
qizz = rpole[i][12];
|
||||
|
||||
tep[0] = tep_ptr[4*i];
|
||||
tep[1] = tep_ptr[4*i+1];
|
||||
tep[2] = tep_ptr[4*i+2];
|
||||
torque2force(i,tep,fix,fiy,fiz,fpolar);
|
||||
|
||||
iz = zaxis2local[i];
|
||||
ix = xaxis2local[i];
|
||||
iy = yaxis2local[i];
|
||||
|
||||
xiz = x[iz][0] - x[i][0];
|
||||
yiz = x[iz][1] - x[i][1];
|
||||
ziz = x[iz][2] - x[i][2];
|
||||
xix = x[ix][0] - x[i][0];
|
||||
yix = x[ix][1] - x[i][1];
|
||||
zix = x[ix][2] - x[i][2];
|
||||
xiy = x[iy][0] - x[i][0];
|
||||
yiy = x[iy][1] - x[i][1];
|
||||
ziy = x[iy][2] - x[i][2];
|
||||
|
||||
vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
|
||||
vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
|
||||
vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
|
||||
vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
|
||||
xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
|
||||
vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
|
||||
xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
|
||||
vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
|
||||
yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
|
||||
|
||||
virpolar[0] += vxx;
|
||||
virpolar[1] += vyy;
|
||||
virpolar[2] += vzz;
|
||||
virpolar[3] += vxy;
|
||||
virpolar[4] += vxz;
|
||||
virpolar[5] += vyz;
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
init specific to this pair style
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairAmoebaGPU::init_style()
|
||||
{
|
||||
PairAmoeba::init_style();
|
||||
|
||||
if (gpu_mode == GPU_FORCE) {
|
||||
if (comm->me == 0)
|
||||
error->warning(FLERR,"Pair style amoeba/gpu does not support neigh no "
|
||||
"for now, automatically switching to neigh yes");
|
||||
gpu_mode = GPU_NEIGH;
|
||||
}
|
||||
|
||||
// Repeat cutsq calculation because done after call to init_style
|
||||
|
||||
double maxcut = -1.0;
|
||||
double cut;
|
||||
for (int i = 1; i <= atom->ntypes; i++) {
|
||||
for (int j = i; j <= atom->ntypes; j++) {
|
||||
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
|
||||
cut = init_one(i,j);
|
||||
cut *= cut;
|
||||
if (cut > maxcut)
|
||||
maxcut = cut;
|
||||
cutsq[i][j] = cutsq[j][i] = cut;
|
||||
} else
|
||||
cutsq[i][j] = cutsq[j][i] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
// select the cutoff (off2) for neighbor list builds (the polar term for now)
|
||||
|
||||
if (use_ewald) choose(POLAR_LONG);
|
||||
else choose(POLAR);
|
||||
|
||||
double cell_size = sqrt(off2) + neighbor->skin;
|
||||
|
||||
int maxspecial=0;
|
||||
int maxspecial15=0;
|
||||
if (atom->molecular != Atom::ATOMIC) {
|
||||
maxspecial=atom->maxspecial;
|
||||
maxspecial15=atom->maxspecial15;
|
||||
}
|
||||
|
||||
int tep_size;
|
||||
int mnf = 5e-2 * neighbor->oneatom;
|
||||
|
||||
// set the energy unit conversion factor for polar real-space calculation
|
||||
|
||||
double felec = 0.5 * electric / am_dielectric;
|
||||
|
||||
int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole,
|
||||
special_polar_wscale, special_polar_piscale,
|
||||
special_polar_pscale, atom->nlocal,
|
||||
atom->nlocal+atom->nghost, mnf, maxspecial,
|
||||
maxspecial15, cell_size, gpu_mode, screen,
|
||||
aewald, felec, off2, polar_dscale, polar_uscale,
|
||||
tep_size);
|
||||
GPU_EXTRA::check_flag(success,error,world);
|
||||
|
||||
if (tep_size == sizeof(double))
|
||||
tep_single = false;
|
||||
else
|
||||
tep_single = true;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
double PairAmoebaGPU::memory_usage()
|
||||
{
|
||||
double bytes = Pair::memory_usage();
|
||||
return bytes + amoeba_gpu_bytes();
|
||||
}
|
||||
63
src/GPU/pair_amoeba_gpu.h
Normal file
63
src/GPU/pair_amoeba_gpu.h
Normal file
@ -0,0 +1,63 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
https://www.lammps.org/, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef PAIR_CLASS
|
||||
// clang-format off
|
||||
PairStyle(amoeba/gpu,PairAmoebaGPU);
|
||||
// clang-format on
|
||||
#else
|
||||
|
||||
#ifndef LMP_PAIR_AMOEBA_GPU_H
|
||||
#define LMP_PAIR_AMOEBA_GPU_H
|
||||
|
||||
#include "pair_amoeba.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class PairAmoebaGPU : public PairAmoeba {
|
||||
public:
|
||||
PairAmoebaGPU(LAMMPS *lmp);
|
||||
~PairAmoebaGPU();
|
||||
void init_style();
|
||||
double memory_usage();
|
||||
|
||||
enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
|
||||
|
||||
virtual void polar_real();
|
||||
|
||||
private:
|
||||
int gpu_mode;
|
||||
double cpu_time;
|
||||
void *tep_pinned;
|
||||
bool tep_single;
|
||||
|
||||
template<class numtyp>
|
||||
void compute_force_from_tep(const numtyp*);
|
||||
};
|
||||
|
||||
} // namespace LAMMPS_NS
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* ERROR/WARNING messages:
|
||||
|
||||
E: Insufficient memory on accelerator
|
||||
|
||||
There is insufficient memory on one of the devices specified for the gpu
|
||||
package
|
||||
|
||||
E: Pair style amoeba/gpu requires atom attribute q
|
||||
|
||||
The atom style defined does not have this attribute.
|
||||
|
||||
*/
|
||||
Reference in New Issue
Block a user