cuda forces + init var

This commit is contained in:
Eddy Barraud
2024-06-05 17:49:27 +02:00
parent 9ea57acf54
commit eb7f947a0c
5 changed files with 251 additions and 130 deletions

View File

@ -46,14 +46,15 @@ template <class numtyp, class acctyp>
int DPDChargedT::init(const int ntypes, int DPDChargedT::init(const int ntypes,
double **host_cutsq, double **host_a0, double **host_cutsq, double **host_a0,
double **host_gamma, double **host_sigma, double **host_gamma, double **host_sigma,
double **host_cut, double **host_cut_dpd, double **host_cut_dpdsq,
double **host_cut_dpd, double **host_cut_dpdsq, double **host_cut_slatersq, double **host_cut_slatersq, **host_scale,
double *host_special_lj, double *host_special_lj,
const bool tstat_only, const bool tstat_only,
const int nlocal, const int nall, const int nlocal, const int nall,
const int max_nbors, const int maxspecial, const int max_nbors, const int maxspecial,
const double cell_size, const double cell_size,
const double gpu_split, FILE *_screen) { const double gpu_split, FILE *_screen, double *host_special_coul,
const double qqrd2e, const double g_ewald, double lamda) {
const int max_shared_types=this->device->max_shared_types(); const int max_shared_types=this->device->max_shared_types();
int onetype=0; int onetype=0;
@ -83,7 +84,28 @@ int DPDChargedT::init(const int ntypes,
lj_types=max_shared_types; lj_types=max_shared_types;
shared_types=true; shared_types=true;
} }
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write_coul(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write_coul[i]=0.0;
scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,scale,host_write_coul,host_scale);
sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write_coul[i]=host_special_coul[i];
}
ucl_copy(sp_cl,host_write_coul,4,false);
_lj_types=lj_types; _lj_types=lj_types;
_cut_coulsq=host_cut_coulsq;
_qqrd2e=qqrd2e;
_g_ewald=g_ewald;
_lamda=lamda;
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
@ -103,7 +125,7 @@ int DPDChargedT::init(const int ntypes,
host_rsq[i]=0.0; host_rsq[i]=0.0;
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,cutsq,host_rsq,host_cutsq, this->atom->type_pack4(ntypes,lj_types,cutsq,host_rsq,host_cutsq,
host_cut_dpdsq, host_cut_dpd, host_cut_slatersq); host_cut_dpdsq, host_scale, host_cut_slatersq);
double special_sqrt[4]; double special_sqrt[4];
special_sqrt[0] = sqrt(host_special_lj[0]); special_sqrt[0] = sqrt(host_special_lj[0]);
@ -181,20 +203,23 @@ int DPDChargedT::loop(const int eflag, const int vflag) {
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_sel->set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &sp_lj, &sp_sqrt, this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &sp_lj, &sp_cl, &sp_sqrt,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->v, &cutsq, &vflag, &ainum, &nbor_pitch, &this->atom->v, &cutsq,
&this->_dtinvsqrt, &this->_seed, &this->_timestep, &this->_dtinvsqrt, &this->_seed, &this->_timestep,
&this->_tstat_only, &this->_threads_per_atom); &_qqrd2e, &_g_ewald, &_lamda,
&this->_tstat_only, &this->_threads_per_atom,);
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &_lj_types, &sp_lj, &sp_sqrt, this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &_lj_types, &sp_lj, &sp_cl, &sp_sqrt,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt, &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt,
&this->_seed, &this->_timestep, &this->_tstat_only, &_qqrd2e, &_g_ewald, &_lamda,
&this->_threads_per_atom); &this->_seed, &this->_timestep,
&_qqrd2e, &_g_ewald, &_lamda,
&this->_tstat_only, &this->_threads_per_atom,);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX; return GX;

View File

@ -166,6 +166,7 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff, const __global numtyp4 *restrict coeff,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_lj, const __global numtyp *restrict sp_lj,
const __global numtyp *restrict sp_cl_in,
const __global numtyp *restrict sp_sqrt, const __global numtyp *restrict sp_sqrt,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
@ -176,19 +177,31 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict v_, const __global numtyp4 *restrict v_,
const __global numtyp4 *restrict cutsq, const __global numtyp4 *restrict cutsq,
const numtyp dtinvsqrt, const int seed, const numtyp dtinvsqrt, const int seed,
const int timestep, const int tstat_only, const int timestep, const numtyp qqrd2e,
const numtyp g_ewald, const numtyp lamda,
const int tstat_only,
const int t_per_atom) { const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_cl[4];
int n_stride;
local_allocate_store_charge();
sp_cl[0]=sp_cl_in[0];
sp_cl[1]=sp_cl_in[1];
sp_cl[2]=sp_cl_in[2];
sp_cl[3]=sp_cl_in[3];
int n_stride; int n_stride;
local_allocate_store_pair(); local_allocate_store_pair();
acctyp3 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp e_coul, energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
energy=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0
for (int i=0; i<6; i++) virial[i]=(acctyp)0; for (int i=0; i<6; i++) virial[i]=(acctyp)0;
} }
@ -202,7 +215,8 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i]; numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i];
int itag=iv.w; int itag=iv.w;
const numtyp qtmp = extra[i].x; // q[i] numtyp qtmp = extra[i].x; // q[i]
numtyp lamdainv = ucl_recip(lamda);
numtyp factor_dpd, factor_sqrt; numtyp factor_dpd, factor_sqrt;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
@ -211,6 +225,8 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_dpd = sp_lj[sbmask(j)]; factor_dpd = sp_lj[sbmask(j)];
factor_sqrt = sp_sqrt[sbmask(j)]; factor_sqrt = sp_sqrt[sbmask(j)];
numtyp factor_coul;
factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -225,48 +241,88 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype; int mtype=itype*lj_types+jtype;
if (rsq<cutsq.x[mtype]) {
// cutsq[mtype].x -> global squared cutoff
if (rsq<cutsq[mtype].x) {
numtyp r=ucl_sqrt(rsq); numtyp r=ucl_sqrt(rsq);
if (r < EPSILON) continue; numtyp force_dpd = (numtyp)0.0;
numtyp force_coul = (numtyp)0.0;
numtyp rinv=ucl_recip(r); // apply DPD force if distance below DPD cutoff
numtyp delvx = iv.x - jv.x; // cutsq[mtype].y -> DPD squared cutoff
numtyp delvy = iv.y - jv.y; if (rsq < cutsq[mtype].y && r < EPSILON) {
numtyp delvz = iv.z - jv.z;
numtyp dot = delx*delvx + dely*delvy + delz*delvz;
numtyp wd = (numtyp)1.0 - r/coeff[mtype].w;
const numtyp qj = extra[j].x; numtyp rinv=ucl_recip(r);
numtyp delvx = iv.x - jv.x;
numtyp delvy = iv.y - jv.y;
numtyp delvz = iv.z - jv.z;
numtyp dot = delx*delvx + dely*delvy + delz*delvz;
numtyp wd = (numtyp)1.0 - r/coeff[mtype].w;
unsigned int tag1=itag, tag2=jtag; const numtyp qj = extra[j].x;
if (tag1 > tag2) {
tag1 = jtag; tag2 = itag; unsigned int tag1=itag, tag2=jtag;
if (tag1 > tag2) {
tag1 = jtag; tag2 = itag;
}
numtyp randnum = (numtyp)0.0;
saru(tag1, tag2, seed, timestep, randnum);
// conservative force = a0 * wd, or 0 if tstat only
// drag force = -gamma * wd^2 * (delx dot delv) / r
// random force = sigma * wd * rnd * dtinvsqrt;
if (!tstat_only) force_dpd = coeff[mtype].x*wd;
force_dpd -= coeff[mtype].y*wd*wd*dot*rinv;
force_dpd *= factor_dpd;
force_dpd += factor_sqrt*coeff[mtype].z*wd*randnum*dtinvsqrt;
force_dpd *=rinv;
}
// apply Slater electrostatic force if distance below Slater cutoff
// and the two species have a slater coeff
// cutsq[mtype].w -> Coulombic squared cutoff
if ( cutsq[mtype].w != 0.0 && rsq < cutsq[mtype].w){
numtyp r2inv=ucl_recip(rsq);
numtyp _erfc;
numtyp r = ucl_rsqrt(r2inv);
numtyp grij = g_ewald * r;
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
numtyp prefactor = extra[j].x;
prefactor *= qqrd2e * cutsq[mtype].z * qtmp/r;
numtyp rlamdainv = r * lamdainv;
numtyp exprlmdainv = ucl_exp((numtyp)-2.0*rlamdainv);
numtyp slater_term = exprlmdainv*((numtyp)1.0 + ((numtyp)2.0*rlamdainv*((numtyp)1.0+rlamdainv)));
force_coul = prefactor*(_erfc + EWALD_F*grij*expm2-slater_term);
if (factor_coul > (numtyp)0) force_coul -= factor_coul*prefactor*((numtyp)1.0-slater_term);
force_coul *= r2inv;
} }
numtyp randnum = (numtyp)0.0; numtyp force = force_coul + force_dpd;
saru(tag1, tag2, seed, timestep, randnum);
// conservative force = a0 * wd, or 0 if tstat only
// drag force = -gamma * wd^2 * (delx dot delv) / r
// random force = sigma * wd * rnd * dtinvsqrt;
numtyp force = (numtyp)0.0;
if (!tstat_only) force = coeff[mtype].x*wd;
force -= coeff[mtype].y*wd*wd*dot*rinv;
force *= factor_dpd;
force += factor_sqrt*coeff[mtype].z*wd*randnum*dtinvsqrt;
force*=rinv;
f.x+=delx*force; f.x+=delx*force;
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (EVFLAG && eflag) { if (EVFLAG && eflag) {
// unshifted eng of conservative term:
// evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]); if (rsq < cutsq[mtype].y && r < EPSILON) {
// eng shifted to 0.0 at cutoff // unshifted eng of conservative term:
numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd; // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
energy+=factor_dpd*e; // eng shifted to 0.0 at cutoff
numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
energy+=factor_dpd*e;
}
if ( cutsq[mtype].w != 0.0 && rsq < cutsq[mtype].w){
numtyp e_slater = ((numtyp)1.0 + rlamdainv)*exprlmdainv;
numtyp e = prefactor*(_erfc-e_slater);
if (factor_coul > (numtyp)0) e -= factor_coul*prefactor*((numtyp)1.0 - e_slater);
e_coul += e;
}
} }
if (EVFLAG && vflag) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
@ -276,18 +332,22 @@ __kernel void k_dpd_charged(const __global numtyp4 *restrict x_,
virial[4] += delx*delz*force; virial[4] += delx*delz*force;
virial[5] += dely*delz*force; virial[5] += dely*delz*force;
} }
} }
} // for nbor } // for nbor
} // if ii } // if ii
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, store_answers_q(f,energy, e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv); ans,engv);
} }
__kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_, __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict extra, const __global numtyp4 *restrict extra,
const __global numtyp4 *restrict coeff_in, const __global numtyp4 *restrict coeff_in,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global numtyp *restrict sp_cl_in,
const __global numtyp *restrict sp_sqrt_in, const __global numtyp *restrict sp_sqrt_in,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
@ -298,39 +358,41 @@ __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict v_, const __global numtyp4 *restrict v_,
const __global numtyp4 *restrict cutsq, const __global numtyp4 *restrict cutsq,
const numtyp dtinvsqrt, const int seed, const numtyp dtinvsqrt, const int seed,
const int timestep, const int tstat_only, const int timestep, const numtyp qqrd2e,
const numtyp g_ewald, const numtyp lamda,
const int tstat_only,
const int t_per_atom) { const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
#ifndef ONETYPE
__local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
__local numtyp sp_sqrt[4]; __local numtyp sp_sqrt[4];
/// COUL Init
__local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_cl[4];
if (tid<4) { if (tid<4) {
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
sp_sqrt[tid]=sp_sqrt_in[tid]; sp_sqrt[tid]=sp_sqrt_in[tid];
sp_cl[tid]=sp_cl_in[tid];
} }
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff[tid]=coeff_in[tid]; coeff[tid]=coeff_in[tid];
scale[tid]=scale_in[tid];
} }
__syncthreads(); __syncthreads();
#else
const numtyp coeffx=coeff_in[ONETYPE].x;
const numtyp coeffy=coeff_in[ONETYPE].y;
const numtyp coeffz=coeff_in[ONETYPE].z;
const numtyp coeffw=coeff_in[ONETYPE].w;
const numtyp cutsq_p=cutsq[ONETYPE];
#endif
int n_stride; int n_stride;
local_allocate_store_pair(); local_allocate_store_pair();
acctyp3 f; acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6]; acctyp e_coul, energy, virial[6];
if (EVFLAG) { if (EVFLAG) {
energy=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0; for (int i=0; i<6; i++) virial[i]=(acctyp)0;
} }
@ -340,33 +402,26 @@ __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
#ifndef ONETYPE
int iw=ix.w; int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
#endif
numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i]; numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i];
int itag=iv.w; int itag=iv.w;
const numtyp qi = extra[i].x; numtyp qtmp = extra[i].x; // q[i]
numtyp lamdainv = ucl_recip(lamda);
#ifndef ONETYPE
numtyp factor_dpd, factor_sqrt; numtyp factor_dpd, factor_sqrt;
#endif
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride); ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor]; int j=dev_packed[nbor];
#ifndef ONETYPE
factor_dpd = sp_lj[sbmask(j)]; factor_dpd = sp_lj[sbmask(j)];
factor_sqrt = sp_sqrt[sbmask(j)]; factor_sqrt = sp_sqrt[sbmask(j)];
numtyp factor_coul;
factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
#endif
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
#ifndef ONETYPE
int mtype=itype+jx.w;
const numtyp cutsq_p=cutsq[mtype];
#endif
numtyp4 jv; fetch4(jv,j,vel_tex); //v_[j]; numtyp4 jv; fetch4(jv,j,vel_tex); //v_[j];
int jtag=jv.w; int jtag=jv.w;
@ -376,64 +431,89 @@ __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq_p) { int mtype=itype+jx.w;
// cutsq[mtype].x -> global squared cutoff
if (rsq<cutsq[mtype].x) {
numtyp r=ucl_sqrt(rsq); numtyp r=ucl_sqrt(rsq);
if (r < EPSILON) continue; numtyp force_dpd = (numtyp)0.0;
numtyp force_coul = (numtyp)0.0;
numtyp rinv=ucl_recip(r); // apply DPD force if distance below DPD cutoff
numtyp delvx = iv.x - jv.x; // cutsq[mtype].y -> DPD squared cutoff
numtyp delvy = iv.y - jv.y; if (rsq < cutsq[mtype].y && r < EPSILON) {
numtyp delvz = iv.z - jv.z;
numtyp dot = delx*delvx + dely*delvy + delz*delvz;
#ifndef ONETYPE
const numtyp coeffw=coeff[mtype].w;
#endif
numtyp wd = (numtyp)1.0 - r/coeffw;
const numtyp qj = extra[j].x; numtyp rinv=ucl_recip(r);
numtyp delvx = iv.x - jv.x;
unsigned int tag1=itag, tag2=jtag; numtyp delvy = iv.y - jv.y;
if (tag1 > tag2) { numtyp delvz = iv.z - jv.z;
tag1 = jtag; tag2 = itag; numtyp dot = delx*delvx + dely*delvy + delz*delvz;
numtyp wd = (numtyp)1.0 - r/coeff[mtype].w;
const numtyp qj = extra[j].x;
unsigned int tag1=itag, tag2=jtag;
if (tag1 > tag2) {
tag1 = jtag; tag2 = itag;
}
numtyp randnum = (numtyp)0.0;
saru(tag1, tag2, seed, timestep, randnum);
// conservative force = a0 * wd, or 0 if tstat only
// drag force = -gamma * wd^2 * (delx dot delv) / r
// random force = sigma * wd * rnd * dtinvsqrt;
if (!tstat_only) force_dpd = coeff[mtype].x*wd;
force_dpd -= coeff[mtype].y*wd*wd*dot*rinv;
force_dpd *= factor_dpd;
force_dpd += factor_sqrt*coeff[mtype].z*wd*randnum*dtinvsqrt;
force_dpd *=rinv;
}
// apply Slater electrostatic force if distance below Slater cutoff
// and the two species have a slater coeff
// cutsq[mtype].w -> Coulombic squared cutoff
if ( cutsq[mtype].w != 0.0 && rsq < cutsq[mtype].w){
numtyp r2inv=ucl_recip(rsq);
numtyp _erfc;
numtyp r = ucl_rsqrt(r2inv);
numtyp grij = g_ewald * r;
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
numtyp prefactor = extra[j].x;
prefactor *= qqrd2e * cutsq[mtype].z * qtmp/r;
numtyp rlamdainv = r * lamdainv;
numtyp exprlmdainv = ucl_exp((numtyp)-2.0*rlamdainv);
numtyp slater_term = exprlmdainv*((numtyp)1.0 + ((numtyp)2.0*rlamdainv*((numtyp)1.0+rlamdainv)));
force_coul = prefactor*(_erfc + EWALD_F*grij*expm2-slater_term);
if (factor_coul > (numtyp)0) force_coul -= factor_coul*prefactor*((numtyp)1.0-slater_term);
force_coul *= r2inv;
} }
numtyp randnum = (numtyp)0.0; numtyp force = force_coul + force_dpd;
saru(tag1, tag2, seed, timestep, randnum);
// conservative force = a0 * wd, or 0 if tstat only
// drag force = -gamma * wd^2 * (delx dot delv) / r
// random force = sigma * wd * rnd * dtinvsqrt;
#ifndef ONETYPE
const numtyp coeffx=coeff[mtype].x;
const numtyp coeffy=coeff[mtype].y;
const numtyp coeffz=coeff[mtype].z;
#endif
numtyp force = (numtyp)0.0;
if (!tstat_only) force = coeffx*wd;
force -= coeffy*wd*wd*dot*rinv;
#ifndef ONETYPE
force *= factor_dpd;
force += factor_sqrt*coeffz*wd*randnum*dtinvsqrt;
#else
force += coeffz*wd*randnum*dtinvsqrt;
#endif
force*=rinv;
f.x+=delx*force; f.x+=delx*force;
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (EVFLAG && eflag) { if (EVFLAG && eflag) {
// unshifted eng of conservative term:
// evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]); if (rsq < cutsq[mtype].y && r < EPSILON) {
// eng shifted to 0.0 at cutoff // unshifted eng of conservative term:
numtyp e = (numtyp)0.5*coeffx*coeffw * wd*wd; // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
#ifndef ONETYPE // eng shifted to 0.0 at cutoff
energy+=factor_dpd*e; numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
#else energy+=factor_dpd*e;
energy+=e; }
#endif if ( cutsq[mtype].w != 0.0 && rsq < cutsq[mtype].w){
numtyp e_slater = ((numtyp)1.0 + rlamdainv)*exprlmdainv;
numtyp e = prefactor*(_erfc-e_slater);
if (factor_coul > (numtyp)0) e -= factor_coul*prefactor*((numtyp)1.0 - e_slater);
e_coul += e;
}
} }
if (EVFLAG && vflag) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
@ -443,6 +523,8 @@ __kernel void k_dpd_charged_fast(const __global numtyp4 *restrict x_,
virial[4] += delx*delz*force; virial[4] += delx*delz*force;
virial[5] += dely*delz*force; virial[5] += dely*delz*force;
} }
} }
} // for nbor } // for nbor

View File

@ -38,11 +38,12 @@ class DPDCharged : public BaseDPD<numtyp, acctyp> {
* - -4 if the GPU library was not compiled for GPU * - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/ * - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_a0, int init(const int ntypes, double **host_cutsq, double **host_a0,
double **host_gamma, double **host_sigma, double **host_cut, double **host_gamma, double **host_sigma,
double **host_cut_dpd, double **host_cut_dpdsq, double **host_cut_slatersq, double **host_cut_dpd, double **host_cut_dpdsq, double **host_cut_slatersq,
double *host_special_lj, bool tstat_only, const int nlocal, double **host_scale, double *host_special_lj, bool tstat_only, const int nlocal,
const int nall, const int max_nbors, const int maxspecial, const int nall, const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split, FILE *screen); const double cell_size, const double gpu_split, FILE *screen,
const double qqrd2e, const double g_ewald, const double lamda);
/// Clear all host and device data /// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/ /** \note This is called at the beginning of the init() routine **/
@ -63,12 +64,15 @@ class DPDCharged : public BaseDPD<numtyp, acctyp> {
/// coeff.x = a0, coeff.y = gamma, coeff.z = sigma, coeff.w = cut_dpd /// coeff.x = a0, coeff.y = gamma, coeff.z = sigma, coeff.w = cut_dpd
UCL_D_Vec<numtyp4> coeff; UCL_D_Vec<numtyp4> coeff;
/// cutsq.x = cutsq, cutsq.y = cut_dpd_sq, cutsq.z = cut_dpd, cutsq.w = cut_slatersq /// cutsq.x = cutsq, cutsq.y = cut_dpd_sq, cutsq.z = scale, cutsq.w = cut_slatersq
UCL_D_Vec<numtyp4> cutsq; UCL_D_Vec<numtyp4> cutsq;
/// Special LJ values /// Special LJ values
UCL_D_Vec<numtyp> sp_lj, sp_sqrt; UCL_D_Vec<numtyp> sp_lj, sp_sqrt;
/// Special Coul values [0-3]
UCL_D_Vec<numtyp> sp_cl;
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
@ -78,6 +82,9 @@ class DPDCharged : public BaseDPD<numtyp, acctyp> {
/// Only used for thermostat /// Only used for thermostat
int _tstat_only; int _tstat_only;
/// Coulombic terms
numtyp _cut_coulsq, _qqrd2e, _g_ewald, _lamda;
/// pointer to host data for atom charge /// pointer to host data for atom charge
double *q; double *q;

View File

@ -28,11 +28,12 @@ static DPDCharged<PRECISION,ACC_PRECISION> DPDCMF;
// Allocate memory on host and device and copy constants to device // Allocate memory on host and device and copy constants to device
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0, int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0,
double **host_gamma, double **host_sigma, double **host_cut, double **host_gamma, double **host_sigma,
double **host_cut_dpd, double **host_cut_slater, double **host_cut_dpd, double **host_cut_dpd_sq, double **host_cut_slatersq,
double *special_lj, const int inum, double **host_scale, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial, const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) { const double cell_size, int &gpu_mode, FILE *screen, double *host_special_coul,
const double qqrd2e, const double g_ewald, const double lamda) {
DPDCMF.clear(); DPDCMF.clear();
gpu_mode=DPDCMF.device->gpu_mode(); gpu_mode=DPDCMF.device->gpu_mode();
double gpu_split=DPDCMF.device->particle_split(); double gpu_split=DPDCMF.device->particle_split();
@ -56,8 +57,10 @@ int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=DPDCMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, init_ok=DPDCMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
host_cut, host_cut_dpd, host_cut_dpdsq, host_cut_slatersq, special_lj, false, inum, nall, max_nbors, host_cut_dpd, host_cut_dpdsq, host_cut_slatersq,
maxspecial, cell_size, gpu_split, screen); host_scale, special_lj, false, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen,
force->special_coul,->qqrd2e, g_ewald, lamda);
DPDCMF.device->world_barrier(); DPDCMF.device->world_barrier();
if (message) if (message)
@ -74,8 +77,10 @@ int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0,
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=DPDCMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, init_ok=DPDCMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
host_cut, host_cut_dpd, host_cut_slater, special_lj, false, inum, nall, max_nbors, host_cut_dpd, host_cut_dpdsq, host_cut_slatersq,
maxspecial, cell_size, gpu_split, screen); host_scale, special_lj, false, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen,
force->special_coul,force->qqrd2e, g_ewald, lamda);
DPDCMF.device->serialize_init(); DPDCMF.device->serialize_init();
if (message) if (message)

View File

@ -39,10 +39,11 @@ using namespace EwaldConst;
// External functions from cuda library for atom decomposition // External functions from cuda library for atom decomposition
int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0, double **host_gamma, int dpd_charged_gpu_init(const int ntypes, double **cutsq, double **host_a0, double **host_gamma,
double **host_sigma, double **host_cut, double **host_cut_dpd, double **host_cut_slater, double *special_lj, const int inum, double **host_sigma, double **host_cut_dpd, double **host_cut_dpd_sq, double **host_cut_slatersq,
double **host_scale, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial, const double cell_size, const int nall, const int max_nbors, const int maxspecial, const double cell_size,
int &gpu_mode, FILE *screen, int &gpu_mode, FILE *screen, double *host_special_coul,
double *host_special_coul, const double qqrd2e, const double g_ewald, const double lamda); const double qqrd2e, const double g_ewald, const double lamda);
void dpd_charged_gpu_clear(); void dpd_charged_gpu_clear();
int **dpd_charged_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int **dpd_charged_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x,
int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial,
@ -313,8 +314,9 @@ void PairDPDChargedGPU::init_style()
if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial;
int mnf = 5e-2 * neighbor->oneatom; int mnf = 5e-2 * neighbor->oneatom;
int success = int success =
dpd_charged_gpu_init(atom->ntypes + 1, cutsq, a0, gamma, sigma, cut, dpd_charged_gpu_init(atom->ntypes + 1, cutsq, a0, gamma, sigma,
cut_dpd, cut_dpdsq, cut_slatersq, force->special_lj, atom->nlocal, cut_dpd, cut_dpdsq, cut_slatersq, scale,
force->special_lj, atom->nlocal,
atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen, atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen,
force->special_coul, force->qqrd2e, g_ewald, lamda); force->special_coul, force->qqrd2e, g_ewald, lamda);
GPU_EXTRA::check_flag(success, error, world); GPU_EXTRA::check_flag(success, error, world);