git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10669 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2013-08-23 14:46:25 +00:00
parent 22751a2aae
commit 205833eaad
5 changed files with 1573 additions and 0 deletions

225
lib/gpu/lal_beck.cu Normal file
View File

@ -0,0 +1,225 @@
// **************************************************************************
// beck.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the beck pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
__kernel void k_beck(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict beck1,
const __global numtyp4 *restrict beck2,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<beck2[mtype].z) {
numtyp r = ucl_sqrt(rsq);
numtyp r5 = rsq*rsq*r;
numtyp aaij = beck1[mtype].x;
numtyp alphaij = beck1[mtype].y;
numtyp betaij = beck1[mtype].z;
numtyp term1 = aaij*aaij + rsq;
numtyp term2 = pow(term1,(numtyp)-5.0);
numtyp term3 = (numtyp)21.672 + (numtyp)30.0*aaij*aaij + (numtyp)6.0*rsq;
numtyp term4 = alphaij + r5*betaij;
numtyp term5 = alphaij + (numtyp)6.0*r5*betaij;
numtyp rinv = ucl_recip(r);
numtyp force = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4)*term5;
force -= beck2[mtype].y*r*term2*term3;
force *= factor_lj*rinv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp term6 = pow(term1,(numtyp)-3);
numtyp term1inv = ucl_recip(term1);
numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
energy+=factor_lj*e;
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}
__kernel void k_beck_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict beck1_in,
const __global numtyp4 *restrict beck2_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
beck1[tid]=beck1_in[tid];
beck2[tid]=beck2_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<beck2[mtype].z) {
numtyp r = ucl_sqrt(rsq);
numtyp r5 = rsq*rsq*r;
numtyp aaij = beck1[mtype].x;
numtyp alphaij = beck1[mtype].y;
numtyp betaij = beck1[mtype].z;
numtyp term1 = aaij*aaij + rsq;
numtyp term2 = pow(term1,(numtyp)-5.0);
numtyp term3 = (numtyp)21.672 + (numtyp)30.0*aaij*aaij + (numtyp)6.0*rsq;
numtyp term4 = alphaij + r5*betaij;
numtyp term5 = alphaij + (numtyp)6.0*r5*betaij;
numtyp rinv = ucl_recip(r);
numtyp force = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4)*term5;
force -= beck2[mtype].y*r*term2*term3;
force *= factor_lj*rinv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp term6 = pow(term1,(numtyp)-3);
numtyp term1inv = ucl_recip(term1);
numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
energy+=factor_lj*e;
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}

322
lib/gpu/lal_lj_coul_msm.cu Normal file
View File

@ -0,0 +1,322 @@
// **************************************************************************
// lj_coul_msm.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the lj/cut/coul/msm pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float> gcons_tex;
texture<float> dgcons_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int2> gcons_tex;
texture<int2> dgcons_tex;
#endif
#else
#define pos_tex x_
#define q_tex q_
#define gcons_tex gcons
#define dgcons_tex dgcons
#endif
/* ----------------------------------------------------------------------
compute gamma for MSM and pair styles
see Eq 4 from Parallel Computing 35 (2009) 164–177
------------------------------------------------------------------------- */
ucl_inline numtyp gamma(const numtyp rho, const int order,
const __global numtyp *gcons) {
if (rho <= (numtyp)1.0) {
const int split_order = order/2;
const numtyp rho2 = rho*rho;
numtyp g; fetch(g,7*split_order+0,gcons_tex);
numtyp rho_n = rho2;
for (int n=1; n<=split_order; n++) {
numtyp tmp; fetch(tmp,7*split_order+n,gcons_tex);
g += tmp*rho_n;
rho_n *= rho2;
}
return g;
} else
return ((numtyp)1.0/rho);
}
/* ----------------------------------------------------------------------
compute the derivative of gamma for MSM and pair styles
see Eq 4 from Parallel Computing 35 (2009) 164-177
------------------------------------------------------------------------- */
ucl_inline numtyp dgamma(const numtyp rho, const int order,
const __global numtyp *dgcons) {
if (rho <= (numtyp)1.0) {
const int split_order = order/2;
const numtyp rho2 = rho*rho;
numtyp dg; fetch(dg,6*split_order+0,dgcons_tex);
dg *= rho;
numtyp rho_n = rho*rho2;
for (int n=1; n<split_order; n++) {
numtyp tmp; fetch(tmp,6*split_order+n,dgcons_tex);
dg += tmp*rho_n;
rho_n *= rho2;
}
return dg;
} else
return ((numtyp)-1.0/rho/rho);
}
__kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const __global numtyp *restrict gcons,
const __global numtyp *restrict dgcons,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const int order, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
numtyp cut_coul = ucl_sqrt(cut_coulsq);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<lj1[mtype].z) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, force_lj, force, r6inv, prefactor, egamma, fgamma;
if (rsq < lj1[mtype].w) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = ucl_rsqrt(r2inv);
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
numtyp rho = r/cut_coul;
egamma = (numtyp)1.0 - rho*gamma(rho, order, gcons);
fgamma = (numtyp)1.0 + (rsq/cut_coulsq)*dgamma(rho, order, dgcons);
forcecoul = prefactor * fgamma;
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(egamma-factor_coul);
if (rsq < lj1[mtype].w) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}
__kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict gcons,
const __global numtyp *restrict dgcons,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const int order, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp cut_coul = ucl_sqrt(cut_coulsq);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<lj1[mtype].z) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, force_lj, force, r6inv, prefactor, egamma, fgamma;
if (rsq < lj1[mtype].w) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = ucl_rsqrt(r2inv);
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
numtyp rho = r/cut_coul;
egamma = (numtyp)1.0 - rho*gamma(rho, order, gcons);
fgamma = (numtyp)1.0 + (rsq/cut_coulsq)*dgamma(rho, order, dgcons);
forcecoul = prefactor * fgamma;
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(egamma-factor_coul);
if (rsq < lj1[mtype].w) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}

203
lib/gpu/lal_mie.cu Normal file
View File

@ -0,0 +1,203 @@
// **************************************************************************
// mie.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the mie pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
__kernel void k_mie(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict mie1,
const __global numtyp4 *restrict mie3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<mie3[mtype].w) {
numtyp r2inv = ucl_recip(rsq);
numtyp rgamA = pow(r2inv,(mie1[mtype].z/(numtyp)2.0));
numtyp rgamR = pow(r2inv,(mie1[mtype].w/(numtyp)2.0));
numtyp forcemie = (mie1[mtype].x*rgamR - mie1[mtype].y*rgamA);
numtyp force = factor_lj*forcemie*r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=(mie3[mtype].x*rgamR - mie3[mtype].y*rgamA) -
mie3[mtype].z;
energy+=factor_lj*e;
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}
__kernel void k_mie_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict mie1_in,
const __global numtyp4 *restrict mie3_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 mie1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 mie3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
mie1[tid]=mie1_in[tid];
mie3[tid]=mie3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<mie3[mtype].w) {
numtyp r2inv = ucl_recip(rsq);
numtyp rgamA = pow(r2inv,(mie1[mtype].z/(numtyp)2.0));
numtyp rgamR = pow(r2inv,(mie1[mtype].w/(numtyp)2.0));
numtyp forcemie = (mie1[mtype].x*rgamR - mie1[mtype].y*rgamA);
numtyp force = factor_lj*forcemie*r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=(mie3[mtype].x*rgamR - mie3[mtype].y*rgamA) -
mie3[mtype].z;
energy+=factor_lj*e;
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}

201
lib/gpu/lal_soft.cu Normal file
View File

@ -0,0 +1,201 @@
// **************************************************************************
// soft.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the soft pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
#define MY_PI (acctyp)3.14159265358979323846
__kernel void k_soft(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<coeff[mtype].z) {
numtyp force;
numtyp r = ucl_sqrt(rsq);
numtyp arg = MY_PI*r/coeff[mtype].y;
if (r > (numtyp)0.0) force = factor_lj * coeff[mtype].x *
sin(arg) * MY_PI/coeff[mtype].y*ucl_recip(r);
else force = (numtyp)0.0;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg));
energy+=factor_lj*e;
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}
__kernel void k_soft_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff[tid]=coeff_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<coeff[mtype].z) {
numtyp force;
numtyp r = ucl_sqrt(rsq);
numtyp arg = MY_PI*r/coeff[mtype].y;
if (r > (numtyp)0.0) force = factor_lj * coeff[mtype].x *
sin(arg) * MY_PI/coeff[mtype].y*ucl_recip(r);
else force = (numtyp)0.0;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg));
energy+=factor_lj*e;
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}

622
lib/gpu/lal_sw.cu Normal file
View File

@ -0,0 +1,622 @@
// **************************************************************************
// sw.cu
// -------------------
// W. Michael Brown (ORNL)
//
// Device code for acceleration of the sw pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin : Tue March 26, 2013
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
#define THIRD (numtyp)0.66666667
#if (ARCH < 300)
#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
__local acctyp red_acc[6][BLOCK_ELLIPSE]; \
red_acc[0][tid]=f.x; \
red_acc[1][tid]=f.y; \
red_acc[2][tid]=f.z; \
red_acc[3][tid]=energy; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<4; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
f.x=red_acc[0][tid]; \
f.y=red_acc[1][tid]; \
f.z=red_acc[2][tid]; \
energy=red_acc[3][tid]; \
if (vflag>0) { \
for (int r=0; r<6; r++) \
red_acc[r][tid]=virial[r]; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<6; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
for (int r=0; r<6; r++) \
virial[r]=red_acc[r][tid]; \
} \
} \
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv+=energy*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv+=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \
acctyp4 old=ans[ii]; \
old.x+=f.x; \
old.y+=f.y; \
old.z+=f.z; \
ans[ii]=old; \
}
#else
#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_xor(f.x, s, t_per_atom); \
f.y += shfl_xor(f.y, s, t_per_atom); \
f.z += shfl_xor(f.z, s, t_per_atom); \
energy += shfl_xor(energy, s, t_per_atom); \
} \
if (vflag>0) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
} \
} \
} \
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv+=energy*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv+=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \
acctyp4 old=ans[ii]; \
old.x+=f.x; \
old.y+=f.y; \
old.z+=f.z; \
ans[ii]=old; \
}
#endif
__kernel void k_sw(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
const numtyp sw_cut, const numtyp sw_epsilon,
const numtyp sw_sigma, const numtyp sw_biga,
const numtyp sw_bigb, const numtyp sw_powerp,
const numtyp sw_powerq, const numtyp sw_cutsq) {
__local int n_stride;
__local numtyp pre_sw_c1, pre_sw_c2, pre_sw_c3, pre_sw_c4;
__local numtyp pre_sw_c5, pre_sw_c6;
pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
pow(sw_sigma,sw_powerp);
pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
pow(sw_sigma,sw_powerq);
pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
pow(sw_sigma,sw_powerp+(numtyp)1.0);
pre_sw_c4=sw_biga*sw_epsilon*
pow(sw_sigma,sw_powerq+(numtyp)1.0);
pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
pow(sw_sigma,sw_powerp);
pre_sw_c6=sw_biga*sw_epsilon*
pow(sw_sigma,sw_powerq);
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//int iw=ix.w;
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<sw_cutsq) {
numtyp r=ucl_sqrt(rsq);
numtyp rp=ucl_powr(r,-sw_powerp);
numtyp rq=ucl_powr(r,-sw_powerq);
numtyp rainv=ucl_recip(r-sw_cut);
numtyp expsrainv=ucl_exp(sw_sigma*rainv);
rainv*=rainv*r;
numtyp force = (pre_sw_c1*rp-pre_sw_c2*rq +
(pre_sw_c3*rp-pre_sw_c4*rq) * rainv)*
expsrainv*ucl_recip(rsq);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy+=(pre_sw_c5*rp - pre_sw_c6*rq) * expsrainv;
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}
#define threebody(delr1x, delr1y, delr1z, eflag, energy) \
{ \
numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \
numtyp rainv1 = ucl_recip(r1 - sw_cut); \
numtyp gsrainv1 = sw_sigma_gamma * rainv1; \
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
\
numtyp r2 = ucl_sqrt(rsq2); \
numtyp rinvsq2 = ucl_recip(rsq2); \
numtyp rainv2 = ucl_recip(r2 - sw_cut); \
numtyp gsrainv2 = sw_sigma_gamma * rainv2; \
numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
\
numtyp rinv12 = ucl_recip(r1*r2); \
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
numtyp delcs = cs - sw_costheta; \
numtyp delcssq = delcs*delcs; \
\
numtyp facexp = expgsrainv1*expgsrainv2; \
\
numtyp facrad = sw_lambda_epsilon * facexp*delcssq; \
numtyp frad1 = facrad*gsrainvsq1; \
numtyp frad2 = facrad*gsrainvsq2; \
numtyp facang = sw_lambda_epsilon2 * facexp*delcs; \
numtyp facang12 = rinv12*facang; \
numtyp csfacang = cs*facang; \
numtyp csfac1 = rinvsq1*csfacang; \
\
fjx = delr1x*(frad1+csfac1)-delr2x*facang12; \
fjy = delr1y*(frad1+csfac1)-delr2y*facang12; \
fjz = delr1z*(frad1+csfac1)-delr2z*facang12; \
\
numtyp csfac2 = rinvsq2*csfacang; \
\
fkx = delr2x*(frad2+csfac2)-delr1x*facang12; \
fky = delr2y*(frad2+csfac2)-delr1y*facang12; \
fkz = delr2z*(frad2+csfac2)-delr1z*facang12; \
\
if (eflag>0) \
energy+=facrad; \
if (vflag>0) { \
virial[0] += delr1x*fjx + delr2x*fkx; \
virial[1] += delr1y*fjy + delr2y*fky; \
virial[2] += delr1z*fjz + delr2z*fkz; \
virial[3] += delr1x*fjy + delr2x*fky; \
virial[4] += delr1x*fjz + delr2x*fkz; \
virial[5] += delr1y*fjz + delr2y*fkz; \
} \
}
#define threebody_half(delr1x, delr1y, delr1z) \
{ \
numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \
numtyp rainv1 = ucl_recip(r1 - sw_cut); \
numtyp gsrainv1 = sw_sigma_gamma * rainv1; \
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
\
numtyp r2 = ucl_sqrt(rsq2); \
numtyp rainv2 = ucl_recip(r2 - sw_cut); \
numtyp gsrainv2 = sw_sigma_gamma * rainv2; \
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
\
numtyp rinv12 = ucl_recip(r1*r2); \
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
numtyp delcs = cs - sw_costheta; \
numtyp delcssq = delcs*delcs; \
\
numtyp facexp = expgsrainv1*expgsrainv2; \
\
numtyp facrad = sw_lambda_epsilon * facexp*delcssq; \
numtyp frad1 = facrad*gsrainvsq1; \
numtyp facang = sw_lambda_epsilon2 * facexp*delcs; \
numtyp facang12 = rinv12*facang; \
numtyp csfacang = cs*facang; \
numtyp csfac1 = rinvsq1*csfacang; \
\
fjx = delr1x*(frad1+csfac1)-delr2x*facang12; \
fjy = delr1y*(frad1+csfac1)-delr2y*facang12; \
fjz = delr1z*(frad1+csfac1)-delr2z*facang12; \
}
__kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom, const int evatom,
const numtyp sw_cut, const numtyp sw_epsilon,
const numtyp sw_sigma, const numtyp sw_lambda,
const numtyp sw_gamma, const numtyp sw_costheta,
const numtyp sw_cutsq) {
__local int tpa_sq, n_stride;
__local numtyp sw_sigma_gamma, sw_lambda_epsilon;
__local numtyp sw_lambda_epsilon2;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
sw_sigma_gamma=sw_sigma*sw_gamma;
sw_lambda_epsilon=sw_lambda*sw_epsilon;
sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
int tid, ii, offset;
atom_info(tpa_sq,ii,tid,offset);
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
const __global int *nbor_j, *list_end;
int i, numj;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,list_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//int iw=ix.w;
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
int j=*nbor_j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int mtype=itype+jx.w;
// Compute r12
numtyp delr1x = jx.x-ix.x;
numtyp delr1y = jx.y-ix.y;
numtyp delr1z = jx.z-ix.z;
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
if (rsq1 > sw_cutsq) continue;
const __global int *nbor_k=nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j)
nbor_k+=n_stride;
for ( ; nbor_k<list_end; nbor_k+=n_stride) {
int k=*nbor_k;
k &= NEIGHMASK;
numtyp4 kx; fetch4(kx,k,pos_tex);
numtyp delr2x = kx.x-ix.x;
numtyp delr2y = kx.y-ix.y;
numtyp delr2z = kx.z-ix.z;
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw_cutsq) {
numtyp fjx, fjy, fjz, fkx, fky, fkz;
threebody(delr1x,delr1y,delr1z,eflag,energy);
f.x -= fjx + fkx;
f.y -= fjy + fky;
f.z -= fjz + fkz;
}
}
} // for nbor
numtyp pre;
if (evatom==1)
pre=THIRD;
else
pre=(numtyp)2.0;
energy*=pre;
for (int i=0; i<6; i++)
virial[i]*=pre;
store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
eflag,vflag,ans,engv);
} // if ii
}
__kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom, const numtyp sw_cut,
const numtyp sw_epsilon, const numtyp sw_sigma,
const numtyp sw_lambda, const numtyp sw_gamma,
const numtyp sw_costheta, const numtyp sw_cutsq) {
__local int tpa_sq, n_stride;
__local numtyp sw_sigma_gamma, sw_lambda_epsilon;
__local numtyp sw_lambda_epsilon2;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
sw_sigma_gamma=sw_sigma*sw_gamma;
sw_lambda_epsilon=sw_lambda*sw_epsilon;
sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
int tid, ii, offset;
atom_info(tpa_sq,ii,tid,offset);
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
const __global int *nbor_j, *list_end, *k_end;
int i, numj;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,list_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//int iw=ix.w;
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
int j=*nbor_j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int mtype=itype+jx.w;
// Compute r12
numtyp delr1x = ix.x-jx.x;
numtyp delr1y = ix.y-jx.y;
numtyp delr1z = ix.z-jx.z;
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
if (rsq1 > sw_cutsq) continue;
const __global int *nbor_k=dev_nbor+j+nbor_pitch;
int numk=*nbor_k;
if (dev_nbor==dev_packed) {
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k+=nbor_pitch;
nbor_k=dev_packed+*nbor_k;
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=*nbor_k;
k &= NEIGHMASK;
if (k == i)
continue;
numtyp4 kx; fetch4(kx,k,pos_tex);
numtyp delr2x = kx.x - jx.x;
numtyp delr2y = kx.y - jx.y;
numtyp delr2z = kx.z - jx.z;
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw_cutsq) {
numtyp fjx, fjy, fjz;
//if (evatom==0) {
threebody_half(delr1x,delr1y,delr1z);
//} else {
// numtyp fkx, fky, fkz;
// threebody(delr1x,delr1y,delr1z,eflag,energy);
//}
f.x += fjx;
f.y += fjy;
f.z += fjz;
}
}
} // for nbor
#ifdef THREE_CONCURRENT
store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
eflag,vflag,ans,engv);
#else
store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
eflag,vflag,ans,engv);
#endif
} // if ii
}
__kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom, const numtyp sw_cut,
const numtyp sw_epsilon, const numtyp sw_sigma,
const numtyp sw_lambda, const numtyp sw_gamma,
const numtyp sw_costheta, const numtyp sw_cutsq) {
__local int tpa_sq, n_stride;
__local numtyp sw_sigma_gamma, sw_lambda_epsilon;
__local numtyp sw_lambda_epsilon2;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
sw_sigma_gamma=sw_sigma*sw_gamma;
sw_lambda_epsilon=sw_lambda*sw_epsilon;
sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
int tid, ii, offset;
atom_info(tpa_sq,ii,tid,offset);
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
const __global int *nbor_j, *list_end, *k_end;
int i, numj;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,list_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//int iw=ix.w;
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
int j=*nbor_j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int mtype=itype+jx.w;
// Compute r12
numtyp delr1x = ix.x-jx.x;
numtyp delr1y = ix.y-jx.y;
numtyp delr1z = ix.z-jx.z;
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
if (rsq1 > sw_cutsq) continue;
const __global int *nbor_k=dev_nbor+j+nbor_pitch;
int numk=*nbor_k;
if (dev_nbor==dev_packed) {
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k+=nbor_pitch;
nbor_k=dev_packed+*nbor_k;
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=*nbor_k;
k &= NEIGHMASK;
if (k == i)
continue;
numtyp4 kx; fetch4(kx,k,pos_tex);
numtyp delr2x = kx.x - jx.x;
numtyp delr2y = kx.y - jx.y;
numtyp delr2z = kx.z - jx.z;
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw_cutsq) {
numtyp fjx, fjy, fjz, fkx, fky, fkz;
threebody(delr1x,delr1y,delr1z,eflag,energy);
f.x += fjx;
f.y += fjy;
f.z += fjz;
}
}
} // for nbor
energy*=THIRD;
for (int i=0; i<6; i++)
virial[i]*=THIRD;
#ifdef THREE_CONCURRENT
store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
eflag,vflag,ans,engv);
#else
store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
eflag,vflag,ans,engv);
#endif
} // if ii
}