// ************************************************************************** // dipole_lj.cu // ------------------- // Trung Dac Nguyen (ORNL) // // Device code for acceleration of the dipole/cut pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : nguyentd@ornl.gov // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE _texture( pos_tex,float4); _texture( q_tex,float); _texture( mu_tex,float4); #else _texture_2d( pos_tex,int4); _texture( q_tex,int2); _texture_2d( mu_tex,int4); #endif #else #define pos_tex x_ #define q_tex q_ #define mu_tex mu_ #endif #if (SHUFFLE_AVAIL == 0) #define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid, \ t_per_atom, offset, eflag, vflag, ans, engv) \ if (t_per_atom>1) { \ simd_reduce_add6(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z, \ tor.x, tor.y, tor.z); \ if (EVFLAG && (vflag==2 || eflag==2)) { \ if (eflag) { \ simdsync(); \ simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ } \ if (vflag) { \ simdsync(); \ simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ } \ } \ if (offset==0 && ii1) { \ simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z); \ if (vflag==2 || eflag==2) { \ if (eflag) \ simd_reduce_add2(t_per_atom,energy,e_coul); \ if (vflag) \ simd_reduce_arr(6, t_per_atom,virial); \ } \ } \ if (offset==0 && ii 1; active_subgs /= vwidth) { \ if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ if (bnum < active_subgs) { \ if (eflag) { \ simd_reduce_add2(vwidth, energy, e_coul); \ if (voffset==0) { \ red_acc[6][bnum] = energy; \ red_acc[7][bnum] = e_coul; \ } \ } \ if (vflag) { \ simd_reduce_arr(6, vwidth, virial); \ if (voffset==0) \ for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ } \ } \ \ __syncthreads(); \ if (tid < active_subgs) { \ if (eflag) { \ energy = red_acc[6][tid]; \ e_coul = red_acc[7][tid]; \ } \ if (vflag) \ for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ } else { \ if (eflag) energy = e_coul = (acctyp)0; \ if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ } \ } \ \ if (bnum == 0) { \ int ei=BLOCK_ID_X; \ const int ev_stride=NUM_BLOCKS_X; \ if (eflag) { \ simd_reduce_add2(vwidth, energy, e_coul); \ if (tid==0) { \ engv[ei]=energy*(acctyp)0.5; \ ei+=ev_stride; \ engv[ei]=e_coul*(acctyp)0.5; \ ei+=ev_stride; \ } \ } \ if (vflag) { \ simd_reduce_arr(6, vwidth, virial); \ if (tid==0) { \ for (int r=0; r<6; r++) { \ engv[ei]=virial[r]*(acctyp)0.5; \ ei+=ev_stride; \ } \ } \ } \ } \ } else if (offset==0 && ii1) \ simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z); \ if (offset==0 && ii (numtyp)0.0 && muj.w > (numtyp)0.0) { r3inv = r2inv*rinv; r5inv = r3inv*r2inv; r7inv = r5inv*r2inv; pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; pre1 = (numtyp)3.0*r5inv*pdotp - (numtyp)15.0*r7inv*pidotr*pjdotr; pre2 = (numtyp)3.0*r5inv*pjdotr; pre3 = (numtyp)3.0*r5inv*pidotr; pre4 = (numtyp)(-1.0)*r3inv; forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x; forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y; forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z; numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y); numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z); numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x); ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely); ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz); ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx); } // dipole-charge if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { r3inv = r2inv*rinv; r5inv = r3inv*r2inv; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pre1 = (numtyp)3.0*qj*r5inv * pidotr; pre2 = qj*r3inv; forcecoul.x += pre2*mui.x - pre1*delx; forcecoul.y += pre2*mui.y - pre1*dely; forcecoul.z += pre2*mui.z - pre1*delz; ticoul.x += pre2 * (mui.y*delz - mui.z*dely); ticoul.y += pre2 * (mui.z*delx - mui.x*delz); ticoul.z += pre2 * (mui.x*dely - mui.y*delx); } // charge-dipole if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { r3inv = r2inv*rinv; r5inv = r3inv*r2inv; pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr; pre2 = qtmp*r3inv; forcecoul.x += pre1*delx - pre2*muj.x; forcecoul.y += pre1*dely - pre2*muj.y; forcecoul.z += pre1*delz - pre2*muj.z; } } else { forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0; ticoul.x = ticoul.y = ticoul.z = (acctyp)0; } numtyp fq = factor_coul*qqrd2e; force.x = fq*forcecoul.x + delx*force_lj; force.y = fq*forcecoul.y + dely*force_lj; force.z = fq*forcecoul.z + delz*force_lj; f.x+=force.x; f.y+=force.y; f.z+=force.z; tor.x+=fq*ticoul.x; tor.y+=fq*ticoul.y; tor.z+=fq*ticoul.z; if (EVFLAG && eflag) { acctyp e = (acctyp)0.0; if (rsq < lj1[mtype].w) { e = qtmp*qj*rinv; if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr; if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) e += -qj*r3inv*pidotr; if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) e += qtmp*r3inv*pjdotr; e *= fq; } else e = (acctyp)0.0; e_coul += e; if (rsq < lj1[mtype].z) { e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); } } if (EVFLAG && vflag) { virial[0] += delx*force.x; virial[1] += dely*force.y; virial[2] += delz*force.z; virial[3] += delx*force.y; virial[4] += delx*force.z; virial[5] += dely*force.z; } } } // for nbor } // if ii store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset, eflag,vflag,ans,engv); } __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp3 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp *restrict q_, const __global numtyp4 *restrict mu_, const __global numtyp *restrict _cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; int n_stride; local_allocate_store_charge(); if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid (numtyp)0.0 && muj.w > (numtyp)0.0) { r3inv = r2inv*rinv; r5inv = r3inv*r2inv; r7inv = r5inv*r2inv; pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; pre1 = (numtyp)3.0*r5inv*pdotp - (numtyp)15.0*r7inv*pidotr*pjdotr; pre2 = (numtyp)3.0*r5inv*pjdotr; pre3 = (numtyp)3.0*r5inv*pidotr; pre4 = (numtyp)(-1.0)*r3inv; forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x; forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y; forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z; numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y); numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z); numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x); ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely); ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz); ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx); } // dipole-charge if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { r3inv = r2inv*rinv; r5inv = r3inv*r2inv; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pre1 = (numtyp)3.0*qj*r5inv * pidotr; pre2 = qj*r3inv; forcecoul.x += pre2*mui.x - pre1*delx; forcecoul.y += pre2*mui.y - pre1*dely; forcecoul.z += pre2*mui.z - pre1*delz; ticoul.x += pre2 * (mui.y*delz - mui.z*dely); ticoul.y += pre2 * (mui.z*delx - mui.x*delz); ticoul.z += pre2 * (mui.x*dely - mui.y*delx); } // charge-dipole if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { r3inv = r2inv*rinv; r5inv = r3inv*r2inv; pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr; pre2 = qtmp*r3inv; forcecoul.x += pre1*delx - pre2*muj.x; forcecoul.y += pre1*dely - pre2*muj.y; forcecoul.z += pre1*delz - pre2*muj.z; } } else { forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0; ticoul.x = ticoul.y = ticoul.z = (acctyp)0; } numtyp fq = factor_coul*qqrd2e; force.x = fq*forcecoul.x + delx*force_lj; force.y = fq*forcecoul.y + dely*force_lj; force.z = fq*forcecoul.z + delz*force_lj; f.x+=force.x; f.y+=force.y; f.z+=force.z; tor.x+=fq*ticoul.x; tor.y+=fq*ticoul.y; tor.z+=fq*ticoul.z; if (EVFLAG && eflag) { acctyp e = (acctyp)0; if (rsq < lj1[mtype].w) { e = qtmp*qj*rinv; if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr; if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) e += -qj*r3inv*pidotr; if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) e += qtmp*r3inv*pjdotr; e *= fq; } else e = (acctyp)0; e_coul += e; if (rsq < lj1[mtype].z) { e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); } } if (EVFLAG && vflag) { virial[0] += delx*force.x; virial[1] += dely*force.y; virial[2] += delz*force.z; virial[3] += delx*force.y; virial[4] += delx*force.z; virial[5] += dely*force.z; } } } // for nbor } // if ii store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset, eflag,vflag,ans,engv); }