// ************************************************************************** // lj_tip4p_long.cu // ------------------- // V. Nikolskiy (HSE) // // Device code for acceleration of the lj/tip4p/long pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : thevsevak@gmail.com // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifdef LAMMPS_SMALLBIG #define tagint int #endif #ifdef LAMMPS_BIGBIG #ifdef USE_OPENCL #define tagint long #else #include "stdint.h" #define tagint int64_t #endif #endif #ifdef LAMMPS_SMALLSMALL #define tagint int #endif #ifndef _DOUBLE_DOUBLE _texture( pos_tex,float4); _texture( q_tex,float); #else _texture_2d( pos_tex,int4); _texture( q_tex,int2); #endif #else #ifdef LAMMPS_SMALLBIG #define tagint int #endif #ifdef LAMMPS_BIGBIG #ifdef USE_OPENCL #define tagint long #else #include "stdint.h" #define tagint int64_t #endif #endif #ifdef LAMMPS_SMALLSMALL #define tagint int #endif #define pos_tex x_ #define q_tex q_ #endif /* ---------------------------------------------------------------------- GPU analogue of Atom::map inline method, but now limited to map_array mapping style. Map global ID to local index of atom. ---------------------------------------------------------------------- */ ucl_inline int atom_mapping(const __global int *map, tagint glob) { return map[glob]; } /* ---------------------------------------------------------------------- GPU version of Domain::closest_image(int, int) method. Return local index of atom J or any of its images that is closest to atom I if J is not a valid index like -1, just return it. ---------------------------------------------------------------------- */ ucl_inline int closest_image(int i, int j, const __global int* sametag, const __global numtyp4 *restrict x_) { if (j < 0) return j; numtyp4 xi; fetch4(xi,i,pos_tex); numtyp4 xj; fetch4(xj,j,pos_tex); int closest = j; numtyp delx = xi.x - xj.x; numtyp dely = xi.y - xj.y; numtyp delz = xi.z - xj.z; numtyp rsqmin = delx*delx + dely*dely + delz*delz; numtyp rsq; while (sametag[j] >= 0) { j = sametag[j]; fetch4(xj,j,pos_tex); delx = xi.x - xj.x; dely = xi.y - xj.y; delz = xi.z - xj.z; rsq = delx*delx + dely*dely + delz*delz; if (rsq < rsqmin) { rsqmin = rsq; closest = j; } } return closest; } /* ---------------------------------------------------------------------- For molecule that consists of atoms O, H1 and H2 compute position of virtual charge site xM (return parameter) ---------------------------------------------------------------------- */ ucl_inline void compute_newsite(int iO, int iH1, int iH2, __global numtyp4 *xM, numtyp q, numtyp alpha, const __global numtyp4 *restrict x_) { numtyp4 xO; fetch4(xO,iO,pos_tex); numtyp4 xH1; fetch4(xH1,iH1,pos_tex); numtyp4 xH2; fetch4(xH2,iH2,pos_tex); numtyp4 M; numtyp delx1 = xH1.x - xO.x; numtyp dely1 = xH1.y - xO.y; numtyp delz1 = xH1.z - xO.z; numtyp delx2 = xH2.x - xO.x; numtyp dely2 = xH2.y - xO.y; numtyp delz2 = xH2.z - xO.z; numtyp ap = alpha * (numtyp)0.5; M.x = xO.x + ap * (delx1 + delx2); M.y = xO.y + ap * (dely1 + dely2); M.z = xO.z + ap * (delz1 + delz2); M.w = q; *xM = M; } /* ---------------------------------------------------------------------- Compute resulting forces (ans), energies and virial (engv). An additional term is calculated based on the previously calculated values on the virlual sites (ansO), which should be distributed over the real atoms. For some hydrogens, the corresponding oxygens are not local atoms and the ansO value is not calculated. The required increase is calculated directly in the main function. ---------------------------------------------------------------------- */ __kernel void k_lj_tip4p_long_distrib( const __global numtyp4 *restrict x_, __global acctyp3 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, const __global int *restrict hneigh, const __global numtyp4 *restrict m, const int typeO, const int typeH, const numtyp alpha, const __global numtyp *restrict q_, const __global acctyp4 *restrict ansO) { int i = BLOCK_ID_X*(BLOCK_SIZE_X)+THREAD_ID_X; acctyp3 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; if (i=0 && iH2>=0) { compute_newsite(iO,iH1,iH2, &m[iO], qO, alpha, x_); } else { m[iO] = ix; m[iO].w = qO; hneigh[i*4] = iO; hneigh[i*4+1] = iO; } } } } /* ---------------------------------------------------------------------- Compute initial value of force, energy and virial for each local particle. The values calculated on oxygens use the virtual charge position (m) and they are stored in a separate array (ansO) for further distribution in a separate kernel. For some hydrogens located on the boundary of the local region, oxygens are non-local and the contribution of oxygen is calculated separately in this kernel for them . ---------------------------------------------------------------------- */ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, const __global numtyp4 *restrict lj3, const int lj_types, const __global numtyp *restrict sp_lj, const __global int * dev_nbor, const __global int * dev_packed, __global acctyp3 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, __global int *restrict hneigh, __global numtyp4 *restrict m, const int typeO, const int typeH, const numtyp alpha, const __global numtyp *restrict q_, const __global numtyp *restrict cutsq, const numtyp qqrd2e, const numtyp g_ewald, const numtyp cut_coulsq, const numtyp cut_coulsqplus, __global acctyp4 *restrict ansO) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); int n_stride; local_allocate_store_charge(); acctyp3 f; acctyp4 fO; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0; acctyp energy, e_coul, virial[6], vO[6]; if (EVFLAG) { energy = (acctyp)0; e_coul = (acctyp)0; for (int i=0; i<6; i++) { virial[i]=(acctyp)0; vO[i]=(acctyp)0; } } int i; if (ii= inum) { non_local_oxy = 1; } } for ( ; nbor1) { #if (SHUFFLE_AVAIL == 0) red_acc[0][tid]=fO.x; red_acc[1][tid]=fO.y; red_acc[2][tid]=fO.z; red_acc[3][tid]=fO.w; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { simdsync(); if (offset < s) { for (int r=0; r<4; r++) red_acc[r][tid] += red_acc[r][tid+s]; } } fO.x=red_acc[0][tid]; fO.y=red_acc[1][tid]; fO.z=red_acc[2][tid]; fO.w=red_acc[3][tid]; if (EVFLAG && vflag) { simdsync(); for (int r=0; r<6; r++) red_acc[r][tid]=vO[r]; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { simdsync(); if (offset < s) { for (int r=0; r<6; r++) red_acc[r][tid] += red_acc[r][tid+s]; } } for (int r=0; r<6; r++) vO[r]=red_acc[r][tid]; } #else for (unsigned int s=t_per_atom/2; s>0; s>>=1) { fO.x += shfl_down(fO.x, s, t_per_atom); fO.y += shfl_down(fO.y, s, t_per_atom); fO.z += shfl_down(fO.z, s, t_per_atom); fO.w += shfl_down(fO.w, s, t_per_atom); } if (EVFLAG && vflag) { for (unsigned int s=t_per_atom/2; s>0; s>>=1) { for (int r=0; r<6; r++) vO[r] += shfl_down(vO[r], s, t_per_atom); } } #endif } if(offset == 0 && ii= inum) { non_local_oxy = 1; } } for ( ; nbor1) { #if (SHUFFLE_AVAIL == 0) red_acc[0][tid]=fO.x; red_acc[1][tid]=fO.y; red_acc[2][tid]=fO.z; red_acc[3][tid]=fO.w; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { simdsync(); if (offset < s) { for (int r=0; r<4; r++) red_acc[r][tid] += red_acc[r][tid+s]; } } fO.x=red_acc[0][tid]; fO.y=red_acc[1][tid]; fO.z=red_acc[2][tid]; fO.w=red_acc[3][tid]; if (EVFLAG && vflag) { for (int r=0; r<6; r++) red_acc[r][tid]=vO[r]; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { simdsync(); if (offset < s) { for (int r=0; r<6; r++) red_acc[r][tid] += red_acc[r][tid+s]; } } for (int r=0; r<6; r++) vO[r]=red_acc[r][tid]; } #else for (unsigned int s=t_per_atom/2; s>0; s>>=1) { fO.x += shfl_down(fO.x, s, t_per_atom); fO.y += shfl_down(fO.y, s, t_per_atom); fO.z += shfl_down(fO.z, s, t_per_atom); fO.w += shfl_down(fO.w, s, t_per_atom); } if (EVFLAG && vflag) { for (unsigned int s=t_per_atom/2; s>0; s>>=1) { for (int r=0; r<6; r++) vO[r] += shfl_down(vO[r], s, t_per_atom); } } #endif } if(offset == 0) { ansO[i] = fO; if (EVFLAG && vflag) { ansO[inum + i].x = vO[0]; ansO[inum + i].y = vO[1]; ansO[inum + i].z = vO[2]; ansO[inum*2 + i].x = vO[3]; ansO[inum*2 + i].y = vO[4]; ansO[inum*2 + i].z = vO[5]; } } } // if ii store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, vflag,ans,engv); }