// ************************************************************************** // lj_tip4p_long.cu // ------------------- // V. Nikolskiy (HSE) // // Device code for acceleration of the lj/tip4p/long pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : thevsevak@gmail.com // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifdef LAMMPS_SMALLBIG #define tagint int #endif #ifdef LAMMPS_BIGBIG #ifdef USE_OPENCL #define tagint long #else #include "stdint.h" #define tagint int64_t #endif #endif #ifdef LAMMPS_SMALLSMALL #define tagint int #endif #ifndef _DOUBLE_DOUBLE _texture( pos_tex,float4); _texture( q_tex,float); #else _texture_2d( pos_tex,int4); _texture( q_tex,int2); #endif #else #ifdef LAMMPS_SMALLBIG #define tagint int #endif #ifdef LAMMPS_BIGBIG #ifdef USE_OPENCL #define tagint long #else #include "stdint.h" #define tagint int64_t #endif #endif #ifdef LAMMPS_SMALLSMALL #define tagint int #endif #define pos_tex x_ #define q_tex q_ #endif ucl_inline int atom_mapping(const __global int *map, tagint glob) { return map[glob]; } ucl_inline int closest_image(int i, int j, const __global int* sametag, const __global numtyp4 *restrict x_) { if (j < 0) return j; numtyp4 xi; fetch4(xi,i,pos_tex); // = x[i]; numtyp4 xj; fetch4(xj,j,pos_tex); int closest = j; numtyp delx = xi.x - xj.x; numtyp dely = xi.y - xj.y; numtyp delz = xi.z - xj.z; numtyp rsqmin = delx*delx + dely*dely + delz*delz; numtyp rsq; while (sametag[j] >= 0) { j = sametag[j]; fetch4(xj,j,pos_tex); delx = xi.x - xj.x; dely = xi.y - xj.y; delz = xi.z - xj.z; rsq = delx*delx + dely*dely + delz*delz; if (rsq < rsqmin) { rsqmin = rsq; closest = j; } } return closest; } ucl_inline void compute_newsite(int iO, int iH1, int iH2, __global numtyp4 *xM, numtyp q, numtyp alpha, const __global numtyp4 *restrict x_) { numtyp4 xO; fetch4(xO,iO,pos_tex); numtyp4 xH1; fetch4(xH1,iH1,pos_tex); numtyp4 xH2; fetch4(xH2,iH2,pos_tex); numtyp4 M; numtyp delx1 = xH1.x - xO.x; numtyp dely1 = xH1.y - xO.y; numtyp delz1 = xH1.z - xO.z; numtyp delx2 = xH2.x - xO.x; numtyp dely2 = xH2.y - xO.y; numtyp delz2 = xH2.z - xO.z; numtyp ap = alpha * (numtyp)0.5; M.x = xO.x + ap * (delx1 + delx2); M.y = xO.y + ap * (dely1 + dely2); M.z = xO.z + ap * (delz1 + delz2); M.w = q; *xM = M; } __kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, __global int *restrict hneigh, __global numtyp4 *restrict m, const int typeO, const int typeH, const numtyp alpha, const __global numtyp *restrict q_, const __global acctyp4 *restrict ansO) { int i = BLOCK_ID_X*(BLOCK_SIZE_X)+THREAD_ID_X; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; if (i= inum) { non_local_oxy = 1; } } for ( ; nbor1) { #if (SHUFFLE_AVAIL == 0) red_acc[0][tid]=fO.x; red_acc[1][tid]=fO.y; red_acc[2][tid]=fO.z; red_acc[3][tid]=fO.w; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { simdsync(); if (offset < s) { for (int r=0; r<4; r++) red_acc[r][tid] += red_acc[r][tid+s]; } } fO.x=red_acc[0][tid]; fO.y=red_acc[1][tid]; fO.z=red_acc[2][tid]; fO.w=red_acc[3][tid]; if (EVFLAG && vflag) { simdsync(); for (int r=0; r<6; r++) red_acc[r][tid]=vO[r]; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { simdsync(); if (offset < s) { for (int r=0; r<6; r++) red_acc[r][tid] += red_acc[r][tid+s]; } } for (int r=0; r<6; r++) vO[r]=red_acc[r][tid]; } #else for (unsigned int s=t_per_atom/2; s>0; s>>=1) { fO.x += shfl_down(fO.x, s, t_per_atom); fO.y += shfl_down(fO.y, s, t_per_atom); fO.z += shfl_down(fO.z, s, t_per_atom); fO.w += shfl_down(fO.w, s, t_per_atom); } if (EVFLAG && vflag) { for (unsigned int s=t_per_atom/2; s>0; s>>=1) { for (int r=0; r<6; r++) vO[r] += shfl_down(vO[r], s, t_per_atom); } } #endif } if(offset == 0 && ii= inum) { non_local_oxy = 1; } } for ( ; nbor1) { #if (SHUFFLE_AVAIL == 0) red_acc[0][tid]=fO.x; red_acc[1][tid]=fO.y; red_acc[2][tid]=fO.z; red_acc[3][tid]=fO.w; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { simdsync(); if (offset < s) { for (int r=0; r<4; r++) red_acc[r][tid] += red_acc[r][tid+s]; } } fO.x=red_acc[0][tid]; fO.y=red_acc[1][tid]; fO.z=red_acc[2][tid]; fO.w=red_acc[3][tid]; if (EVFLAG && vflag) { for (int r=0; r<6; r++) red_acc[r][tid]=vO[r]; for (unsigned int s=t_per_atom/2; s>0; s>>=1) { simdsync(); if (offset < s) { for (int r=0; r<6; r++) red_acc[r][tid] += red_acc[r][tid+s]; } } for (int r=0; r<6; r++) vO[r]=red_acc[r][tid]; } #else for (unsigned int s=t_per_atom/2; s>0; s>>=1) { fO.x += shfl_down(fO.x, s, t_per_atom); fO.y += shfl_down(fO.y, s, t_per_atom); fO.z += shfl_down(fO.z, s, t_per_atom); fO.w += shfl_down(fO.w, s, t_per_atom); } if (EVFLAG && vflag) { for (unsigned int s=t_per_atom/2; s>0; s>>=1) { for (int r=0; r<6; r++) vO[r] += shfl_down(vO[r], s, t_per_atom); } } #endif } if(offset == 0) { ansO[i] = fO; if (EVFLAG && vflag) { ansO[inum + i].x = vO[0]; ansO[inum + i].y = vO[1]; ansO[inum + i].z = vO[2]; ansO[inum*2 + i].x = vO[3]; ansO[inum*2 + i].y = vO[4]; ansO[inum*2 + i].z = vO[5]; } } } // if ii store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, vflag,ans,engv); }