// ************************************************************************** // vashishta.cu // ------------------- // Anders Hafreager (UiO) // // Device code for acceleration of the vashishta pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : Mon June 12, 2017 // email : andershaf@gmail.com // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE _texture( pos_tex,float4); _texture( param1_tex,float4); _texture( param2_tex,float4); _texture( param3_tex,float4); _texture( param4_tex,float4); _texture( param5_tex,float4); #else _texture_2d( pos_tex,int4); _texture( param1_tex,int4); _texture( param2_tex,int4); _texture( param3_tex,int4); _texture( param4_tex,int4); _texture( param5_tex,int4); #endif #if (__CUDACC_VER_MAJOR__ >= 11) #define param1_tex param1 #define param2_tex param2 #define param3_tex param3 #define param4_tex param4 #define param5_tex param5 #endif #else #define pos_tex x_ #define param1_tex param1 #define param2_tex param2 #define param3_tex param3 #define param4_tex param4 #define param5_tex param5 #endif #define THIRD (numtyp)0.66666666666666666667 //#define THREE_CONCURRENT #if (SHUFFLE_AVAIL == 0) #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) { \ simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ if (EVFLAG && (vflag==2 || eflag==2)) { \ if (eflag) { \ simdsync(); \ simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \ } \ if (vflag) { \ simdsync(); \ simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ } \ } \ if (offset==0 && ii1) { \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (vflag==2 || eflag==2) { \ if (eflag) \ simd_reduce_add1(t_per_atom,energy); \ if (vflag) \ simd_reduce_arr(6, t_per_atom,virial); \ } \ } \ if (offset==0 && ii 1; active_subgs /= vwidth) { \ if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ if (bnum < active_subgs) { \ if (eflag) { \ simd_reduce_add1(vwidth, energy); \ if (voffset==0) red_acc[6][bnum] = energy; \ } \ if (vflag) { \ simd_reduce_arr(6, vwidth, virial); \ if (voffset==0) \ for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ } \ } \ \ __syncthreads(); \ if (tid < active_subgs) { \ if (eflag) energy = red_acc[6][tid]; \ if (vflag) \ for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ } else { \ if (eflag) energy = (acctyp)0; \ if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ } \ } \ \ if (bnum == 0) { \ int ei=BLOCK_ID_X; \ if (eflag) { \ simd_reduce_add1(vwidth, energy); \ if (tid==0) { \ engv[ei]+=energy*(acctyp)0.5; \ ei+=ev_stride; \ } \ } \ if (vflag) { \ simd_reduce_arr(6, vwidth, virial); \ if (tid==0) { \ for (int r=0; r<6; r++) { \ engv[ei]+=virial[r]*(acctyp)0.5; \ ei+=ev_stride; \ } \ } \ } \ } \ } else if (offset==0 && ii1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (offset==0 && ii param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1 param_gamma_ij=param4_ijparam.y; param_r0_ij=param4_ijparam.w; int nbor_k = nbor_j-offset_j+offset_k; if (nbor_k<=nbor_j) nbor_k += n_stride; for ( ; nbor_k param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1 param_gamma_ij=param4_ijparam.y; param_r0_ij = param4_ijparam.w; int nbor_k; if (gpu_nbor) nbor_k=j+nbor_pitch; else nbor_k=dev_ilist[j]+nbor_pitch; const int numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); nbor_k+=offset_k; for ( ; nbor_k param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1 param_gamma_ij=param4_ijparam.y; param_r0_ij=param4_ijparam.w; int nbor_k; if (gpu_nbor) nbor_k=j+nbor_pitch; else nbor_k=dev_ilist[j]+nbor_pitch; const int numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); nbor_k+=offset_k; for ( ; nbor_k