diff --git a/lib/cuda/pair_sw_cuda.cu b/lib/cuda/pair_sw_cuda.cu new file mode 100644 index 0000000000..f5b0807fcd --- /dev/null +++ b/lib/cuda/pair_sw_cuda.cu @@ -0,0 +1,140 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + +#include "pair_sw_cuda_cu.h" +__device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR*MANYBODY_NPAIR*MANYBODY_NPAIR]; + +#include "pair_sw_cuda_kernel_nc.cu" + +#include + + +void Cuda_PairSWCuda_Init(cuda_shared_data* sdata,ParamSW_Float* params_host,void* map_host, void* elem2param_host,int nelements_h) +{ + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + X_FLOAT box_size[3] = + { + sdata->domain.subhi[0] - sdata->domain.sublo[0], + sdata->domain.subhi[1] - sdata->domain.sublo[1], + sdata->domain.subhi[2] - sdata->domain.sublo[2] + }; + + cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3); + cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 ); + cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) ); + cudaMemcpyToSymbol("params_sw", params_host , sizeof(ParamSW_Float)*nelements_h*nelements_h*nelements_h ); + cudaMemcpyToSymbol("elem2param",elem2param_host , sizeof(int)*nelements_h*nelements_h*nelements_h ); + cudaMemcpyToSymbol("map",map_host , sizeof(int)*cuda_ntypes ); + cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int)); +} + +void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + static int glob_ij_size=0; + static F_FLOAT4* glob_r_ij=NULL; + static int* glob_numneigh_red=NULL; + static int* glob_neighbors_red=NULL; + static int* glob_neightype_red=NULL; + + if(glob_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT)) + { + glob_ij_size = sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT); + cudaFree(glob_r_ij); + cudaFree(glob_numneigh_red); + cudaFree(glob_neighbors_red); + cudaFree(glob_neightype_red); + cudaMalloc(&glob_r_ij,glob_ij_size*4); + cudaMalloc(&glob_numneigh_red,sdata->atom.nall*sizeof(int)); + cudaMalloc(&glob_neighbors_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int)); + cudaMalloc(&glob_neightype_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int)); + cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red , sizeof(int*) ); + cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red , sizeof(int*) ); + cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red , sizeof(int*) ); + cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij , sizeof(F_FLOAT4*) ); + } + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,64); + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + + + dim3 grid2; + if(sdata->atom.nall<=256*64000){ + grid2.x = (sdata->atom.nall+255)/256; + grid2.y = 1; + } else { + grid2.x = (sdata->atom.nall+256*128-1)/(256*128); + grid2.y = 128; + } + grid2.z = 1; + dim3 threads2; + threads2.x = 256; + threads2.y = 1; + threads2.z = 1; + + timespec time1,time2; + + //pre-calculate all neighbordistances and zeta_ij + clock_gettime(CLOCK_REALTIME,&time1); + Pair_SW_Kernel_TpA_RIJ<<>>(); + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME,&time2); + sdata->cuda_timings.test1+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + clock_gettime(CLOCK_REALTIME,&time1); + + //actual force calculation + unsigned int sharedsize=(sharedperproc*sizeof(ENERGY_FLOAT)+4*sizeof(F_FLOAT))*threads.x; //extra 4 floats per thread used to reduce register pressure + if(eflag) + { + if(vflag) + Pair_SW_Kernel_TpA<1,1><<>> + (eflag_atom,vflag_atom); + else + Pair_SW_Kernel_TpA<1,0><<>> + (eflag_atom,vflag_atom); + } + else + { + if(vflag) + Pair_SW_Kernel_TpA<0,1><<>> + (eflag_atom,vflag_atom); + else + Pair_SW_Kernel_TpA<0,0><<>> + (eflag_atom,vflag_atom); + } + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME,&time2); + sdata->cuda_timings.test2+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + diff --git a/lib/cuda/pair_sw_cuda_cu.h b/lib/cuda/pair_sw_cuda_cu.h new file mode 100644 index 0000000000..24e92689ff --- /dev/null +++ b/lib/cuda/pair_sw_cuda_cu.h @@ -0,0 +1,39 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + + struct ParamSW_Float { + F_FLOAT epsilon,sigma; + F_FLOAT littlea,lambda,gamma,costheta; + F_FLOAT biga,bigb; + F_FLOAT powerp,powerq; + F_FLOAT tol; + F_FLOAT cut,cutsq; + F_FLOAT sigma_gamma,lambda_epsilon,lambda_epsilon2; + F_FLOAT c1,c2,c3,c4,c5,c6; + int ielement,jelement,kelement; + }; + +extern "C" void Cuda_PairSWCuda_Init(cuda_shared_data* sdata,ParamSW_Float* params_host,void* map_host, void* elem2param_host,int nelements_h); +extern "C" void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_sw_cuda_kernel_nc.cu b/lib/cuda/pair_sw_cuda_kernel_nc.cu new file mode 100644 index 0000000000..8072822559 --- /dev/null +++ b/lib/cuda/pair_sw_cuda_kernel_nc.cu @@ -0,0 +1,449 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#define Pi F_F(3.1415926535897932384626433832795) +#define PI Pi +#define PI2 F_F(0.5)*Pi +#define PI4 F_F(0.25)*Pi + + + +__device__ void twobody(int iparam, F_FLOAT rsq, F_FLOAT &fforce, + int eflag, ENERGY_FLOAT &eng) +{ + F_FLOAT r,rp,rq,rainv,expsrainv; + + r = sqrt(rsq); + rp = pow(r,-params_sw[iparam].powerp); + rq = pow(r,-params_sw[iparam].powerq); + rainv = 1.0 / (r - params_sw[iparam].cut); + expsrainv = exp(params_sw[iparam].sigma * rainv); + fforce = (params_sw[iparam].c1*rp - params_sw[iparam].c2*rq + + (params_sw[iparam].c3*rp -params_sw[iparam].c4*rq) * rainv*rainv*r) * expsrainv / rsq; + if (eflag) eng += (params_sw[iparam].c5*rp - params_sw[iparam].c6*rq) * expsrainv; +} + +__device__ void threebody(int paramij, int paramik, int paramijk, + F_FLOAT4& delr1, + F_FLOAT4& delr2, + F_FLOAT3& fj, F_FLOAT3& fk, int eflag,ENERGY_FLOAT &eng) +{ + F_FLOAT r1,rinvsq1,rainv1,gsrainv1,gsrainvsq1,expgsrainv1; + F_FLOAT r2,rinvsq2,rainv2,gsrainv2,gsrainvsq2,expgsrainv2; + F_FLOAT rinv12,cs,delcs,delcssq,facexp,facrad,frad1,frad2; + F_FLOAT facang,facang12,csfacang,csfac1,csfac2; + + r1 = sqrt(delr1.w); + rinvsq1 = F_F(1.0)/delr1.w; + rainv1 = F_F(1.0)/(r1 - params_sw[paramij].cut); + gsrainv1 = params_sw[paramij].sigma_gamma * rainv1; + gsrainvsq1 = gsrainv1*rainv1/r1; + expgsrainv1 = exp(gsrainv1); + + r2 = sqrt(delr2.w); + rinvsq2 = F_F(1.0)/delr2.w; + rainv2 = F_F(1.0)/(r2 - params_sw[paramik].cut); + gsrainv2 = params_sw[paramik].sigma_gamma * rainv2; + gsrainvsq2 = gsrainv2*rainv2/r2; + expgsrainv2 = exp(gsrainv2); + + rinv12 = F_F(1.0)/(r1*r2); + cs = (delr1.x*delr2.x + delr1.y*delr2.y + delr1.z*delr2.z) * rinv12; + delcs = cs - params_sw[paramijk].costheta; + delcssq = delcs*delcs; + + facexp = expgsrainv1*expgsrainv2; + + // facrad = sqrt(paramij->lambda_epsilon*paramik->lambda_epsilon) * + // facexp*delcssq; + + facrad = params_sw[paramijk].lambda_epsilon * facexp*delcssq; + frad1 = facrad*gsrainvsq1; + frad2 = facrad*gsrainvsq2; + facang = params_sw[paramijk].lambda_epsilon2 * facexp*delcs; + facang12 = rinv12*facang; + csfacang = cs*facang; + csfac1 = rinvsq1*csfacang; + + fj.x = delr1.x*(frad1+csfac1)-delr2.x*facang12; + fj.y = delr1.y*(frad1+csfac1)-delr2.y*facang12; + fj.z = delr1.z*(frad1+csfac1)-delr2.z*facang12; + + csfac2 = rinvsq2*csfacang; + + fk.x = delr2.x*(frad2+csfac2)-delr1.x*facang12; + fk.y = delr2.y*(frad2+csfac2)-delr1.y*facang12; + fk.z = delr2.z*(frad2+csfac2)-delr1.z*facang12; + + if (eflag) eng+= F_F(2.0)*facrad; +} + +__device__ void threebody_fj(int paramij, int paramik, int paramijk, + F_FLOAT4& delr1, + F_FLOAT4& delr2, + F_FLOAT3& fj) +{ + F_FLOAT r1,rinvsq1,rainv1,gsrainv1,gsrainvsq1,expgsrainv1; + F_FLOAT r2,rainv2,gsrainv2,expgsrainv2; + F_FLOAT rinv12,cs,delcs,delcssq,facexp,facrad,frad1; + F_FLOAT facang,facang12,csfacang,csfac1; + + r1 = sqrt(delr1.w); + rinvsq1 = F_F(1.0)/delr1.w; + rainv1 = F_F(1.0)/(r1 - params_sw[paramij].cut); + gsrainv1 = params_sw[paramij].sigma_gamma * rainv1; + gsrainvsq1 = gsrainv1*rainv1/r1; + expgsrainv1 = exp(gsrainv1); + + r2 = sqrt(delr2.w); + rainv2 = F_F(1.0)/(r2 - params_sw[paramik].cut); + gsrainv2 = params_sw[paramik].sigma_gamma * rainv2; + expgsrainv2 = exp(gsrainv2); + + rinv12 = F_F(1.0)/(r1*r2); + cs = (delr1.x*delr2.x + delr1.y*delr2.y + delr1.z*delr2.z) * rinv12; + delcs = cs - params_sw[paramijk].costheta; + delcssq = delcs*delcs; + + facexp = expgsrainv1*expgsrainv2; + + // facrad = sqrt(paramij->lambda_epsilon*paramik->lambda_epsilon) * + // facexp*delcssq; + + facrad = params_sw[paramijk].lambda_epsilon * facexp*delcssq; + frad1 = facrad*gsrainvsq1; + facang = params_sw[paramijk].lambda_epsilon2 * facexp*delcs; + facang12 = rinv12*facang; + csfacang = cs*facang; + csfac1 = rinvsq1*csfacang; + + fj.x = delr1.x*(frad1+csfac1)-delr2.x*facang12; + fj.y = delr1.y*(frad1+csfac1)-delr2.y*facang12; + fj.z = delr1.z*(frad1+csfac1)-delr2.z*facang12; +} + + +__global__ void Pair_SW_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +{ + int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if( ii >= _nall ) return; + + X_FLOAT4 myxtype; + F_FLOAT4 delij; + F_FLOAT xtmp,ytmp,ztmp; + int itype,jnum,i,j; + int* jlist; + int neigh_red = 0; + i = ii;//_ilist[ii]; + myxtype = fetchXType(i); + + xtmp=myxtype.x; + ytmp=myxtype.y; + ztmp=myxtype.z; + itype=map[(static_cast (myxtype.w))]; + + jnum = _numneigh[i]; + jlist = &_neighbors[i]; + + __syncthreads(); + for (int jj = 0; jj < jnum; jj++) + { + if(jj (myxtype.w))]; + int iparam_ij = elem2param[(itype*nelements+jtype)*nelements+jtype]; + delij.w = vec3_dot(delij,delij); + if (delij.w < params_sw[iparam_ij].cutsq) + { + _glob_neighbors_red[i+neigh_red*_nall]=j; + _glob_neightype_red[i+neigh_red*_nall]=jtype; + _glob_r_ij[i+neigh_red*_nall]=delij; + neigh_red++; + } + } + } + _glob_numneigh_red[i]=neigh_red; +} + + + template + __global__ void Pair_SW_Kernel_TpA(int eflag_atom,int vflag_atom)//,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +{ + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + + ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x]; + ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + + F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem; + if((eflag||eflag_atom)&&(vflagm||vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7*blockDim.x]; + else + if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x]; + else + if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6*blockDim.x]; + shared_F_F+=threadIdx.x; + + if(eflag_atom||eflag) + { + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + + if(vflagm||vflag_atom) + { + sharedV[0*blockDim.x] = ENERGY_F(0.0); + sharedV[1*blockDim.x] = ENERGY_F(0.0); + sharedV[2*blockDim.x] = ENERGY_F(0.0); + sharedV[3*blockDim.x] = ENERGY_F(0.0); + sharedV[4*blockDim.x] = ENERGY_F(0.0); + sharedV[5*blockDim.x] = ENERGY_F(0.0); + } + + int jnum_red=0; +#define fxtmp shared_F_F[0] +#define fytmp shared_F_F[blockDim.x] +#define fztmp shared_F_F[2*blockDim.x] +//#define jnum_red (static_cast (shared_F_F[3*blockDim.x])) + + int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + X_FLOAT4 myxtype_i,myxtype_j,myxtype_k; + F_FLOAT4 delij,delik,deljk; + F_FLOAT fpair; + + int itype,i,j; + int* jlist_red; + + if(ii < _inum) + { + i = _ilist[ii]; + + if(vflagm) + myxtype_i=fetchXType(i); + //itype=map[(static_cast (myxtype_i.w))]; + itype=map[_type[i]]; + + + fxtmp = F_F(0.0); + fytmp = F_F(0.0); + fztmp = F_F(0.0); + + + //shared_F_F[3*blockDim.x] = _glob_numneigh_red[i]; + jnum_red = _glob_numneigh_red[i]; + jlist_red = &_glob_neighbors_red[i]; + } + __syncthreads(); +#pragma unroll 1 + for (int jj = 0; jj < jnum_red; jj++) + { + if(i < _nlocal) + { + fpair=F_F(0.0); + j = jlist_red[jj*_nall]; + j &= NEIGHMASK; + + if(vflagm) + myxtype_j = fetchXType(j); + + int jtype = _glob_neightype_red[i+jj*_nall]; + delij = _glob_r_ij[i+jj*_nall]; + + volatile int iparam_ij = elem2param[(itype*nelements+jtype)*nelements+jtype]; + volatile int iparam_ji = elem2param[(jtype*nelements+itype)*nelements+itype]; + + if (delij.w(); +#undef fxtmp +#undef fytmp +#undef fztmp +//#undef jnum_red +} diff --git a/lib/cuda/pair_tersoff_cuda.cu b/lib/cuda/pair_tersoff_cuda.cu new file mode 100644 index 0000000000..abbf39ecf0 --- /dev/null +++ b/lib/cuda/pair_tersoff_cuda.cu @@ -0,0 +1,155 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include + + +#include "pair_tersoff_cuda_cu.h" +__device__ __constant__ Param_Float params[MANYBODY_NPAIR*MANYBODY_NPAIR*MANYBODY_NPAIR]; +__device__ __constant__ F_FLOAT* _glob_zeta_ij; //zeta_ij +__device__ __constant__ F_FLOAT4* _glob_r_ij; //r_ij (x,y,z,r^2) for pairs within force cutoff +__device__ __constant__ bool _zbl; //is tersoff zbl? + + +#include "pair_tersoff_cuda_kernel_nc.cu" + +#include + + +void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata,Param_Float* params_host,void* map_host, void* elem2param_host,int nelements_h,bool zbl) +{ + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + X_FLOAT box_size[3] = + { + sdata->domain.subhi[0] - sdata->domain.sublo[0], + sdata->domain.subhi[1] - sdata->domain.sublo[1], + sdata->domain.subhi[2] - sdata->domain.sublo[2] + }; + + cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3); + cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) ); + cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 ); + cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) ); + cudaMemcpyToSymbol("params", params_host , sizeof(Param_Float)*nelements_h*nelements_h*nelements_h ); + cudaMemcpyToSymbol("elem2param",elem2param_host , sizeof(int)*nelements_h*nelements_h*nelements_h ); + cudaMemcpyToSymbol("map",map_host , sizeof(int)*cuda_ntypes ); + cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int)); + cudaMemcpyToSymbol("_zbl",&zbl,sizeof(bool)); + +} + +void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom) +{ + static F_FLOAT* glob_zeta_ij=NULL; + static int glob_zeta_ij_size=0; + static F_FLOAT4* glob_r_ij=NULL; + static int* glob_numneigh_red=NULL; + static int* glob_neighbors_red=NULL; + static int* glob_neightype_red=NULL; + + if(glob_zeta_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT)) + { + glob_zeta_ij_size = sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT); + cudaFree(glob_zeta_ij); + cudaFree(glob_r_ij); + cudaFree(glob_numneigh_red); + cudaFree(glob_neighbors_red); + cudaFree(glob_neightype_red); + cudaMalloc(&glob_zeta_ij,glob_zeta_ij_size); + cudaMalloc(&glob_r_ij,glob_zeta_ij_size*4); + cudaMalloc(&glob_numneigh_red,sdata->atom.nall*sizeof(int)); + cudaMalloc(&glob_neighbors_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int)); + cudaMalloc(&glob_neightype_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int)); + cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red , sizeof(int*) ); + cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red , sizeof(int*) ); + cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red , sizeof(int*) ); + cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij , sizeof(F_FLOAT4*) ); + cudaMemcpyToSymbol("_glob_zeta_ij", &glob_zeta_ij , sizeof(F_FLOAT*) ); + } + dim3 grid,threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,64); + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + + + dim3 grid2; + if(sdata->atom.nall<=256*64000){ + grid2.x = (sdata->atom.nall+255)/256; + grid2.y = 1; + } else { + grid2.x = (sdata->atom.nall+256*128-1)/(256*128); + grid2.y = 128; + } + grid2.z = 1; + dim3 threads2; + threads2.x = 256; + threads2.y = 1; + threads2.z = 1; + + timespec time1,time2; + + //pre-calculate all neighbordistances and zeta_ij + clock_gettime(CLOCK_REALTIME,&time1); + Pair_Tersoff_Kernel_TpA_RIJ<<>> + (); + cudaThreadSynchronize(); + Pair_Tersoff_Kernel_TpA_ZetaIJ<<>> + (); + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME,&time2); + sdata->cuda_timings.test1+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + clock_gettime(CLOCK_REALTIME,&time1); + + //actual force calculation + unsigned int sharedsize=(sharedperproc*sizeof(ENERGY_FLOAT)+4*sizeof(F_FLOAT))*threads.x; //extra 4 floats per thread used to reduce register pressure + if(eflag) + { + if(vflag) + Pair_Tersoff_Kernel_TpA<1,1><<>> + (eflag_atom,vflag_atom); + else + Pair_Tersoff_Kernel_TpA<1,0><<>> + (eflag_atom,vflag_atom); + } + else + { + if(vflag) + Pair_Tersoff_Kernel_TpA<0,1><<>> + (eflag_atom,vflag_atom); + else + Pair_Tersoff_Kernel_TpA<0,0><<>> + (eflag_atom,vflag_atom); + } + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME,&time2); + sdata->cuda_timings.test2+= + time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + diff --git a/lib/cuda/pair_tersoff_cuda_cu.h b/lib/cuda/pair_tersoff_cuda_cu.h new file mode 100644 index 0000000000..5276cd1c35 --- /dev/null +++ b/lib/cuda/pair_tersoff_cuda_cu.h @@ -0,0 +1,42 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + + struct Param_Float { + F_FLOAT lam1,lam2,lam3; + F_FLOAT c,d,h; + F_FLOAT gamma,powerm; + F_FLOAT powern,beta; + F_FLOAT biga,bigb,bigd,bigr; + F_FLOAT cut,cutsq; + F_FLOAT c1,c2,c3,c4; + int ielement,jelement,kelement; + int powermint; + //F_FLOAT Z_i,Z_j; + F_FLOAT ZBLcut,ZBLexpscale; + F_FLOAT a_ij,premult; + }; + +extern "C" void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata,Param_Float* params_host,void* map_host, void* elem2param_host,int nelements_h,bool zbl); +extern "C" void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom); diff --git a/lib/cuda/pair_tersoff_cuda_kernel_nc.cu b/lib/cuda/pair_tersoff_cuda_kernel_nc.cu new file mode 100644 index 0000000000..f94f35a587 --- /dev/null +++ b/lib/cuda/pair_tersoff_cuda_kernel_nc.cu @@ -0,0 +1,1054 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#define Pi F_F(3.1415926535897932384626433832795) +#define PI Pi +#define PI2 F_F(0.5)*Pi +#define PI4 F_F(0.25)*Pi +template +static inline __device__ void PairVirialCompute_A_Kernel_Template() +{ + __syncthreads(); + ENERGY_FLOAT* shared=sharedmem; + + if(eflag) + { + reduceBlock(shared); + shared+=blockDim.x; + } + if(vflag) + { + reduceBlock(shared + 0 * blockDim.x); + reduceBlock(shared + 1 * blockDim.x); + reduceBlock(shared + 2 * blockDim.x); + reduceBlock(shared + 3 * blockDim.x); + reduceBlock(shared + 4 * blockDim.x); + reduceBlock(shared + 5 * blockDim.x); + } + if(threadIdx.x == 0) + { + shared=sharedmem; + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + if(eflag) + { + buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0]; + shared+=blockDim.x; buffer+=gridDim.x * gridDim.y; + } + if(vflag) + { + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[0 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[1 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[2 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[3 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[4 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[5 * blockDim.x]; + } + } + __syncthreads(); +} + +__global__ void virial_fdotr_compute_kernel(int eflag) +{ + int i = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + ENERGY_FLOAT* sharedE = (ENERGY_FLOAT*) &sharedmem[0]; + ENERGY_FLOAT* sharedVirial = (ENERGY_FLOAT*) &sharedE[blockDim.x]; + sharedE+=threadIdx.x; + sharedVirial+=threadIdx.x; + if(i<_nlocal) + { + + F_FLOAT x = _x[i]; + F_FLOAT y = _x[i+_nmax]; + F_FLOAT z = _x[i+2*_nmax]; + F_FLOAT fx = _f[i]; + F_FLOAT fy = _f[i+_nmax]; + F_FLOAT fz = _f[i+2*_nmax]; + //if(fz*z*fz*z>1e-5) printf("V %i %i %e %e %e %e %e %e\n",i,_tag[i],x,y,z,fx,fy,fz); + sharedVirial[0] = fx*x; + sharedVirial[1*blockDim.x] = fy*y; + sharedVirial[2*blockDim.x] = fz*z; + sharedVirial[3*blockDim.x] = fy*x; + sharedVirial[4*blockDim.x] = fz*x; + sharedVirial[5*blockDim.x] = fz*y; + } else { + sharedVirial[0] = 0; + sharedVirial[1*blockDim.x] = 0; + sharedVirial[2*blockDim.x] = 0; + sharedVirial[3*blockDim.x] = 0; + sharedVirial[4*blockDim.x] = 0; + sharedVirial[5*blockDim.x] = 0; + } + sharedVirial = (ENERGY_FLOAT*) &sharedmem[0]; + sharedVirial += blockDim.x; + reduceBlockP2(sharedVirial); + reduceBlockP2(&sharedVirial[1*blockDim.x]); + reduceBlockP2(&sharedVirial[2*blockDim.x]); + reduceBlockP2(&sharedVirial[3*blockDim.x]); + reduceBlockP2(&sharedVirial[4*blockDim.x]); + reduceBlockP2(&sharedVirial[5*blockDim.x]); + if(threadIdx.x<6) + { + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + if(eflag) buffer = &buffer[gridDim.x*gridDim.y]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + threadIdx.x * gridDim.x * gridDim.y]= sharedVirial[threadIdx.x*blockDim.x]; + } +} + +/*#define vec3_scale(K,X,Y) Y.x = K*X.x; Y.y = K*X.y; Y.z = K*X.z; +#define vec3_scaleadd(K,X,Y,Z) Z.x = K*X.x+Y.x; Z.y = K*X.y+Y.y; Z.z = K*X.z+Y.z; +#define vec3_add(X,Y,Z) Z.x = X.x+Y.x; Z.y = X.y+Y.y; Z.z = X.z+Y.z; +#define vec3_dot(X,Y) (X.x*Y.x + X.y*Y.y + X.z*Y.z)*/ + +__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT3& x, F_FLOAT3& y) { + y.x = k*x.x; y.y = k*x.y; y.z = k*x.z; +} + +__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4& x, F_FLOAT3& y) { + y.x = k*x.x; y.y = k*x.y; y.z = k*x.z; +} + +__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4& x, F_FLOAT4& y) { + y.x = k*x.x; y.y = k*x.y; y.z = k*x.z; +} + +__device__ inline void vec3_scaleadd(F_FLOAT k, F_FLOAT3& x, F_FLOAT3& y, F_FLOAT3& z) { + z.x = k*x.x+y.x; z.y = k*x.y+y.y; z.z = k*x.z+y.z; +} + +__device__ inline void vec3_add(F_FLOAT3& x, F_FLOAT3& y, F_FLOAT3& z) { + z.x = x.x+y.x; z.y = x.y+y.y; z.z = x.z+y.z; +} + +__device__ inline F_FLOAT vec3_dot(F_FLOAT3 x, F_FLOAT3 y) { + return x.x*y.x + x.y*y.y + x.z*y.z; +} + +__device__ inline F_FLOAT vec3_dot(F_FLOAT4 x, F_FLOAT4 y) { + return x.x*y.x + x.y*y.y + x.z*y.z; +} + +/* ---------------------------------------------------------------------- + Fermi-like smoothing function +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT F_fermi(F_FLOAT &r, int &iparam) +{ + return F_F(1.0) / (F_F(1.0) + exp(-params[iparam].ZBLexpscale*(r-params[iparam].ZBLcut))); +} + +/* ---------------------------------------------------------------------- + Fermi-like smoothing function derivative with respect to r +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT F_fermi_d(F_FLOAT &r, int &iparam) +{ + volatile const F_FLOAT tmp = exp(-params[iparam].ZBLexpscale*(r-params[iparam].ZBLcut)); + return params[iparam].ZBLexpscale*tmp / + ((F_F(1.0) +tmp)*(F_F(1.0) +tmp)); +} + +__device__ inline F_FLOAT ters_fc(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D) +{ + return (r < ters_R-ters_D)?F_F(1.0):((r > ters_R+ters_D)? + F_F(0.0):F_F(0.5)*(F_F(1.0) - sin(PI2*(r - ters_R)/ters_D))); +} + +__device__ inline F_FLOAT ters_fc_d(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D) +{ + return ((r < ters_R-ters_D)||(r > ters_R+ters_D))? + F_F(0.0):-(PI4/ters_D) * cos(PI2*(r - ters_R)/ters_D); +} + + +__device__ inline F_FLOAT ters_gijk(F_FLOAT& cos_theta, int iparam) +{ + F_FLOAT ters_c = params[iparam].c; + F_FLOAT ters_d = params[iparam].d; + + return params[iparam].gamma*(F_F(1.0) + pow(params[iparam].c/params[iparam].d,F_F(2.0)) - + pow(ters_c,F_F(2.0)) / (pow(ters_d,F_F(2.0)) + pow(params[iparam].h - cos_theta,F_F(2.0)))); +} + +__device__ F_FLOAT ters_gijk2(F_FLOAT& cos_theta, int iparam) +{ + F_FLOAT ters_c = params[iparam].c; + F_FLOAT ters_d = params[iparam].d; + + return params[iparam].gamma*(F_F(1.0) + pow(ters_c/ters_d,F_F(2.0)) - + pow(ters_c,F_F(2.0)) / (pow(ters_d,F_F(2.0)) + pow(params[iparam].h - cos_theta,F_F(2.0)))); +} + +__device__ inline F_FLOAT ters_gijk_d(F_FLOAT costheta, int iparam) +{ + F_FLOAT numerator = -F_F(2.0) * pow(params[iparam].c,F_F(2.0)) * (params[iparam].h - costheta); + F_FLOAT denominator = pow(pow(params[iparam].d,F_F(2.0)) + + pow(params[iparam].h - costheta,F_F(2.0)),F_F(2.0)); + return params[iparam].gamma*numerator/denominator; +} + +__device__ inline F_FLOAT zeta(int iparam, const F_FLOAT rsqij, const F_FLOAT rsqik, + F_FLOAT3& delij, F_FLOAT3& delik) +{ + F_FLOAT rij,rik,costheta,arg,ex_delr; + + rij = sqrt(rsqij); + rik = sqrt(rsqik); + costheta = vec3_dot(delij,delik) / (rij*rik); + + arg = (params[iparam].powermint == 3)? (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)) : params[iparam].lam3 * (rij-rik); + + if (arg > F_F(69.0776)) ex_delr = F_F(1.e30); + else if (arg < -F_F(69.0776)) ex_delr = F_F(0.0); + else ex_delr = exp(arg); + + return ters_fc(rik,params[iparam].bigr,params[iparam].bigd) * ex_delr * params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c/(params[iparam].d*params[iparam].d)) - + (params[iparam].c*params[iparam].c) / ((params[iparam].d*params[iparam].d) + (params[iparam].h - costheta)*(params[iparam].h - costheta))); +} + +__device__ void repulsive(int iparam, F_FLOAT rsq, F_FLOAT &fforce, + int eflag, ENERGY_FLOAT &eng) +{ + F_FLOAT r,tmp_fc,tmp_fc_d,tmp_exp; + + F_FLOAT ters_R = params[iparam].bigr; + F_FLOAT ters_D = params[iparam].bigd; + r = sqrt(rsq); + tmp_fc = ters_fc(r,ters_R,ters_D); + tmp_fc_d = ters_fc_d(r,ters_R,ters_D); + tmp_exp = exp(-params[iparam].lam1 * r); + if(!_zbl) + { + fforce = -params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc*params[iparam].lam1) / r; + if (eflag) eng += tmp_fc * params[iparam].biga * tmp_exp; + } + else + { + F_FLOAT const fforce_ters = params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc*params[iparam].lam1); + ENERGY_FLOAT eng_ters = tmp_fc * params[iparam].biga * tmp_exp; + + F_FLOAT r_ov_a = r/params[iparam].a_ij; + F_FLOAT phi = F_F(0.1818)*exp(-F_F(3.2)*r_ov_a) + F_F(0.5099)*exp(-F_F(0.9423)*r_ov_a) + + F_F(0.2802)*exp(-F_F(0.4029)*r_ov_a) + F_F(0.02817)*exp(-F_F(0.2016)*r_ov_a); + F_FLOAT dphi = (F_F(1.0)/params[iparam].a_ij) * (-F_F(3.2)*F_F(0.1818)*exp(-F_F(3.2)*r_ov_a) - + F_F(0.9423)*F_F(0.5099)*exp(-F_F(0.9423)*r_ov_a) - + F_F(0.4029)*F_F(0.2802)*exp(-F_F(0.4029)*r_ov_a) - + F_F(0.2016)*F_F(0.02817)*exp(-F_F(0.2016)*r_ov_a)); + F_FLOAT fforce_ZBL = params[iparam].premult/(-r*r)* phi + params[iparam].premult/r*dphi; + ENERGY_FLOAT eng_ZBL = params[iparam].premult*(F_F(1.0)/r)*phi; + + fforce = -(-F_fermi_d(r,iparam) * (eng_ZBL - eng_ters) + fforce_ZBL + F_fermi(r,iparam)*(fforce_ters-fforce_ZBL))/r; + if(eflag) + eng += eng_ZBL + F_fermi(r,iparam)*(eng_ters-eng_ZBL); + } + + +} + +/* ---------------------------------------------------------------------- */ + +__device__ inline F_FLOAT ters_fa(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D) +{ + if (r > ters_R + ters_D) return F_F(0.0); + if(_zbl) + return -params[iparam].bigb * exp(-params[iparam].lam2 * r) * ters_fc(r,ters_R,ters_D) * F_fermi(r,iparam); + else + return -params[iparam].bigb * exp(-params[iparam].lam2 * r) * ters_fc(r,ters_R,ters_D); +} + +/* ---------------------------------------------------------------------- */ + +__device__ inline F_FLOAT ters_fa_d(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D) +{ + if (r > ters_R + ters_D) return F_F(0.0); + if(_zbl) + return params[iparam].bigb * exp(-params[iparam].lam2 * r) * + ((params[iparam].lam2 * ters_fc(r,ters_R,ters_D) - ters_fc_d(r,ters_R,ters_D))*F_fermi(r,iparam) + -ters_fc(r,ters_R,ters_D)*F_fermi_d(r,iparam)); + else + return params[iparam].bigb * exp(-params[iparam].lam2 * r) * + (params[iparam].lam2 * ters_fc(r,ters_R,ters_D) - ters_fc_d(r,ters_R,ters_D)); +} + +/* ---------------------------------------------------------------------- */ + +__device__ inline F_FLOAT ters_bij(F_FLOAT zeta, int iparam) +{ + F_FLOAT tmp = params[iparam].beta * zeta; + if (tmp > params[iparam].c1) return F_F(1.0)/sqrt(tmp); + if (tmp > params[iparam].c2) + return (F_F(1.0) - pow(tmp,-params[iparam].powern) / (F_F(2.0)*params[iparam].powern))/sqrt(tmp); + if (tmp < params[iparam].c4) return F_F(1.0); + if (tmp < params[iparam].c3) + return F_F(1.0) - pow(tmp,params[iparam].powern)/(F_F(2.0)*params[iparam].powern); + return pow(F_F(1.0) + pow(tmp,params[iparam].powern), -F_F(1.0)/(F_F(2.0)*params[iparam].powern)); +} + +/* ---------------------------------------------------------------------- */ + +__device__ inline F_FLOAT ters_bij_d(F_FLOAT zeta, int iparam) +{ + F_FLOAT tmp = params[iparam].beta * zeta; + if (tmp > params[iparam].c1) return params[iparam].beta * -F_F(0.5)*pow(tmp,-F_F(1.5)); + if (tmp > params[iparam].c2) + return params[iparam].beta * (-F_F(0.5)*pow(tmp,-F_F(1.5)) * + (F_F(1.0) - F_F(0.5)*(F_F(1.0) + F_F(1.0)/(F_F(2.0)*params[iparam].powern)) * + pow(tmp,-params[iparam].powern))); + if (tmp < params[iparam].c4) return F_F(0.0); + if (tmp < params[iparam].c3) + return -F_F(0.5)*params[iparam].beta * pow(tmp,params[iparam].powern-F_F(1.0)); + + F_FLOAT tmp_n = pow(tmp,params[iparam].powern); + return -F_F(0.5) * pow(F_F(1.0)+tmp_n, -F_F(1.0)-(F_F(1.0)/(F_F(2.0)*params[iparam].powern)))*tmp_n / zeta; +} + +__device__ void force_zeta(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, + F_FLOAT &fforce, F_FLOAT &prefactor, + int eflag, F_FLOAT &eng) +{ + F_FLOAT r,fa,fa_d,bij; + F_FLOAT ters_R = params[iparam].bigr; + F_FLOAT ters_D = params[iparam].bigd; + r = sqrt(rsq); + fa = ters_fa(r,iparam,ters_R,ters_D); + fa_d = ters_fa_d(r,iparam,ters_R,ters_D); + bij = ters_bij(zeta_ij,iparam); + fforce = F_F(0.5)*bij*fa_d / r; + prefactor = -F_F(0.5)*fa * ters_bij_d(zeta_ij,iparam); + if (eflag) eng += bij*fa; +} + +__device__ void force_zeta_prefactor_force(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, + F_FLOAT &fforce, F_FLOAT &prefactor) +{ + F_FLOAT r,fa,fa_d,bij; + F_FLOAT ters_R = params[iparam].bigr; + F_FLOAT ters_D = params[iparam].bigd; + r = sqrt(rsq); + fa = ters_fa(r,iparam,ters_R,ters_D); + fa_d = ters_fa_d(r,iparam,ters_R,ters_D); + bij = ters_bij(zeta_ij,iparam); + fforce = F_F(0.5)*bij*fa_d / r; + prefactor = -F_F(0.5)*fa * ters_bij_d(zeta_ij,iparam); +} + +__device__ void force_zeta_prefactor(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, + F_FLOAT &prefactor) +{ + F_FLOAT r,fa; + r = sqrt(rsq); + fa = ters_fa(r,iparam,params[iparam].bigr,params[iparam].bigd); + prefactor = -F_F(0.5)*fa*ters_bij_d(zeta_ij,iparam); +} + + +__device__ void costheta_d(F_FLOAT3& rij_hat, F_FLOAT& rij, + F_FLOAT3& rik_hat, F_FLOAT& rik, + F_FLOAT3& dri, F_FLOAT3& drj, F_FLOAT3& drk) +{ + // first element is derivative wrt Ri, second wrt Rj, third wrt Rk + + F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); + + vec3_scaleadd(-cos_theta,rij_hat,rik_hat,drj); + vec3_scale(F_F(1.0)/rij,drj,drj); + vec3_scaleadd(-cos_theta,rik_hat,rij_hat,drk); + vec3_scale(F_F(1.0)/rik,drk,drk); + vec3_add(drj,drk,dri); + vec3_scale(-F_F(1.0),dri,dri); +} + +__device__ void ters_zetaterm_d(F_FLOAT prefactor, + F_FLOAT3& rij_hat, F_FLOAT rij, + F_FLOAT3& rik_hat, F_FLOAT rik, + F_FLOAT3& dri, F_FLOAT3& drj, F_FLOAT3& drk, + int iparam) +{ + F_FLOAT ex_delr,ex_delr_d,tmp; + F_FLOAT3 dcosdri,dcosdrj,dcosdrk; + + if (params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)); + else tmp = params[iparam].lam3 * (rij-rik); + + if (tmp > F_F(69.0776)) ex_delr = F_F(1.e30); + else if (tmp < -F_F(69.0776)) ex_delr = F_F(0.0); + else ex_delr = exp(tmp); + + if (params[iparam].powermint == 3) + ex_delr_d = F_F(3.0)*(params[iparam].lam3*params[iparam].lam3*params[iparam].lam3) * (rij-rik)*(rij-rik)*ex_delr; + else ex_delr_d = params[iparam].lam3 * ex_delr; + + + const F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); + costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); + + const F_FLOAT gijk = params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c)/(params[iparam].d*params[iparam].d) - + (params[iparam].c*params[iparam].c) / (params[iparam].d*params[iparam].d + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta))); + const F_FLOAT numerator = -F_F(2.0) * params[iparam].c*params[iparam].c * (params[iparam].h - cos_theta); + const F_FLOAT denominator = (params[iparam].d*params[iparam].d) + + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta); + const F_FLOAT gijk_d = params[iparam].gamma*numerator/(denominator*denominator); // compute the derivative wrt Ri + // dri = -dfc*gijk*ex_delr*rik_hat; + // dri += fc*gijk_d*ex_delr*dcosdri; + // dri += fc*gijk*ex_delr_d*(rik_hat - rij_hat); + const F_FLOAT fc = ters_fc(rik,params[iparam].bigr,params[iparam].bigd); + const F_FLOAT dfc = ters_fc_d(rik,params[iparam].bigr,params[iparam].bigd); + + + vec3_scale(-dfc*gijk*ex_delr,rik_hat,dri); + vec3_scaleadd(fc*gijk_d*ex_delr,dcosdri,dri,dri); + vec3_scaleadd(fc*gijk*ex_delr_d,rik_hat,dri,dri); + vec3_scaleadd(-fc*gijk*ex_delr_d,rij_hat,dri,dri); + vec3_scale(prefactor,dri,dri); + // compute the derivative wrt Rj + // drj = fc*gijk_d*ex_delr*dcosdrj; + // drj += fc*gijk*ex_delr_d*rij_hat; + + vec3_scale(fc*gijk_d*ex_delr,dcosdrj,drj); + vec3_scaleadd(fc*gijk*ex_delr_d,rij_hat,drj,drj); + vec3_scale(prefactor,drj,drj); + + // compute the derivative wrt Rk + // drk = dfc*gijk*ex_delr*rik_hat; + // drk += fc*gijk_d*ex_delr*dcosdrk; + // drk += -fc*gijk*ex_delr_d*rik_hat; + + vec3_scale(dfc*gijk*ex_delr,rik_hat,drk); + vec3_scaleadd(fc*gijk_d*ex_delr,dcosdrk,drk,drk); + vec3_scaleadd(-fc*gijk*ex_delr_d,rik_hat,drk,drk); + vec3_scale(prefactor,drk,drk); +} + +__device__ void ters_zetaterm_d_fi(F_FLOAT &prefactor, + F_FLOAT3& rij_hat, F_FLOAT &rij, + F_FLOAT3& rik_hat, F_FLOAT &rik, + F_FLOAT3& dri, int &iparam) +{ + F_FLOAT ex_delr,ex_delr_d,tmp; + + if (params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)); + else tmp = params[iparam].lam3 * (rij-rik); + + if (tmp > F_F(69.0776)) ex_delr = F_F(1.e30); + else if (tmp < -F_F(69.0776)) ex_delr = F_F(0.0); + else ex_delr = exp(tmp); + + if (params[iparam].powermint == 3) + ex_delr_d = F_F(3.0)*(params[iparam].lam3*params[iparam].lam3*params[iparam].lam3) * (rij-rik)*(rij-rik)*ex_delr; + else ex_delr_d = params[iparam].lam3 * ex_delr; + + const F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); + //costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); + + + F_FLOAT3 dcosdri; + vec3_scaleadd(-cos_theta,rij_hat,rik_hat,dri); + vec3_scale(F_F(1.0)/rij,dri,dri); + vec3_scaleadd(-cos_theta,rik_hat,rij_hat,dcosdri); + vec3_scale(F_F(1.0)/rik,dcosdri,dcosdri); + vec3_add(dri,dcosdri,dcosdri); + vec3_scale(-F_F(1.0),dcosdri,dcosdri); + + const F_FLOAT gijk = params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c)/(params[iparam].d*params[iparam].d) - + (params[iparam].c*params[iparam].c) / (params[iparam].d*params[iparam].d + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta))); + const F_FLOAT numerator = -F_F(2.0) * params[iparam].c*params[iparam].c * (params[iparam].h - cos_theta); + const F_FLOAT denominator = (params[iparam].d*params[iparam].d) + + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta); + const F_FLOAT gijk_d = params[iparam].gamma*numerator/(denominator*denominator); // compute the derivative wrt Ri +// + const F_FLOAT fc = ters_fc(rik,params[iparam].bigr,params[iparam].bigd); + const F_FLOAT dfc = ters_fc_d(rik,params[iparam].bigr,params[iparam].bigd); + + vec3_scale(-dfc*gijk*ex_delr,rik_hat,dri); + vec3_scaleadd(fc*gijk_d*ex_delr,dcosdri,dri,dri); + vec3_scaleadd(fc*gijk*ex_delr_d,rik_hat,dri,dri); + vec3_scaleadd(-fc*gijk*ex_delr_d,rij_hat,dri,dri); + vec3_scale(prefactor,dri,dri); + +} + +__device__ void ters_zetaterm_d_fj(F_FLOAT &prefactor, + F_FLOAT3& rij_hat, F_FLOAT &rij, + F_FLOAT3& rik_hat, F_FLOAT &rik, + F_FLOAT3& drj, int &iparam) +{ + F_FLOAT ex_delr,ex_delr_d,tmp; + + if (params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)); + else tmp = params[iparam].lam3 * (rij-rik); + + if (tmp > F_F(69.0776)) ex_delr = F_F(1.e30); + else if (tmp < -F_F(69.0776)) ex_delr = F_F(0.0); + else ex_delr = exp(tmp); + + if (params[iparam].powermint == 3) + ex_delr_d = F_F(3.0)*(params[iparam].lam3*params[iparam].lam3*params[iparam].lam3) * (rij-rik)*(rij-rik)*ex_delr; + else ex_delr_d = params[iparam].lam3 * ex_delr; + + const F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); + vec3_scaleadd(-cos_theta,rij_hat,rik_hat,drj); + vec3_scale(F_F(1.0)/rij,drj,drj); + + const F_FLOAT gijk = params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c)/(params[iparam].d*params[iparam].d) - + (params[iparam].c*params[iparam].c) / (params[iparam].d*params[iparam].d + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta))); + const F_FLOAT numerator = -F_F(2.0) * params[iparam].c*params[iparam].c * (params[iparam].h - cos_theta); + const F_FLOAT denominator = (params[iparam].d*params[iparam].d) + + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta); + const F_FLOAT gijk_d = params[iparam].gamma*numerator/(denominator*denominator); // compute the derivative wrt Ri + + const F_FLOAT fc = ters_fc(rik,params[iparam].bigr,params[iparam].bigd); + + vec3_scale(fc*gijk_d*ex_delr,drj,drj); + vec3_scaleadd(fc*gijk*ex_delr_d,rij_hat,drj,drj); + vec3_scale(prefactor,drj,drj); +} + +__device__ void ters_zetaterm_d_fk(F_FLOAT &prefactor, + F_FLOAT3& rij_hat, F_FLOAT &rij, + F_FLOAT3& rik_hat, F_FLOAT &rik, + F_FLOAT3& drk, int &iparam) +{ + F_FLOAT ex_delr,ex_delr_d,tmp; + + if (params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)); + else tmp = params[iparam].lam3 * (rij-rik); + + if (tmp > F_F(69.0776)) ex_delr = F_F(1.e30); + else if (tmp < -F_F(69.0776)) ex_delr = F_F(0.0); + else ex_delr = exp(tmp); + + if (params[iparam].powermint == 3) + ex_delr_d = F_F(3.0)*(params[iparam].lam3*params[iparam].lam3*params[iparam].lam3) * (rij-rik)*(rij-rik)*ex_delr; + else ex_delr_d = params[iparam].lam3 * ex_delr; + + const F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); + vec3_scaleadd(-cos_theta,rik_hat,rij_hat,drk); + vec3_scale(F_F(1.0)/rik,drk,drk); + + const F_FLOAT gijk = params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c)/(params[iparam].d*params[iparam].d) - + (params[iparam].c*params[iparam].c) / (params[iparam].d*params[iparam].d + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta))); + const F_FLOAT numerator = -F_F(2.0) * params[iparam].c*params[iparam].c * (params[iparam].h - cos_theta); + const F_FLOAT denominator = (params[iparam].d*params[iparam].d) + + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta); + const F_FLOAT gijk_d = params[iparam].gamma*numerator/(denominator*denominator); // compute the derivative wrt Ri + + const F_FLOAT fc = ters_fc(rik,params[iparam].bigr,params[iparam].bigd); + const F_FLOAT dfc = ters_fc_d(rik,params[iparam].bigr,params[iparam].bigd); + + vec3_scale(fc*gijk_d*ex_delr,drk,drk); + vec3_scaleadd(dfc*gijk*ex_delr,rik_hat,drk,drk); + vec3_scaleadd(-fc*gijk*ex_delr_d,rik_hat,drk,drk); + vec3_scale(prefactor,drk,drk); +} + +__device__ void attractive(int iparam, F_FLOAT prefactor, + F_FLOAT4& delij, + F_FLOAT4& delik, + F_FLOAT3& fi, F_FLOAT3& fj, F_FLOAT3& fk) +{ + F_FLOAT3 rij_hat,rik_hat; + F_FLOAT rij,rijinv,rik,rikinv; + + rij = sqrt(delij.w); + rijinv = F_F(1.0)/rij; + vec3_scale(rijinv,delij,rij_hat); + + rik = sqrt(delik.w); + rikinv = F_F(1.0)/rik; + vec3_scale(rikinv,delik,rik_hat); + + ters_zetaterm_d(prefactor,rij_hat,rij,rik_hat,rik,fi,fj,fk,iparam); +} + +__device__ void attractive_fi(int& iparam, F_FLOAT& prefactor, + F_FLOAT4& delij, + F_FLOAT4& delik, + F_FLOAT3& f) +{ + F_FLOAT3 rij_hat,rik_hat; + F_FLOAT rij,rijinv,rik,rikinv; + + rij = sqrt(delij.w); + rijinv = F_F(1.0)/rij; + vec3_scale(rijinv,delij,rij_hat); + + rik = sqrt(delik.w); + rikinv = F_F(1.0)/rik; + vec3_scale(rikinv,delik,rik_hat); + + ters_zetaterm_d_fi(prefactor,rij_hat,rij,rik_hat,rik,f,iparam); +} + +__device__ void attractive_fj(int iparam, F_FLOAT prefactor, + F_FLOAT4& delij, + F_FLOAT4& delik, + F_FLOAT3& f) +{ + F_FLOAT3 rij_hat,rik_hat; + F_FLOAT rij,rijinv,rik,rikinv; + + rij = sqrt(delij.w); + rijinv = F_F(1.0)/rij; + vec3_scale(rijinv,delij,rij_hat); + + rik = sqrt(delik.w); + rikinv = F_F(1.0)/rik; + vec3_scale(rikinv,delik,rik_hat); + + ters_zetaterm_d_fj(prefactor,rij_hat,rij,rik_hat,rik,f,iparam); +} + +__device__ void attractive_fk(int iparam, F_FLOAT prefactor, + F_FLOAT4& delij, + F_FLOAT4& delik, + F_FLOAT3& f) +{ + F_FLOAT3 rij_hat,rik_hat; + F_FLOAT rij,rijinv,rik,rikinv; + + rij = sqrt(delij.w); + rijinv = F_F(1.0)/rij; + vec3_scale(rijinv,delij,rij_hat); + + rik = sqrt(delik.w); + rikinv = F_F(1.0)/rik; + vec3_scale(rikinv,delik,rik_hat); + + ters_zetaterm_d_fk(prefactor,rij_hat,rij,rik_hat,rik,f,iparam); +} + +__global__ void Pair_Tersoff_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +{ + int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if( ii >= _nall ) return; + + X_FLOAT4 myxtype; + F_FLOAT4 delij; + F_FLOAT xtmp,ytmp,ztmp; + int itype,jnum,i,j; + int* jlist; + int neigh_red = 0; + i = ii;//_ilist[ii]; + myxtype = fetchXType(i); + + xtmp=myxtype.x; + ytmp=myxtype.y; + ztmp=myxtype.z; + itype=map[(static_cast (myxtype.w))]; + + jnum = _numneigh[i]; + jlist = &_neighbors[i]; + + __syncthreads(); + for (int jj = 0; jj < jnum; jj++) + { + if(jj (myxtype.w))]; + int iparam_ij = elem2param[(itype*nelements+jtype)*nelements+jtype]; + delij.w = vec3_dot(delij,delij); + if (delij.w < params[iparam_ij].cutsq) + { + _glob_neighbors_red[i+neigh_red*_nall]=j; + _glob_neightype_red[i+neigh_red*_nall]=jtype; + _glob_r_ij[i+neigh_red*_nall]=delij; + neigh_red++; + } + } + } + _glob_numneigh_red[i]=neigh_red; +} + + + __global__ void Pair_Tersoff_Kernel_TpA_ZetaIJ()//F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) + { + + int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + if( ii >= _nall ) return; + + + F_FLOAT4 delij; + F_FLOAT4 delik; + + int itype,jnum,i,j; + int* jlist; + i = ii; + itype=map[(static_cast (_type[i]))]; + + jnum = _glob_numneigh_red[i]; + jlist = &_glob_neighbors_red[i]; + + __syncthreads(); + for (int jj = 0; jj < jnum; jj++) + { + if(jj + __global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom,int vflag_atom)//,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +{ + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + + ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x]; + ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + + F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem; + if((eflag||eflag_atom)&&(vflagm||vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7*blockDim.x]; + else + if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x]; + else + if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6*blockDim.x]; + shared_F_F+=threadIdx.x; + + if(eflag_atom||eflag) + { + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + + if(vflagm||vflag_atom) + { + sharedV[0*blockDim.x] = ENERGY_F(0.0); + sharedV[1*blockDim.x] = ENERGY_F(0.0); + sharedV[2*blockDim.x] = ENERGY_F(0.0); + sharedV[3*blockDim.x] = ENERGY_F(0.0); + sharedV[4*blockDim.x] = ENERGY_F(0.0); + sharedV[5*blockDim.x] = ENERGY_F(0.0); + } + + int jnum_red=0; +#define fxtmp shared_F_F[0] +#define fytmp shared_F_F[blockDim.x] +#define fztmp shared_F_F[2*blockDim.x] +//#define jnum_red (static_cast (shared_F_F[3*blockDim.x])) + + int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; + + X_FLOAT4 myxtype_i,myxtype_j,myxtype_k; + F_FLOAT4 delij,delik,deljk; + F_FLOAT fpair; + F_FLOAT prefactor_ij,prefactor_ji; + + int itype,i,j; + int* jlist_red; + + if(ii < _inum) + { + i = _ilist[ii]; + + if(vflagm) + myxtype_i=fetchXType(i); + //itype=map[(static_cast (myxtype_i.w))]; + itype=map[_type[i]]; + + + fxtmp = F_F(0.0); + fytmp = F_F(0.0); + fztmp = F_F(0.0); + + + //shared_F_F[3*blockDim.x] = _glob_numneigh_red[i]; + jnum_red = _glob_numneigh_red[i]; + jlist_red = &_glob_neighbors_red[i]; + } + __syncthreads(); + +#pragma unroll 1 + for (int jj = 0; jj < jnum_red; jj++) + { + if(i < _nlocal) + { + fpair=F_F(0.0); + j = jlist_red[jj*_nall]; + j &= NEIGHMASK; + + if(vflagm) + myxtype_j = fetchXType(j); + + int jtype = _glob_neightype_red[i+jj*_nall]; + delij = _glob_r_ij[i+jj*_nall]; + + volatile int iparam_ij = elem2param[(itype*nelements+jtype)*nelements+jtype]; + volatile int iparam_ji = elem2param[(jtype*nelements+itype)*nelements+itype]; + + if (delij.w(); +#undef fxtmp +#undef fytmp +#undef fztmp +//#undef jnum_red +}