/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov See the README file in the top-level LAMMPS directory. ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Theoretical Physics II, University of Technology Ilmenau, Germany See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ #define Pi F_F(3.1415926535897932384626433832795) #define PI Pi #define PI2 F_F(0.5)*Pi #define PI4 F_F(0.25)*Pi template static inline __device__ void PairVirialCompute_A_Kernel_Template() { __syncthreads(); ENERGY_FLOAT* shared=sharedmem; if(eflag) { reduceBlock(shared); shared+=blockDim.x; } if(vflag) { reduceBlock(shared + 0 * blockDim.x); reduceBlock(shared + 1 * blockDim.x); reduceBlock(shared + 2 * blockDim.x); reduceBlock(shared + 3 * blockDim.x); reduceBlock(shared + 4 * blockDim.x); reduceBlock(shared + 5 * blockDim.x); } if(threadIdx.x == 0) { shared=sharedmem; ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; if(eflag) { buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0]; shared+=blockDim.x; buffer+=gridDim.x * gridDim.y; } if(vflag) { buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[0 * blockDim.x]; buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[1 * blockDim.x]; buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[2 * blockDim.x]; buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[3 * blockDim.x]; buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[4 * blockDim.x]; buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[5 * blockDim.x]; } } __syncthreads(); } __global__ void virial_fdotr_compute_kernel(int eflag) { int i = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; ENERGY_FLOAT* sharedE = (ENERGY_FLOAT*) &sharedmem[0]; ENERGY_FLOAT* sharedVirial = (ENERGY_FLOAT*) &sharedE[blockDim.x]; sharedE+=threadIdx.x; sharedVirial+=threadIdx.x; if(i<_nlocal) { F_FLOAT x = _x[i]; F_FLOAT y = _x[i+_nmax]; F_FLOAT z = _x[i+2*_nmax]; F_FLOAT fx = _f[i]; F_FLOAT fy = _f[i+_nmax]; F_FLOAT fz = _f[i+2*_nmax]; //if(fz*z*fz*z>1e-5) printf("V %i %i %e %e %e %e %e %e\n",i,_tag[i],x,y,z,fx,fy,fz); sharedVirial[0] = fx*x; sharedVirial[1*blockDim.x] = fy*y; sharedVirial[2*blockDim.x] = fz*z; sharedVirial[3*blockDim.x] = fy*x; sharedVirial[4*blockDim.x] = fz*x; sharedVirial[5*blockDim.x] = fz*y; } else { sharedVirial[0] = 0; sharedVirial[1*blockDim.x] = 0; sharedVirial[2*blockDim.x] = 0; sharedVirial[3*blockDim.x] = 0; sharedVirial[4*blockDim.x] = 0; sharedVirial[5*blockDim.x] = 0; } sharedVirial = (ENERGY_FLOAT*) &sharedmem[0]; sharedVirial += blockDim.x; reduceBlockP2(sharedVirial); reduceBlockP2(&sharedVirial[1*blockDim.x]); reduceBlockP2(&sharedVirial[2*blockDim.x]); reduceBlockP2(&sharedVirial[3*blockDim.x]); reduceBlockP2(&sharedVirial[4*blockDim.x]); reduceBlockP2(&sharedVirial[5*blockDim.x]); if(threadIdx.x<6) { ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; if(eflag) buffer = &buffer[gridDim.x*gridDim.y]; buffer[blockIdx.x * gridDim.y + blockIdx.y + threadIdx.x * gridDim.x * gridDim.y]= sharedVirial[threadIdx.x*blockDim.x]; } } /*#define vec3_scale(K,X,Y) Y.x = K*X.x; Y.y = K*X.y; Y.z = K*X.z; #define vec3_scaleadd(K,X,Y,Z) Z.x = K*X.x+Y.x; Z.y = K*X.y+Y.y; Z.z = K*X.z+Y.z; #define vec3_add(X,Y,Z) Z.x = X.x+Y.x; Z.y = X.y+Y.y; Z.z = X.z+Y.z; #define vec3_dot(X,Y) (X.x*Y.x + X.y*Y.y + X.z*Y.z)*/ __device__ inline void vec3_scale(F_FLOAT k, F_FLOAT3& x, F_FLOAT3& y) { y.x = k*x.x; y.y = k*x.y; y.z = k*x.z; } __device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4& x, F_FLOAT3& y) { y.x = k*x.x; y.y = k*x.y; y.z = k*x.z; } __device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4& x, F_FLOAT4& y) { y.x = k*x.x; y.y = k*x.y; y.z = k*x.z; } __device__ inline void vec3_scaleadd(F_FLOAT k, F_FLOAT3& x, F_FLOAT3& y, F_FLOAT3& z) { z.x = k*x.x+y.x; z.y = k*x.y+y.y; z.z = k*x.z+y.z; } __device__ inline void vec3_add(F_FLOAT3& x, F_FLOAT3& y, F_FLOAT3& z) { z.x = x.x+y.x; z.y = x.y+y.y; z.z = x.z+y.z; } __device__ inline F_FLOAT vec3_dot(F_FLOAT3 x, F_FLOAT3 y) { return x.x*y.x + x.y*y.y + x.z*y.z; } __device__ inline F_FLOAT vec3_dot(F_FLOAT4 x, F_FLOAT4 y) { return x.x*y.x + x.y*y.y + x.z*y.z; } /* ---------------------------------------------------------------------- Fermi-like smoothing function ------------------------------------------------------------------------- */ __device__ inline F_FLOAT F_fermi(F_FLOAT &r, int &iparam) { return F_F(1.0) / (F_F(1.0) + exp(-params[iparam].ZBLexpscale*(r-params[iparam].ZBLcut))); } /* ---------------------------------------------------------------------- Fermi-like smoothing function derivative with respect to r ------------------------------------------------------------------------- */ __device__ inline F_FLOAT F_fermi_d(F_FLOAT &r, int &iparam) { volatile const F_FLOAT tmp = exp(-params[iparam].ZBLexpscale*(r-params[iparam].ZBLcut)); return params[iparam].ZBLexpscale*tmp / ((F_F(1.0) +tmp)*(F_F(1.0) +tmp)); } __device__ inline F_FLOAT ters_fc(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D) { return (r < ters_R-ters_D)?F_F(1.0):((r > ters_R+ters_D)? F_F(0.0):F_F(0.5)*(F_F(1.0) - sin(PI2*(r - ters_R)/ters_D))); } __device__ inline F_FLOAT ters_fc_d(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D) { return ((r < ters_R-ters_D)||(r > ters_R+ters_D))? F_F(0.0):-(PI4/ters_D) * cos(PI2*(r - ters_R)/ters_D); } __device__ inline F_FLOAT ters_gijk(F_FLOAT& cos_theta, int iparam) { F_FLOAT ters_c = params[iparam].c; F_FLOAT ters_d = params[iparam].d; return params[iparam].gamma*(F_F(1.0) + pow(params[iparam].c/params[iparam].d,F_F(2.0)) - pow(ters_c,F_F(2.0)) / (pow(ters_d,F_F(2.0)) + pow(params[iparam].h - cos_theta,F_F(2.0)))); } __device__ F_FLOAT ters_gijk2(F_FLOAT& cos_theta, int iparam) { F_FLOAT ters_c = params[iparam].c; F_FLOAT ters_d = params[iparam].d; return params[iparam].gamma*(F_F(1.0) + pow(ters_c/ters_d,F_F(2.0)) - pow(ters_c,F_F(2.0)) / (pow(ters_d,F_F(2.0)) + pow(params[iparam].h - cos_theta,F_F(2.0)))); } __device__ inline F_FLOAT ters_gijk_d(F_FLOAT costheta, int iparam) { F_FLOAT numerator = -F_F(2.0) * pow(params[iparam].c,F_F(2.0)) * (params[iparam].h - costheta); F_FLOAT denominator = pow(pow(params[iparam].d,F_F(2.0)) + pow(params[iparam].h - costheta,F_F(2.0)),F_F(2.0)); return params[iparam].gamma*numerator/denominator; } __device__ inline F_FLOAT zeta(int iparam, const F_FLOAT rsqij, const F_FLOAT rsqik, F_FLOAT3& delij, F_FLOAT3& delik) { F_FLOAT rij,rik,costheta,arg,ex_delr; rij = sqrt(rsqij); rik = sqrt(rsqik); costheta = vec3_dot(delij,delik) / (rij*rik); arg = (params[iparam].powermint == 3)? (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)) : params[iparam].lam3 * (rij-rik); if (arg > F_F(69.0776)) ex_delr = F_F(1.e30); else if (arg < -F_F(69.0776)) ex_delr = F_F(0.0); else ex_delr = exp(arg); return ters_fc(rik,params[iparam].bigr,params[iparam].bigd) * ex_delr * params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c/(params[iparam].d*params[iparam].d)) - (params[iparam].c*params[iparam].c) / ((params[iparam].d*params[iparam].d) + (params[iparam].h - costheta)*(params[iparam].h - costheta))); } __device__ void repulsive(int iparam, F_FLOAT rsq, F_FLOAT &fforce, int eflag, ENERGY_FLOAT &eng) { F_FLOAT r,tmp_fc,tmp_fc_d,tmp_exp; F_FLOAT ters_R = params[iparam].bigr; F_FLOAT ters_D = params[iparam].bigd; r = sqrt(rsq); tmp_fc = ters_fc(r,ters_R,ters_D); tmp_fc_d = ters_fc_d(r,ters_R,ters_D); tmp_exp = exp(-params[iparam].lam1 * r); if(!_zbl) { fforce = -params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc*params[iparam].lam1) / r; if (eflag) eng += tmp_fc * params[iparam].biga * tmp_exp; } else { F_FLOAT const fforce_ters = params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc*params[iparam].lam1); ENERGY_FLOAT eng_ters = tmp_fc * params[iparam].biga * tmp_exp; F_FLOAT r_ov_a = r/params[iparam].a_ij; F_FLOAT phi = F_F(0.1818)*exp(-F_F(3.2)*r_ov_a) + F_F(0.5099)*exp(-F_F(0.9423)*r_ov_a) + F_F(0.2802)*exp(-F_F(0.4029)*r_ov_a) + F_F(0.02817)*exp(-F_F(0.2016)*r_ov_a); F_FLOAT dphi = (F_F(1.0)/params[iparam].a_ij) * (-F_F(3.2)*F_F(0.1818)*exp(-F_F(3.2)*r_ov_a) - F_F(0.9423)*F_F(0.5099)*exp(-F_F(0.9423)*r_ov_a) - F_F(0.4029)*F_F(0.2802)*exp(-F_F(0.4029)*r_ov_a) - F_F(0.2016)*F_F(0.02817)*exp(-F_F(0.2016)*r_ov_a)); F_FLOAT fforce_ZBL = params[iparam].premult/(-r*r)* phi + params[iparam].premult/r*dphi; ENERGY_FLOAT eng_ZBL = params[iparam].premult*(F_F(1.0)/r)*phi; fforce = -(-F_fermi_d(r,iparam) * (eng_ZBL - eng_ters) + fforce_ZBL + F_fermi(r,iparam)*(fforce_ters-fforce_ZBL))/r; if(eflag) eng += eng_ZBL + F_fermi(r,iparam)*(eng_ters-eng_ZBL); } } /* ---------------------------------------------------------------------- */ __device__ inline F_FLOAT ters_fa(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D) { if (r > ters_R + ters_D) return F_F(0.0); if(_zbl) return -params[iparam].bigb * exp(-params[iparam].lam2 * r) * ters_fc(r,ters_R,ters_D) * F_fermi(r,iparam); else return -params[iparam].bigb * exp(-params[iparam].lam2 * r) * ters_fc(r,ters_R,ters_D); } /* ---------------------------------------------------------------------- */ __device__ inline F_FLOAT ters_fa_d(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D) { if (r > ters_R + ters_D) return F_F(0.0); if(_zbl) return params[iparam].bigb * exp(-params[iparam].lam2 * r) * ((params[iparam].lam2 * ters_fc(r,ters_R,ters_D) - ters_fc_d(r,ters_R,ters_D))*F_fermi(r,iparam) -ters_fc(r,ters_R,ters_D)*F_fermi_d(r,iparam)); else return params[iparam].bigb * exp(-params[iparam].lam2 * r) * (params[iparam].lam2 * ters_fc(r,ters_R,ters_D) - ters_fc_d(r,ters_R,ters_D)); } /* ---------------------------------------------------------------------- */ __device__ inline F_FLOAT ters_bij(F_FLOAT zeta, int iparam) { F_FLOAT tmp = params[iparam].beta * zeta; if (tmp > params[iparam].c1) return F_F(1.0)/sqrt(tmp); if (tmp > params[iparam].c2) return (F_F(1.0) - pow(tmp,-params[iparam].powern) / (F_F(2.0)*params[iparam].powern))/sqrt(tmp); if (tmp < params[iparam].c4) return F_F(1.0); if (tmp < params[iparam].c3) return F_F(1.0) - pow(tmp,params[iparam].powern)/(F_F(2.0)*params[iparam].powern); return pow(F_F(1.0) + pow(tmp,params[iparam].powern), -F_F(1.0)/(F_F(2.0)*params[iparam].powern)); } /* ---------------------------------------------------------------------- */ __device__ inline F_FLOAT ters_bij_d(F_FLOAT zeta, int iparam) { F_FLOAT tmp = params[iparam].beta * zeta; if (tmp > params[iparam].c1) return params[iparam].beta * -F_F(0.5)*pow(tmp,-F_F(1.5)); if (tmp > params[iparam].c2) return params[iparam].beta * (-F_F(0.5)*pow(tmp,-F_F(1.5)) * (F_F(1.0) - F_F(0.5)*(F_F(1.0) + F_F(1.0)/(F_F(2.0)*params[iparam].powern)) * pow(tmp,-params[iparam].powern))); if (tmp < params[iparam].c4) return F_F(0.0); if (tmp < params[iparam].c3) return -F_F(0.5)*params[iparam].beta * pow(tmp,params[iparam].powern-F_F(1.0)); F_FLOAT tmp_n = pow(tmp,params[iparam].powern); return -F_F(0.5) * pow(F_F(1.0)+tmp_n, -F_F(1.0)-(F_F(1.0)/(F_F(2.0)*params[iparam].powern)))*tmp_n / zeta; } __device__ void force_zeta(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, F_FLOAT &fforce, F_FLOAT &prefactor, int eflag, F_FLOAT &eng) { F_FLOAT r,fa,fa_d,bij; F_FLOAT ters_R = params[iparam].bigr; F_FLOAT ters_D = params[iparam].bigd; r = sqrt(rsq); fa = ters_fa(r,iparam,ters_R,ters_D); fa_d = ters_fa_d(r,iparam,ters_R,ters_D); bij = ters_bij(zeta_ij,iparam); fforce = F_F(0.5)*bij*fa_d / r; prefactor = -F_F(0.5)*fa * ters_bij_d(zeta_ij,iparam); if (eflag) eng += bij*fa; } __device__ void force_zeta_prefactor_force(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, F_FLOAT &fforce, F_FLOAT &prefactor) { F_FLOAT r,fa,fa_d,bij; F_FLOAT ters_R = params[iparam].bigr; F_FLOAT ters_D = params[iparam].bigd; r = sqrt(rsq); fa = ters_fa(r,iparam,ters_R,ters_D); fa_d = ters_fa_d(r,iparam,ters_R,ters_D); bij = ters_bij(zeta_ij,iparam); fforce = F_F(0.5)*bij*fa_d / r; prefactor = -F_F(0.5)*fa * ters_bij_d(zeta_ij,iparam); } __device__ void force_zeta_prefactor(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, F_FLOAT &prefactor) { F_FLOAT r,fa; r = sqrt(rsq); fa = ters_fa(r,iparam,params[iparam].bigr,params[iparam].bigd); prefactor = -F_F(0.5)*fa*ters_bij_d(zeta_ij,iparam); } __device__ void costheta_d(F_FLOAT3& rij_hat, F_FLOAT& rij, F_FLOAT3& rik_hat, F_FLOAT& rik, F_FLOAT3& dri, F_FLOAT3& drj, F_FLOAT3& drk) { // first element is derivative wrt Ri, second wrt Rj, third wrt Rk F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); vec3_scaleadd(-cos_theta,rij_hat,rik_hat,drj); vec3_scale(F_F(1.0)/rij,drj,drj); vec3_scaleadd(-cos_theta,rik_hat,rij_hat,drk); vec3_scale(F_F(1.0)/rik,drk,drk); vec3_add(drj,drk,dri); vec3_scale(-F_F(1.0),dri,dri); } __device__ void ters_zetaterm_d(F_FLOAT prefactor, F_FLOAT3& rij_hat, F_FLOAT rij, F_FLOAT3& rik_hat, F_FLOAT rik, F_FLOAT3& dri, F_FLOAT3& drj, F_FLOAT3& drk, int iparam) { F_FLOAT ex_delr,ex_delr_d,tmp; F_FLOAT3 dcosdri,dcosdrj,dcosdrk; if (params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)); else tmp = params[iparam].lam3 * (rij-rik); if (tmp > F_F(69.0776)) ex_delr = F_F(1.e30); else if (tmp < -F_F(69.0776)) ex_delr = F_F(0.0); else ex_delr = exp(tmp); if (params[iparam].powermint == 3) ex_delr_d = F_F(3.0)*(params[iparam].lam3*params[iparam].lam3*params[iparam].lam3) * (rij-rik)*(rij-rik)*ex_delr; else ex_delr_d = params[iparam].lam3 * ex_delr; const F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); const F_FLOAT gijk = params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c)/(params[iparam].d*params[iparam].d) - (params[iparam].c*params[iparam].c) / (params[iparam].d*params[iparam].d + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta))); const F_FLOAT numerator = -F_F(2.0) * params[iparam].c*params[iparam].c * (params[iparam].h - cos_theta); const F_FLOAT denominator = (params[iparam].d*params[iparam].d) + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta); const F_FLOAT gijk_d = params[iparam].gamma*numerator/(denominator*denominator); // compute the derivative wrt Ri // dri = -dfc*gijk*ex_delr*rik_hat; // dri += fc*gijk_d*ex_delr*dcosdri; // dri += fc*gijk*ex_delr_d*(rik_hat - rij_hat); const F_FLOAT fc = ters_fc(rik,params[iparam].bigr,params[iparam].bigd); const F_FLOAT dfc = ters_fc_d(rik,params[iparam].bigr,params[iparam].bigd); vec3_scale(-dfc*gijk*ex_delr,rik_hat,dri); vec3_scaleadd(fc*gijk_d*ex_delr,dcosdri,dri,dri); vec3_scaleadd(fc*gijk*ex_delr_d,rik_hat,dri,dri); vec3_scaleadd(-fc*gijk*ex_delr_d,rij_hat,dri,dri); vec3_scale(prefactor,dri,dri); // compute the derivative wrt Rj // drj = fc*gijk_d*ex_delr*dcosdrj; // drj += fc*gijk*ex_delr_d*rij_hat; vec3_scale(fc*gijk_d*ex_delr,dcosdrj,drj); vec3_scaleadd(fc*gijk*ex_delr_d,rij_hat,drj,drj); vec3_scale(prefactor,drj,drj); // compute the derivative wrt Rk // drk = dfc*gijk*ex_delr*rik_hat; // drk += fc*gijk_d*ex_delr*dcosdrk; // drk += -fc*gijk*ex_delr_d*rik_hat; vec3_scale(dfc*gijk*ex_delr,rik_hat,drk); vec3_scaleadd(fc*gijk_d*ex_delr,dcosdrk,drk,drk); vec3_scaleadd(-fc*gijk*ex_delr_d,rik_hat,drk,drk); vec3_scale(prefactor,drk,drk); } __device__ void ters_zetaterm_d_fi(F_FLOAT &prefactor, F_FLOAT3& rij_hat, F_FLOAT &rij, F_FLOAT3& rik_hat, F_FLOAT &rik, F_FLOAT3& dri, int &iparam) { F_FLOAT ex_delr,ex_delr_d,tmp; if (params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)); else tmp = params[iparam].lam3 * (rij-rik); if (tmp > F_F(69.0776)) ex_delr = F_F(1.e30); else if (tmp < -F_F(69.0776)) ex_delr = F_F(0.0); else ex_delr = exp(tmp); if (params[iparam].powermint == 3) ex_delr_d = F_F(3.0)*(params[iparam].lam3*params[iparam].lam3*params[iparam].lam3) * (rij-rik)*(rij-rik)*ex_delr; else ex_delr_d = params[iparam].lam3 * ex_delr; const F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); //costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); F_FLOAT3 dcosdri; vec3_scaleadd(-cos_theta,rij_hat,rik_hat,dri); vec3_scale(F_F(1.0)/rij,dri,dri); vec3_scaleadd(-cos_theta,rik_hat,rij_hat,dcosdri); vec3_scale(F_F(1.0)/rik,dcosdri,dcosdri); vec3_add(dri,dcosdri,dcosdri); vec3_scale(-F_F(1.0),dcosdri,dcosdri); const F_FLOAT gijk = params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c)/(params[iparam].d*params[iparam].d) - (params[iparam].c*params[iparam].c) / (params[iparam].d*params[iparam].d + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta))); const F_FLOAT numerator = -F_F(2.0) * params[iparam].c*params[iparam].c * (params[iparam].h - cos_theta); const F_FLOAT denominator = (params[iparam].d*params[iparam].d) + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta); const F_FLOAT gijk_d = params[iparam].gamma*numerator/(denominator*denominator); // compute the derivative wrt Ri // const F_FLOAT fc = ters_fc(rik,params[iparam].bigr,params[iparam].bigd); const F_FLOAT dfc = ters_fc_d(rik,params[iparam].bigr,params[iparam].bigd); vec3_scale(-dfc*gijk*ex_delr,rik_hat,dri); vec3_scaleadd(fc*gijk_d*ex_delr,dcosdri,dri,dri); vec3_scaleadd(fc*gijk*ex_delr_d,rik_hat,dri,dri); vec3_scaleadd(-fc*gijk*ex_delr_d,rij_hat,dri,dri); vec3_scale(prefactor,dri,dri); } __device__ void ters_zetaterm_d_fj(F_FLOAT &prefactor, F_FLOAT3& rij_hat, F_FLOAT &rij, F_FLOAT3& rik_hat, F_FLOAT &rik, F_FLOAT3& drj, int &iparam) { F_FLOAT ex_delr,ex_delr_d,tmp; if (params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)); else tmp = params[iparam].lam3 * (rij-rik); if (tmp > F_F(69.0776)) ex_delr = F_F(1.e30); else if (tmp < -F_F(69.0776)) ex_delr = F_F(0.0); else ex_delr = exp(tmp); if (params[iparam].powermint == 3) ex_delr_d = F_F(3.0)*(params[iparam].lam3*params[iparam].lam3*params[iparam].lam3) * (rij-rik)*(rij-rik)*ex_delr; else ex_delr_d = params[iparam].lam3 * ex_delr; const F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); vec3_scaleadd(-cos_theta,rij_hat,rik_hat,drj); vec3_scale(F_F(1.0)/rij,drj,drj); const F_FLOAT gijk = params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c)/(params[iparam].d*params[iparam].d) - (params[iparam].c*params[iparam].c) / (params[iparam].d*params[iparam].d + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta))); const F_FLOAT numerator = -F_F(2.0) * params[iparam].c*params[iparam].c * (params[iparam].h - cos_theta); const F_FLOAT denominator = (params[iparam].d*params[iparam].d) + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta); const F_FLOAT gijk_d = params[iparam].gamma*numerator/(denominator*denominator); // compute the derivative wrt Ri const F_FLOAT fc = ters_fc(rik,params[iparam].bigr,params[iparam].bigd); vec3_scale(fc*gijk_d*ex_delr,drj,drj); vec3_scaleadd(fc*gijk*ex_delr_d,rij_hat,drj,drj); vec3_scale(prefactor,drj,drj); } __device__ void ters_zetaterm_d_fk(F_FLOAT &prefactor, F_FLOAT3& rij_hat, F_FLOAT &rij, F_FLOAT3& rik_hat, F_FLOAT &rik, F_FLOAT3& drk, int &iparam) { F_FLOAT ex_delr,ex_delr_d,tmp; if (params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)*params[iparam].lam3 * (rij-rik)); else tmp = params[iparam].lam3 * (rij-rik); if (tmp > F_F(69.0776)) ex_delr = F_F(1.e30); else if (tmp < -F_F(69.0776)) ex_delr = F_F(0.0); else ex_delr = exp(tmp); if (params[iparam].powermint == 3) ex_delr_d = F_F(3.0)*(params[iparam].lam3*params[iparam].lam3*params[iparam].lam3) * (rij-rik)*(rij-rik)*ex_delr; else ex_delr_d = params[iparam].lam3 * ex_delr; const F_FLOAT cos_theta = vec3_dot(rij_hat,rik_hat); vec3_scaleadd(-cos_theta,rik_hat,rij_hat,drk); vec3_scale(F_F(1.0)/rik,drk,drk); const F_FLOAT gijk = params[iparam].gamma*(F_F(1.0) + (params[iparam].c*params[iparam].c)/(params[iparam].d*params[iparam].d) - (params[iparam].c*params[iparam].c) / (params[iparam].d*params[iparam].d + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta))); const F_FLOAT numerator = -F_F(2.0) * params[iparam].c*params[iparam].c * (params[iparam].h - cos_theta); const F_FLOAT denominator = (params[iparam].d*params[iparam].d) + (params[iparam].h - cos_theta)*(params[iparam].h - cos_theta); const F_FLOAT gijk_d = params[iparam].gamma*numerator/(denominator*denominator); // compute the derivative wrt Ri const F_FLOAT fc = ters_fc(rik,params[iparam].bigr,params[iparam].bigd); const F_FLOAT dfc = ters_fc_d(rik,params[iparam].bigr,params[iparam].bigd); vec3_scale(fc*gijk_d*ex_delr,drk,drk); vec3_scaleadd(dfc*gijk*ex_delr,rik_hat,drk,drk); vec3_scaleadd(-fc*gijk*ex_delr_d,rik_hat,drk,drk); vec3_scale(prefactor,drk,drk); } __device__ void attractive(int iparam, F_FLOAT prefactor, F_FLOAT4& delij, F_FLOAT4& delik, F_FLOAT3& fi, F_FLOAT3& fj, F_FLOAT3& fk) { F_FLOAT3 rij_hat,rik_hat; F_FLOAT rij,rijinv,rik,rikinv; rij = sqrt(delij.w); rijinv = F_F(1.0)/rij; vec3_scale(rijinv,delij,rij_hat); rik = sqrt(delik.w); rikinv = F_F(1.0)/rik; vec3_scale(rikinv,delik,rik_hat); ters_zetaterm_d(prefactor,rij_hat,rij,rik_hat,rik,fi,fj,fk,iparam); } __device__ void attractive_fi(int& iparam, F_FLOAT& prefactor, F_FLOAT4& delij, F_FLOAT4& delik, F_FLOAT3& f) { F_FLOAT3 rij_hat,rik_hat; F_FLOAT rij,rijinv,rik,rikinv; rij = sqrt(delij.w); rijinv = F_F(1.0)/rij; vec3_scale(rijinv,delij,rij_hat); rik = sqrt(delik.w); rikinv = F_F(1.0)/rik; vec3_scale(rikinv,delik,rik_hat); ters_zetaterm_d_fi(prefactor,rij_hat,rij,rik_hat,rik,f,iparam); } __device__ void attractive_fj(int iparam, F_FLOAT prefactor, F_FLOAT4& delij, F_FLOAT4& delik, F_FLOAT3& f) { F_FLOAT3 rij_hat,rik_hat; F_FLOAT rij,rijinv,rik,rikinv; rij = sqrt(delij.w); rijinv = F_F(1.0)/rij; vec3_scale(rijinv,delij,rij_hat); rik = sqrt(delik.w); rikinv = F_F(1.0)/rik; vec3_scale(rikinv,delik,rik_hat); ters_zetaterm_d_fj(prefactor,rij_hat,rij,rik_hat,rik,f,iparam); } __device__ void attractive_fk(int iparam, F_FLOAT prefactor, F_FLOAT4& delij, F_FLOAT4& delik, F_FLOAT3& f) { F_FLOAT3 rij_hat,rik_hat; F_FLOAT rij,rijinv,rik,rikinv; rij = sqrt(delij.w); rijinv = F_F(1.0)/rij; vec3_scale(rijinv,delij,rij_hat); rik = sqrt(delik.w); rikinv = F_F(1.0)/rik; vec3_scale(rikinv,delik,rik_hat); ters_zetaterm_d_fk(prefactor,rij_hat,rij,rik_hat,rik,f,iparam); } __global__ void Pair_Tersoff_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) { int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; if( ii >= _nall ) return; X_FLOAT4 myxtype; F_FLOAT4 delij; F_FLOAT xtmp,ytmp,ztmp; int itype,jnum,i,j; int* jlist; int neigh_red = 0; i = ii;//_ilist[ii]; myxtype = fetchXType(i); xtmp=myxtype.x; ytmp=myxtype.y; ztmp=myxtype.z; itype=map[(static_cast (myxtype.w))]; jnum = _numneigh[i]; jlist = &_neighbors[i]; __syncthreads(); for (int jj = 0; jj < jnum; jj++) { if(jj (myxtype.w))]; int iparam_ij = elem2param[(itype*nelements+jtype)*nelements+jtype]; delij.w = vec3_dot(delij,delij); if (delij.w < params[iparam_ij].cutsq) { _glob_neighbors_red[i+neigh_red*_nall]=j; _glob_neightype_red[i+neigh_red*_nall]=jtype; _glob_r_ij[i+neigh_red*_nall]=delij; neigh_red++; } } } _glob_numneigh_red[i]=neigh_red; } __global__ void Pair_Tersoff_Kernel_TpA_ZetaIJ()//F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) { int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; if( ii >= _nall ) return; F_FLOAT4 delij; F_FLOAT4 delik; int itype,jnum,i,j; int* jlist; i = ii; itype=map[(static_cast (_type[i]))]; jnum = _glob_numneigh_red[i]; jlist = &_glob_neighbors_red[i]; __syncthreads(); for (int jj = 0; jj < jnum; jj++) { if(jj __global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom,int vflag_atom)//,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) { ENERGY_FLOAT evdwl = ENERGY_F(0.0); ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x]; ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem; if((eflag||eflag_atom)&&(vflagm||vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7*blockDim.x]; else if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x]; else if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6*blockDim.x]; shared_F_F+=threadIdx.x; if(eflag_atom||eflag) { sharedE[0] = ENERGY_F(0.0); sharedV += blockDim.x; } if(vflagm||vflag_atom) { sharedV[0*blockDim.x] = ENERGY_F(0.0); sharedV[1*blockDim.x] = ENERGY_F(0.0); sharedV[2*blockDim.x] = ENERGY_F(0.0); sharedV[3*blockDim.x] = ENERGY_F(0.0); sharedV[4*blockDim.x] = ENERGY_F(0.0); sharedV[5*blockDim.x] = ENERGY_F(0.0); } int jnum_red=0; #define fxtmp shared_F_F[0] #define fytmp shared_F_F[blockDim.x] #define fztmp shared_F_F[2*blockDim.x] //#define jnum_red (static_cast (shared_F_F[3*blockDim.x])) int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; X_FLOAT4 myxtype_i,myxtype_j,myxtype_k; F_FLOAT4 delij,delik,deljk; F_FLOAT fpair; F_FLOAT prefactor_ij,prefactor_ji; int itype,i,j; int* jlist_red; if(ii < _inum) { i = _ilist[ii]; if(vflagm) myxtype_i=fetchXType(i); //itype=map[(static_cast (myxtype_i.w))]; itype=map[_type[i]]; fxtmp = F_F(0.0); fytmp = F_F(0.0); fztmp = F_F(0.0); //shared_F_F[3*blockDim.x] = _glob_numneigh_red[i]; jnum_red = _glob_numneigh_red[i]; jlist_red = &_glob_neighbors_red[i]; } __syncthreads(); #pragma unroll 1 for (int jj = 0; jj < jnum_red; jj++) { if(i < _nlocal) { fpair=F_F(0.0); j = jlist_red[jj*_nall]; j &= NEIGHMASK; if(vflagm) myxtype_j = fetchXType(j); int jtype = _glob_neightype_red[i+jj*_nall]; delij = _glob_r_ij[i+jj*_nall]; volatile int iparam_ij = elem2param[(itype*nelements+jtype)*nelements+jtype]; volatile int iparam_ji = elem2param[(jtype*nelements+itype)*nelements+itype]; if (delij.w(); else if(eflag) PairVirialCompute_A_Kernel_Template<1,0>(); else if(vflagm) PairVirialCompute_A_Kernel_Template<0,1>(); #undef fxtmp #undef fytmp #undef fztmp //#undef jnum_red }