/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov See the README file in the top-level LAMMPS directory. ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Theoretical Physics II, University of Technology Ilmenau, Germany See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ static __device__ inline F_FLOAT4 fetchRhor(int i) { #ifdef CUDA_USE_TEXTURE #if F_PRECISION == 1 return tex1Dfetch(_rhor_spline_tex,i); #else return tex1Dfetch_double_f(_rhor_spline_tex,i); #endif #else return _rhor_spline[i]; #endif } static __device__ inline F_FLOAT4 fetchZ2r(int i) { #ifdef CUDA_USE_TEXTURE #if F_PRECISION == 1 return tex1Dfetch(_z2r_spline_tex,i); #else return tex1Dfetch_double_f(_z2r_spline_tex,i); #endif #else return _z2r_spline[i]; #endif } __global__ void PairEAMCuda_Kernel1(int eflag, int vflag,int eflag_atom,int vflag_atom) { ENERGY_FLOAT* sharedE; ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; if(eflag||eflag_atom) { sharedE = &sharedmem[threadIdx.x]; sharedE[0] = ENERGY_F(0.0); sharedV += blockDim.x; } if(vflag||vflag_atom) { sharedV[0*blockDim.x] = ENERGY_F(0.0); sharedV[1*blockDim.x] = ENERGY_F(0.0); sharedV[2*blockDim.x] = ENERGY_F(0.0); sharedV[3*blockDim.x] = ENERGY_F(0.0); sharedV[4*blockDim.x] = ENERGY_F(0.0); sharedV[5*blockDim.x] = ENERGY_F(0.0); } int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; X_FLOAT xtmp,ytmp,ztmp; X_FLOAT4 myxtype; F_FLOAT delx,dely,delz; int itype; int i=_nlocal; int jnum=0; int* jlist; if(ii < _inum) { i = _ilist[ii]; myxtype=fetchXType(i); xtmp=myxtype.x; ytmp=myxtype.y; ztmp=myxtype.z; itype=static_cast (myxtype.w); jnum = _numneigh[i]; jlist = &_neighbors[i]; if(i<_nlocal) _rho[i]=F_F(0.0); } __syncthreads(); for (int jj = 0; jj < jnum; jj++) { if(ii < _inum) if(jj (myxtype.w); const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; if (rsq < _cutsq_global) { F_FLOAT p = sqrt(rsq)*_rdr + F_F(1.0); int m = static_cast (p); m = MIN(m,_nr-1); p -= m; p = MIN(p,F_F(1.0)); int k=(static_cast (_type2rhor[jtype*_cuda_ntypes+itype])*(_nr+1)+m)*2; F_FLOAT4 c=fetchRhor(k+1); _rho[i] += ((c.w*p+c.x)*p+c.y)*p+c.z; } } } if(ii < _inum) { F_FLOAT p = _rho[i]*_rdrho + F_F(1.0); int m = static_cast (p); m = MAX(1,MIN(m,_nrho-1)); p -= m; p = MIN(p,F_F(1.0)); F_FLOAT* coeff = &_frho_spline[(static_cast (_type2frho[itype])*(_nrho+1)+m)*EAM_COEFF_LENGTH]; _fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2]; if (eflag||eflag_atom) { sharedmem[threadIdx.x] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; } } __syncthreads(); if(eflag||eflag_atom) { if(i<_nlocal&&eflag_atom) _eatom[i]+=sharedmem[threadIdx.x]; reduceBlock(sharedmem); ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(2.0)*sharedmem[0]; } } __global__ void PairEAMCuda_Kernel2(int eflag, int vflag,int eflag_atom,int vflag_atom) { ENERGY_FLOAT evdwl = ENERGY_F(0.0); ENERGY_FLOAT* sharedE; ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; if(eflag||eflag_atom) { sharedE = &sharedmem[threadIdx.x]; sharedE[0] = ENERGY_F(0.0); sharedV += blockDim.x; } if(vflag||vflag_atom) { sharedV[0*blockDim.x] = ENERGY_F(0.0); sharedV[1*blockDim.x] = ENERGY_F(0.0); sharedV[2*blockDim.x] = ENERGY_F(0.0); sharedV[3*blockDim.x] = ENERGY_F(0.0); sharedV[4*blockDim.x] = ENERGY_F(0.0); sharedV[5*blockDim.x] = ENERGY_F(0.0); } int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; X_FLOAT xtmp,ytmp,ztmp; X_FLOAT4 myxtype; F_FLOAT fxtmp,fytmp,fztmp,fpair; F_FLOAT delx,dely,delz; int itype,i; int jnum=0; int* jlist; if(ii < _inum) { i = _ilist[ii]; myxtype=fetchXType(i); xtmp=myxtype.x; ytmp=myxtype.y; ztmp=myxtype.z; itype=static_cast (myxtype.w); fxtmp = F_F(0.0); fytmp = F_F(0.0); fztmp = F_F(0.0); jnum = _numneigh[i]; jlist = &_neighbors[i]; if(i<_nlocal) _rho[i]=F_F(0.0); } if(ii (myxtype.w); const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; if (rsq < _cutsq_global) { F_FLOAT r = _SQRT_(rsq); F_FLOAT p = r*_rdr + F_F(1.0); int m = static_cast (p); m = MIN(m,_nr-1); p -= m; p = MIN(p,F_F(1.0)); int k=(static_cast (_type2rhor[itype*_cuda_ntypes+jtype])*(_nr+1)+m)*2; F_FLOAT4 c=fetchRhor(k); F_FLOAT rhoip = (c.x*p + c.y)*p + c.z; k=(static_cast (_type2rhor[jtype*_cuda_ntypes+itype])*(_nr+1)+m)*2; c=fetchRhor(k); F_FLOAT rhojp = (c.x*p + c.y)*p + c.z; k=(static_cast (_type2z2r[itype*_cuda_ntypes+jtype])*(_nr+1)+m)*2; c=fetchZ2r(k); F_FLOAT z2p = (c.x*p + c.y)*p + c.z; c=fetchZ2r(k+1); F_FLOAT z2 = ((c.w*p + c.x)*p + c.y)*p+c.z; F_FLOAT recip = F_F(1.0)/r; F_FLOAT phi = z2*recip; F_FLOAT phip = z2p*recip - phi*recip; F_FLOAT psip = _fp[i]*rhojp + _fp[j]*rhoip + phip; fpair = -psip*recip; F_FLOAT dxfp,dyfp,dzfp; fxtmp += dxfp = delx*fpair; fytmp += dyfp = dely*fpair; fztmp += dzfp = delz*fpair; evdwl+=phi; if(vflag||vflag_atom) { sharedV[0 * blockDim.x]+= delx*dxfp; sharedV[1 * blockDim.x]+= dely*dyfp; sharedV[2 * blockDim.x]+= delz*dzfp; sharedV[3 * blockDim.x]+= delx*dyfp; sharedV[4 * blockDim.x]+= delx*dzfp; sharedV[5 * blockDim.x]+= dely*dzfp; } } } } __syncthreads(); if(ii < _inum) { F_FLOAT* my_f; if(_collect_forces_later) { ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; if(eflag) { buffer=&buffer[1 * gridDim.x * gridDim.y]; } if(vflag) { buffer=&buffer[6 * gridDim.x * gridDim.y]; } my_f = (F_FLOAT*) buffer; my_f += i; *my_f = fxtmp; my_f += _nmax; *my_f = fytmp; my_f += _nmax; *my_f = fztmp; } else { my_f = _f + i; *my_f += fxtmp; my_f += _nmax; *my_f += fytmp; my_f += _nmax; *my_f += fztmp; } } __syncthreads(); if(eflag) { sharedE[0] = evdwl; } if(eflag_atom && i<_nlocal) { _eatom[i] += evdwl; } if(vflag_atom && i<_nlocal) { _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x]; _vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x]; _vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x]; _vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x]; _vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x]; _vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x]; } if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,0); } __global__ void PairEAMCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,F_FLOAT* buffer) { int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; int* list=sendlist+iswap*maxlistlength; if(i