/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov See the README file in the top-level LAMMPS directory. ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Theoretical Physics II, University of Technology Ilmenau, Germany See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, int vflag_atom, int** firstneight, int* binned_id , F_FLOAT kn, F_FLOAT gamman, F_FLOAT gammat, F_FLOAT xmu) { ENERGY_FLOAT evdwl = ENERGY_F(0.0); ENERGY_FLOAT* sharedE; ENERGY_FLOAT* sharedV; if(eflag || eflag_atom) { sharedE = &sharedmem[threadIdx.x]; sharedV = &sharedmem[0]; sharedE[0] = ENERGY_F(0.0); sharedV += blockDim.x; } if(vflag || vflag_atom) { sharedV += threadIdx.x; sharedV[0 * blockDim.x] = ENERGY_F(0.0); sharedV[1 * blockDim.x] = ENERGY_F(0.0); sharedV[2 * blockDim.x] = ENERGY_F(0.0); sharedV[3 * blockDim.x] = ENERGY_F(0.0); sharedV[4 * blockDim.x] = ENERGY_F(0.0); sharedV[5 * blockDim.x] = ENERGY_F(0.0); } int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; MYEMUDBG(if(ii == 0) printf("# CUDA: PairGranHookeCuda_Kernel: -- no binning --\n");) X_FLOAT xtmp, ytmp, ztmp; X_FLOAT4 myxtype; V_FLOAT4 myvradius, ovradius; F_FLOAT fxtmp, fytmp, fztmp, torquextmp, torqueytmp, torqueztmp; F_FLOAT delx, dely, delz; F_FLOAT radi, radj, radsum, r, rsqinv; F_FLOAT vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3; F_FLOAT wr1, wr2, wr3; F_FLOAT vtr1, vtr2, vtr3, vrel; F_FLOAT meff, damp, ccel, tor1, tor2, tor3; F_FLOAT fn, fs, ft, fs1, fs2, fs3; int jnum = 0; int i, j; int* jlist; if(ii < _inum) { i = _ilist[ii]; myxtype = fetchXType(i); myvradius = fetchVRadius(i); xtmp = myxtype.x; ytmp = myxtype.y; ztmp = myxtype.z; radi = myvradius.w; fxtmp = F_F(0.0); fytmp = F_F(0.0); fztmp = F_F(0.0); torquextmp = F_F(0.0); torqueytmp = F_F(0.0); torqueztmp = F_F(0.0); jnum = _numneigh[i]; jlist = &_neighbors[i]; } __syncthreads(); for(int jj = 0; jj < jnum; jj++) { if(ii < _inum) if(jj < jnum) { j = jlist[jj * _nlocal]; myxtype = fetchXType(j); ovradius = fetchVRadius(j); delx = xtmp - myxtype.x; dely = ytmp - myxtype.y; delz = ztmp - myxtype.z; radj = ovradius.w; radsum = radi + radj; const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq < radsum * radsum) { const F_FLOAT rinv = _RSQRT_(rsq); r = F_F(1.0) / rinv; rsqinv = F_F(1.0) / rsq; // relative translational velocity vr1 = myvradius.x - ovradius.x; vr2 = myvradius.y - ovradius.y; vr3 = myvradius.z - ovradius.z; // normal component vnnr = vr1 * delx + vr2 * dely + vr3 * delz; vn1 = delx * vnnr * rsqinv; vn2 = dely * vnnr * rsqinv; vn3 = delz * vnnr * rsqinv; // tangential component vt1 = vr1 - vn1; vt2 = vr2 - vn2; vt3 = vr3 - vn3; // relative rotational velocity V_FLOAT4 omegarmass_i = fetchOmegaRmass(i); V_FLOAT4 omegarmass_j = fetchOmegaRmass(j); wr1 = (radi * omegarmass_i.x + radj * omegarmass_j.x) * rinv; wr2 = (radi * omegarmass_i.y + radj * omegarmass_j.y) * rinv; wr3 = (radi * omegarmass_i.z + radj * omegarmass_j.z) * rinv; meff = omegarmass_i.w * omegarmass_j.w / (omegarmass_i.w + omegarmass_j.w); if(_mask[i] & _freeze_group_bit) meff = omegarmass_j.w; if(_mask[j] & _freeze_group_bit) meff = omegarmass_i.w; damp = meff * gamman * vnnr * rsqinv; ccel = kn * (radsum - r) * rinv - damp; vtr1 = vt1 - (delz * wr2 - dely * wr3); vtr2 = vt2 - (delx * wr3 - delz * wr1); vtr3 = vt3 - (dely * wr1 - delx * wr2); vrel = vtr1 * vtr1 + vtr2 * vtr2 + vtr3 * vtr3; vrel = _SQRT_(vrel); fn = xmu * fabs(ccel * r); fs = meff * gammat * vrel; ft = (vrel != F_F(0.0)) ? MIN(fn, fs) / vrel : F_F(0.0); fs1 = -ft * vtr1; fs2 = -ft * vtr2; fs3 = -ft * vtr3; F_FLOAT dxfp, dyfp, dzfp; fxtmp += dxfp = delx * ccel + fs1; fytmp += dyfp = dely * ccel + fs2; fztmp += dzfp = delz * ccel + fs3; tor1 = rinv * (dely * fs3 - delz * fs2); tor2 = rinv * (delz * fs1 - delx * fs3); tor3 = rinv * (delx * fs2 - dely * fs1); torquextmp -= radi * tor1; torqueytmp -= radi * tor2; torqueztmp -= radi * tor3; if(vflag) { sharedV[0 * blockDim.x] += delx * dxfp; sharedV[1 * blockDim.x] += dely * dyfp; sharedV[2 * blockDim.x] += delz * dzfp; sharedV[3 * blockDim.x] += delx * dyfp; sharedV[4 * blockDim.x] += delx * dzfp; sharedV[5 * blockDim.x] += dely * dzfp; } } } } __syncthreads(); if(ii < _inum) { F_FLOAT* my_f = _f + i; *my_f += fxtmp; my_f += _nmax; *my_f += fytmp; my_f += _nmax; *my_f += fztmp; F_FLOAT* my_torque = _torque + i; *my_torque += torquextmp; my_torque += _nmax; *my_torque += torqueytmp; my_torque += _nmax; *my_torque += torqueztmp; } __syncthreads(); if(eflag) sharedE[0] = evdwl; if(eflag_atom && i < _nlocal) _eatom[i] += evdwl; if(vflag_atom && i < _nlocal) { _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x]; _vatom[i + _nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x]; _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x]; _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x]; _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x]; _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x]; } if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, 0); }