/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov Peng Wang (Nvidia), penwang@nvidia.com Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ #ifndef LJ_GPU_KERNEL #define LJ_GPU_KERNEL /* Cell list version of LJ kernel */ template __global__ void kernel_lj_cell(float3 *force3, float *energy, float3 *virial, float3 *cell_list, unsigned int *cell_idx, int *cell_type, int *cell_atom, const int inum, const int nall, const int ncell, const int ncellx, const int ncelly, const int ncellz) { // calculate 3D block idx from 2d block int bx = blockIdx.x; int by = blockIdx.y % ncelly; int bz = blockIdx.y / ncelly; int tid = threadIdx.x; // compute cell idx from 3D block idx int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly)); __shared__ int typeSh[blockSize]; __shared__ float posSh[blockSize*3]; __shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; extern __shared__ float smem[]; __shared__ float *lj3Sh; __shared__ float *lj4Sh; __shared__ float *offsetSh; // load force parameters into shared memory for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) { int itype = i/MAX_SHARED_TYPES; int jtype = i%MAX_SHARED_TYPES; cutsqSh[i] = _cutsq_(itype,jtype); lj1Sh[i] = _lj1_(itype,jtype).x; lj2Sh[i] = _lj1_(itype,jtype).y; } // Only allocate shared memory when needed, // this reduces shared memory limitation on occupancy if (eflag || vflag) { lj3Sh = smem; lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES; offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES; for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) { int itype = i/MAX_SHARED_TYPES; int jtype = i%MAX_SHARED_TYPES; lj3Sh[i] = _lj3_(itype,jtype).x+0.01; lj4Sh[i] = _lj3_(itype,jtype).y; offsetSh[i]= _offset_(itype,jtype); } } __syncthreads(); int nborz0 = max(bz-1,0), nborz1 = min(bz+1, ncellz-1), nbory0 = max(by-1,0), nbory1 = min(by+1, ncelly-1), nborx0 = max(bx-1,0), nborx1 = min(bx+1, ncellx-1); for (int ii = 0; ii < ceil((float)(cell_atom[cid])/blockSize); ii++) { float3 f = {0.0f, 0.0f, 0.0f}; float ener = 0.0f; float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f}; int itype; float ix, iy, iz; int i = tid + ii*blockSize; unsigned int answer_pos = cell_idx[cid*blockSize+i]; // load current cell atom position and type into sMem for (int j = tid; j < cell_atom[cid]; j += blockSize) { int pid = cid*blockSize + j; float3 pos = cell_list[pid]; posSh[j ] = pos.x; posSh[j+ blockSize] = pos.y; posSh[j+2*blockSize] = pos.z; typeSh[j] = cell_type[pid]; } __syncthreads(); if (answer_pos < inum) { itype = typeSh[i]; ix = posSh[i ]; iy = posSh[i+ blockSize]; iz = posSh[i+2*blockSize]; // compute force from current cell for (int j = 0; j < cell_atom[cid]; j++) { if (j == i) continue; float delx = ix - posSh[j ]; float dely = iy - posSh[j+ blockSize]; float delz = iz - posSh[j+2*blockSize]; int jtype = typeSh[j]; int mtype = itype + jtype*MAX_SHARED_TYPES; float r2inv = delx*delx + dely*dely + delz*delz; if (r2inv < cutsqSh[mtype]) { r2inv = 1.0f/r2inv; float r6inv = r2inv * r2inv * r2inv; float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]); f.x += delx * force; f.y += dely * force; f.z += delz * force; if (eflag) { float e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]); ener += (e - offsetSh[mtype]); } if (vflag) { v0.x += delx*delx*force; v0.y += dely*dely*force; v0.z += delz*delz*force; v1.x += delx*dely*force; v1.y += delx*delz*force; v1.z += dely*delz*force; } } } } __syncthreads(); // compute force from neigboring cells for (int nborz = nborz0; nborz <= nborz1; nborz++) { for (int nbory = nbory0; nbory <= nbory1; nbory++) { for (int nborx = nborx0; nborx <= nborx1; nborx++) { if (nborz == bz && nbory == by && nborx == bx) continue; // compute cell id int cid_nbor = nborx + INT_MUL(nbory,ncellx) + INT_MUL(nborz,INT_MUL(ncellx,ncelly)); // load neighbor cell position and type into smem for (int j = tid; j < cell_atom[cid_nbor]; j += blockSize) { int pid = INT_MUL(cid_nbor,blockSize) + j; float3 pos = cell_list[pid]; posSh[j ] = pos.x; posSh[j+ blockSize] = pos.y; posSh[j+2*blockSize] = pos.z; typeSh[j] = cell_type[pid]; } __syncthreads(); // compute force if (answer_pos < inum) { for (int j = 0; j < cell_atom[cid_nbor]; j++) { float delx = ix - posSh[j ]; float dely = iy - posSh[j+ blockSize]; float delz = iz - posSh[j+2*blockSize]; int jtype = typeSh[j]; int mtype = itype + jtype*MAX_SHARED_TYPES; float r2inv = delx*delx + dely*dely + delz*delz; if (r2inv < cutsqSh[mtype]) { r2inv = 1.0f/r2inv; float r6inv = r2inv * r2inv * r2inv; float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]); f.x += delx * force; f.y += dely * force; f.z += delz * force; if (eflag) { float e=r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]); ener += (e-offsetSh[mtype]); } if (vflag) { v0.x += delx*delx*force; v0.y += dely*dely*force; v0.z += delz*delz*force; v1.x += delx*dely*force; v1.y += delx*delz*force; v1.z += dely*delz*force; } } } } __syncthreads(); } } } if (answer_pos < inum) { force3[answer_pos] = f; if (eflag) energy[answer_pos] = ener; if (vflag) { virial[2*answer_pos] = v0; virial[2*answer_pos+1] = v1; } } } } #endif