/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov Peng Wang (Nvidia), penwang@nvidia.com Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ #if defined(__APPLE__) #if _GLIBCXX_ATOMIC_BUILTINS == 1 #undef _GLIBCXX_ATOMIC_BUILTINS #endif // _GLIBCXX_ATOMIC_BUILTINS #endif // __APPLE__ #include #include "lj_gpu_memory.h" #include "pair_gpu_cell.h" static __constant__ float d_boxlo[3]; static __constant__ float d_boxhi[3]; static __constant__ float d_cell_size[1]; static __constant__ float d_skin[1]; void init_cell_list_const(double cell_size, double skin, double *boxlo, double *boxhi) { float cell_size1 = cell_size; float skin1 = skin; float boxlo1[3], boxhi1[3]; for (int i = 0; i < 3; i++) { boxlo1[i] = boxlo[i]; boxhi1[i] = boxhi[i]; } cudaMemcpyToSymbol(d_cell_size, &cell_size1, sizeof(float), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_boxlo, boxlo1, 3*sizeof(float), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_boxhi, boxhi1, 3*sizeof(float), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_skin, &skin1, sizeof(float), 0, cudaMemcpyHostToDevice); } __global__ void kernel_set_cell_list(unsigned int *cell_idx) { unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x; cell_idx[gid] = BIG_NUMBER; } // build the cell list __global__ void kernel_build_cell_list(float3 *cell_list, unsigned int *cell_idx, int *cell_type, int *cell_atom, float3 *pos, int *type, const int inum, const int nall, const int cell_size) { unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x; float cSize = d_cell_size[0]; int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize); int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize); int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize); if (gid < nall) { float3 p = pos[gid]; p.x = fmaxf(p.x, d_boxlo[0]-cSize); p.x = fminf(p.x, d_boxhi[0]+cSize); p.y = fmaxf(p.y, d_boxlo[1]-cSize); p.y = fminf(p.y, d_boxhi[1]+cSize); p.z = fmaxf(p.z, d_boxlo[2]-cSize); p.z = fminf(p.z, d_boxhi[2]+cSize); int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncellx + (int)(p.z/cSize + 1.0) * ncellx * ncelly; int atom_pos = atomicAdd(&cell_atom[cell_id], 1); int pid = cell_id*cell_size + atom_pos; cell_list[pid] = pos[gid]; cell_type[pid] = type[gid]; cell_idx [pid] = gid; } } __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebuild) { float cSize = d_cell_size[0]; int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize); int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize); int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize); // calculate 3D block idx from 2d block int bx = blockIdx.x; int by = blockIdx.y % ncelly; int bz = blockIdx.y / ncelly; int tid = threadIdx.x; // compute cell idx from 3D block idx int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly)); int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list float skin = d_skin[0]; float lowx = d_boxlo[0] + (bx-1)*cSize - 0.5*skin; float hix = lowx + cSize + skin; float lowy = d_boxlo[1] + (by-1)*cSize - 0.5*skin; float hiy = lowy + cSize + skin; float lowz = d_boxlo[2] + (bz-1)*cSize - 0.5*skin; float hiz = lowz + cSize + skin; for (int i = tid; i < cell_atom[cid]; i += blockDim.x) { int pid = pbase + i; float3 p = cell_list[pid]; p.x = fmaxf(p.x, d_boxlo[0]-cSize); p.x = fminf(p.x, d_boxhi[0]+cSize); p.y = fmaxf(p.y, d_boxlo[1]-cSize); p.y = fminf(p.y, d_boxhi[1]+cSize); p.z = fmaxf(p.z, d_boxlo[2]-cSize); p.z = fminf(p.z, d_boxhi[2]+cSize); if (p.x < lowx || p.x > hix || p.y < lowy || p.y > hiy || p.z < lowz || p.z > hiz) { *rebuild = 1; } } } __global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int ncell) { unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x; if (gid < ncell) { if (cell_atom[gid] > blockDim.x) *overflow = 1; } } __global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int *cell_atom, float3 *pos) { float cSize = d_cell_size[0]; int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize); int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize); int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize); // calculate 3D block idx from 2d block int bx = blockIdx.x; int by = blockIdx.y % ncelly; int bz = blockIdx.y / ncelly; int tid = threadIdx.x; // compute cell idx from 3D block idx int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly)); int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list for (int i = tid; i < cell_atom[cid]; i += blockDim.x) { int pid = pbase + i; cell_list[pid] = pos[cell_idx[pid]]; } } __global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit); #ifdef __DEVICE_EMULATION__ #define __SYNC __syncthreads(); #else #define __SYNC #endif #define WARP_SIZE 32 template __device__ T scanwarp(T val, T* sData) { // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1)) int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1)); sData[idx] = 0; idx += WARP_SIZE; sData[idx] = val; __SYNC #ifdef __DEVICE_EMULATION__ T t = sData[idx - 1]; __SYNC sData[idx] += t; __SYNC t = sData[idx - 2]; __SYNC sData[idx] += t; __SYNC t = sData[idx - 4]; __SYNC sData[idx] += t; __SYNC t = sData[idx - 8]; __SYNC sData[idx] += t; __SYNC t = sData[idx - 16]; __SYNC sData[idx] += t; __SYNC #else if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; } __SYNC if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; } __SYNC if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; } __SYNC if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; } __SYNC if (4 <= maxlevel) { sData[idx] += sData[idx -16]; } __SYNC #endif return sData[idx] - val; // convert inclusive -> exclusive } __device__ unsigned int scan(unsigned int idata) { extern __shared__ unsigned int ptr[]; unsigned int idx = threadIdx.x; unsigned int val = idata; val = scanwarp(val, ptr); __syncthreads(); if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1) { ptr[idx >> 5] = val + idata; } __syncthreads(); #ifndef __DEVICE_EMULATION__ if (idx < WARP_SIZE) #endif { ptr[idx] = scanwarp(ptr[idx], ptr); } __syncthreads(); val += ptr[idx >> 5]; return val; } __device__ unsigned int rank(unsigned int preds) { unsigned int address = scan(preds); __shared__ unsigned int numtrue; if (threadIdx.x == blockDim.x - 1) { numtrue = address + preds; } __syncthreads(); unsigned int rank; unsigned int idx = threadIdx.x; rank = (preds) ? address : numtrue + idx - address; return rank; } template __device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, unsigned int nbits, unsigned int startbit) { extern __shared__ unsigned int sMem1[]; __shared__ float sMem2[blockSize]; __shared__ int sMem3[blockSize]; int tid = threadIdx.x; for(unsigned int shift = startbit; shift < (startbit + nbits); ++shift) { unsigned int lsb; lsb = !(((*key) >> shift) & 0x1); unsigned int r; r = rank(lsb); // This arithmetic strides the ranks across 4 CTA_SIZE regions sMem1[r] = *key; __syncthreads(); // The above allows us to read without 4-way bank conflicts: *key = sMem1[tid]; __syncthreads(); sMem2[r] = (*value1).x; __syncthreads(); (*value1).x = sMem2[tid]; __syncthreads(); sMem2[r] = (*value1).y; __syncthreads(); (*value1).y = sMem2[tid]; __syncthreads(); sMem2[r] = (*value1).z; __syncthreads(); (*value1).z = sMem2[tid]; __syncthreads(); sMem3[r] = *value2; __syncthreads(); *value2 = sMem3[tid]; __syncthreads(); } } __global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit) { extern __shared__ unsigned int sMem[]; int gid = threadIdx.x + blockIdx.x * blockDim.x; unsigned int key; float3 value1; int value2; key = keys[gid]; value1 = values1[gid]; value2 = values2[gid]; __syncthreads(); if (blockDim.x == 64) radixSortBlock<64>(&key, &value1, &value2, nbits, startbit); else if (blockDim.x == 128) radixSortBlock<128>(&key, &value1, &value2, nbits, startbit); else if (blockDim.x == 256) radixSortBlock<256>(&key, &value1, &value2, nbits, startbit); keys[gid] = key; values1[gid] = value1; values2[gid] = value2; } void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size, int cell_size) { int i = 0; const unsigned int bitSize = sizeof(unsigned int)*8; const unsigned int bitStep = 4; const int gSize = size/cell_size; while (bitSize > i*bitStep) { radixSortBlocks<<>>(keys, values1, values2, bitStep, i*bitStep); i++; } } static float3 *d_pos, *pos_temp; static int *d_type; static int *d_overflow, *d_rebuild; void init_cell_list(cell_list &cell_list_gpu, const int nall, const int ncell, const int buffer) { cudaMalloc((void**)&(cell_list_gpu.pos), ncell*buffer*sizeof(float3)); cudaMalloc((void**)&(cell_list_gpu.idx), ncell*buffer*sizeof(unsigned int)); cudaMalloc((void**)&(cell_list_gpu.type), ncell*buffer*sizeof(int)); cudaMalloc((void**)&(cell_list_gpu.natom), ncell*sizeof(int)); cudaMallocHost((void**)&pos_temp, nall*sizeof(float3)); cudaMalloc((void**)&d_pos, nall*sizeof(float3)); cudaMalloc((void**)&d_type, nall*sizeof(int)); cudaMalloc((void**)&d_overflow, sizeof(int)); cudaMalloc((void**)&d_rebuild, sizeof(int)); cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int)); cudaMemset(cell_list_gpu.pos, 0, ncell*buffer*sizeof(float3)); } void clear_cell_list(cell_list &cell_list_gpu) { cudaFree(cell_list_gpu.pos); cudaFree(cell_list_gpu.idx); cudaFree(cell_list_gpu.natom); cudaFree(cell_list_gpu.type); cudaFreeHost(pos_temp); cudaFree(d_pos); cudaFree(d_type); cudaFree(d_overflow); cudaFree(d_rebuild); } void build_cell_list(double *atom_pos, int *atom_type, cell_list &cell_list_gpu, const int ncell, const int ncellx, const int ncelly, const int ncellz, const int buffer, const int inum, const int nall, const int ago) { cudaError_t err; cudaMemset(d_overflow, 0, sizeof(int)); cudaMemset(d_rebuild, 0, sizeof(int)); // copy position and type to GPU for (int i = 0; i < 3*nall; i+=3) { pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]); } cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice); cudaMemcpy(d_type, atom_type, nall*sizeof(int), cudaMemcpyHostToDevice); static int first_build = 1; int rebuild = 0; // copy the last built cell-list and test whether it needs to be rebuilt if (!first_build) { dim3 grid(ncellx, ncelly*ncellz); kernel_copy_list<<>>(cell_list_gpu.pos, cell_list_gpu.idx, cell_list_gpu.natom, d_pos); cudaMemset(d_rebuild, 0, sizeof(int)); kernel_test_rebuild<<>>(cell_list_gpu.pos, cell_list_gpu.natom, d_rebuild); cudaMemcpy(&rebuild, d_rebuild, sizeof(int), cudaMemcpyDeviceToHost); err = cudaGetLastError(); assert(err == cudaSuccess); } if (ago == 0) rebuild = 1; // build cell-list for the first time if (first_build || rebuild) { first_build = 0; // cout << "Building cell list..." << endl; cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int)); // initialize d_cell_idx for the sorting routine kernel_set_cell_list<<>>(cell_list_gpu.idx); // build cell list dim3 blockDim(128); dim3 gridDim(static_cast(ceil(static_cast(nall)/blockDim.x))); kernel_build_cell_list<<>>(cell_list_gpu.pos, cell_list_gpu.idx, cell_list_gpu.type, cell_list_gpu.natom, d_pos, d_type, inum, nall, buffer); err = cudaGetLastError(); assert(err == cudaSuccess); // check cell list overflow int overflow = 0; int gDimCell = static_cast(ceil(static_cast(ncell)/buffer)); kernel_test_overflow<<>>(cell_list_gpu.natom, d_overflow, ncell); cudaMemcpy(&overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost); if (overflow > 0) { printf("\n BLOCK_1D too small for cell list, please increase it!"); printf("\n BLOCK_1D = %d",BLOCK_1D); printf("\n ncell = %d",ncell); printf("\n gDimCell = %d",gDimCell); printf("\n overflow = %d \n",overflow); exit(0); } // sort atoms in every cell by atom index to avoid floating point associativity problem. sortBlocks(cell_list_gpu.idx, cell_list_gpu.pos, cell_list_gpu.type, ncell*buffer, buffer); cudaThreadSynchronize(); err = cudaGetLastError(); assert(err == cudaSuccess); } }