// ************************************************************************** // neighbor_gpu.cu // ------------------- // Nitin Dhamankar (Intel) // Peng Wang (Nvidia) // W. Michael Brown (ORNL) // // Device code for handling GPU generated neighbor lists // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : penwang@nvidia.com, brownw@ornl.gov // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_preprocessor.h" #ifdef LAMMPS_SMALLBIG #define tagint int #endif #ifdef LAMMPS_BIGBIG #ifdef USE_OPENCL #define tagint long #else #include "stdint.h" #define tagint int64_t #endif #endif #ifdef LAMMPS_SMALLSMALL #define tagint int #endif #ifndef _DOUBLE_DOUBLE _texture( pos_tex,float4); #else _texture_2d( pos_tex,int4); #endif #ifdef NV_KERNEL #if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 2) // Issue with incorrect results in CUDA >= 11.2 #define LAL_USE_OLD_NEIGHBOR #endif #endif #ifdef USE_HIP #define LAL_USE_OLD_NEIGHBOR #endif __kernel void calc_cell_id(const numtyp4 *restrict x_, unsigned *restrict cell_id, int *restrict particle_id, numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, numtyp i_cell_size, int ncellx, int ncelly, int ncellz, int inum, int nall, int cells_in_cutoff) { int i = threadIdx.x + blockIdx.x*blockDim.x; if (i < nall) { numtyp4 p; fetch4(p,i,pos_tex); //x_[i]; p.x -= boxlo0; p.y -= boxlo1; p.z -= boxlo2; int ix = int(p.x*i_cell_size+cells_in_cutoff); int iy = int(p.y*i_cell_size+cells_in_cutoff); int iz = int(p.z*i_cell_size+cells_in_cutoff); int offset_lo, offset_hi; if (i 0 && idx < nall) { int id_l = cell_id[idx-1]; if (id != id_l) { for (int i = id_l+1; i <= id; i++) cell_counts[i] = idx; } } } } #else #define pos_tex x_ #ifdef LAMMPS_SMALLBIG #define tagint int #endif #ifdef LAMMPS_BIGBIG #define tagint long #endif #ifdef LAMMPS_SMALLSMALL #define tagint int #endif #endif __kernel void transpose(__global tagint *restrict out, const __global tagint *restrict in, int columns_in, int rows_in) { __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; unsigned ti=THREAD_ID_X; unsigned tj=THREAD_ID_Y; unsigned bi=BLOCK_ID_X; unsigned bj=BLOCK_ID_Y; unsigned i=bi*BLOCK_CELL_2D+ti; unsigned j=bj*BLOCK_CELL_2D+tj; if ((i0; mask>>=1) { simdsync(); local_cell_counts[tid] += local_cell_counts[ offset + lane_id^mask ]; } simdsync(); cell_count = local_cell_counts[tid]; #else #pragma unroll for (unsigned int s=simd_size/2; s>0; s>>=1) cell_count += shfl_xor(cell_count, s, simd_size); #endif int num_iter = cell_count; int remainder = num_iter % simd_size; if (remainder == 0) remainder = simd_size; if (num_iter) num_iter = (num_iter - 1) / simd_size + 1; numtyp4 diff; numtyp r2; int pid_i = nall, lpid_j, stride; numtyp4 atom_i, atom_j; int cnt = 0; __global int *neigh_counts, *neigh_list; if (i < icell_end) pid_i = cell_particle_id[i]; if (pid_i < nt) { fetch4(atom_i,pid_i,pos_tex); //pos[i]; } if (pid_i < inum) { stride=inum; neigh_counts=nbor_list+stride+pid_i; neigh_list=neigh_counts+stride+pid_i*(t_per_atom-1); stride=stride*t_per_atom-t_per_atom; nbor_list[pid_i]=pid_i; } else { stride=0; neigh_counts=host_numj+pid_i-inum; neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size; } // loop through neighbors int bin_shift = 0; int zy = -1; int num_atom_cell = 0; int cell_pos = lane_id; end_idx = simd_size; for (int ci = 0; ci < num_iter; ci++) { cell_pos += simd_size; while (cell_pos >= num_atom_cell && zy < bin_stencil_size) { // Shift lane index into atom bins based on remainder from last bin bin_shift += num_atom_cell % simd_size; if (bin_shift >= simd_size) bin_shift -= simd_size; cell_pos = lane_id - bin_shift; if (cell_pos < 0) cell_pos += simd_size; // Move to next bin zy++; jcell_begin = local_begin[offset2 + zy]; num_atom_cell = local_counts[offset2 + zy]; } if (zy < bin_stencil_size) { lpid_j = cell_particle_id[jcell_begin + cell_pos]; fetch4(atom_j,lpid_j,pos_tex); #if (SHUFFLE_AVAIL == 0) cell_list_sh[tid] = lpid_j; pos_sh[tid].x = atom_j.x; pos_sh[tid].y = atom_j.y; pos_sh[tid].z = atom_j.z; } simdsync(); #else } #endif if (ci == num_iter-1) end_idx = remainder; for (int j = 0; j < end_idx; j++) { #if (SHUFFLE_AVAIL == 0) int pid_j = cell_list_sh[offset+j]; // gather from shared memory diff.x = atom_i.x - pos_sh[offset+j].x; diff.y = atom_i.y - pos_sh[offset+j].y; diff.z = atom_i.z - pos_sh[offset+j].z; #else int pid_j = simd_broadcast_i(lpid_j, j, simd_size); #ifdef _DOUBLE_DOUBLE diff.x = atom_i.x - simd_broadcast_d(atom_j.x, j, simd_size); diff.y = atom_i.y - simd_broadcast_d(atom_j.y, j, simd_size); diff.z = atom_i.z - simd_broadcast_d(atom_j.z, j, simd_size); #else diff.x = atom_i.x - simd_broadcast_f(atom_j.x, j, simd_size); diff.y = atom_i.y - simd_broadcast_f(atom_j.y, j, simd_size); diff.z = atom_i.z - simd_broadcast_f(atom_j.z, j, simd_size); #endif #endif r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; //USE CUTOFFSQ? if (r2 < cutoff_neigh*cutoff_neigh && pid_j != pid_i && pid_i < nt) { if (cnt < neigh_bin_size) { cnt++; *neigh_list = pid_j; neigh_list++; if ((cnt & (t_per_atom-1))==0) neigh_list=neigh_list+stride; } else *error_flag=1; } } // for j #if (SHUFFLE_AVAIL == 0) simdsync(); #endif } // for (ci) if (pid_i < nt) *neigh_counts = cnt; } // if (subgroup_id_global < subgroup_count) } #else __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, const __global int *restrict cell_particle_id, const __global int *restrict cell_counts, __global int *nbor_list, __global int *host_nbor_list, __global int *host_numj, int neigh_bin_size, numtyp cell_size, int ncellx, int ncelly, int ncellz, int inum, int nt, int nall, int t_per_atom, int cells_in_cutoff) { int tid = THREAD_ID_X; int ix = BLOCK_ID_X + cells_in_cutoff; int iy = BLOCK_ID_Y % (ncelly - cells_in_cutoff*2) + cells_in_cutoff; int iz = BLOCK_ID_Y / (ncelly - cells_in_cutoff*2) + cells_in_cutoff; int bsx = BLOCK_SIZE_X; int icell = ix + iy*ncellx + iz*ncellx*ncelly; __local int cell_list_sh[BLOCK_NBOR_BUILD]; __local numtyp4 pos_sh[BLOCK_NBOR_BUILD]; int icell_begin = cell_counts[icell]; int icell_end = cell_counts[icell+1]; int nborz0 = iz-cells_in_cutoff, nborz1 = iz+cells_in_cutoff, nbory0 = iy-cells_in_cutoff, nbory1 = iy+cells_in_cutoff, nborx0 = ix-cells_in_cutoff, nborx1 = ix+cells_in_cutoff; numtyp4 diff; numtyp r2; int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/bsx); for (int ii = 0; ii < cap; ii++) { int i = icell_begin + tid + ii*bsx; int pid_i = nall, pid_j, stride; numtyp4 atom_i, atom_j; int cnt = 0; __global int *neigh_counts, *neigh_list; if (i < icell_end) pid_i = cell_particle_id[i]; if (pid_i < nt) { fetch4(atom_i,pid_i,pos_tex); //pos[i]; } if (pid_i < inum) { stride=inum; neigh_counts=nbor_list+stride+pid_i; neigh_list=neigh_counts+stride+pid_i*(t_per_atom-1); stride=stride*t_per_atom-t_per_atom; nbor_list[pid_i]=pid_i; } else { stride=0; neigh_counts=host_numj+pid_i-inum; neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size; } // loop through neighbors for (int nborz = nborz0; nborz <= nborz1; nborz++) { for (int nbory = nbory0; nbory <= nbory1; nbory++) { for (int nborx = nborx0; nborx <= nborx1; nborx++) { int jcell = nborx + nbory*ncellx + nborz*ncellx*ncelly; int jcell_begin = cell_counts[jcell]; int jcell_end = cell_counts[jcell+1]; int num_atom_cell = jcell_end - jcell_begin; // load jcell to shared memory int num_iter = ucl_ceil((numtyp)num_atom_cell/bsx); for (int k = 0; k < num_iter; k++) { int end_idx = min(bsx, num_atom_cell-k*bsx); if (tid < end_idx) { pid_j = cell_particle_id[tid+k*bsx+jcell_begin]; cell_list_sh[tid] = pid_j; fetch4(atom_j,pid_j,pos_tex); //[pid_j]; pos_sh[tid].x = atom_j.x; pos_sh[tid].y = atom_j.y; pos_sh[tid].z = atom_j.z; } __syncthreads(); if (pid_i < nt) { for (int j = 0; j < end_idx; j++) { int pid_j = cell_list_sh[j]; // gather from shared memory diff.x = atom_i.x - pos_sh[j].x; diff.y = atom_i.y - pos_sh[j].y; diff.z = atom_i.z - pos_sh[j].z; r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; if (r2 < cell_size*cell_size && pid_j != pid_i) { cnt++; if (cnt <= neigh_bin_size) { *neigh_list = pid_j; neigh_list++; if ((cnt & (t_per_atom-1))==0) neigh_list=neigh_list+stride; } } } } __syncthreads(); } // for (k) } } } if (pid_i < nt) *neigh_counts = cnt; } // for (i) } #endif __kernel void kernel_special(__global int *dev_nbor, __global int *host_nbor_list, const __global int *host_numj, const __global tagint *restrict tag, const __global int *restrict nspecial, const __global tagint *restrict special, int inum, int nt, int max_nbors, int t_per_atom) { int tid=THREAD_ID_X; int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); ii+=tid/t_per_atom; int offset=tid & (t_per_atom-1); if (ii=n1) which++; if (i>=n2) which++; nbor=nbor ^ (which << SBBITS); *list=nbor; } offset+=nt; } } } // if ii }