// ************************************************************************** // neighbor_gpu.cu // ------------------- // Peng Wang (Nvidia) // W. Michael Brown (ORNL) // // Device code for handling GPU generated neighbor lists // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : penwang@nvidia.com, brownw@ornl.gov // ***************************************************************************/ #ifdef NV_KERNEL #include "lal_preprocessor.h" texture neigh_tex; #ifndef _DOUBLE_DOUBLE ucl_inline float4 fetch_pos(const int& i, const float4 *pos) { return tex1Dfetch(neigh_tex, i); } #endif __kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id, numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, numtyp boxhi0, numtyp boxhi1, numtyp boxhi2, numtyp cell_size, int ncellx, int ncelly, int nall) { int i = threadIdx.x + blockIdx.x*blockDim.x; if (i < nall) { numtyp4 p = fetch_pos(i,pos); //pos[i]; p.x -= boxlo0; p.y -= boxlo1; p.z -= boxlo2; p.x = fmaxf(p.x, -cell_size); p.x = fminf(p.x, boxhi0-boxlo0+cell_size); p.y = fmaxf(p.y, -cell_size); p.y = fminf(p.y, boxhi1-boxlo1+cell_size); p.z = fmaxf(p.z, -cell_size); p.z = fminf(p.z, boxhi2-boxlo2+cell_size); unsigned int id = (unsigned int)(p.x/cell_size + 1.0) + (unsigned int)(p.y/cell_size + 1.0) * ncellx + (unsigned int)(p.z/cell_size + 1.0) * ncellx * ncelly; cell_id[i] = id; particle_id[i] = i; } } __kernel void kernel_calc_cell_counts(unsigned *cell_id, int *cell_counts, int nall, int ncell) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nall) { int id = cell_id[idx]; // handle boundary cases if (idx == 0) { for (int i = 0; i < id + 1; i++) cell_counts[i] = 0; } if (idx == nall - 1) { for (int i = id+1; i <= ncell; i++) cell_counts[i] = nall; } if (idx > 0 && idx < nall) { int id_l = cell_id[idx-1]; if (id != id_l) { for (int i = id_l+1; i <= id; i++) cell_counts[i] = idx; } } } } #endif __kernel void transpose(__global int *out, __global int *in, int columns_in, int rows_in) { __local int block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; unsigned ti=THREAD_ID_X; unsigned tj=THREAD_ID_Y; unsigned bi=BLOCK_ID_X; unsigned bj=BLOCK_ID_Y; unsigned i=bi*BLOCK_CELL_2D+ti; unsigned j=bj*BLOCK_CELL_2D+tj; if ((i 1e-5) { cnt++; if (cnt <= neigh_bin_size) { *neigh_list = pid_j; neigh_list++; if ((cnt & (t_per_atom-1))==0) neigh_list=neigh_list+stride; } } } } __syncthreads(); } // for (k) } } } if (pid_i < nt) *neigh_counts = cnt; } // for (i) } __kernel void kernel_special(__global int *dev_nbor, __global int *host_nbor_list, __global int *host_numj, __global int *tag, __global int *nspecial, __global int *special, int inum, int nt, int max_nbors, int t_per_atom) { int tid=THREAD_ID_X; int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); ii+=tid/t_per_atom; int offset=tid & (t_per_atom-1); if (ii=n1) which++; if (i>=n2) which++; nbor=nbor ^ (which << SBBITS); *list=nbor; } offset+=nt; } } } // if ii }