Feb2021 GPU Package Update - GPU Package Files
This commit is contained in:
@ -1,6 +1,7 @@
|
||||
// **************************************************************************
|
||||
// neighbor_gpu.cu
|
||||
// -------------------
|
||||
// Nitin Dhamankar (Intel)
|
||||
// Peng Wang (Nvidia)
|
||||
// W. Michael Brown (ORNL)
|
||||
//
|
||||
@ -32,7 +33,14 @@ _texture( pos_tex,float4);
|
||||
_texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
|
||||
__kernel void calc_cell_id(const numtyp4 *restrict pos,
|
||||
#ifdef NV_KERNEL
|
||||
#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 2)
|
||||
// Issue with incorrect results in CUDA 11.2
|
||||
#define LAL_USE_OLD_NEIGHBOR
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__kernel void calc_cell_id(const numtyp4 *restrict x_,
|
||||
unsigned *restrict cell_id,
|
||||
int *restrict particle_id,
|
||||
numtyp boxlo0, numtyp boxlo1, numtyp boxlo2,
|
||||
@ -43,7 +51,7 @@ __kernel void calc_cell_id(const numtyp4 *restrict pos,
|
||||
|
||||
if (i < nall) {
|
||||
numtyp4 p;
|
||||
fetch4(p,i,pos_tex); //pos[i];
|
||||
fetch4(p,i,pos_tex); //x_[i];
|
||||
|
||||
p.x -= boxlo0;
|
||||
p.y -= boxlo1;
|
||||
@ -138,16 +146,219 @@ __kernel void transpose(__global tagint *restrict out,
|
||||
out[j*rows_in+i] = block[ti][tj];
|
||||
}
|
||||
|
||||
#ifndef LAL_USE_OLD_NEIGHBOR
|
||||
|
||||
#define MAX_STENCIL_SIZE 25
|
||||
#if !defined(MAX_SUBGROUPS_PER_BLOCK)
|
||||
#define MAX_SUBGROUPS_PER_BLOCK 8
|
||||
#endif
|
||||
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
__device__ __constant__ int bin_stencil[MAX_STENCIL_SIZE];
|
||||
#endif
|
||||
|
||||
__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
|
||||
const __global int *restrict cell_particle_id,
|
||||
const __global int *restrict cell_counts,
|
||||
__global int *nbor_list,
|
||||
__global int *host_nbor_list,
|
||||
__global int *host_numj,
|
||||
int neigh_bin_size, numtyp cell_size,
|
||||
int ncellx, int ncelly, int ncellz,
|
||||
int inum, int nt, int nall, int t_per_atom,
|
||||
int cells_in_cutoff)
|
||||
const __global int *restrict cell_particle_id,
|
||||
const __global int *restrict cell_counts,
|
||||
__global int *nbor_list,
|
||||
__global int *host_nbor_list,
|
||||
__global int *host_numj,
|
||||
int neigh_bin_size, numtyp cutoff_neigh,
|
||||
int ncellx, int ncelly, int ncellz,
|
||||
int inum, int nt, int nall, int t_per_atom,
|
||||
int cells_in_cutoff,
|
||||
const __global int *restrict cell_subgroup_counts,
|
||||
const __global int *restrict subgroup2cell,
|
||||
int subgroup_count,
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
int *not_used, __global int *error_flag)
|
||||
#else
|
||||
__constant int *bin_stencil,
|
||||
__global int *error_flag)
|
||||
#endif
|
||||
{
|
||||
int tid = THREAD_ID_X;
|
||||
int bsx = BLOCK_SIZE_X;
|
||||
int simd_size = simd_size();
|
||||
int subgroup_id_local = tid / simd_size;
|
||||
int subgroup_id_global = BLOCK_ID_X * bsx / simd_size + subgroup_id_local;
|
||||
int lane_id = tid % simd_size;
|
||||
|
||||
#if (SHUFFLE_AVAIL == 0)
|
||||
__local int cell_list_sh[BLOCK_NBOR_BUILD];
|
||||
__local numtyp4 pos_sh[BLOCK_NBOR_BUILD];
|
||||
__local int local_cell_counts[BLOCK_NBOR_BUILD];
|
||||
#endif
|
||||
__local int local_begin[(MAX_STENCIL_SIZE+1)*MAX_SUBGROUPS_PER_BLOCK];
|
||||
__local int local_counts[(MAX_STENCIL_SIZE+1)*MAX_SUBGROUPS_PER_BLOCK];
|
||||
|
||||
if (subgroup_id_global < subgroup_count) {
|
||||
// identify own cell for subgroup (icell) and local atom (i) for the lane
|
||||
int icell = subgroup2cell[subgroup_id_global];
|
||||
int icell_end = cell_counts[icell+1];
|
||||
int i = cell_counts[icell] + (subgroup_id_global -
|
||||
cell_subgroup_counts[icell]) *
|
||||
simd_size + lane_id;
|
||||
|
||||
// Get count of the number of iterations to finish all cells
|
||||
const int bin_stencil_stride = cells_in_cutoff * 2 + 1;
|
||||
const int bin_stencil_size = bin_stencil_stride * bin_stencil_stride;
|
||||
int offset = 0;
|
||||
int cell_count = 0, jcellyz, jcell_begin;
|
||||
const int offset2 = subgroup_id_local * (MAX_STENCIL_SIZE+1);
|
||||
const int niter = (bin_stencil_size - 1)/simd_size + 1;
|
||||
int end_idx = simd_size;
|
||||
for (int ni = 0; ni < niter; ni++) {
|
||||
if (ni == niter - 1)
|
||||
end_idx = bin_stencil_size - offset;
|
||||
if (lane_id < end_idx) {
|
||||
jcellyz = icell + bin_stencil[lane_id + offset];
|
||||
jcell_begin = cell_counts[jcellyz - cells_in_cutoff];
|
||||
local_begin[lane_id + offset2 + offset] = jcell_begin;
|
||||
const int local_count = cell_counts[jcellyz + cells_in_cutoff + 1] -
|
||||
jcell_begin;
|
||||
cell_count += local_count;
|
||||
local_counts[lane_id + offset2 + offset] = local_count;
|
||||
}
|
||||
offset += simd_size;
|
||||
}
|
||||
|
||||
#if (SHUFFLE_AVAIL == 0)
|
||||
local_cell_counts[tid] = cell_count;
|
||||
offset = subgroup_id_local * simd_size;
|
||||
for (unsigned int mask=simd_size/2; mask>0; mask>>=1) {
|
||||
simdsync();
|
||||
local_cell_counts[tid] += local_cell_counts[ offset + lane_id^mask ];
|
||||
}
|
||||
simdsync();
|
||||
cell_count = local_cell_counts[tid];
|
||||
#else
|
||||
#pragma unroll
|
||||
for (unsigned int s=simd_size/2; s>0; s>>=1)
|
||||
cell_count += shfl_xor(cell_count, s, simd_size);
|
||||
#endif
|
||||
|
||||
int num_iter = cell_count;
|
||||
int remainder = num_iter % simd_size;
|
||||
if (remainder == 0) remainder = simd_size;
|
||||
if (num_iter) num_iter = (num_iter - 1) / simd_size + 1;
|
||||
|
||||
numtyp4 diff;
|
||||
numtyp r2;
|
||||
|
||||
int pid_i = nall, lpid_j, stride;
|
||||
numtyp4 atom_i, atom_j;
|
||||
int cnt = 0;
|
||||
__global int *neigh_counts, *neigh_list;
|
||||
|
||||
if (i < icell_end)
|
||||
pid_i = cell_particle_id[i];
|
||||
|
||||
if (pid_i < nt) {
|
||||
fetch4(atom_i,pid_i,pos_tex); //pos[i];
|
||||
}
|
||||
|
||||
if (pid_i < inum) {
|
||||
stride=inum;
|
||||
neigh_counts=nbor_list+stride+pid_i;
|
||||
neigh_list=neigh_counts+stride+pid_i*(t_per_atom-1);
|
||||
stride=stride*t_per_atom-t_per_atom;
|
||||
nbor_list[pid_i]=pid_i;
|
||||
} else {
|
||||
stride=0;
|
||||
neigh_counts=host_numj+pid_i-inum;
|
||||
neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
|
||||
}
|
||||
|
||||
// loop through neighbors
|
||||
int bin_shift = 0;
|
||||
int zy = -1;
|
||||
int num_atom_cell = 0;
|
||||
int cell_pos = lane_id;
|
||||
end_idx = simd_size;
|
||||
for (int ci = 0; ci < num_iter; ci++) {
|
||||
cell_pos += simd_size;
|
||||
while (cell_pos >= num_atom_cell && zy < bin_stencil_size) {
|
||||
// Shift lane index into atom bins based on remainder from last bin
|
||||
bin_shift += num_atom_cell % simd_size;
|
||||
if (bin_shift >= simd_size) bin_shift -= simd_size;
|
||||
cell_pos = lane_id - bin_shift;
|
||||
if (cell_pos < 0) cell_pos += simd_size;
|
||||
// Move to next bin
|
||||
zy++;
|
||||
jcell_begin = local_begin[offset2 + zy];
|
||||
num_atom_cell = local_counts[offset2 + zy];
|
||||
}
|
||||
|
||||
if (zy < bin_stencil_size) {
|
||||
lpid_j = cell_particle_id[jcell_begin + cell_pos];
|
||||
fetch4(atom_j,lpid_j,pos_tex);
|
||||
#if (SHUFFLE_AVAIL == 0)
|
||||
cell_list_sh[tid] = lpid_j;
|
||||
pos_sh[tid].x = atom_j.x;
|
||||
pos_sh[tid].y = atom_j.y;
|
||||
pos_sh[tid].z = atom_j.z;
|
||||
}
|
||||
simdsync();
|
||||
#else
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ci == num_iter-1) end_idx = remainder;
|
||||
|
||||
for (int j = 0; j < end_idx; j++) {
|
||||
#if (SHUFFLE_AVAIL == 0)
|
||||
int pid_j = cell_list_sh[offset+j]; // gather from shared memory
|
||||
diff.x = atom_i.x - pos_sh[offset+j].x;
|
||||
diff.y = atom_i.y - pos_sh[offset+j].y;
|
||||
diff.z = atom_i.z - pos_sh[offset+j].z;
|
||||
#else
|
||||
int pid_j = simd_broadcast_i(lpid_j, j, simd_size);
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
diff.x = atom_i.x - simd_broadcast_d(atom_j.x, j, simd_size);
|
||||
diff.y = atom_i.y - simd_broadcast_d(atom_j.y, j, simd_size);
|
||||
diff.z = atom_i.z - simd_broadcast_d(atom_j.z, j, simd_size);
|
||||
#else
|
||||
diff.x = atom_i.x - simd_broadcast_f(atom_j.x, j, simd_size);
|
||||
diff.y = atom_i.y - simd_broadcast_f(atom_j.y, j, simd_size);
|
||||
diff.z = atom_i.z - simd_broadcast_f(atom_j.z, j, simd_size);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
|
||||
//USE CUTOFFSQ?
|
||||
if (r2 < cutoff_neigh*cutoff_neigh && pid_j != pid_i && pid_i < nt) {
|
||||
if (cnt < neigh_bin_size) {
|
||||
cnt++;
|
||||
*neigh_list = pid_j;
|
||||
neigh_list++;
|
||||
if ((cnt & (t_per_atom-1))==0)
|
||||
neigh_list=neigh_list+stride;
|
||||
} else
|
||||
*error_flag=1;
|
||||
}
|
||||
} // for j
|
||||
#if (SHUFFLE_AVAIL == 0)
|
||||
simdsync();
|
||||
#endif
|
||||
} // for (ci)
|
||||
if (pid_i < nt)
|
||||
*neigh_counts = cnt;
|
||||
} // if (subgroup_id_global < subgroup_count)
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
|
||||
const __global int *restrict cell_particle_id,
|
||||
const __global int *restrict cell_counts,
|
||||
__global int *nbor_list,
|
||||
__global int *host_nbor_list,
|
||||
__global int *host_numj,
|
||||
int neigh_bin_size, numtyp cell_size,
|
||||
int ncellx, int ncelly, int ncellz,
|
||||
int inum, int nt, int nall, int t_per_atom,
|
||||
int cells_in_cutoff)
|
||||
{
|
||||
int tid = THREAD_ID_X;
|
||||
int ix = BLOCK_ID_X + cells_in_cutoff;
|
||||
@ -232,7 +443,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
|
||||
diff.z = atom_i.z - pos_sh[j].z;
|
||||
|
||||
r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
|
||||
if (r2 < cell_size*cell_size && pid_j != pid_i) { // && r2 > 1e-5
|
||||
if (r2 < cell_size*cell_size && pid_j != pid_i) {
|
||||
cnt++;
|
||||
if (cnt <= neigh_bin_size) {
|
||||
*neigh_list = pid_j;
|
||||
@ -253,6 +464,8 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
|
||||
} // for (i)
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
__kernel void kernel_special(__global int *dev_nbor,
|
||||
__global int *host_nbor_list,
|
||||
const __global int *host_numj,
|
||||
@ -310,4 +523,3 @@ __kernel void kernel_special(__global int *dev_nbor,
|
||||
}
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user