Loop over chunks on GPU to write values properly when using default chunk size

This commit is contained in:
rohskopf
2023-06-02 15:10:45 -06:00
parent 95e39ba89a
commit be5476e442
3 changed files with 120 additions and 93 deletions

View File

@ -220,7 +220,10 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
// "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
// `total_range` is the number of grid points which may be larger than chunk size.
//printf(">>> total_range: %d\n", total_range);
chunk_size = MIN(chunksize, total_range);
chunk_offset = 0;
//snaKK.grow_rij(chunk_size, ntotal);
snaKK.grow_rij(chunk_size, max_neighs);
@ -229,100 +232,112 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
// Pre-compute ceil(chunk_size / vector_length) for code cleanliness
const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
//ComputeNeigh
{
int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh>
policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
}
if (chunk_size > total_range - chunk_offset)
chunk_size = total_range - chunk_offset;
//ComputeCayleyKlein
{
// tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
}
//printf(">>> chunk_offset: %d\n", chunk_offset);
//PreUi
{
// tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>
policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
Kokkos::parallel_for("PreUi",policy_preui,*this);
}
// ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
{
// team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
// scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
const int tile_size = vector_length * (twojmax + 1);
const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
if (chunk_size < parallel_thresh)
//ComputeNeigh
{
// Version with parallelism over j_bend
int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
// total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
policy_ui(n_teams_div, team_size_compute_ui, vector_length);
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
} else {
// Version w/out parallelism over j_bend
// total number of teams needed: (natoms / 32) * (ntotal)
const int n_teams = chunk_size_div * max_neighs;
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
policy_ui(n_teams_div, team_size_compute_ui, vector_length);
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh>
policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
}
}
//TransformUi: un-"fold" ulisttot, zero ylist
{
// team_size_transform_ui is defined in `pair_snap_kokkos.h`
Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>
policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
}
//ComputeCayleyKlein
{
// tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
}
//Compute bispectrum in AoSoA data layout, transform Bi
//PreUi
{
// tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>
policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
Kokkos::parallel_for("PreUi",policy_preui,*this);
}
//ComputeZi
const int idxz_max = snaKK.idxz_max;
Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi>
policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
// ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
{
// team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
// scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
const int tile_size = vector_length * (twojmax + 1);
const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
//ComputeBi
const int idxb_max = snaKK.idxb_max;
Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi>
policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
if (chunk_size < parallel_thresh)
{
// Version with parallelism over j_bend
//Transform data layout of blist out of AoSoA
//We need this because `blist` gets used in ComputeForce which doesn't
//take advantage of AoSoA, which at best would only be beneficial on the margins
//NOTE: Do we need this in compute sna/grid/kk?
Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagCSNAGridTransformBi>
policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
// total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
// Fill the grid array with bispectrum values
{
typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
Kokkos::parallel_for(policy_fill, *this);
}
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
policy_ui(n_teams_div, team_size_compute_ui, vector_length);
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
} else {
// Version w/out parallelism over j_bend
// total number of teams needed: (natoms / 32) * (ntotal)
const int n_teams = chunk_size_div * max_neighs;
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
policy_ui(n_teams_div, team_size_compute_ui, vector_length);
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
}
}
//TransformUi: un-"fold" ulisttot, zero ylist
{
// team_size_transform_ui is defined in `pair_snap_kokkos.h`
Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>
policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
}
//Compute bispectrum in AoSoA data layout, transform Bi
//ComputeZi
const int idxz_max = snaKK.idxz_max;
Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi>
policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
//ComputeBi
const int idxb_max = snaKK.idxb_max;
Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi>
policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
//Transform data layout of blist out of AoSoA
//We need this because `blist` gets used in ComputeForce which doesn't
//take advantage of AoSoA, which at best would only be beneficial on the margins
//NOTE: Do we need this in compute sna/grid/kk?
Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagCSNAGridTransformBi>
policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
// Fill the grid array with bispectrum values
{
typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
Kokkos::parallel_for(policy_fill, *this);
}
// Proceed to the next chunk.
chunk_offset += chunk_size;
} // end while
k_gridlocal.template modify<DeviceType>();
k_gridlocal.template sync<LMPHostType>();
@ -363,8 +378,12 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
// extract loop index
int ii = team.team_rank() + team.league_rank() * team.team_size();
if (ii >= chunk_size) return;
// extract grid index
int igrid = ii + chunk_offset;
// get a pointer to scratch memory
// This is used to cache whether or not an atom is within the cutoff.
// If it is, type_cache is assigned to the atom type.
@ -376,8 +395,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
// convert to grid indices
int iz = ii/(xlen*ylen);
int i2 = ii - (iz*xlen*ylen);
int iz = igrid/(xlen*ylen);
int i2 = igrid - (iz*xlen*ylen);
int iy = i2/xlen;
int ix = i2 % xlen;
iz += nzlo;
@ -387,7 +406,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
double xgrid[3];
// index ii already captures the proper grid point
// int igrid = iz * (nx * ny) + iy * nx + ix;
//int igrid = iz * (nx * ny) + iy * nx + ix;
//printf("%d %d\n", ii, igrid);
// grid2x converts igrid to ix,iy,iz like we've done before
// multiply grid integers by grid spacing delx, dely, delz
@ -729,10 +749,14 @@ KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// extract grid index
int igrid = ii + chunk_offset;
// convert to grid indices
int iz = ii/(xlen*ylen);
int i2 = ii - (iz*xlen*ylen);
int iz = igrid/(xlen*ylen);
int i2 = igrid - (iz*xlen*ylen);
int iy = i2/xlen;
int ix = i2 % xlen;
iz += nzlo;
@ -753,9 +777,9 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
const F_FLOAT xtmp = xgrid[0];
const F_FLOAT ytmp = xgrid[1];
const F_FLOAT ztmp = xgrid[2];
d_gridall(ii,0) = xtmp;
d_gridall(ii,1) = ytmp;
d_gridall(ii,2) = ztmp;
d_gridall(igrid,0) = xtmp;
d_gridall(igrid,1) = ytmp;
d_gridall(igrid,2) = ztmp;
const auto idxb_max = snaKK.idxb_max;
@ -764,7 +788,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
d_gridall(ii,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
}
}

View File

@ -163,7 +163,7 @@ template <typename TYPE, typename HTYPE>
{
data = TYPE(std::string(name),n1,n2);
h_data = Kokkos::create_mirror_view(data);
printf(">>> name: %s\n", name);
//printf(">>> name: %s\n", name);
return data;
}
@ -174,7 +174,7 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
data = TYPE(std::string(name),n1,n2);
bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
array = (typename TYPE::value_type **) smalloc(nbytes,name);
printf(">>> name %s nbytes %d\n", name, nbytes);
//printf(">>> name %s nbytes %d\n", name, nbytes);
for (int i = 0; i < n1; i++) {
if (n2 == 0)

View File

@ -88,6 +88,7 @@ void ComputeGrid::grid2x(int igrid, double *x)
x[2] = iz * delz;
if (triclinic) domain->lamda2x(x, x);
//printf(">>>>> ComputeGrid::grid2x\n");
}
/* ----------------------------------------------------------------------
@ -103,6 +104,7 @@ void ComputeGrid::assign_coords_all()
gridall[igrid][1] = x[1];
gridall[igrid][2] = x[2];
}
//printf(">>>>> ComputeGrid::assign_coords_all\n");
}
/* ----------------------------------------------------------------------
@ -111,6 +113,7 @@ void ComputeGrid::assign_coords_all()
void ComputeGrid::allocate()
{
//printf(">>> ComputeGrid::allocate\n");
// allocate arrays
memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");