Shallow copy Kokkos written array to returned array variable

This commit is contained in:
rohskopf
2023-06-26 16:43:28 -06:00
parent 5885f49b75
commit 9eb26e4cd0
5 changed files with 163 additions and 7 deletions

View File

@ -66,16 +66,25 @@ ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMP
}
//printf(">>> 1\n");
// Set up element lists
MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
int n = atom->ntypes;
MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
MemKK::realloc_kokkos(d_sigmaelem,"ComputeSNAGridKokkos::sigmaelem",n+1);
MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1);
MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1);
MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
//printf(">>> 2\n");
auto h_radelem = Kokkos::create_mirror_view(d_radelem);
auto h_sigmaelem = Kokkos::create_mirror_view(d_sigmaelem);
auto h_prefacelem = Kokkos::create_mirror_view(d_prefacelem);
auto h_argfacelem = Kokkos::create_mirror_view(d_argfacelem);
auto h_map = Kokkos::create_mirror_view(d_map);
//printf(">>> 3\n");
// start from index 1 because of how compute sna/grid is
for (int i = 1; i <= atom->ntypes; i++) {
h_radelem(i-1) = radelem[i];
h_sigmaelem(i-1) = sigmaelem[i];
h_prefacelem(i-1) = prefacelem[i];
h_argfacelem(i-1) = argfacelem[i];
}
//printf(">>> 4\n");
// In pair snap some things like `map` get allocated regardless of chem flag.
@ -87,6 +96,9 @@ ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMP
*/
//printf(">>> 5\n");
Kokkos::deep_copy(d_radelem,h_radelem);
Kokkos::deep_copy(d_sigmaelem,h_sigmaelem);
Kokkos::deep_copy(d_prefacelem, h_prefacelem);
Kokkos::deep_copy(d_argfacelem, h_argfacelem);
Kokkos::deep_copy(d_map,h_map);
//printf(">>> 6\n");
@ -127,6 +139,8 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::setup()
//gridlocal_allocated = 1;
//array = gridall;
array_local = alocal;
d_alocal = k_alocal.template view<DeviceType>();
//d_grid = k_grid.template view<DeviceType>();
//d_gridall = k_gridall.template view<DeviceType>();
@ -188,6 +202,23 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
if (!host_flag)
team_size_default = 32;//max_neighs;
if (triclinic){
/*
xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
*/
h0 = domain->h[0];
h1 = domain->h[1];
h2 = domain->h[2];
h3 = domain->h[3];
h4 = domain->h[4];
h5 = domain->h[5];
lo0 = domain->boxlo[0];
lo1 = domain->boxlo[1];
lo2 = domain->boxlo[2];
}
while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
if (chunk_size > total_range - chunk_offset)
@ -198,9 +229,9 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
int vector_length = vector_length_default;
int team_size = team_size_default;
check_team_size_for<TagComputeGaussianGridLocalNeigh>(chunk_size,team_size,vector_length);
printf(">>> Check 1 %d %d %d\n", chunk_size, team_size, vector_length);
//printf(">>> Check 1 %d %d %d\n", chunk_size, team_size, vector_length);
typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh> policy_neigh(chunk_size,team_size,vector_length);
printf(">>> Check 2\n");
//printf(">>> Check 2\n");
Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this);
}
@ -213,6 +244,8 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
k_alocal.template modify<DeviceType>();
k_alocal.template sync<LMPHostType>();
printf(">>> k_alocal: %f\n", k_alocal.h_view(0,6));
}
/* ---------------------------------------------------------------------- */
@ -223,6 +256,102 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianG
{
const int ii = team.league_rank();
//printf("%d\n", ii);
if (ii >= chunk_size) return;
// extract grid index
int igrid = ii + chunk_offset;
// get a pointer to scratch memory
// This is used to cache whether or not an atom is within the cutoff.
// If it is, type_cache is assigned to the atom type.
// If it's not, it's assigned to -1.
const int tile_size = ntotal; //max_neighs; // number of elements per thread
const int team_rank = team.team_rank();
const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
// convert to grid indices
int iz = igrid/(xlen*ylen);
int i2 = igrid - (iz*xlen*ylen);
int iy = i2/xlen;
int ix = i2 % xlen;
iz += nzlo;
iy += nylo;
ix += nxlo;
double xgrid[3];
// index ii already captures the proper grid point
//int igrid = iz * (nx * ny) + iy * nx + ix;
//printf("%d %d\n", ii, igrid);
// grid2x converts igrid to ix,iy,iz like we've done before
// multiply grid integers by grid spacing delx, dely, delz
//grid2x(igrid, xgrid);
xgrid[0] = ix * delx;
xgrid[1] = iy * dely;
xgrid[2] = iz * delz;
if (triclinic) {
// Do a conversion on `xgrid` here like we do in the CPU version.
// Can't do this:
// domainKK->lamda2x(xgrid, xgrid);
// Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
// Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
/*
xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
*/
xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
xgrid[2] = h2*xgrid[2] + lo2;
}
const F_FLOAT xtmp = xgrid[0];
const F_FLOAT ytmp = xgrid[1];
const F_FLOAT ztmp = xgrid[2];
// Zeroing out the components, which are filled as a sum.
for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
d_alocal(igrid, icol) = 0.0;
}
// currently, all grid points are type 1
// not clear what a better choice would be
const int itype = 1;
int ielem = 0;
ielem = d_map[itype];
const double radi = d_radelem[ielem];
// Compute the number of neighbors, store rsq
int ninside = 0;
// Looping over ntotal for now.
for (int j = 0; j < ntotal; j++){
const F_FLOAT dx = x(j,0) - xtmp;
const F_FLOAT dy = x(j,1) - ytmp;
const F_FLOAT dz = x(j,2) - ztmp;
int jtype = type(j);
const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
if (rsq < rnd_cutsq(jtype, jtype) ) {
//printf("%f %f\n", d_prefacelem(jtype-1), d_argfacelem(jtype-1));
int icol = size_local_cols_base + jtype - 1;
d_alocal(igrid, icol) += d_prefacelem(jtype-1) * exp(-rsq * d_argfacelem(jtype-1));
}
}
//printf("%f\n", d_alocal(igrid, 6));
}
/* ----------------------------------------------------------------------

View File

@ -61,6 +61,9 @@ template <class DeviceType> class ComputeGaussianGridLocalKokkos : public Comput
//double adof, mvv2e, mv2d, boltz;
Kokkos::View<double*, DeviceType> d_radelem; // element radii
Kokkos::View<double*, DeviceType> d_sigmaelem;
Kokkos::View<double*, DeviceType> d_prefacelem;
Kokkos::View<double*, DeviceType> d_argfacelem;
Kokkos::View<int*, DeviceType> d_ninside; // ninside for all atoms in list
Kokkos::View<int*, DeviceType> d_map; // mapping from atom types to elements
@ -98,6 +101,15 @@ template <class DeviceType> class ComputeGaussianGridLocalKokkos : public Comput
DAT::tdual_float_2d k_alocal;
typename AT::t_float_2d d_alocal;
// triclinic vars
/*
xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
*/
double h0, h1, h2, h3, h4, h5;
double lo0, lo1, lo2;
};
} // namespace LAMMPS_NS

View File

@ -41,8 +41,8 @@ ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char *
arg += nargbase;
narg -= nargbase;
double rfac0, rmin0;
int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
//double rfac0, rmin0;
//int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
int ntypes = atom->ntypes;
int nargmin = 4 + 2 * ntypes;
@ -91,11 +91,14 @@ ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char *
ComputeGaussianGridLocal::~ComputeGaussianGridLocal()
{
//printf(">>> ComputeGaussianGridLocal begin destruct copymode %d\n", copymode);
if (copymode) return;
memory->destroy(radelem);
memory->destroy(sigmaelem);
memory->destroy(prefacelem);
memory->destroy(argfacelem);
memory->destroy(cutsq);
//printf(">>> ComputeGaussianGridLocal end destruct\n");
}
/* ---------------------------------------------------------------------- */
@ -110,6 +113,8 @@ void ComputeGaussianGridLocal::init()
void ComputeGaussianGridLocal::compute_local()
{
printf(">>> compute_local CPU\n");
printf(">>> size_local_cols_base, size_local_cols: %d %d\n", size_local_cols_base, size_local_cols);
invoked_local = update->ntimestep;
// compute gaussian for each gridpoint
@ -146,7 +151,7 @@ void ComputeGaussianGridLocal::compute_local()
const double rsq = delx * delx + dely * dely + delz * delz;
int jtype = type[j];
if (rsq < cutsq[jtype][jtype]) {
int icol = size_local_cols_base + jtype - 1;
int icol = size_local_cols_base + jtype - 1;
alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]);
}
}

View File

@ -32,7 +32,7 @@ class ComputeGaussianGridLocal : public ComputeGridLocal {
void compute_local() override;
double memory_usage() override;
private:
protected:
int ncoeff;
double **cutsq;
double rcutfac; // global cut-off scale

View File

@ -61,13 +61,16 @@ ComputeGridLocal::ComputeGridLocal(LAMMPS *lmp, int narg, char **arg) :
ComputeGridLocal::~ComputeGridLocal()
{
printf(">>> ComputeGridLocal begin destruct\n");
deallocate();
printf(">>> ComputeGridLocal end destruct\n");
}
/* ---------------------------------------------------------------------- */
void ComputeGridLocal::setup()
{
printf(">>> ComputeGridLocal setup\n");
deallocate();
set_grid_global();
set_grid_local();
@ -106,6 +109,7 @@ void ComputeGridLocal::grid2lamda(int ix, int iy, int iz, double *x)
void ComputeGridLocal::allocate()
{
printf(">>> ComputeGridLocal::allocate %d %d\n", size_local_rows, size_local_cols);
if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
gridlocal_allocated = 1;
memory->create(alocal, size_local_rows, size_local_cols, "compute/grid/local:alocal");
@ -119,10 +123,14 @@ void ComputeGridLocal::allocate()
void ComputeGridLocal::deallocate()
{
//printf(">>> ComputeGridLocal::deallocate begin gridlocal_allocated %d copymode %d\n", gridlocal_allocated, copymode);
if (copymode) return;
if (gridlocal_allocated) {
gridlocal_allocated = 0;
memory->destroy(alocal);
}
//printf(">>> ComputeGridLocal:: deallocate end\n");
array_local = nullptr;
}
@ -178,6 +186,8 @@ void ComputeGridLocal::set_grid_local()
// the 2 equality if tests ensure a consistent decision
// as to which proc owns it
//printf(">>> ComputeGridLocal set_grid_local\n");
double xfraclo, xfrachi, yfraclo, yfrachi, zfraclo, zfrachi;
if (comm->layout != Comm::LAYOUT_TILED) {