Initial draft of Kokkos acclerated PPPM routines for triclinic cells.

This commit is contained in:
Emily Kahl
2021-06-25 09:47:54 +10:00
parent c758d224e0
commit cfd9cf625d
2 changed files with 381 additions and 213 deletions

View File

@ -1,7 +1,6 @@
// clang-format off
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
https://lammps.sandia.gov/, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
@ -31,6 +30,7 @@
#include "memory_kokkos.h"
#include "pair.h"
#include "remap_kokkos.h"
#include "kokkos_few.h"
#include <cmath>
@ -67,7 +67,7 @@ PPPMKokkos<DeviceType>::PPPMKokkos(LAMMPS *lmp) : PPPM(lmp)
pppmflag = 1;
group_group_enable = 0;
triclinic_support = 0;
triclinic_support = 1;
nfactors = 3;
//factors = new int[nfactors];
@ -455,80 +455,92 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup4, const int &n) const
template<class DeviceType>
void PPPMKokkos<DeviceType>::setup_triclinic()
{
// int i,j,k,n;
// double *prd;
//
// // volume-dependent factors
// // adjust z dimension for 2d slab PPPM
// // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
//
// prd = domain->prd;
//
// double xprd = prd[0];
// double yprd = prd[1];
// double zprd = prd[2];
// double zprd_slab = zprd*slab_volfactor;
// volume = xprd * yprd * zprd_slab;
//
// // use lamda (0-1) coordinates
//
// delxinv = nx_pppm;
// delyinv = ny_pppm;
// delzinv = nz_pppm;
// delvolinv = delxinv*delyinv*delzinv/volume;
//
// // d_fkx,d_fky,d_fkz for my FFT grid pts
//
// double per_i,per_j,per_k;
//
// n = 0;
// for (k = nzlo_fft; k <= nzhi_fft; k++) { // parallel_for
// per_k = k - nz_pppm*(2*k/nz_pppm);
// for (j = nylo_fft; j <= nyhi_fft; j++) {
// per_j = j - ny_pppm*(2*j/ny_pppm);
// for (i = nxlo_fft; i <= nxhi_fft; i++) {
// per_i = i - nx_pppm*(2*i/nx_pppm);
//
// double unitk_lamda[3];
// unitk_lamda[0] = 2.0*MY_PI*per_i;
// unitk_lamda[1] = 2.0*MY_PI*per_j;
// unitk_lamda[2] = 2.0*MY_PI*per_k;
// x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
// d_fkx[n] = unitk_lamda[0];
// d_fky[n] = unitk_lamda[1];
// d_fkz[n] = unitk_lamda[2];
// n++;
// }
// }
// }
//
// // virial coefficients
//
// double sqk,vterm;
//
// for (n = 0; n < nfft; n++) { // parallel_for
// sqk = d_fkx[n]*d_fkx[n] + d_fky[n]*d_fky[n] + d_fkz[n]*d_fkz[n];
// if (sqk == 0.0) {
// d_vg(n,0) = 0.0;
// d_vg(n,1) = 0.0;
// d_vg(n,2) = 0.0;
// d_vg(n,3) = 0.0;
// d_vg(n,4) = 0.0;
// d_vg(n,5) = 0.0;
// } else {
// vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
// d_vg(n,0) = 1.0 + vterm*d_fkx[n]*d_fkx[n];
// d_vg(n,1) = 1.0 + vterm*d_fky[n]*d_fky[n];
// d_vg(n,2) = 1.0 + vterm*d_fkz[n]*d_fkz[n];
// d_vg(n,3) = vterm*d_fkx[n]*d_fky[n];
// d_vg(n,4) = vterm*d_fkx[n]*d_fkz[n];
// d_vg(n,5) = vterm*d_fky[n]*d_fkz[n];
// }
// }
//
// compute_gf_ik_triclinic();
int i,j,k,n;
double *prd;
// volume-dependent factors
// adjust z dimension for 2d slab PPPM
// z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
prd = domain->prd;
// Update simulation box parameters
h = Few<double, 6>(domain->h);
h_inv = Few<double, 6>(domain->h_inv);
double xprd = prd[0];
double yprd = prd[1];
double zprd = prd[2];
double zprd_slab = zprd*slab_volfactor;
volume = xprd * yprd * zprd_slab;
// use lamda (0-1) coordinates
delxinv = nx_pppm;
delyinv = ny_pppm;
delzinv = nz_pppm;
delvolinv = delxinv*delyinv*delzinv/volume;
// merge three outer loops into one for better threading
numz_fft = nzhi_fft-nzlo_fft + 1;
numy_fft = nyhi_fft-nylo_fft + 1;
numx_fft = nxhi_fft-nxlo_fft + 1;
const int inum_fft = numx_fft*numy_fft*numz_fft;
copymode = 1;
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup_triclinic1>(0,inum_fft),*this);
copymode = 0;
// virial coefficients
copymode = 1;
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup_triclinic2>(0,nfft),*this);
copymode = 0;
compute_gf_ik_triclinic();
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup_triclinic1, const int &n) const
{
const int k = n/(numy_fft*numx_fft);
const int j = (n - k*numy_fft*numx_fft) / numx_fft;
const int i = n - k*numy_fft*numx_fft - j*numx_fft;
double per_k = k - nz_pppm*(2*k/nz_pppm);
double per_j = j - ny_pppm*(2*j/ny_pppm);
double per_i = i - nx_pppm*(2*i/nx_pppm);
double unitk_lamda[3];
unitk_lamda[0] = 2.0*MY_PI*per_i;
unitk_lamda[1] = 2.0*MY_PI*per_j;
unitk_lamda[2] = 2.0*MY_PI*per_k;
x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
d_fkx[n] = unitk_lamda[0];
d_fky[n] = unitk_lamda[1];
d_fkz[n] = unitk_lamda[2];
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup_triclinic2, const int &n) const
{
const double sqk = d_fkx[n]*d_fkx[n] + d_fky[n]*d_fky[n] + d_fkz[n]*d_fkz[n];
if (sqk == 0.0) {
d_vg(n,0) = 0.0;
d_vg(n,1) = 0.0;
d_vg(n,2) = 0.0;
d_vg(n,3) = 0.0;
d_vg(n,4) = 0.0;
d_vg(n,5) = 0.0;
} else {
const double vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
d_vg(n,0) = 1.0 + vterm*d_fkx[n]*d_fkx[n];
d_vg(n,1) = 1.0 + vterm*d_fky[n]*d_fky[n];
d_vg(n,2) = 1.0 + vterm*d_fkz[n]*d_fkz[n];
d_vg(n,3) = vterm*d_fkx[n]*d_fky[n];
d_vg(n,4) = vterm*d_fkx[n]*d_fkz[n];
d_vg(n,5) = vterm*d_fky[n]*d_fkz[n];
}
}
/* ----------------------------------------------------------------------
reset local grid arrays and communication stencils
called by fix balance b/c it changed sizes of processor sub-domains
@ -1004,7 +1016,7 @@ void PPPMKokkos<DeviceType>::set_grid_global()
tmp[0] = nx_pppm;
tmp[1] = ny_pppm;
tmp[2] = nz_pppm;
x2lamdaT(&tmp[0],&tmp[0]);
KSpace::x2lamdaT(&tmp[0],&tmp[0]);
h_x = 1.0/tmp[0];
h_y = 1.0/tmp[1];
h_z = 1.0/tmp[2];
@ -1311,6 +1323,10 @@ void PPPMKokkos<DeviceType>::compute_gf_ik_triclinic()
nby = static_cast<int> (tmp[1]);
nbz = static_cast<int> (tmp[2]);
// Update the local copy of the domain box tilt
h = Few<double, 6>(domain->h);
h_inv = Few<double, 6>(domain->h_inv);
twoorder = 2*order;
copymode = 1;
@ -1320,71 +1336,71 @@ void PPPMKokkos<DeviceType>::compute_gf_ik_triclinic()
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PPPMKokkos<DeviceType>::operator()(TagPPPM_compute_gf_ik_triclinic, const int &/*m*/) const
void PPPMKokkos<DeviceType>::operator()(TagPPPM_compute_gf_ik_triclinic, const int &m) const
{
//int n = (m - nzlo_fft)*(nyhi_fft+1 - nylo_fft)*(nxhi_fft+1 - nxlo_fft);
//
//const int mper = m - nz_pppm*(2*m/nz_pppm);
//const double snz = square(sin(MY_PI*mper/nz_pppm));
//
//for (int l = nylo_fft; l <= nyhi_fft; l++) {
// const int lper = l - ny_pppm*(2*l/ny_pppm);
// const double sny = square(sin(MY_PI*lper/ny_pppm));
//
// for (int k = nxlo_fft; k <= nxhi_fft; k++) {
// const int kper = k - nx_pppm*(2*k/nx_pppm);
// const double snx = square(sin(MY_PI*kper/nx_pppm));
//
// double unitk_lamda[3];
// unitk_lamda[0] = 2.0*MY_PI*kper;
// unitk_lamda[1] = 2.0*MY_PI*lper;
// unitk_lamda[2] = 2.0*MY_PI*mper;
// x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
//
// const double sqk = square(unitk_lamda[0]) + square(unitk_lamda[1]) + square(unitk_lamda[2]);
//
// if (sqk != 0.0) {
// const double numerator = 12.5663706/sqk;
// const double denominator = gf_denom(snx,sny,snz);
// double sum1 = 0.0;
//
// for (int nx = -nbx; nx <= nbx; nx++) {
// const double argx = MY_PI*kper/nx_pppm + MY_PI*nx;
// const double wx = powsinxx(argx,twoorder);
//
// for (int ny = -nby; ny <= nby; ny++) {
// const double argy = MY_PI*lper/ny_pppm + MY_PI*ny;
// const double wy = powsinxx(argy,twoorder);
//
// for (int nz = -nbz; nz <= nbz; nz++) {
// const double argz = MY_PI*mper/nz_pppm + MY_PI*nz;
// const double wz = powsinxx(argz,twoorder);
//
// double b[3];
// b[0] = 2.0*MY_PI*nx_pppm*nx;
// b[1] = 2.0*MY_PI*ny_pppm*ny;
// b[2] = 2.0*MY_PI*nz_pppm*nz;
// x2lamdaT(&b[0],&b[0]);
//
// const double qx = unitk_lamda[0]+b[0];
// const double sx = exp(-0.25*square(qx/g_ewald));
//
// const double qy = unitk_lamda[1]+b[1];
// const double sy = exp(-0.25*square(qy/g_ewald));
//
// const double qz = unitk_lamda[2]+b[2];
// const double sz = exp(-0.25*square(qz/g_ewald));
//
// const double dot1 = unitk_lamda[0]*qx + unitk_lamda[1]*qy + unitk_lamda[2]*qz;
// const double dot2 = qx*qx+qy*qy+qz*qz;
// sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
// }
// }
// }
// d_greensfn[n++] = numerator*sum1/denominator;
// } else d_greensfn[n++] = 0.0;
// }
//}
int n = (m - nzlo_fft)*(nyhi_fft+1 - nylo_fft)*(nxhi_fft+1 - nxlo_fft);
const int mper = m - nz_pppm*(2*m/nz_pppm);
const double snz = square(sin(MY_PI*mper/nz_pppm));
for (int l = nylo_fft; l <= nyhi_fft; l++) {
const int lper = l - ny_pppm*(2*l/ny_pppm);
const double sny = square(sin(MY_PI*lper/ny_pppm));
for (int k = nxlo_fft; k <= nxhi_fft; k++) {
const int kper = k - nx_pppm*(2*k/nx_pppm);
const double snx = square(sin(MY_PI*kper/nx_pppm));
double unitk_lamda[3];
unitk_lamda[0] = 2.0*MY_PI*kper;
unitk_lamda[1] = 2.0*MY_PI*lper;
unitk_lamda[2] = 2.0*MY_PI*mper;
x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
const double sqk = square(unitk_lamda[0]) + square(unitk_lamda[1]) + square(unitk_lamda[2]);
if (sqk != 0.0) {
const double numerator = 12.5663706/sqk;
const double denominator = gf_denom(snx,sny,snz);
double sum1 = 0.0;
for (int nx = -nbx; nx <= nbx; nx++) {
const double argx = MY_PI*kper/nx_pppm + MY_PI*nx;
const double wx = powsinxx(argx,twoorder);
for (int ny = -nby; ny <= nby; ny++) {
const double argy = MY_PI*lper/ny_pppm + MY_PI*ny;
const double wy = powsinxx(argy,twoorder);
for (int nz = -nbz; nz <= nbz; nz++) {
const double argz = MY_PI*mper/nz_pppm + MY_PI*nz;
const double wz = powsinxx(argz,twoorder);
double b[3];
b[0] = 2.0*MY_PI*nx_pppm*nx;
b[1] = 2.0*MY_PI*ny_pppm*ny;
b[2] = 2.0*MY_PI*nz_pppm*nz;
x2lamdaT(&b[0],&b[0]);
const double qx = unitk_lamda[0]+b[0];
const double sx = exp(-0.25*square(qx/g_ewald));
const double qy = unitk_lamda[1]+b[1];
const double sy = exp(-0.25*square(qy/g_ewald));
const double qz = unitk_lamda[2]+b[2];
const double sz = exp(-0.25*square(qz/g_ewald));
const double dot1 = unitk_lamda[0]*qx + unitk_lamda[1]*qy + unitk_lamda[2]*qz;
const double dot2 = qx*qx+qy*qy+qz*qz;
sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
}
}
}
d_greensfn[n++] = numerator*sum1/denominator;
} else d_greensfn[n++] = 0.0;
}
}
}
/* ----------------------------------------------------------------------
@ -1867,107 +1883,224 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik10, const int &ii) con
template<class DeviceType>
void PPPMKokkos<DeviceType>::poisson_ik_triclinic()
{
// int i,j,k,n;
//
// // compute gradients of V(r) in each of 3 dims by transforming ik*V(k)
// // FFT leaves data in 3d brick decomposition
// // copy it into inner portion of vdx,vdy,vdz arrays
//
// // x direction gradient
//
// n = 0;
// for (i = 0; i < nfft; i++) { // parallel_for1
// d_work2[n] = -d_fkx[i]*d_work1[n+1];
// d_work2[n+1] = d_fkx[i]*d_work1[n];
// n += 2;
// }
//
// fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
//
// n = 0;
// for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for2
//
//
// // y direction gradient
//
// n = 0;
// for (i = 0; i < nfft; i++) { // parallel_for3
// d_work2[n] = -d_fky[i]*d_work1[n+1];
// d_work2[n+1] = d_fky[i]*d_work1[n];
// n += 2;
// }
//
// fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
//
// n = 0;
// for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for4
// for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++)
// for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) {
// d_vdy_brick(k,j,i) = d_work2[n];
// n += 2;
// }
//
// // z direction gradient
//
// n = 0;
// for (i = 0; i < nfft; i++) { // parallel_for5
// d_work2[n] = -d_fkz[i]*d_work1[n+1];
// d_work2[n+1] = d_fkz[i]*d_work1[n];
// n += 2;
// }
//
// fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
//
// n = 0;
// for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for6
//
/**************************************************
int i,j,k,n;
// compute gradients of V(r) in each of 3 dims by transforming ik*V(k)
// FFT leaves data in 3d brick decomposition
// copy it into inner portion of vdx,vdy,vdz arrays
// x direction gradient
n = 0;
for (i = 0; i < nfft; i++) { // parallel_for1
d_work2[n] = -d_fkx[i]*d_work1[n+1];
d_work2[n+1] = d_fkx[i]*d_work1[n];
n += 2;
}
fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
n = 0;
for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for2
// y direction gradient
n = 0;
for (i = 0; i < nfft; i++) { // parallel_for3
d_work2[n] = -d_fky[i]*d_work1[n+1];
d_work2[n+1] = d_fky[i]*d_work1[n];
n += 2;
}
fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
n = 0;
for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for4
for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++)
for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) {
d_vdy_brick(k,j,i) = d_work2[n];
n += 2;
}
// z direction gradient
n = 0;
for (i = 0; i < nfft; i++) { // parallel_for5
d_work2[n] = -d_fkz[i]*d_work1[n+1];
d_work2[n+1] = d_fkz[i]*d_work1[n];
n += 2;
}
fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
n = 0;
for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for6
******************************/
int i,j,k,n;
// compute gradients of V(r) in each of 3 dims by transforming ik*V(k)
// FFT leaves data in 3d brick decomposition
// copy it into inner portion of vdx,vdy,vdz arrays
// x direction gradient
//n = 0;
//for (i = 0; i < nfft; i++) {
// d_work2[n] = -d_fkx[i]*d_work1[n+1];
// d_work2[n+1] = d_fkx[i]*d_work1[n];
// n += 2;
//}
// merge three outer loops into one for better threading
numz_fft = nzhi_fft-nzlo_fft + 1;
numy_fft = nyhi_fft-nylo_fft + 1;
numx_fft = nxhi_fft-nxlo_fft + 1;
const int inum_fft = numz_fft*numy_fft*numx_fft;
numz_inout = (nzhi_in-nzlo_out)-(nzlo_in-nzlo_out) + 1;
numy_inout = (nyhi_in-nylo_out)-(nylo_in-nylo_out) + 1;
numx_inout = (nxhi_in-nxlo_out)-(nxlo_in-nxlo_out) + 1;
const int inum_inout = numz_inout*numy_inout*numx_inout;
copymode = 1;
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic1>(0,nfft),*this);
copymode = 0;
fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
//n = 0;
//for (k = nzlo_in; k <= nzhi_in; k++)
// for (j = nylo_in; j <= nyhi_in; j++)
// for (i = nxlo_in; i <= nxhi_in; i++) {
// d_vdx_brick(k,j,i) = d_work2[n];
// n += 2;
// }
copymode = 1;
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic2>(0,inum_inout),*this);
copymode = 0;
// y direction gradient
//n = 0;
//for (i = 0; i < nfft; i++) {
// d_work2[n] = -d_fky[i]*d_work1[n+1];
// d_work2[n+1] = d_fky[i]*d_work1[n];
// n += 2;
//}
copymode = 1;
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic3>(0,nfft),*this);
copymode = 0;
fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
//n = 0;
//for (k = nzlo_in; k <= nzhi_in; k++)
// for (j = nylo_in; j <= nyhi_in; j++)
// for (i = nxlo_in; i <= nxhi_in; i++) {
// d_vdy_brick(k,j,i) = d_work2[n];
// n += 2;
// }
copymode = 1;
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic4>(0,inum_inout),*this);
copymode = 0;
// z direction gradient
//n = 0;
//for (i = 0; i < nfft; i++) {
// d_work2[n] = -d_fkz[i]*d_work1[n+1];
// d_work2[n+1] = d_fkz[i]*d_work1[n];
// n += 2;
//}
copymode = 1;
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic5>(0,nfft),*this);
copymode = 0;
fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
//n = 0;
//for (k = nzlo_in; k <= nzhi_in; k++)
// for (j = nylo_in; j <= nyhi_in; j++)
// for (i = nxlo_in; i <= nxhi_in; i++) {
// d_vdz_brick(k,j,i) = d_work2[n];
// n += 2;
// }
copymode = 1;
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic6>(0,inum_inout),*this);
copymode = 0;
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic1, const int &/*k*/) const
void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic1, const int &ii) const
{
d_work2[2*ii] = -d_fkx[ii]*d_work1[2*ii+1];
d_work2[2*ii+1] = d_fkx[ii]*d_work1[2*ii];
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic2, const int &/*k*/) const
void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic2, const int &ii) const
{
// for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++)
// for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) {
// d_vdx_brick(k,j,i) = d_work2[n];
// n += 2;
// }
const int n = ii*2;
int k = ii/(numy_inout*numx_inout);
int j = (ii - k*numy_inout*numx_inout) / numx_inout;
int i = ii - k*numy_inout*numx_inout - j*numx_inout;
k += nzlo_in-nzlo_out;
j += nylo_in-nylo_out;
i += nxlo_in-nxlo_out;
d_vdx_brick(k,j,i) = d_work2[n];
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic3, const int &/*k*/) const
void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic3, const int &ii) const
{
// int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2;
d_work2[2*ii] = -d_fky[ii]*d_work1[2*ii+1];
d_work2[2*ii+1] = d_fky[ii]*d_work1[2*ii];
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic4, const int &/*k*/) const
void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic4, const int &ii) const
{
// int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2;
//
const int n = ii*2;
int k = ii/(numy_inout*numx_inout);
int j = (ii - k*numy_inout*numx_inout) / numx_inout;
int i = ii - k*numy_inout*numx_inout - j*numx_inout;
k += nzlo_in-nzlo_out;
j += nylo_in-nylo_out;
i += nxlo_in-nxlo_out;
d_vdy_brick(k,j,i) = d_work2[n];
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic5, const int &/*k*/) const
void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic5, const int &ii) const
{
// int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2;
//
d_work2[2*ii] = -d_fkz[ii]*d_work1[2*ii+1];
d_work2[2*ii+1] = d_fkz[ii]*d_work1[2*ii];
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic6, const int &/*k*/) const
void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic6, const int &ii) const
{
// int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2;
//
@ -1976,6 +2109,14 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic6, const int
// d_vdz_brick(k,j,i) = d_work2[n];
// n += 2;
// }
const int n = ii*2;
int k = ii/(numy_inout*numx_inout);
int j = (ii - k*numy_inout*numx_inout) / numx_inout;
int i = ii - k*numy_inout*numx_inout - j*numx_inout;
k += nzlo_in-nzlo_out;
j += nylo_in-nylo_out;
i += nxlo_in-nxlo_out;
d_vdz_brick(k,j,i) = d_work2[n];
}
/* ----------------------------------------------------------------------

View File

@ -1,7 +1,6 @@
// clang-format off
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
https://lammps.sandia.gov/, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
@ -13,11 +12,11 @@
------------------------------------------------------------------------- */
#ifdef KSPACE_CLASS
// clang-format off
KSpaceStyle(pppm/kk,PPPMKokkos<LMPDeviceType>);
KSpaceStyle(pppm/kk/device,PPPMKokkos<LMPDeviceType>);
KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>);
// clang-format on
KSpaceStyle(pppm/kk,PPPMKokkos<LMPDeviceType>)
KSpaceStyle(pppm/kk/device,PPPMKokkos<LMPDeviceType>)
KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>)
#else
#ifndef LMP_PPPM_KOKKOS_H
@ -29,6 +28,7 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>);
#include "kokkos_base_fft.h"
#include "fftdata_kokkos.h"
#include "kokkos_type.h"
#include "kokkos_few.h"
// fix up FFT defines for KOKKOS with CUDA
@ -55,6 +55,8 @@ struct TagPPPM_setup1{};
struct TagPPPM_setup2{};
struct TagPPPM_setup3{};
struct TagPPPM_setup4{};
struct TagPPPM_setup_triclinic1{};
struct TagPPPM_setup_triclinic2{};
struct TagPPPM_compute_gf_ik{};
struct TagPPPM_compute_gf_ik_triclinic{};
struct TagPPPM_self1{};
@ -138,6 +140,12 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
KOKKOS_INLINE_FUNCTION
void operator()(TagPPPM_setup4, const int&) const;
KOKKOS_INLINE_FUNCTION
void operator()(TagPPPM_setup_triclinic1, const int&) const;
KOKKOS_INLINE_FUNCTION
void operator()(TagPPPM_setup_triclinic2, const int&) const;
KOKKOS_INLINE_FUNCTION
void operator()(TagPPPM_compute_gf_ik, const int&) const;
@ -309,6 +317,25 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
int numx_out,numy_out,numz_out;
int ix,iy,nlocal;
// Local copies of the domain box tilt etc.
// TODO: Will need to put in a less
// hacky solution later, since this needs to be manually updated
Few<double,6> h, h_inv;
KOKKOS_INLINE_FUNCTION
void x2lamdaT(double* v, double* lamda) const
{
double lamda_tmp[3];
lamda_tmp[0] = h_inv[0]*v[0];
lamda_tmp[1] = h_inv[5]*v[0] + h_inv[1]*v[1];
lamda_tmp[2] = h_inv[4]*v[0] + h_inv[3]*v[1] + h_inv[2]*v[2];
lamda[0] = lamda_tmp[0];
lamda[1] = lamda_tmp[1];
lamda[2] = lamda_tmp[2];
}
int nx,ny,nz;
typename AT::t_int_1d_um d_list_index;
typename FFT_AT::t_FFT_SCALAR_1d_um d_buf;