From b2d6df5bfbe44b7092ab4588539113b94cd34023 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 25 Aug 2022 23:18:13 -0500 Subject: [PATCH] Re-arranged some for loops in umutual1 to improve cache-friendly memory access; made placeholder for grid_uind on the GPU lib, maybe FFT is not that heavy to be put on the device. --- lib/gpu/lal_amoeba_ext.cpp | 5 +++ src/GPU/pair_amoeba_gpu.cpp | 87 ++++++++++++++++++++++++++++++------- 2 files changed, 77 insertions(+), 15 deletions(-) diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 7d9d836b29..6989a5e6f6 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -162,6 +162,11 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double * eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } +void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, double ***grid) { +} + void amoeba_setup_fft(const int numel, const int element_type) { AMOEBAMF.setup_fft(numel, element_type); } diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 29db1b4c1b..cd3c01cde3 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -88,6 +88,10 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, void amoeba_gpu_update_fieldp(void **fieldp_ptr); +void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp, + double** host_thetai1, double** host_thetai2, + double** host_thetai3, double ***grid); + void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -869,7 +873,7 @@ void PairAmoebaGPU::udirect2b_cpu() void PairAmoebaGPU::ufield0c(double **field, double **fieldp) { - int i,j; + //int i,j; double term; double time0,time1,time2; @@ -879,13 +883,18 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) int nlocal = atom->nlocal; int nall = nlocal + atom->nghost; - for (i = 0; i < nall; i++) { - for (j = 0; j < 3; j++) { + memset(&field[0][0], 0, 3*nall *sizeof(double)); + memset(&fieldp[0][0], 0, 3*nall *sizeof(double)); + +/* + for (int i = 0; i < nall; i++) { + for (int j = 0; j < 3; j++) { field[i][j] = 0.0; fieldp[i][j] = 0.0; } } - +*/ + // get the real space portion of the mutual field first MPI_Barrier(world); @@ -902,13 +911,24 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) // add the self-energy portion of the mutual field term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; + for (int i = 0; i < nlocal; i++) { + field[i][0] += term*uind[i][0]; + field[i][1] += term*uind[i][1]; + field[i][2] += term*uind[i][2]; + } + for (int i = 0; i < nlocal; i++) { + fieldp[i][0] += term*uinp[i][0]; + fieldp[i][1] += term*uinp[i][1]; + fieldp[i][2] += term*uinp[i][2]; + } +/* for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { field[i][j] += term*uind[i][j]; fieldp[i][j] += term*uinp[i][j]; } } - +*/ // accumulate the field and fieldp values from the real space portion from umutual2b() on the GPU // field and fieldp may already have some nonzero values from kspace (umutual1 and self) @@ -947,7 +967,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) void PairAmoebaGPU::umutual1(double **field, double **fieldp) { - int i,j,k,m,n; + int m,n; int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; double term; double a[3][3]; // indices not flipped vs Fortran @@ -958,7 +978,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) // convert Cartesian dipoles to fractional coordinates - for (j = 0; j < 3; j++) { + for (int j = 0; j < 3; j++) { a[0][j] = nfft1 * recip[0][j]; a[1][j] = nfft2 * recip[1][j]; a[2][j] = nfft3 * recip[2][j]; @@ -966,13 +986,25 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) int nlocal = atom->nlocal; + for (int i = 0; i < nlocal; i++) { + fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2]; + fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2]; + fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2]; + } + + for (int i = 0; i < nlocal; i++) { + fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2]; + fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; + fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; + } +/* for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2]; fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2]; } } - +*/ // gridpre = my portion of 4d grid in brick decomp w/ ghost values double ****gridpre = (double ****) ic_kspace->zero(); @@ -1000,9 +1032,9 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) // use qfac values stored in udirect1() m = n = 0; - for (k = nzlo; k <= nzhi; k++) { - for (j = nylo; j <= nyhi; j++) { - for (i = nxlo; i <= nxhi; i++) { + for (int k = nzlo; k <= nzhi; k++) { + for (int j = nylo; j <= nyhi; j++) { + for (int i = nxlo; i <= nxhi; i++) { term = qfac[m++]; gridfft[n] *= term; gridfft[n+1] *= term; @@ -1023,8 +1055,8 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) // store fractional reciprocal potentials for OPT method if (poltyp == OPT) { - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 10; j++) { + for (int i = 0; i < nlocal; i++) { + for (int j = 0; j < 10; j++) { fopt[i][optlevel][j] = fdip_phi1[i][j]; foptp[i][optlevel][j] = fdip_phi2[i][j]; } @@ -1033,13 +1065,37 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) // convert the dipole fields from fractional to Cartesian - for (i = 0; i < 3; i++) { + for (int i = 0; i < 3; i++) { a[0][i] = nfft1 * recip[0][i]; a[1][i] = nfft2 * recip[1][i]; a[2][i] = nfft3 * recip[2][i]; } - for (i = 0; i < nlocal; i++) { + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi1[i][1] + + a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3]; + double dfy = a[1][0]*fdip_phi1[i][1] + + a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3]; + double dfz = a[2][0]*fdip_phi1[i][1] + + a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3]; + field[i][0] -= dfx; + field[i][1] -= dfy; + field[i][2] -= dfz; + } + + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi2[i][1] + + a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3]; + double dfy = a[1][0]*fdip_phi2[i][1] + + a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3]; + double dfz = a[2][0]*fdip_phi2[i][1] + + a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3]; + fieldp[i][0] -= dfx; + fieldp[i][1] -= dfy; + fieldp[i][2] -= dfz; + } +/* + for (int i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] + a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3]; @@ -1056,6 +1112,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) fieldp[i][j] -= dipfield2[i][j]; } } +*/ } /* ----------------------------------------------------------------------