diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 4a26f7f98d..b0013f0b9b 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -14,7 +14,7 @@ // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) -//#include +#include #include "lal_aux_fun1.h" #ifdef LAMMPS_SMALLBIG #define tagint int @@ -1630,14 +1630,19 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, __global numtyp *restrict fdip_phi2, __global numtyp *restrict fdip_sum_phi, const int bsorder, const int inum, - const int nyzgrid, const int nygrid, - const int t_per_atom) + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out, + const int ngridxy, const int ngridx) { - int tid, ii, offset, i, n_stride; - atom_info(t_per_atom,ii,tid,offset); + //int tid, ii, offset, i, n_stride; + //atom_info(t_per_atom,ii,tid,offset); + + int tid=THREAD_ID_X; + int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X; if (ii void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** host_igrid, - double* host_grid_brick_start, int nzlo_out, - int nzhi_out, int nylo_out, int nyhi_out, + double* host_grid_brick_start, double**** host_grid_brick, + int nzlo_out, int nzhi_out, + int nylo_out, int nyhi_out, int nxlo_out, int nxhi_out) { _bsorder = bsorder; @@ -599,7 +600,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, } UCL_H_Vec dview; - dview.alloc(inum_full*bsorder*4,*(this->ucl_device)); + dview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device)); // pack host data to device @@ -634,7 +635,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, ucl_copy(_thetai3,dview,false); UCL_H_Vec dview_int; - dview_int.alloc(inum_full*4, *(this->ucl_device)); + dview_int.alloc(_max_thetai_size*4, *(this->ucl_device)); for (int i = 0; i < inum_full; i++) { int idx = i*4; dview_int[idx+0] = host_igrid[i][0]; @@ -643,6 +644,33 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, } ucl_copy(_igrid, dview_int, false); + + +} + +// --------------------------------------------------------------------------- +// fphi_uind = induced potential from grid +// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + double *host_grid_brick_start, double ****host_grid_brick, + void** host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, + int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out, bool& first_iteration) +{ + // allocation/resize and transfers before the first iteration + + if (first_iteration) { + precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, + igrid, host_grid_brick_start, host_grid_brick, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out); + if (first_iteration) first_iteration = false; + } + // update the cgrid_brick with data host _nzlo_out = nzlo_out; @@ -656,36 +684,27 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _ngridx = nxhi_out - nxlo_out + 1; _num_grid_points = _ngridx * _ngridy * _ngridz; - UCL_H_Vec dview_cgrid; - dview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device)); + UCL_H_Vec hview_cgrid; + hview_cgrid.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_WRITE); + int n = 0; + for (int iz = nzlo_out; iz <= nzhi_out; iz++) + for (int iy = nylo_out; iy <= nyhi_out; iy++) + for (int ix = nxlo_out; ix <= nxhi_out; ix++) { +/* + if (iz == nzlo_out && iy == nylo_out && ix == nxlo_out) { + printf("origin = %d %d %d: grid = %f %f %f\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1]); + } + if (iz == -2 && iy == 4 && ix == 8) printf("ixyz = %d %d %d: grid = %f %f %f; n = %d\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1], n); +*/ + hview_cgrid[n] = host_grid_brick[iz][iy][ix][0]; + hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1]; + n += 2; + } + //hview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device)); _cgrid_brick.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_ONLY); - ucl_copy(_cgrid_brick,dview_cgrid,false); + ucl_copy(_cgrid_brick,hview_cgrid,false); -} -// --------------------------------------------------------------------------- -// fphi_uind = induced potential from grid -// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid -// --------------------------------------------------------------------------- - -template -void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - double *host_grid_brick_start, void** host_fdip_phi1, - void **host_fdip_phi2, void **host_fdip_sum_phi, - int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out, bool& first_iteration) -{ - // allocation/resize and transfers before the first iteration - - if (first_iteration) { - precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, - igrid, host_grid_brick_start, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out); - if (first_iteration) first_iteration = false; - } - const int red_blocks = fphi_uind(); _fdip_phi1.update_host(_max_thetai_size*10); @@ -711,16 +730,16 @@ int BaseAmoebaT::fphi_uind() { // Compute the block size and grid size to keep all cores busy const int BX=block_size(); - int GX=static_cast(ceil(static_cast(ans->inum())/ - (BX/_threads_per_atom))); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); time_pair.start(); - int ngridyz = _ngridy * _ngridz; + int ngridxy = _ngridx * _ngridy; k_fphi_uind.set_size(GX,BX); k_fphi_uind.run(&atom->x, &_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, &_fdip_phi1, &_fdip_phi2, - &_fdip_sum_phi, &_bsorder, &ainum, &ngridyz, &_ngridy, - &_threads_per_atom); + &_fdip_sum_phi, &_bsorder, &ainum, + &_nzlo_out, &_nzhi_out, &_nylo_out, &_nyhi_out, + &_nxlo_out, &_nxhi_out, &ngridxy, &_ngridx); time_pair.stop(); return GX; diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index a001423812..c2c2a2d93d 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -153,8 +153,9 @@ class BaseAmoeba { virtual void precompute_induce(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double* grid_brick_start, int nzlo_out, - int nzhi_out, int nylo_out, int nyhi_out, + double *host_grid_brick_start, double ****host_grid_brick, + int nzlo_out, int nzhi_out, + int nylo_out, int nyhi_out, int nxlo_out, int nxhi_out); /// Compute multipole real-space with device neighboring @@ -182,8 +183,8 @@ class BaseAmoeba { virtual void compute_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double *host_grid_brick_start, void **host_fdip_phi1, - void **host_fdip_phi2, void **host_fdip_sum_phi, + double *host_grid_brick_start, double ****host_grid_brick, + void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, int nxlo_out, int nxhi_out, bool& first_iteration); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index bf6db3472d..936cf8afbc 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -91,7 +91,7 @@ void amoeba_gpu_update_fieldp(void **fieldp_ptr); void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double *host_grid_brick_start, void **host_fdip_phi1, + double *host_grid_brick_start, double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, int nxlo_out, int nxhi_out, bool& first_iteration); @@ -121,7 +121,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_multipole_real_ready = true; // need to be true for precompute() gpu_udirect2b_ready = true; gpu_umutual1_ready = true; - gpu_fphi_uind_ready = false; + gpu_fphi_uind_ready = true; gpu_umutual2b_ready = true; gpu_polar_real_ready = true; // need to be true for copying data from device back to host @@ -1139,7 +1139,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, void* fdip_phi2_pinned = nullptr; void* fdip_sum_phi_pinned = nullptr; amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, thetai2, thetai3, - igrid, ic_kspace->grid_brick_start, + igrid, ic_kspace->grid_brick_start, grid, &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned, ic_kspace->nzlo_out, ic_kspace->nzhi_out, ic_kspace->nylo_out, ic_kspace->nyhi_out, @@ -1150,8 +1150,10 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; for (int i = 0; i < nlocal; i++) { int idx = 10 * i; - for (int m = 0; m < 10; m++) - fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m]; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m]; + } + if (i == 0) printf("gpu fdip phi1 = %f %f %f\n", fdip_phi1[i][0], fdip_phi1[i][1], fdip_phi1[i][2]); } double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned; @@ -1159,6 +1161,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, int idx = 10 * i; for (int m = 0; m < 10; m++) fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m]; + if (i == 0) printf("gpu fdip phi2 = %f %f %f\n", fdip_phi2[i][0], fdip_phi2[i][1], fdip_phi2[i][2]); } double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; @@ -1166,6 +1169,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, int idx = 20 * i; for (int m = 0; m < 20; m++) fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m]; + if (i == 0) printf("gpu fdip sum phi = %f %f %f\n", fdip_sum_phi[i][0], fdip_sum_phi[i][1], fdip_sum_phi[i][2]); } }