diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 5f50486e28..298d404117 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -68,7 +68,34 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H) # host code compilation -$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H) +$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H) + $(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H) + $(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H) + $(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H) + $(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H) + $(CUDR) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H) + $(CUDR) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H) $(CUDR) -o $@ -c $< -I$(OBJ_DIR) #ifdef CUDPP_OPT diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 84a8495dfb..ab750aaadc 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1630,7 +1630,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, const __global numtyp4 *restrict thetai2, const __global numtyp4 *restrict thetai3, const __global int *restrict igrid, - const __global numtyp *restrict grid, + const __global numtyp2 *restrict grid, __global numtyp *restrict fdip_phi1, __global numtyp *restrict fdip_phi2, __global numtyp *restrict fdip_sum_phi, @@ -1648,12 +1648,12 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, if (ii -void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, +void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** host_igrid, const int nzlo_out, const int nzhi_out, @@ -660,7 +660,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _ngridx = nxhi_out - nxlo_out + 1; _num_grid_points = _ngridx * _ngridy * _ngridz; - int numel = _num_grid_points*2; + int numel = _num_grid_points; if (_cgrid_brick.cols() == 0) { _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY); } else if (numel > _cgrid_brick.cols()) { @@ -688,11 +688,13 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) for (int iy = _nylo_out; iy <= _nyhi_out; iy++) for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { - _cgrid_brick[n] = host_grid_brick[iz][iy][ix][0]; - _cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1]; - n += 2; + numtyp2 v; + v.x = host_grid_brick[iz][iy][ix][0]; + v.y = host_grid_brick[iz][iy][ix][1]; + _cgrid_brick[n] = v; + n++; } - _cgrid_brick.update_device(_num_grid_points*2, false); + _cgrid_brick.update_device(_num_grid_points, false); const int red_blocks = fphi_uind(); @@ -740,7 +742,7 @@ int BaseAmoebaT::fphi_uind() { // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi) +void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) { // TODO: grid brick[k][j][i] is a scalar UCL_H_Vec hdummy; @@ -750,11 +752,15 @@ void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) for (int iy = _nylo_out; iy <= _nyhi_out; iy++) for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { - _cgrid_brick[n] = host_grid_brick[iz][iy][ix]; + numtyp2 v; + v.x = host_grid_brick[iz][iy][ix]; + v.y = (numtyp)0; + _cgrid_brick[n] = v; n++; } _cgrid_brick.update_device(_num_grid_points, false); + _felec = felec; const int red_blocks = fphi_mpole(); _fdip_sum_phi.update_host(_max_thetai_size*20); @@ -776,13 +782,14 @@ int BaseAmoebaT::fphi_mpole() { // Compute the block size and grid size to keep all cores busy const int BX=block_size(); + //printf("BX = %d; pppm block size = %d\n", BX, PPPM_BLOCK_1D); int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); time_pair.start(); int ngridxy = _ngridx * _ngridy; k_fphi_mpole.set_size(GX,BX); k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, - &_fdip_sum_phi, &_bsorder, &ainum, + &_fdip_sum_phi, &_bsorder, &ainum, &_felec, &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx); time_pair.stop(); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index a5ee245623..f9a715808e 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -173,8 +173,8 @@ class BaseAmoeba { double **host_uind, double **host_uinp, double *host_pval, const double aewald, const double off2_polar, void **fieldp_ptr); - /// Allocate/resize per-atom arrays before induce() - virtual void precompute_induce(const int inum_full, const int bsorder, + /// Allocate/resize per-atom arrays before the kspace parts in induce() and polar + virtual void precompute_kspace(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, const int nzlo_out, const int nzhi_out, @@ -185,7 +185,8 @@ class BaseAmoeba { void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi); - virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi); + virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, + const double felec); /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, @@ -256,7 +257,7 @@ class BaseAmoeba { int _bsorder; UCL_Vector _thetai1, _thetai2, _thetai3; UCL_Vector _igrid; - UCL_Vector _cgrid_brick; + UCL_Vector _cgrid_brick; UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; int _max_thetai_size; int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 1bd6bade3a..a75080bfca 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -193,15 +193,15 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double ** eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } -void hippo_gpu_precompute_induce(const int inum_full, const int bsorder, +void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, const int nzlo_out, const int nzhi_out, const int nylo_out, const int nyhi_out, const int nxlo_out, const int nxhi_out) { - HIPPOMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, - host_thetai3, igrid, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out); + HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, + host_thetai3, igrid, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out); } void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 396ff0b592..d0018bf588 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -88,17 +88,18 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, void amoeba_gpu_update_fieldp(void **fieldp_ptr); -void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out); +void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi); -void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi); +void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi, + const double felec); void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -343,7 +344,7 @@ void PairAmoebaGPU::induce() // must be done before the first ufield0c // NOTE: this is for ic_kspace, and thetai[1-3] - amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, + amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, thetai3, igrid, ic_kspace->nzlo_out, ic_kspace->nzhi_out, ic_kspace->nylo_out, ic_kspace->nyhi_out, @@ -1382,11 +1383,11 @@ void PairAmoebaGPU::polar_kspace() // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill if (gpu_fphi_mpole_ready) { - amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, - thetai3, igrid, p_kspace->nzlo_out, - p_kspace->nzhi_out, p_kspace->nylo_out, - p_kspace->nyhi_out, p_kspace->nxlo_out, - p_kspace->nxhi_out); + amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, + thetai1, thetai2, thetai3, igrid, + p_kspace->nzlo_out, p_kspace->nzhi_out, + p_kspace->nylo_out, p_kspace->nyhi_out, + p_kspace->nxlo_out, p_kspace->nxhi_out); } @@ -1461,10 +1462,15 @@ void PairAmoebaGPU::polar_kspace() if (!gpu_fphi_mpole_ready) { fphi_mpole(gridpost,fphi); - //printf("cpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]); + + for (i = 0; i < nlocal; i++) { + for (k = 0; k < 20; k++) + fphi[i][k] *= felec; + } + } else { void* fphi_pinned = nullptr; - amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned); + amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec); double *_fphi_ptr = (double *)fphi_pinned; for (int i = 0; i < nlocal; i++) { @@ -1474,13 +1480,8 @@ void PairAmoebaGPU::polar_kspace() idx += nlocal; } } - //printf("gpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]); - } - for (i = 0; i < nlocal; i++) { - for (k = 0; k < 20; k++) - fphi[i][k] *= felec; - } + } // convert field from fractional to Cartesian diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 1151027993..4dbc998ee3 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -105,12 +105,12 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, void hippo_gpu_update_fieldp(void **fieldp_ptr); -void hippo_gpu_precompute_induce(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out); +void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi); @@ -475,7 +475,7 @@ void PairHippoGPU::induce() // allocate memory and make early host-device transfers // must be done before the first ufield0c - hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, + hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, thetai3, igrid, ic_kspace->nzlo_out, ic_kspace->nzhi_out, ic_kspace->nylo_out, ic_kspace->nyhi_out,