Updated fphi_mpole, renamed precompute_induce to precompute_kspace

This commit is contained in:
Trung Nguyen
2022-09-28 15:08:18 -05:00
parent 166701f13a
commit e6d2582642
8 changed files with 129 additions and 101 deletions

View File

@ -68,7 +68,34 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
# host code compilation # host code compilation
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H) $(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H)
$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR) $(CUDR) -o $@ -c $< -I$(OBJ_DIR)
#ifdef CUDPP_OPT #ifdef CUDPP_OPT

View File

@ -1630,7 +1630,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
const __global numtyp4 *restrict thetai2, const __global numtyp4 *restrict thetai2,
const __global numtyp4 *restrict thetai3, const __global numtyp4 *restrict thetai3,
const __global int *restrict igrid, const __global int *restrict igrid,
const __global numtyp *restrict grid, const __global numtyp2 *restrict grid,
__global numtyp *restrict fdip_phi1, __global numtyp *restrict fdip_phi1,
__global numtyp *restrict fdip_phi2, __global numtyp *restrict fdip_phi2,
__global numtyp *restrict fdip_sum_phi, __global numtyp *restrict fdip_sum_phi,
@ -1648,12 +1648,12 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
if (ii<inum) { if (ii<inum) {
int nlpts = (bsorder-1) / 2; const int nlpts = (bsorder-1) / 2;
int istart = fast_mul(ii,4); int istart = fast_mul(ii,4);
int igridx = igrid[istart]; const int igridx = igrid[istart];
int igridy = igrid[istart+1]; const int igridy = igrid[istart+1];
int igridz = igrid[istart+2]; const int igridz = igrid[istart+2];
// now istart is used to index thetai1, thetai2 and thetai3 // now istart is used to index thetai1, thetai2 and thetai3
istart = fast_mul(ii,bsorder); istart = fast_mul(ii,bsorder);
@ -1701,18 +1701,13 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
int k = (igridz - nzlo_out) - nlpts; int k = (igridz - nzlo_out) - nlpts;
for (int kb = 0; kb < bsorder; kb++) { for (int kb = 0; kb < bsorder; kb++) {
/* const int mz = fast_mul(k, ngridxy);
v0 = thetai3[m][kb][0]; const int i3 = istart + kb;
v1 = thetai3[m][kb][1]; const numtyp4 tha3 = thetai3[i3];
v2 = thetai3[m][kb][2]; const numtyp v0 = tha3.x; // thetai3[m][kb][0];
v3 = thetai3[m][kb][3]; const numtyp v1 = tha3.y; // thetai3[m][kb][1];
*/ const numtyp v2 = tha3.z; // thetai3[m][kb][2];
int i3 = istart + kb; const numtyp v3 = tha3.w; // thetai3[m][kb][3];
numtyp4 tha3 = thetai3[i3];
numtyp v0 = tha3.x;
numtyp v1 = tha3.y;
numtyp v2 = tha3.z;
numtyp v3 = tha3.w;
numtyp tu00_1 = (numtyp)0.0; numtyp tu00_1 = (numtyp)0.0;
numtyp tu01_1 = (numtyp)0.0; numtyp tu01_1 = (numtyp)0.0;
numtyp tu10_1 = (numtyp)0.0; numtyp tu10_1 = (numtyp)0.0;
@ -1738,18 +1733,13 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
int j = (igridy - nylo_out) - nlpts; int j = (igridy - nylo_out) - nlpts;
for (int jb = 0; jb < bsorder; jb++) { for (int jb = 0; jb < bsorder; jb++) {
/* const int my = mz + fast_mul(j, ngridx);
u0 = thetai2[m][jb][0]; const int i2 = istart + jb;
u1 = thetai2[m][jb][1]; const numtyp4 tha2 = thetai2[i2];
u2 = thetai2[m][jb][2]; const numtyp u0 = tha2.x; // thetai2[m][jb][0];
u3 = thetai2[m][jb][3]; const numtyp u1 = tha2.y; // thetai2[m][jb][1];
*/ const numtyp u2 = tha2.z; // thetai2[m][jb][2];
int i2 = istart + jb; const numtyp u3 = tha2.w; // thetai2[m][jb][3];
numtyp4 tha2 = thetai2[i2];
numtyp u0 = tha2.x;
numtyp u1 = tha2.y;
numtyp u2 = tha2.z;
numtyp u3 = tha2.w;
numtyp t0_1 = (numtyp)0.0; numtyp t0_1 = (numtyp)0.0;
numtyp t1_1 = (numtyp)0.0; numtyp t1_1 = (numtyp)0.0;
numtyp t2_1 = (numtyp)0.0; numtyp t2_1 = (numtyp)0.0;
@ -1771,22 +1761,25 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
t2_2 += tq_2*thetai1[m][ib][2]; t2_2 += tq_2*thetai1[m][ib][2];
t3 += (tq_1+tq_2)*thetai1[m][ib][3]; t3 += (tq_1+tq_2)*thetai1[m][ib][3];
*/ */
int i1 = istart + ib; const int i1 = istart + ib;
numtyp4 tha1 = thetai1[i1]; const numtyp4 tha1 = thetai1[i1];
numtyp w0 = tha1.x; /*
numtyp w1 = tha1.y; const numtyp w0 = tha1.x;
numtyp w2 = tha1.z; const numtyp w1 = tha1.y;
numtyp w3 = tha1.w; const numtyp w2 = tha1.z;
int gidx = 2*(k*ngridxy + j*ngridx + i); const numtyp w3 = tha1.w;
numtyp tq_1 = grid[gidx]; */
numtyp tq_2 = grid[gidx+1]; const int gidx = my + i; // k*ngridxy + j*ngridx + i;
t0_1 += tq_1*w0; const numtyp2 tq = grid[gidx];
t1_1 += tq_1*w1; const numtyp tq_1 = tq.x; //grid[gidx];
t2_1 += tq_1*w2; const numtyp tq_2 = tq.y; //grid[gidx+1];
t0_2 += tq_2*w0; t0_1 += tq_1*tha1.x; // w0
t1_2 += tq_2*w1; t1_1 += tq_1*tha1.y; // w1
t2_2 += tq_2*w2; t2_1 += tq_1*tha1.z; // w2
t3 += (tq_1+tq_2)*w3; t0_2 += tq_2*tha1.x; // w0
t1_2 += tq_2*tha1.y; // w1
t2_2 += tq_2*tha1.z; // w2
t3 += (tq_1+tq_2)*tha1.w; // w3
i++; i++;
} }
@ -1933,9 +1926,9 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
const __global numtyp4 *restrict thetai2, const __global numtyp4 *restrict thetai2,
const __global numtyp4 *restrict thetai3, const __global numtyp4 *restrict thetai3,
const __global int *restrict igrid, const __global int *restrict igrid,
const __global numtyp *restrict grid, const __global numtyp2 *restrict grid,
__global numtyp *restrict fphi, __global numtyp *restrict fphi,
const int bsorder, const int inum, const int bsorder, const int inum, const numtyp felec,
const int nzlo_out, const int nylo_out, const int nzlo_out, const int nylo_out,
const int nxlo_out, const int ngridxy, const int nxlo_out, const int ngridxy,
const int ngridx) const int ngridx)
@ -2027,7 +2020,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
int i1 = istart + ib; int i1 = istart + ib;
numtyp4 tha1 = thetai1[i1]; numtyp4 tha1 = thetai1[i1];
int gidx = k*ngridxy + j*ngridx + i; int gidx = k*ngridxy + j*ngridx + i;
numtyp tq = grid[gidx]; numtyp tq = grid[gidx].x;
t0 += tq*tha1.x; t0 += tq*tha1.x;
t1 += tq*tha1.y; t1 += tq*tha1.y;
t2 += tq*tha1.z; t2 += tq*tha1.z;
@ -2095,7 +2088,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
int idx = ii; int idx = ii;
for (int m = 0; m < 20; m++) { for (int m = 0; m < 20; m++) {
fphi[idx] = buf[m]; fphi[idx] = felec * buf[m];
idx += inum; idx += inum;
} }
} }

View File

@ -162,15 +162,14 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
} }
void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder, void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2, double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid, double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out, const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out, const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out) { const int nxlo_out, const int nxhi_out) {
AMOEBAMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid,
host_thetai3, igrid, nzlo_out, nzhi_out, nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out);
nylo_out, nyhi_out, nxlo_out, nxhi_out);
} }
void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
@ -179,8 +178,8 @@ void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
host_fdip_phi2, host_fdip_sum_phi); host_fdip_phi2, host_fdip_sum_phi);
} }
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi) { void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) {
AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi); AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec);
} }
void amoeba_setup_fft(const int numel, const int element_type) { void amoeba_setup_fft(const int numel, const int element_type) {

View File

@ -566,7 +566,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2, double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** host_igrid, double ***host_thetai3, int** host_igrid,
const int nzlo_out, const int nzhi_out, const int nzlo_out, const int nzhi_out,
@ -660,7 +660,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
_ngridx = nxhi_out - nxlo_out + 1; _ngridx = nxhi_out - nxlo_out + 1;
_num_grid_points = _ngridx * _ngridy * _ngridz; _num_grid_points = _ngridx * _ngridy * _ngridz;
int numel = _num_grid_points*2; int numel = _num_grid_points;
if (_cgrid_brick.cols() == 0) { if (_cgrid_brick.cols() == 0) {
_cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY); _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
} else if (numel > _cgrid_brick.cols()) { } else if (numel > _cgrid_brick.cols()) {
@ -688,11 +688,13 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
for (int iy = _nylo_out; iy <= _nyhi_out; iy++) for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
_cgrid_brick[n] = host_grid_brick[iz][iy][ix][0]; numtyp2 v;
_cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1]; v.x = host_grid_brick[iz][iy][ix][0];
n += 2; v.y = host_grid_brick[iz][iy][ix][1];
_cgrid_brick[n] = v;
n++;
} }
_cgrid_brick.update_device(_num_grid_points*2, false); _cgrid_brick.update_device(_num_grid_points, false);
const int red_blocks = fphi_uind(); const int red_blocks = fphi_uind();
@ -740,7 +742,7 @@ int BaseAmoebaT::fphi_uind() {
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi) void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec)
{ {
// TODO: grid brick[k][j][i] is a scalar // TODO: grid brick[k][j][i] is a scalar
UCL_H_Vec<numtyp> hdummy; UCL_H_Vec<numtyp> hdummy;
@ -750,11 +752,15 @@ void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
for (int iy = _nylo_out; iy <= _nyhi_out; iy++) for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
_cgrid_brick[n] = host_grid_brick[iz][iy][ix]; numtyp2 v;
v.x = host_grid_brick[iz][iy][ix];
v.y = (numtyp)0;
_cgrid_brick[n] = v;
n++; n++;
} }
_cgrid_brick.update_device(_num_grid_points, false); _cgrid_brick.update_device(_num_grid_points, false);
_felec = felec;
const int red_blocks = fphi_mpole(); const int red_blocks = fphi_mpole();
_fdip_sum_phi.update_host(_max_thetai_size*20); _fdip_sum_phi.update_host(_max_thetai_size*20);
@ -776,13 +782,14 @@ int BaseAmoebaT::fphi_mpole() {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=block_size(); const int BX=block_size();
//printf("BX = %d; pppm block size = %d\n", BX, PPPM_BLOCK_1D);
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX)); int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
time_pair.start(); time_pair.start();
int ngridxy = _ngridx * _ngridy; int ngridxy = _ngridx * _ngridy;
k_fphi_mpole.set_size(GX,BX); k_fphi_mpole.set_size(GX,BX);
k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
&_fdip_sum_phi, &_bsorder, &ainum, &_fdip_sum_phi, &_bsorder, &ainum, &_felec,
&_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx); &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
time_pair.stop(); time_pair.stop();

View File

@ -173,8 +173,8 @@ class BaseAmoeba {
double **host_uind, double **host_uinp, double *host_pval, double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2_polar, void **fieldp_ptr); const double aewald, const double off2_polar, void **fieldp_ptr);
/// Allocate/resize per-atom arrays before induce() /// Allocate/resize per-atom arrays before the kspace parts in induce() and polar
virtual void precompute_induce(const int inum_full, const int bsorder, virtual void precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2, double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid, double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out, const int nzlo_out, const int nzhi_out,
@ -185,7 +185,8 @@ class BaseAmoeba {
void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_phi1, void **host_fdip_phi2,
void **host_fdip_sum_phi); void **host_fdip_sum_phi);
virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi); virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
const double felec);
/// Compute polar real-space with device neighboring /// Compute polar real-space with device neighboring
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
@ -256,7 +257,7 @@ class BaseAmoeba {
int _bsorder; int _bsorder;
UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3; UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
UCL_Vector<int,int> _igrid; UCL_Vector<int,int> _igrid;
UCL_Vector<numtyp,numtyp> _cgrid_brick; UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi; UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
int _max_thetai_size; int _max_thetai_size;
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;

View File

@ -193,13 +193,13 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
} }
void hippo_gpu_precompute_induce(const int inum_full, const int bsorder, void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2, double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid, double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out, const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out, const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out) { const int nxlo_out, const int nxhi_out) {
HIPPOMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2,
host_thetai3, igrid, nzlo_out, nzhi_out, host_thetai3, igrid, nzlo_out, nzhi_out,
nylo_out, nyhi_out, nxlo_out, nxhi_out); nylo_out, nyhi_out, nxlo_out, nxhi_out);
} }

View File

@ -88,7 +88,7 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
void amoeba_gpu_update_fieldp(void **fieldp_ptr); void amoeba_gpu_update_fieldp(void **fieldp_ptr);
void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder, void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2, double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid, double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out, const int nzlo_out, const int nzhi_out,
@ -98,7 +98,8 @@ void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
void **host_fdip_phi2, void **host_fdip_sum_phi); void **host_fdip_phi2, void **host_fdip_sum_phi);
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi); void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi,
const double felec);
void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
double **host_rpole, double **host_uind, double **host_uinp, double **host_rpole, double **host_uind, double **host_uinp,
@ -343,7 +344,7 @@ void PairAmoebaGPU::induce()
// must be done before the first ufield0c // must be done before the first ufield0c
// NOTE: this is for ic_kspace, and thetai[1-3] // NOTE: this is for ic_kspace, and thetai[1-3]
amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
thetai3, igrid, thetai3, igrid,
ic_kspace->nzlo_out, ic_kspace->nzhi_out, ic_kspace->nzlo_out, ic_kspace->nzhi_out,
ic_kspace->nylo_out, ic_kspace->nyhi_out, ic_kspace->nylo_out, ic_kspace->nyhi_out,
@ -1382,11 +1383,11 @@ void PairAmoebaGPU::polar_kspace()
// NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill
if (gpu_fphi_mpole_ready) { if (gpu_fphi_mpole_ready) {
amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, amoeba_gpu_precompute_kspace(atom->nlocal, bsorder,
thetai3, igrid, p_kspace->nzlo_out, thetai1, thetai2, thetai3, igrid,
p_kspace->nzhi_out, p_kspace->nylo_out, p_kspace->nzlo_out, p_kspace->nzhi_out,
p_kspace->nyhi_out, p_kspace->nxlo_out, p_kspace->nylo_out, p_kspace->nyhi_out,
p_kspace->nxhi_out); p_kspace->nxlo_out, p_kspace->nxhi_out);
} }
@ -1461,10 +1462,15 @@ void PairAmoebaGPU::polar_kspace()
if (!gpu_fphi_mpole_ready) { if (!gpu_fphi_mpole_ready) {
fphi_mpole(gridpost,fphi); fphi_mpole(gridpost,fphi);
//printf("cpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]);
for (i = 0; i < nlocal; i++) {
for (k = 0; k < 20; k++)
fphi[i][k] *= felec;
}
} else { } else {
void* fphi_pinned = nullptr; void* fphi_pinned = nullptr;
amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned); amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec);
double *_fphi_ptr = (double *)fphi_pinned; double *_fphi_ptr = (double *)fphi_pinned;
for (int i = 0; i < nlocal; i++) { for (int i = 0; i < nlocal; i++) {
@ -1474,12 +1480,7 @@ void PairAmoebaGPU::polar_kspace()
idx += nlocal; idx += nlocal;
} }
} }
//printf("gpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]);
}
for (i = 0; i < nlocal; i++) {
for (k = 0; k < 20; k++)
fphi[i][k] *= felec;
} }
// convert field from fractional to Cartesian // convert field from fractional to Cartesian

View File

@ -105,7 +105,7 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
void hippo_gpu_update_fieldp(void **fieldp_ptr); void hippo_gpu_update_fieldp(void **fieldp_ptr);
void hippo_gpu_precompute_induce(const int inum_full, const int bsorder, void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2, double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid, double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out, const int nzlo_out, const int nzhi_out,
@ -475,7 +475,7 @@ void PairHippoGPU::induce()
// allocate memory and make early host-device transfers // allocate memory and make early host-device transfers
// must be done before the first ufield0c // must be done before the first ufield0c
hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
thetai3, igrid, thetai3, igrid,
ic_kspace->nzlo_out, ic_kspace->nzhi_out, ic_kspace->nzlo_out, ic_kspace->nzhi_out,
ic_kspace->nylo_out, ic_kspace->nyhi_out, ic_kspace->nylo_out, ic_kspace->nyhi_out,