Updated fphi_mpole, renamed precompute_induce to precompute_kspace

This commit is contained in:
Trung Nguyen
2022-09-28 15:08:18 -05:00
parent 166701f13a
commit e6d2582642
8 changed files with 129 additions and 101 deletions

View File

@ -68,7 +68,34 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
# host code compilation
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H)
$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
#ifdef CUDPP_OPT

View File

@ -1630,7 +1630,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
const __global numtyp4 *restrict thetai2,
const __global numtyp4 *restrict thetai3,
const __global int *restrict igrid,
const __global numtyp *restrict grid,
const __global numtyp2 *restrict grid,
__global numtyp *restrict fdip_phi1,
__global numtyp *restrict fdip_phi2,
__global numtyp *restrict fdip_sum_phi,
@ -1648,12 +1648,12 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
if (ii<inum) {
int nlpts = (bsorder-1) / 2;
const int nlpts = (bsorder-1) / 2;
int istart = fast_mul(ii,4);
int igridx = igrid[istart];
int igridy = igrid[istart+1];
int igridz = igrid[istart+2];
const int igridx = igrid[istart];
const int igridy = igrid[istart+1];
const int igridz = igrid[istart+2];
// now istart is used to index thetai1, thetai2 and thetai3
istart = fast_mul(ii,bsorder);
@ -1701,18 +1701,13 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
int k = (igridz - nzlo_out) - nlpts;
for (int kb = 0; kb < bsorder; kb++) {
/*
v0 = thetai3[m][kb][0];
v1 = thetai3[m][kb][1];
v2 = thetai3[m][kb][2];
v3 = thetai3[m][kb][3];
*/
int i3 = istart + kb;
numtyp4 tha3 = thetai3[i3];
numtyp v0 = tha3.x;
numtyp v1 = tha3.y;
numtyp v2 = tha3.z;
numtyp v3 = tha3.w;
const int mz = fast_mul(k, ngridxy);
const int i3 = istart + kb;
const numtyp4 tha3 = thetai3[i3];
const numtyp v0 = tha3.x; // thetai3[m][kb][0];
const numtyp v1 = tha3.y; // thetai3[m][kb][1];
const numtyp v2 = tha3.z; // thetai3[m][kb][2];
const numtyp v3 = tha3.w; // thetai3[m][kb][3];
numtyp tu00_1 = (numtyp)0.0;
numtyp tu01_1 = (numtyp)0.0;
numtyp tu10_1 = (numtyp)0.0;
@ -1738,18 +1733,13 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
int j = (igridy - nylo_out) - nlpts;
for (int jb = 0; jb < bsorder; jb++) {
/*
u0 = thetai2[m][jb][0];
u1 = thetai2[m][jb][1];
u2 = thetai2[m][jb][2];
u3 = thetai2[m][jb][3];
*/
int i2 = istart + jb;
numtyp4 tha2 = thetai2[i2];
numtyp u0 = tha2.x;
numtyp u1 = tha2.y;
numtyp u2 = tha2.z;
numtyp u3 = tha2.w;
const int my = mz + fast_mul(j, ngridx);
const int i2 = istart + jb;
const numtyp4 tha2 = thetai2[i2];
const numtyp u0 = tha2.x; // thetai2[m][jb][0];
const numtyp u1 = tha2.y; // thetai2[m][jb][1];
const numtyp u2 = tha2.z; // thetai2[m][jb][2];
const numtyp u3 = tha2.w; // thetai2[m][jb][3];
numtyp t0_1 = (numtyp)0.0;
numtyp t1_1 = (numtyp)0.0;
numtyp t2_1 = (numtyp)0.0;
@ -1771,22 +1761,25 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
t2_2 += tq_2*thetai1[m][ib][2];
t3 += (tq_1+tq_2)*thetai1[m][ib][3];
*/
int i1 = istart + ib;
numtyp4 tha1 = thetai1[i1];
numtyp w0 = tha1.x;
numtyp w1 = tha1.y;
numtyp w2 = tha1.z;
numtyp w3 = tha1.w;
int gidx = 2*(k*ngridxy + j*ngridx + i);
numtyp tq_1 = grid[gidx];
numtyp tq_2 = grid[gidx+1];
t0_1 += tq_1*w0;
t1_1 += tq_1*w1;
t2_1 += tq_1*w2;
t0_2 += tq_2*w0;
t1_2 += tq_2*w1;
t2_2 += tq_2*w2;
t3 += (tq_1+tq_2)*w3;
const int i1 = istart + ib;
const numtyp4 tha1 = thetai1[i1];
/*
const numtyp w0 = tha1.x;
const numtyp w1 = tha1.y;
const numtyp w2 = tha1.z;
const numtyp w3 = tha1.w;
*/
const int gidx = my + i; // k*ngridxy + j*ngridx + i;
const numtyp2 tq = grid[gidx];
const numtyp tq_1 = tq.x; //grid[gidx];
const numtyp tq_2 = tq.y; //grid[gidx+1];
t0_1 += tq_1*tha1.x; // w0
t1_1 += tq_1*tha1.y; // w1
t2_1 += tq_1*tha1.z; // w2
t0_2 += tq_2*tha1.x; // w0
t1_2 += tq_2*tha1.y; // w1
t2_2 += tq_2*tha1.z; // w2
t3 += (tq_1+tq_2)*tha1.w; // w3
i++;
}
@ -1933,9 +1926,9 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
const __global numtyp4 *restrict thetai2,
const __global numtyp4 *restrict thetai3,
const __global int *restrict igrid,
const __global numtyp *restrict grid,
const __global numtyp2 *restrict grid,
__global numtyp *restrict fphi,
const int bsorder, const int inum,
const int bsorder, const int inum, const numtyp felec,
const int nzlo_out, const int nylo_out,
const int nxlo_out, const int ngridxy,
const int ngridx)
@ -2027,7 +2020,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
int i1 = istart + ib;
numtyp4 tha1 = thetai1[i1];
int gidx = k*ngridxy + j*ngridx + i;
numtyp tq = grid[gidx];
numtyp tq = grid[gidx].x;
t0 += tq*tha1.x;
t1 += tq*tha1.y;
t2 += tq*tha1.z;
@ -2095,7 +2088,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
int idx = ii;
for (int m = 0; m < 20; m++) {
fphi[idx] = buf[m];
fphi[idx] = felec * buf[m];
idx += inum;
}
}

View File

@ -162,15 +162,14 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
}
void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out) {
AMOEBAMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
host_thetai3, igrid, nzlo_out, nzhi_out,
nylo_out, nyhi_out, nxlo_out, nxhi_out);
AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid,
nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out);
}
void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
@ -179,8 +178,8 @@ void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
host_fdip_phi2, host_fdip_sum_phi);
}
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi) {
AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi);
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) {
AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec);
}
void amoeba_setup_fft(const int numel, const int element_type) {

View File

@ -566,7 +566,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** host_igrid,
const int nzlo_out, const int nzhi_out,
@ -660,7 +660,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
_ngridx = nxhi_out - nxlo_out + 1;
_num_grid_points = _ngridx * _ngridy * _ngridz;
int numel = _num_grid_points*2;
int numel = _num_grid_points;
if (_cgrid_brick.cols() == 0) {
_cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
} else if (numel > _cgrid_brick.cols()) {
@ -688,11 +688,13 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
_cgrid_brick[n] = host_grid_brick[iz][iy][ix][0];
_cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1];
n += 2;
numtyp2 v;
v.x = host_grid_brick[iz][iy][ix][0];
v.y = host_grid_brick[iz][iy][ix][1];
_cgrid_brick[n] = v;
n++;
}
_cgrid_brick.update_device(_num_grid_points*2, false);
_cgrid_brick.update_device(_num_grid_points, false);
const int red_blocks = fphi_uind();
@ -740,7 +742,7 @@ int BaseAmoebaT::fphi_uind() {
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi)
void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec)
{
// TODO: grid brick[k][j][i] is a scalar
UCL_H_Vec<numtyp> hdummy;
@ -750,11 +752,15 @@ void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
_cgrid_brick[n] = host_grid_brick[iz][iy][ix];
numtyp2 v;
v.x = host_grid_brick[iz][iy][ix];
v.y = (numtyp)0;
_cgrid_brick[n] = v;
n++;
}
_cgrid_brick.update_device(_num_grid_points, false);
_felec = felec;
const int red_blocks = fphi_mpole();
_fdip_sum_phi.update_host(_max_thetai_size*20);
@ -776,13 +782,14 @@ int BaseAmoebaT::fphi_mpole() {
// Compute the block size and grid size to keep all cores busy
const int BX=block_size();
//printf("BX = %d; pppm block size = %d\n", BX, PPPM_BLOCK_1D);
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
time_pair.start();
int ngridxy = _ngridx * _ngridy;
k_fphi_mpole.set_size(GX,BX);
k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
&_fdip_sum_phi, &_bsorder, &ainum,
&_fdip_sum_phi, &_bsorder, &ainum, &_felec,
&_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
time_pair.stop();

View File

@ -173,8 +173,8 @@ class BaseAmoeba {
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2_polar, void **fieldp_ptr);
/// Allocate/resize per-atom arrays before induce()
virtual void precompute_induce(const int inum_full, const int bsorder,
/// Allocate/resize per-atom arrays before the kspace parts in induce() and polar
virtual void precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out,
@ -185,7 +185,8 @@ class BaseAmoeba {
void **host_fdip_phi1, void **host_fdip_phi2,
void **host_fdip_sum_phi);
virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi);
virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
const double felec);
/// Compute polar real-space with device neighboring
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
@ -256,7 +257,7 @@ class BaseAmoeba {
int _bsorder;
UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
UCL_Vector<int,int> _igrid;
UCL_Vector<numtyp,numtyp> _cgrid_brick;
UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
int _max_thetai_size;
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;

View File

@ -193,13 +193,13 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
}
void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out) {
HIPPOMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2,
host_thetai3, igrid, nzlo_out, nzhi_out,
nylo_out, nyhi_out, nxlo_out, nxhi_out);
}

View File

@ -88,7 +88,7 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
void amoeba_gpu_update_fieldp(void **fieldp_ptr);
void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out,
@ -98,7 +98,8 @@ void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
void **host_fdip_phi2, void **host_fdip_sum_phi);
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi);
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi,
const double felec);
void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
double **host_rpole, double **host_uind, double **host_uinp,
@ -343,7 +344,7 @@ void PairAmoebaGPU::induce()
// must be done before the first ufield0c
// NOTE: this is for ic_kspace, and thetai[1-3]
amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
thetai3, igrid,
ic_kspace->nzlo_out, ic_kspace->nzhi_out,
ic_kspace->nylo_out, ic_kspace->nyhi_out,
@ -1382,11 +1383,11 @@ void PairAmoebaGPU::polar_kspace()
// NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill
if (gpu_fphi_mpole_ready) {
amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
thetai3, igrid, p_kspace->nzlo_out,
p_kspace->nzhi_out, p_kspace->nylo_out,
p_kspace->nyhi_out, p_kspace->nxlo_out,
p_kspace->nxhi_out);
amoeba_gpu_precompute_kspace(atom->nlocal, bsorder,
thetai1, thetai2, thetai3, igrid,
p_kspace->nzlo_out, p_kspace->nzhi_out,
p_kspace->nylo_out, p_kspace->nyhi_out,
p_kspace->nxlo_out, p_kspace->nxhi_out);
}
@ -1461,10 +1462,15 @@ void PairAmoebaGPU::polar_kspace()
if (!gpu_fphi_mpole_ready) {
fphi_mpole(gridpost,fphi);
//printf("cpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]);
for (i = 0; i < nlocal; i++) {
for (k = 0; k < 20; k++)
fphi[i][k] *= felec;
}
} else {
void* fphi_pinned = nullptr;
amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned);
amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec);
double *_fphi_ptr = (double *)fphi_pinned;
for (int i = 0; i < nlocal; i++) {
@ -1474,12 +1480,7 @@ void PairAmoebaGPU::polar_kspace()
idx += nlocal;
}
}
//printf("gpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]);
}
for (i = 0; i < nlocal; i++) {
for (k = 0; k < 20; k++)
fphi[i][k] *= felec;
}
// convert field from fractional to Cartesian

View File

@ -105,7 +105,7 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
void hippo_gpu_update_fieldp(void **fieldp_ptr);
void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out,
@ -475,7 +475,7 @@ void PairHippoGPU::induce()
// allocate memory and make early host-device transfers
// must be done before the first ufield0c
hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
thetai3, igrid,
ic_kspace->nzlo_out, ic_kspace->nzhi_out,
ic_kspace->nylo_out, ic_kspace->nyhi_out,