Updated fphi_mpole, renamed precompute_induce to precompute_kspace
This commit is contained in:
@ -68,7 +68,34 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
|
|||||||
|
|
||||||
# host code compilation
|
# host code compilation
|
||||||
|
|
||||||
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
|
$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
|
||||||
|
$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
|
||||||
|
$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
|
||||||
|
$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
|
||||||
|
$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
|
||||||
|
$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
|
||||||
|
$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H)
|
||||||
|
$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
|
||||||
|
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
|
||||||
|
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
|
||||||
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
||||||
|
|
||||||
#ifdef CUDPP_OPT
|
#ifdef CUDPP_OPT
|
||||||
|
|||||||
@ -1630,7 +1630,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
|
|||||||
const __global numtyp4 *restrict thetai2,
|
const __global numtyp4 *restrict thetai2,
|
||||||
const __global numtyp4 *restrict thetai3,
|
const __global numtyp4 *restrict thetai3,
|
||||||
const __global int *restrict igrid,
|
const __global int *restrict igrid,
|
||||||
const __global numtyp *restrict grid,
|
const __global numtyp2 *restrict grid,
|
||||||
__global numtyp *restrict fdip_phi1,
|
__global numtyp *restrict fdip_phi1,
|
||||||
__global numtyp *restrict fdip_phi2,
|
__global numtyp *restrict fdip_phi2,
|
||||||
__global numtyp *restrict fdip_sum_phi,
|
__global numtyp *restrict fdip_sum_phi,
|
||||||
@ -1648,12 +1648,12 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
int nlpts = (bsorder-1) / 2;
|
const int nlpts = (bsorder-1) / 2;
|
||||||
|
|
||||||
int istart = fast_mul(ii,4);
|
int istart = fast_mul(ii,4);
|
||||||
int igridx = igrid[istart];
|
const int igridx = igrid[istart];
|
||||||
int igridy = igrid[istart+1];
|
const int igridy = igrid[istart+1];
|
||||||
int igridz = igrid[istart+2];
|
const int igridz = igrid[istart+2];
|
||||||
|
|
||||||
// now istart is used to index thetai1, thetai2 and thetai3
|
// now istart is used to index thetai1, thetai2 and thetai3
|
||||||
istart = fast_mul(ii,bsorder);
|
istart = fast_mul(ii,bsorder);
|
||||||
@ -1701,18 +1701,13 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
|
|||||||
|
|
||||||
int k = (igridz - nzlo_out) - nlpts;
|
int k = (igridz - nzlo_out) - nlpts;
|
||||||
for (int kb = 0; kb < bsorder; kb++) {
|
for (int kb = 0; kb < bsorder; kb++) {
|
||||||
/*
|
const int mz = fast_mul(k, ngridxy);
|
||||||
v0 = thetai3[m][kb][0];
|
const int i3 = istart + kb;
|
||||||
v1 = thetai3[m][kb][1];
|
const numtyp4 tha3 = thetai3[i3];
|
||||||
v2 = thetai3[m][kb][2];
|
const numtyp v0 = tha3.x; // thetai3[m][kb][0];
|
||||||
v3 = thetai3[m][kb][3];
|
const numtyp v1 = tha3.y; // thetai3[m][kb][1];
|
||||||
*/
|
const numtyp v2 = tha3.z; // thetai3[m][kb][2];
|
||||||
int i3 = istart + kb;
|
const numtyp v3 = tha3.w; // thetai3[m][kb][3];
|
||||||
numtyp4 tha3 = thetai3[i3];
|
|
||||||
numtyp v0 = tha3.x;
|
|
||||||
numtyp v1 = tha3.y;
|
|
||||||
numtyp v2 = tha3.z;
|
|
||||||
numtyp v3 = tha3.w;
|
|
||||||
numtyp tu00_1 = (numtyp)0.0;
|
numtyp tu00_1 = (numtyp)0.0;
|
||||||
numtyp tu01_1 = (numtyp)0.0;
|
numtyp tu01_1 = (numtyp)0.0;
|
||||||
numtyp tu10_1 = (numtyp)0.0;
|
numtyp tu10_1 = (numtyp)0.0;
|
||||||
@ -1738,18 +1733,13 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
|
|||||||
|
|
||||||
int j = (igridy - nylo_out) - nlpts;
|
int j = (igridy - nylo_out) - nlpts;
|
||||||
for (int jb = 0; jb < bsorder; jb++) {
|
for (int jb = 0; jb < bsorder; jb++) {
|
||||||
/*
|
const int my = mz + fast_mul(j, ngridx);
|
||||||
u0 = thetai2[m][jb][0];
|
const int i2 = istart + jb;
|
||||||
u1 = thetai2[m][jb][1];
|
const numtyp4 tha2 = thetai2[i2];
|
||||||
u2 = thetai2[m][jb][2];
|
const numtyp u0 = tha2.x; // thetai2[m][jb][0];
|
||||||
u3 = thetai2[m][jb][3];
|
const numtyp u1 = tha2.y; // thetai2[m][jb][1];
|
||||||
*/
|
const numtyp u2 = tha2.z; // thetai2[m][jb][2];
|
||||||
int i2 = istart + jb;
|
const numtyp u3 = tha2.w; // thetai2[m][jb][3];
|
||||||
numtyp4 tha2 = thetai2[i2];
|
|
||||||
numtyp u0 = tha2.x;
|
|
||||||
numtyp u1 = tha2.y;
|
|
||||||
numtyp u2 = tha2.z;
|
|
||||||
numtyp u3 = tha2.w;
|
|
||||||
numtyp t0_1 = (numtyp)0.0;
|
numtyp t0_1 = (numtyp)0.0;
|
||||||
numtyp t1_1 = (numtyp)0.0;
|
numtyp t1_1 = (numtyp)0.0;
|
||||||
numtyp t2_1 = (numtyp)0.0;
|
numtyp t2_1 = (numtyp)0.0;
|
||||||
@ -1771,22 +1761,25 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
|
|||||||
t2_2 += tq_2*thetai1[m][ib][2];
|
t2_2 += tq_2*thetai1[m][ib][2];
|
||||||
t3 += (tq_1+tq_2)*thetai1[m][ib][3];
|
t3 += (tq_1+tq_2)*thetai1[m][ib][3];
|
||||||
*/
|
*/
|
||||||
int i1 = istart + ib;
|
const int i1 = istart + ib;
|
||||||
numtyp4 tha1 = thetai1[i1];
|
const numtyp4 tha1 = thetai1[i1];
|
||||||
numtyp w0 = tha1.x;
|
/*
|
||||||
numtyp w1 = tha1.y;
|
const numtyp w0 = tha1.x;
|
||||||
numtyp w2 = tha1.z;
|
const numtyp w1 = tha1.y;
|
||||||
numtyp w3 = tha1.w;
|
const numtyp w2 = tha1.z;
|
||||||
int gidx = 2*(k*ngridxy + j*ngridx + i);
|
const numtyp w3 = tha1.w;
|
||||||
numtyp tq_1 = grid[gidx];
|
*/
|
||||||
numtyp tq_2 = grid[gidx+1];
|
const int gidx = my + i; // k*ngridxy + j*ngridx + i;
|
||||||
t0_1 += tq_1*w0;
|
const numtyp2 tq = grid[gidx];
|
||||||
t1_1 += tq_1*w1;
|
const numtyp tq_1 = tq.x; //grid[gidx];
|
||||||
t2_1 += tq_1*w2;
|
const numtyp tq_2 = tq.y; //grid[gidx+1];
|
||||||
t0_2 += tq_2*w0;
|
t0_1 += tq_1*tha1.x; // w0
|
||||||
t1_2 += tq_2*w1;
|
t1_1 += tq_1*tha1.y; // w1
|
||||||
t2_2 += tq_2*w2;
|
t2_1 += tq_1*tha1.z; // w2
|
||||||
t3 += (tq_1+tq_2)*w3;
|
t0_2 += tq_2*tha1.x; // w0
|
||||||
|
t1_2 += tq_2*tha1.y; // w1
|
||||||
|
t2_2 += tq_2*tha1.z; // w2
|
||||||
|
t3 += (tq_1+tq_2)*tha1.w; // w3
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1933,9 +1926,9 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
|
|||||||
const __global numtyp4 *restrict thetai2,
|
const __global numtyp4 *restrict thetai2,
|
||||||
const __global numtyp4 *restrict thetai3,
|
const __global numtyp4 *restrict thetai3,
|
||||||
const __global int *restrict igrid,
|
const __global int *restrict igrid,
|
||||||
const __global numtyp *restrict grid,
|
const __global numtyp2 *restrict grid,
|
||||||
__global numtyp *restrict fphi,
|
__global numtyp *restrict fphi,
|
||||||
const int bsorder, const int inum,
|
const int bsorder, const int inum, const numtyp felec,
|
||||||
const int nzlo_out, const int nylo_out,
|
const int nzlo_out, const int nylo_out,
|
||||||
const int nxlo_out, const int ngridxy,
|
const int nxlo_out, const int ngridxy,
|
||||||
const int ngridx)
|
const int ngridx)
|
||||||
@ -2027,7 +2020,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
|
|||||||
int i1 = istart + ib;
|
int i1 = istart + ib;
|
||||||
numtyp4 tha1 = thetai1[i1];
|
numtyp4 tha1 = thetai1[i1];
|
||||||
int gidx = k*ngridxy + j*ngridx + i;
|
int gidx = k*ngridxy + j*ngridx + i;
|
||||||
numtyp tq = grid[gidx];
|
numtyp tq = grid[gidx].x;
|
||||||
t0 += tq*tha1.x;
|
t0 += tq*tha1.x;
|
||||||
t1 += tq*tha1.y;
|
t1 += tq*tha1.y;
|
||||||
t2 += tq*tha1.z;
|
t2 += tq*tha1.z;
|
||||||
@ -2095,7 +2088,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
|
|||||||
|
|
||||||
int idx = ii;
|
int idx = ii;
|
||||||
for (int m = 0; m < 20; m++) {
|
for (int m = 0; m < 20; m++) {
|
||||||
fphi[idx] = buf[m];
|
fphi[idx] = felec * buf[m];
|
||||||
idx += inum;
|
idx += inum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -162,15 +162,14 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
|
|||||||
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
|
void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
|
||||||
double ***host_thetai1, double ***host_thetai2,
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
double ***host_thetai3, int** igrid,
|
double ***host_thetai3, int** igrid,
|
||||||
const int nzlo_out, const int nzhi_out,
|
const int nzlo_out, const int nzhi_out,
|
||||||
const int nylo_out, const int nyhi_out,
|
const int nylo_out, const int nyhi_out,
|
||||||
const int nxlo_out, const int nxhi_out) {
|
const int nxlo_out, const int nxhi_out) {
|
||||||
AMOEBAMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
|
AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid,
|
||||||
host_thetai3, igrid, nzlo_out, nzhi_out,
|
nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out);
|
||||||
nylo_out, nyhi_out, nxlo_out, nxhi_out);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
|
void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
|
||||||
@ -179,8 +178,8 @@ void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
|
|||||||
host_fdip_phi2, host_fdip_sum_phi);
|
host_fdip_phi2, host_fdip_sum_phi);
|
||||||
}
|
}
|
||||||
|
|
||||||
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi) {
|
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) {
|
||||||
AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi);
|
AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec);
|
||||||
}
|
}
|
||||||
|
|
||||||
void amoeba_setup_fft(const int numel, const int element_type) {
|
void amoeba_setup_fft(const int numel, const int element_type) {
|
||||||
|
|||||||
@ -566,7 +566,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
|
void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
|
||||||
double ***host_thetai1, double ***host_thetai2,
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
double ***host_thetai3, int** host_igrid,
|
double ***host_thetai3, int** host_igrid,
|
||||||
const int nzlo_out, const int nzhi_out,
|
const int nzlo_out, const int nzhi_out,
|
||||||
@ -660,7 +660,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
|
|||||||
_ngridx = nxhi_out - nxlo_out + 1;
|
_ngridx = nxhi_out - nxlo_out + 1;
|
||||||
_num_grid_points = _ngridx * _ngridy * _ngridz;
|
_num_grid_points = _ngridx * _ngridy * _ngridz;
|
||||||
|
|
||||||
int numel = _num_grid_points*2;
|
int numel = _num_grid_points;
|
||||||
if (_cgrid_brick.cols() == 0) {
|
if (_cgrid_brick.cols() == 0) {
|
||||||
_cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
|
_cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
|
||||||
} else if (numel > _cgrid_brick.cols()) {
|
} else if (numel > _cgrid_brick.cols()) {
|
||||||
@ -688,11 +688,13 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
|
|||||||
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
|
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
|
||||||
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
|
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
|
||||||
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
|
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
|
||||||
_cgrid_brick[n] = host_grid_brick[iz][iy][ix][0];
|
numtyp2 v;
|
||||||
_cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1];
|
v.x = host_grid_brick[iz][iy][ix][0];
|
||||||
n += 2;
|
v.y = host_grid_brick[iz][iy][ix][1];
|
||||||
|
_cgrid_brick[n] = v;
|
||||||
|
n++;
|
||||||
}
|
}
|
||||||
_cgrid_brick.update_device(_num_grid_points*2, false);
|
_cgrid_brick.update_device(_num_grid_points, false);
|
||||||
|
|
||||||
const int red_blocks = fphi_uind();
|
const int red_blocks = fphi_uind();
|
||||||
|
|
||||||
@ -740,7 +742,7 @@ int BaseAmoebaT::fphi_uind() {
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi)
|
void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec)
|
||||||
{
|
{
|
||||||
// TODO: grid brick[k][j][i] is a scalar
|
// TODO: grid brick[k][j][i] is a scalar
|
||||||
UCL_H_Vec<numtyp> hdummy;
|
UCL_H_Vec<numtyp> hdummy;
|
||||||
@ -750,11 +752,15 @@ void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi
|
|||||||
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
|
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
|
||||||
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
|
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
|
||||||
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
|
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
|
||||||
_cgrid_brick[n] = host_grid_brick[iz][iy][ix];
|
numtyp2 v;
|
||||||
|
v.x = host_grid_brick[iz][iy][ix];
|
||||||
|
v.y = (numtyp)0;
|
||||||
|
_cgrid_brick[n] = v;
|
||||||
n++;
|
n++;
|
||||||
}
|
}
|
||||||
_cgrid_brick.update_device(_num_grid_points, false);
|
_cgrid_brick.update_device(_num_grid_points, false);
|
||||||
|
|
||||||
|
_felec = felec;
|
||||||
const int red_blocks = fphi_mpole();
|
const int red_blocks = fphi_mpole();
|
||||||
|
|
||||||
_fdip_sum_phi.update_host(_max_thetai_size*20);
|
_fdip_sum_phi.update_host(_max_thetai_size*20);
|
||||||
@ -776,13 +782,14 @@ int BaseAmoebaT::fphi_mpole() {
|
|||||||
|
|
||||||
// Compute the block size and grid size to keep all cores busy
|
// Compute the block size and grid size to keep all cores busy
|
||||||
const int BX=block_size();
|
const int BX=block_size();
|
||||||
|
//printf("BX = %d; pppm block size = %d\n", BX, PPPM_BLOCK_1D);
|
||||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||||
|
|
||||||
time_pair.start();
|
time_pair.start();
|
||||||
int ngridxy = _ngridx * _ngridy;
|
int ngridxy = _ngridx * _ngridy;
|
||||||
k_fphi_mpole.set_size(GX,BX);
|
k_fphi_mpole.set_size(GX,BX);
|
||||||
k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
|
k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
|
||||||
&_fdip_sum_phi, &_bsorder, &ainum,
|
&_fdip_sum_phi, &_bsorder, &ainum, &_felec,
|
||||||
&_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
|
&_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
|
||||||
time_pair.stop();
|
time_pair.stop();
|
||||||
|
|
||||||
|
|||||||
@ -173,8 +173,8 @@ class BaseAmoeba {
|
|||||||
double **host_uind, double **host_uinp, double *host_pval,
|
double **host_uind, double **host_uinp, double *host_pval,
|
||||||
const double aewald, const double off2_polar, void **fieldp_ptr);
|
const double aewald, const double off2_polar, void **fieldp_ptr);
|
||||||
|
|
||||||
/// Allocate/resize per-atom arrays before induce()
|
/// Allocate/resize per-atom arrays before the kspace parts in induce() and polar
|
||||||
virtual void precompute_induce(const int inum_full, const int bsorder,
|
virtual void precompute_kspace(const int inum_full, const int bsorder,
|
||||||
double ***host_thetai1, double ***host_thetai2,
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
double ***host_thetai3, int** igrid,
|
double ***host_thetai3, int** igrid,
|
||||||
const int nzlo_out, const int nzhi_out,
|
const int nzlo_out, const int nzhi_out,
|
||||||
@ -185,7 +185,8 @@ class BaseAmoeba {
|
|||||||
void **host_fdip_phi1, void **host_fdip_phi2,
|
void **host_fdip_phi1, void **host_fdip_phi2,
|
||||||
void **host_fdip_sum_phi);
|
void **host_fdip_sum_phi);
|
||||||
|
|
||||||
virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi);
|
virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
|
||||||
|
const double felec);
|
||||||
|
|
||||||
/// Compute polar real-space with device neighboring
|
/// Compute polar real-space with device neighboring
|
||||||
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
|
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||||
@ -256,7 +257,7 @@ class BaseAmoeba {
|
|||||||
int _bsorder;
|
int _bsorder;
|
||||||
UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
|
UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
|
||||||
UCL_Vector<int,int> _igrid;
|
UCL_Vector<int,int> _igrid;
|
||||||
UCL_Vector<numtyp,numtyp> _cgrid_brick;
|
UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
|
||||||
UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
|
UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
|
||||||
int _max_thetai_size;
|
int _max_thetai_size;
|
||||||
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
|
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
|
||||||
|
|||||||
@ -193,13 +193,13 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **
|
|||||||
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
|
void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
|
||||||
double ***host_thetai1, double ***host_thetai2,
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
double ***host_thetai3, int** igrid,
|
double ***host_thetai3, int** igrid,
|
||||||
const int nzlo_out, const int nzhi_out,
|
const int nzlo_out, const int nzhi_out,
|
||||||
const int nylo_out, const int nyhi_out,
|
const int nylo_out, const int nyhi_out,
|
||||||
const int nxlo_out, const int nxhi_out) {
|
const int nxlo_out, const int nxhi_out) {
|
||||||
HIPPOMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
|
HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2,
|
||||||
host_thetai3, igrid, nzlo_out, nzhi_out,
|
host_thetai3, igrid, nzlo_out, nzhi_out,
|
||||||
nylo_out, nyhi_out, nxlo_out, nxhi_out);
|
nylo_out, nyhi_out, nxlo_out, nxhi_out);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -88,7 +88,7 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
|
|||||||
|
|
||||||
void amoeba_gpu_update_fieldp(void **fieldp_ptr);
|
void amoeba_gpu_update_fieldp(void **fieldp_ptr);
|
||||||
|
|
||||||
void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
|
void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
|
||||||
double ***host_thetai1, double ***host_thetai2,
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
double ***host_thetai3, int** igrid,
|
double ***host_thetai3, int** igrid,
|
||||||
const int nzlo_out, const int nzhi_out,
|
const int nzlo_out, const int nzhi_out,
|
||||||
@ -98,7 +98,8 @@ void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
|
|||||||
void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
|
void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
|
||||||
void **host_fdip_phi2, void **host_fdip_sum_phi);
|
void **host_fdip_phi2, void **host_fdip_sum_phi);
|
||||||
|
|
||||||
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi);
|
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi,
|
||||||
|
const double felec);
|
||||||
|
|
||||||
void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
|
void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
|
||||||
double **host_rpole, double **host_uind, double **host_uinp,
|
double **host_rpole, double **host_uind, double **host_uinp,
|
||||||
@ -343,7 +344,7 @@ void PairAmoebaGPU::induce()
|
|||||||
// must be done before the first ufield0c
|
// must be done before the first ufield0c
|
||||||
// NOTE: this is for ic_kspace, and thetai[1-3]
|
// NOTE: this is for ic_kspace, and thetai[1-3]
|
||||||
|
|
||||||
amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
|
amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
|
||||||
thetai3, igrid,
|
thetai3, igrid,
|
||||||
ic_kspace->nzlo_out, ic_kspace->nzhi_out,
|
ic_kspace->nzlo_out, ic_kspace->nzhi_out,
|
||||||
ic_kspace->nylo_out, ic_kspace->nyhi_out,
|
ic_kspace->nylo_out, ic_kspace->nyhi_out,
|
||||||
@ -1382,11 +1383,11 @@ void PairAmoebaGPU::polar_kspace()
|
|||||||
|
|
||||||
// NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill
|
// NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill
|
||||||
if (gpu_fphi_mpole_ready) {
|
if (gpu_fphi_mpole_ready) {
|
||||||
amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
|
amoeba_gpu_precompute_kspace(atom->nlocal, bsorder,
|
||||||
thetai3, igrid, p_kspace->nzlo_out,
|
thetai1, thetai2, thetai3, igrid,
|
||||||
p_kspace->nzhi_out, p_kspace->nylo_out,
|
p_kspace->nzlo_out, p_kspace->nzhi_out,
|
||||||
p_kspace->nyhi_out, p_kspace->nxlo_out,
|
p_kspace->nylo_out, p_kspace->nyhi_out,
|
||||||
p_kspace->nxhi_out);
|
p_kspace->nxlo_out, p_kspace->nxhi_out);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1461,10 +1462,15 @@ void PairAmoebaGPU::polar_kspace()
|
|||||||
|
|
||||||
if (!gpu_fphi_mpole_ready) {
|
if (!gpu_fphi_mpole_ready) {
|
||||||
fphi_mpole(gridpost,fphi);
|
fphi_mpole(gridpost,fphi);
|
||||||
//printf("cpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]);
|
|
||||||
|
for (i = 0; i < nlocal; i++) {
|
||||||
|
for (k = 0; k < 20; k++)
|
||||||
|
fphi[i][k] *= felec;
|
||||||
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
void* fphi_pinned = nullptr;
|
void* fphi_pinned = nullptr;
|
||||||
amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned);
|
amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec);
|
||||||
|
|
||||||
double *_fphi_ptr = (double *)fphi_pinned;
|
double *_fphi_ptr = (double *)fphi_pinned;
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
@ -1474,12 +1480,7 @@ void PairAmoebaGPU::polar_kspace()
|
|||||||
idx += nlocal;
|
idx += nlocal;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//printf("gpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < nlocal; i++) {
|
|
||||||
for (k = 0; k < 20; k++)
|
|
||||||
fphi[i][k] *= felec;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// convert field from fractional to Cartesian
|
// convert field from fractional to Cartesian
|
||||||
|
|||||||
@ -105,7 +105,7 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
|
|||||||
|
|
||||||
void hippo_gpu_update_fieldp(void **fieldp_ptr);
|
void hippo_gpu_update_fieldp(void **fieldp_ptr);
|
||||||
|
|
||||||
void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
|
void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
|
||||||
double ***host_thetai1, double ***host_thetai2,
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
double ***host_thetai3, int** igrid,
|
double ***host_thetai3, int** igrid,
|
||||||
const int nzlo_out, const int nzhi_out,
|
const int nzlo_out, const int nzhi_out,
|
||||||
@ -475,7 +475,7 @@ void PairHippoGPU::induce()
|
|||||||
// allocate memory and make early host-device transfers
|
// allocate memory and make early host-device transfers
|
||||||
// must be done before the first ufield0c
|
// must be done before the first ufield0c
|
||||||
|
|
||||||
hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
|
hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
|
||||||
thetai3, igrid,
|
thetai3, igrid,
|
||||||
ic_kspace->nzlo_out, ic_kspace->nzhi_out,
|
ic_kspace->nzlo_out, ic_kspace->nzhi_out,
|
||||||
ic_kspace->nylo_out, ic_kspace->nyhi_out,
|
ic_kspace->nylo_out, ic_kspace->nyhi_out,
|
||||||
|
|||||||
Reference in New Issue
Block a user