Enabled fphi_uind in hippo/gpu, really need to refactor hippo and amoeba in the GPU lib to remove kernel duplicates

This commit is contained in:
Trung Nguyen
2022-09-16 14:47:16 -05:00
parent 880f20c285
commit 62ecf98cda
11 changed files with 626 additions and 76 deletions

View File

@ -68,31 +68,7 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
# host code compilation # host code compilation
$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H) $(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR) $(CUDR) -o $@ -c $< -I$(OBJ_DIR)
#ifdef CUDPP_OPT #ifdef CUDPP_OPT

View File

@ -64,7 +64,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
cell_size,gpu_split,_screen,amoeba, cell_size,gpu_split,_screen,amoeba,
"k_amoeba_multipole", "k_amoeba_udirect2b", "k_amoeba_multipole", "k_amoeba_udirect2b",
"k_amoeba_umutual2b", "k_amoeba_polar", "k_amoeba_umutual2b", "k_amoeba_polar",
"k_amoeba_short_nbor", "k_amoeba_special15"); "k_amoeba_fphi_uind", "k_amoeba_short_nbor",
"k_amoeba_special15");
if (success!=0) if (success!=0)
return success; return success;

View File

@ -1626,7 +1626,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
__kernel void k_fphi_uind(const __global numtyp4 *restrict thetai1, __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
const __global numtyp4 *restrict thetai2, const __global numtyp4 *restrict thetai2,
const __global numtyp4 *restrict thetai3, const __global numtyp4 *restrict thetai3,
const __global int *restrict igrid, const __global int *restrict igrid,

View File

@ -65,6 +65,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
const char *k_name_udirect2b, const char *k_name_udirect2b,
const char *k_name_umutual2b, const char *k_name_umutual2b,
const char *k_name_polar, const char *k_name_polar,
const char *k_name_fphi_uind,
const char *k_name_short_nbor, const char *k_name_short_nbor,
const char* k_name_special15) { const char* k_name_special15) {
screen=_screen; screen=_screen;
@ -100,7 +101,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
_block_bio_size=device->block_bio_pair(); _block_bio_size=device->block_bio_pair();
compile_kernels(*ucl_device,pair_program,k_name_multipole, compile_kernels(*ucl_device,pair_program,k_name_multipole,
k_name_udirect2b, k_name_umutual2b,k_name_polar, k_name_udirect2b, k_name_umutual2b,k_name_polar,
k_name_short_nbor, k_name_special15); k_name_fphi_uind, k_name_short_nbor, k_name_special15);
if (_threads_per_atom>1 && gpu_nbor==0) { if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true); nbor->packing(true);
@ -934,6 +935,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *kname_udirect2b, const char *kname_udirect2b,
const char *kname_umutual2b, const char *kname_umutual2b,
const char *kname_polar, const char *kname_polar,
const char *kname_fphi_uind,
const char *kname_short_nbor, const char *kname_short_nbor,
const char* kname_special15) { const char* kname_special15) {
if (_compiled) if (_compiled)
@ -942,17 +944,17 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
if (pair_program) delete pair_program; if (pair_program) delete pair_program;
pair_program=new UCL_Program(dev); pair_program=new UCL_Program(dev);
std::string oclstring = device->compile_string()+" -DEVFLAG=1"; std::string oclstring = device->compile_string()+" -DEVFLAG=1";
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen);
k_multipole.set_function(*pair_program,kname_multipole); k_multipole.set_function(*pair_program, kname_multipole);
k_udirect2b.set_function(*pair_program,kname_udirect2b); k_udirect2b.set_function(*pair_program, kname_udirect2b);
k_umutual2b.set_function(*pair_program,kname_umutual2b); k_umutual2b.set_function(*pair_program, kname_umutual2b);
k_polar.set_function(*pair_program,kname_polar); k_polar.set_function(*pair_program, kname_polar);
k_fphi_uind.set_function(*pair_program,"k_fphi_uind"); k_fphi_uind.set_function(*pair_program, kname_fphi_uind);
k_short_nbor.set_function(*pair_program,kname_short_nbor); k_short_nbor.set_function(*pair_program, kname_short_nbor);
k_special15.set_function(*pair_program,kname_special15); k_special15.set_function(*pair_program, kname_special15);
pos_tex.get_texture(*pair_program,"pos_tex"); pos_tex.get_texture(*pair_program, "pos_tex");
q_tex.get_texture(*pair_program,"q_tex"); q_tex.get_texture(*pair_program, "q_tex");
_compiled=true; _compiled=true;

View File

@ -62,9 +62,10 @@ class BaseAmoeba {
int init_atomic(const int nlocal, const int nall, const int max_nbors, int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15, const double cell_size, const int maxspecial, const int maxspecial15, const double cell_size,
const double gpu_split, FILE *screen, const void *pair_program, const double gpu_split, FILE *screen, const void *pair_program,
const char *kname_multipole, const char *kname_multipole, const char *kname_udirect2b,
const char *kname_udirect2b, const char *kname_umutual2b, const char *kname_umutual2b, const char *kname_polar,
const char *kname_polar, const char *kname_short_nbor, const char* kname_special15); const char *kname_fphi_uind, const char *kname_short_nbor,
const char* kname_special15);
/// Estimate the overhead for GPU context changes and CPU driver /// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(const int add_kernels=0); void estimate_gpu_overhead(const int add_kernels=0);
@ -309,7 +310,8 @@ class BaseAmoeba {
void compile_kernels(UCL_Device &dev, const void *pair_string, void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *kname_multipole, const char *kname_udirect2b, const char *kname_multipole, const char *kname_udirect2b,
const char *kname_umutual2b, const char *kname_polar, const char *kname_umutual2b, const char *kname_polar,
const char *kname_short_nbor, const char* kname_special15); const char *kname_fphi_uind, const char *kname_short_nbor,
const char* kname_special15);
virtual int multipole_real(const int eflag, const int vflag) = 0; virtual int multipole_real(const int eflag, const int vflag) = 0;
virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0;

View File

@ -67,7 +67,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
cell_size,gpu_split,_screen,hippo, cell_size,gpu_split,_screen,hippo,
"k_hippo_multipole", "k_hippo_udirect2b", "k_hippo_multipole", "k_hippo_udirect2b",
"k_hippo_umutual2b", "k_hippo_polar", "k_hippo_umutual2b", "k_hippo_polar",
"k_hippo_short_nbor", "k_hippo_special15"); "k_hippo_fphi_uind", "k_hippo_short_nbor",
"k_hippo_special15");
if (success!=0) if (success!=0)
return success; return success;

View File

@ -2045,6 +2045,307 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
} }
/* ----------------------------------------------------------------------
fphi_uind = induced potential from grid
fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
------------------------------------------------------------------------- */
__kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
const __global numtyp4 *restrict thetai2,
const __global numtyp4 *restrict thetai3,
const __global int *restrict igrid,
const __global numtyp *restrict grid,
__global numtyp *restrict fdip_phi1,
__global numtyp *restrict fdip_phi2,
__global numtyp *restrict fdip_sum_phi,
const int bsorder, const int inum,
const int nzlo_out, const int nylo_out,
const int nxlo_out, const int ngridxy,
const int ngridx)
{
//int tid, ii, offset, i, n_stride;
//atom_info(t_per_atom,ii,tid,offset);
int tid=THREAD_ID_X;
int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
if (ii<inum) {
int nlpts = (bsorder-1) / 2;
int istart = fast_mul(ii,4);
int igridx = igrid[istart];
int igridy = igrid[istart+1];
int igridz = igrid[istart+2];
// now istart is used to index thetai1, thetai2 and thetai3
istart = fast_mul(ii,bsorder);
// extract the permanent multipole field at each site
numtyp tuv100_1 = (numtyp)0.0;
numtyp tuv010_1 = (numtyp)0.0;
numtyp tuv001_1 = (numtyp)0.0;
numtyp tuv200_1 = (numtyp)0.0;
numtyp tuv020_1 = (numtyp)0.0;
numtyp tuv002_1 = (numtyp)0.0;
numtyp tuv110_1 = (numtyp)0.0;
numtyp tuv101_1 = (numtyp)0.0;
numtyp tuv011_1 = (numtyp)0.0;
numtyp tuv100_2 = (numtyp)0.0;
numtyp tuv010_2 = (numtyp)0.0;
numtyp tuv001_2 = (numtyp)0.0;
numtyp tuv200_2 = (numtyp)0.0;
numtyp tuv020_2 = (numtyp)0.0;
numtyp tuv002_2 = (numtyp)0.0;
numtyp tuv110_2 = (numtyp)0.0;
numtyp tuv101_2 = (numtyp)0.0;
numtyp tuv011_2 = (numtyp)0.0;
numtyp tuv000 = (numtyp)0.0;
numtyp tuv001 = (numtyp)0.0;
numtyp tuv010 = (numtyp)0.0;
numtyp tuv100 = (numtyp)0.0;
numtyp tuv200 = (numtyp)0.0;
numtyp tuv020 = (numtyp)0.0;
numtyp tuv002 = (numtyp)0.0;
numtyp tuv110 = (numtyp)0.0;
numtyp tuv101 = (numtyp)0.0;
numtyp tuv011 = (numtyp)0.0;
numtyp tuv300 = (numtyp)0.0;
numtyp tuv030 = (numtyp)0.0;
numtyp tuv003 = (numtyp)0.0;
numtyp tuv210 = (numtyp)0.0;
numtyp tuv201 = (numtyp)0.0;
numtyp tuv120 = (numtyp)0.0;
numtyp tuv021 = (numtyp)0.0;
numtyp tuv102 = (numtyp)0.0;
numtyp tuv012 = (numtyp)0.0;
numtyp tuv111 = (numtyp)0.0;
int k = (igridz - nzlo_out) - nlpts;
for (int kb = 0; kb < bsorder; kb++) {
/*
v0 = thetai3[m][kb][0];
v1 = thetai3[m][kb][1];
v2 = thetai3[m][kb][2];
v3 = thetai3[m][kb][3];
*/
int i3 = istart + kb;
numtyp4 tha3 = thetai3[i3];
numtyp v0 = tha3.x;
numtyp v1 = tha3.y;
numtyp v2 = tha3.z;
numtyp v3 = tha3.w;
numtyp tu00_1 = (numtyp)0.0;
numtyp tu01_1 = (numtyp)0.0;
numtyp tu10_1 = (numtyp)0.0;
numtyp tu20_1 = (numtyp)0.0;
numtyp tu11_1 = (numtyp)0.0;
numtyp tu02_1 = (numtyp)0.0;
numtyp tu00_2 = (numtyp)0.0;
numtyp tu01_2 = (numtyp)0.0;
numtyp tu10_2 = (numtyp)0.0;
numtyp tu20_2 = (numtyp)0.0;
numtyp tu11_2 = (numtyp)0.0;
numtyp tu02_2 = (numtyp)0.0;
numtyp tu00 = (numtyp)0.0;
numtyp tu10 = (numtyp)0.0;
numtyp tu01 = (numtyp)0.0;
numtyp tu20 = (numtyp)0.0;
numtyp tu11 = (numtyp)0.0;
numtyp tu02 = (numtyp)0.0;
numtyp tu30 = (numtyp)0.0;
numtyp tu21 = (numtyp)0.0;
numtyp tu12 = (numtyp)0.0;
numtyp tu03 = (numtyp)0.0;
int j = (igridy - nylo_out) - nlpts;
for (int jb = 0; jb < bsorder; jb++) {
/*
u0 = thetai2[m][jb][0];
u1 = thetai2[m][jb][1];
u2 = thetai2[m][jb][2];
u3 = thetai2[m][jb][3];
*/
int i2 = istart + jb;
numtyp4 tha2 = thetai2[i2];
numtyp u0 = tha2.x;
numtyp u1 = tha2.y;
numtyp u2 = tha2.z;
numtyp u3 = tha2.w;
numtyp t0_1 = (numtyp)0.0;
numtyp t1_1 = (numtyp)0.0;
numtyp t2_1 = (numtyp)0.0;
numtyp t0_2 = (numtyp)0.0;
numtyp t1_2 = (numtyp)0.0;
numtyp t2_2 = (numtyp)0.0;
numtyp t3 = (numtyp)0.0;
int i = (igridx - nxlo_out) - nlpts;
for (int ib = 0; ib < bsorder; ib++) {
/*
tq_1 = grid[k][j][i][0];
tq_2 = grid[k][j][i][1];
t0_1 += tq_1*thetai1[m][ib][0];
t1_1 += tq_1*thetai1[m][ib][1];
t2_1 += tq_1*thetai1[m][ib][2];
t0_2 += tq_2*thetai1[m][ib][0];
t1_2 += tq_2*thetai1[m][ib][1];
t2_2 += tq_2*thetai1[m][ib][2];
t3 += (tq_1+tq_2)*thetai1[m][ib][3];
*/
int i1 = istart + ib;
numtyp4 tha1 = thetai1[i1];
numtyp w0 = tha1.x;
numtyp w1 = tha1.y;
numtyp w2 = tha1.z;
numtyp w3 = tha1.w;
int gidx = 2*(k*ngridxy + j*ngridx + i);
numtyp tq_1 = grid[gidx];
numtyp tq_2 = grid[gidx+1];
t0_1 += tq_1*w0;
t1_1 += tq_1*w1;
t2_1 += tq_1*w2;
t0_2 += tq_2*w0;
t1_2 += tq_2*w1;
t2_2 += tq_2*w2;
t3 += (tq_1+tq_2)*w3;
i++;
}
tu00_1 += t0_1*u0;
tu10_1 += t1_1*u0;
tu01_1 += t0_1*u1;
tu20_1 += t2_1*u0;
tu11_1 += t1_1*u1;
tu02_1 += t0_1*u2;
tu00_2 += t0_2*u0;
tu10_2 += t1_2*u0;
tu01_2 += t0_2*u1;
tu20_2 += t2_2*u0;
tu11_2 += t1_2*u1;
tu02_2 += t0_2*u2;
numtyp t0 = t0_1 + t0_2;
numtyp t1 = t1_1 + t1_2;
numtyp t2 = t2_1 + t2_2;
tu00 += t0*u0;
tu10 += t1*u0;
tu01 += t0*u1;
tu20 += t2*u0;
tu11 += t1*u1;
tu02 += t0*u2;
tu30 += t3*u0;
tu21 += t2*u1;
tu12 += t1*u2;
tu03 += t0*u3;
j++;
}
tuv100_1 += tu10_1*v0;
tuv010_1 += tu01_1*v0;
tuv001_1 += tu00_1*v1;
tuv200_1 += tu20_1*v0;
tuv020_1 += tu02_1*v0;
tuv002_1 += tu00_1*v2;
tuv110_1 += tu11_1*v0;
tuv101_1 += tu10_1*v1;
tuv011_1 += tu01_1*v1;
tuv100_2 += tu10_2*v0;
tuv010_2 += tu01_2*v0;
tuv001_2 += tu00_2*v1;
tuv200_2 += tu20_2*v0;
tuv020_2 += tu02_2*v0;
tuv002_2 += tu00_2*v2;
tuv110_2 += tu11_2*v0;
tuv101_2 += tu10_2*v1;
tuv011_2 += tu01_2*v1;
tuv000 += tu00*v0;
tuv100 += tu10*v0;
tuv010 += tu01*v0;
tuv001 += tu00*v1;
tuv200 += tu20*v0;
tuv020 += tu02*v0;
tuv002 += tu00*v2;
tuv110 += tu11*v0;
tuv101 += tu10*v1;
tuv011 += tu01*v1;
tuv300 += tu30*v0;
tuv030 += tu03*v0;
tuv003 += tu00*v3;
tuv210 += tu21*v0;
tuv201 += tu20*v1;
tuv120 += tu12*v0;
tuv021 += tu02*v1;
tuv102 += tu10*v2;
tuv012 += tu01*v2;
tuv111 += tu11*v1;
k++;
}
int idx;
numtyp fdip_buf[20];
fdip_buf[0] = (numtyp)0.0;
fdip_buf[1] = tuv100_1;
fdip_buf[2] = tuv010_1;
fdip_buf[3] = tuv001_1;
fdip_buf[4] = tuv200_1;
fdip_buf[5] = tuv020_1;
fdip_buf[6] = tuv002_1;
fdip_buf[7] = tuv110_1;
fdip_buf[8] = tuv101_1;
fdip_buf[9] = tuv011_1;
idx = ii;
for (int m = 0; m < 10; m++) {
fdip_phi1[idx] = fdip_buf[m];
idx += inum;
}
fdip_buf[0] = (numtyp)0.0;
fdip_buf[1] = tuv100_2;
fdip_buf[2] = tuv010_2;
fdip_buf[3] = tuv001_2;
fdip_buf[4] = tuv200_2;
fdip_buf[5] = tuv020_2;
fdip_buf[6] = tuv002_2;
fdip_buf[7] = tuv110_2;
fdip_buf[8] = tuv101_2;
fdip_buf[9] = tuv011_2;
idx = ii;
for (int m = 0; m < 10; m++) {
fdip_phi2[idx] = fdip_buf[m];
idx += inum;
}
fdip_buf[0] = tuv000;
fdip_buf[1] = tuv100;
fdip_buf[2] = tuv010;
fdip_buf[3] = tuv001;
fdip_buf[4] = tuv200;
fdip_buf[5] = tuv020;
fdip_buf[6] = tuv002;
fdip_buf[7] = tuv110;
fdip_buf[8] = tuv101;
fdip_buf[9] = tuv011;
fdip_buf[10] = tuv300;
fdip_buf[11] = tuv030;
fdip_buf[12] = tuv003;
fdip_buf[13] = tuv210;
fdip_buf[14] = tuv201;
fdip_buf[15] = tuv120;
fdip_buf[16] = tuv021;
fdip_buf[17] = tuv102;
fdip_buf[18] = tuv012;
fdip_buf[19] = tuv111;
idx = ii;
for (int m = 0; m < 20; m++) {
fdip_sum_phi[idx] = fdip_buf[m];
idx += inum;
}
}
}
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
scan standard neighbor list and make it compatible with 1-5 neighbors scan standard neighbor list and make it compatible with 1-5 neighbors
if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15 if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15

View File

@ -193,6 +193,20 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
} }
void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid, double ****host_grid_brick,
void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out,
bool& first_iteration) {
HIPPOMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
host_thetai3, igrid, host_grid_brick, host_fdip_phi1,
host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration);
}
double hippo_gpu_bytes() { double hippo_gpu_bytes() {
return HIPPOMF.host_memory_usage(); return HIPPOMF.host_memory_usage();
} }

View File

@ -938,7 +938,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
} }
} }
*/ */
// accumulate the field and fieldp values from the real space portion from umutual2b() on the GPU // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
// field and fieldp may already have some nonzero values from kspace (umutual1 and self) // field and fieldp may already have some nonzero values from kspace (umutual1 and self)
amoeba_gpu_update_fieldp(&fieldp_pinned); amoeba_gpu_update_fieldp(&fieldp_pinned);

View File

@ -18,7 +18,7 @@
#include "pair_hippo_gpu.h" #include "pair_hippo_gpu.h"
#include "amoeba_convolution.h" #include "amoeba_convolution_gpu.h"
#include "atom.h" #include "atom.h"
#include "comm.h" #include "comm.h"
#include "domain.h" #include "domain.h"
@ -46,6 +46,9 @@ enum{GEAR,ASPC,LSQR};
enum{BUILD,APPLY}; enum{BUILD,APPLY};
enum{GORDON1,GORDON2}; enum{GORDON1,GORDON2};
// same as in pair_amoeba.cpp
enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};
#define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye #define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye
// External functions from cuda library for atom decomposition // External functions from cuda library for atom decomposition
@ -102,6 +105,16 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
void hippo_gpu_update_fieldp(void **fieldp_ptr); void hippo_gpu_update_fieldp(void **fieldp_ptr);
void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid,
double ****host_grid_brick, void **host_fdip_phi1,
void **host_fdip_phi2, void **host_fdip_sum_phi,
const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out,
bool& first_iteration);
void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
const bool eflag, const bool vflag, const bool eatom, const bool vatom, const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@ -129,8 +142,10 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
gpu_dispersion_real_ready = true; gpu_dispersion_real_ready = true;
gpu_multipole_real_ready = true; gpu_multipole_real_ready = true;
gpu_udirect2b_ready = true; gpu_udirect2b_ready = true;
gpu_umutual1_ready = true;
gpu_fphi_uind_ready = true;
gpu_umutual2b_ready = true; gpu_umutual2b_ready = true;
gpu_polar_real_ready = true; gpu_polar_real_ready = true; // need to be true for copying data from device back to host
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
} }
@ -198,6 +213,16 @@ void PairHippoGPU::init_style()
tq_single = false; tq_single = false;
else else
tq_single = true; tq_single = true;
// replace with the gpu counterpart
if (gpu_umutual1_ready) {
if (use_ewald && ic_kspace) {
delete ic_kspace;
ic_kspace =
new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
}
}
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -392,6 +417,8 @@ void PairHippoGPU::induce()
int debug = 1; int debug = 1;
first_induce_iteration = true;
// set cutoffs, taper coeffs, and PME params // set cutoffs, taper coeffs, and PME params
// create qfac here, free at end of polar() // create qfac here, free at end of polar()
@ -403,8 +430,6 @@ void PairHippoGPU::induce()
// owned atoms // owned atoms
double **x = atom->x;
double **f = atom->f;
int nlocal = atom->nlocal; int nlocal = atom->nlocal;
// zero out the induced dipoles at each site // zero out the induced dipoles at each site
@ -996,37 +1021,60 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
int i,j; int i,j;
double term; double term;
double time0,time1,time2;
// zero field,fieldp for owned and ghost atoms // zero field,fieldp for owned and ghost atoms
int nlocal = atom->nlocal; int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost; int nall = nlocal + atom->nghost;
for (i = 0; i < nall; i++) { memset(&field[0][0], 0, 3*nall *sizeof(double));
for (j = 0; j < 3; j++) { memset(&fieldp[0][0], 0, 3*nall *sizeof(double));
/*
for (int i = 0; i < nall; i++) {
for (int j = 0; j < 3; j++) {
field[i][j] = 0.0; field[i][j] = 0.0;
fieldp[i][j] = 0.0; fieldp[i][j] = 0.0;
} }
} }
*/
// get the real space portion of the mutual field first // get the real space portion of the mutual field first
MPI_Barrier(world);
time0 = MPI_Wtime();
if (polar_rspace_flag) umutual2b(field,fieldp); if (polar_rspace_flag) umutual2b(field,fieldp);
time1 = MPI_Wtime();
// get the reciprocal space part of the mutual field // get the reciprocal space part of the mutual field
if (polar_kspace_flag) umutual1(field,fieldp); if (polar_kspace_flag) umutual1(field,fieldp);
time2 = MPI_Wtime();
// add the self-energy portion of the mutual field // add the self-energy portion of the mutual field
term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
for (int i = 0; i < nlocal; i++) {
field[i][0] += term*uind[i][0];
field[i][1] += term*uind[i][1];
field[i][2] += term*uind[i][2];
}
for (int i = 0; i < nlocal; i++) {
fieldp[i][0] += term*uinp[i][0];
fieldp[i][1] += term*uinp[i][1];
fieldp[i][2] += term*uinp[i][2];
}
/*
for (i = 0; i < nlocal; i++) { for (i = 0; i < nlocal; i++) {
for (j = 0; j < 3; j++) { for (j = 0; j < 3; j++) {
field[i][j] += term*uind[i][j]; field[i][j] += term*uind[i][j];
fieldp[i][j] += term*uinp[i][j]; fieldp[i][j] += term*uinp[i][j];
} }
} }
*/
// accumulate the field and fieldp values from real-space portion from umutual2b() on the GPU // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
// field and fieldp may already have some nonzero values from kspace (umutual1 and self) // field and fieldp may already have some nonzero values from kspace (umutual1 and self)
hippo_gpu_update_fieldp(&fieldp_pinned); hippo_gpu_update_fieldp(&fieldp_pinned);
@ -1049,6 +1097,228 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
fieldp[i][1] += fieldp_ptr[idx+1]; fieldp[i][1] += fieldp_ptr[idx+1];
fieldp[i][2] += fieldp_ptr[idx+2]; fieldp[i][2] += fieldp_ptr[idx+2];
} }
// accumulate timing information
time_mutual_rspace += time1 - time0;
time_mutual_kspace += time2 - time1;
}
/* ----------------------------------------------------------------------
umutual1 = Ewald recip mutual induced field
umutual1 computes the reciprocal space contribution of the
induced atomic dipole moments to the field
------------------------------------------------------------------------- */
void PairHippoGPU::umutual1(double **field, double **fieldp)
{
int m,n;
int nxlo,nxhi,nylo,nyhi,nzlo,nzhi;
double term;
double a[3][3]; // indices not flipped vs Fortran
// return if the Ewald coefficient is zero
if (aewald < 1.0e-6) return;
// convert Cartesian dipoles to fractional coordinates
for (int j = 0; j < 3; j++) {
a[0][j] = nfft1 * recip[0][j];
a[1][j] = nfft2 * recip[1][j];
a[2][j] = nfft3 * recip[2][j];
}
int nlocal = atom->nlocal;
for (int i = 0; i < nlocal; i++) {
fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2];
fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2];
fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2];
}
for (int i = 0; i < nlocal; i++) {
fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2];
fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
}
/*
for (i = 0; i < nlocal; i++) {
for (j = 0; j < 3; j++) {
fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2];
fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2];
}
}
*/
// gridpre = my portion of 4d grid in brick decomp w/ ghost values
double ****gridpre = (double ****) ic_kspace->zero();
// map 2 values to grid
grid_uind(fuind,fuinp,gridpre);
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomposition
double *gridfft = ic_kspace->pre_convolution();
// ---------------------
// convolution operation
// ---------------------
nxlo = ic_kspace->nxlo_fft;
nxhi = ic_kspace->nxhi_fft;
nylo = ic_kspace->nylo_fft;
nyhi = ic_kspace->nyhi_fft;
nzlo = ic_kspace->nzlo_fft;
nzhi = ic_kspace->nzhi_fft;
// use qfac values stored in udirect1()
m = n = 0;
for (int k = nzlo; k <= nzhi; k++) {
for (int j = nylo; j <= nyhi; j++) {
for (int i = nxlo; i <= nxhi; i++) {
term = qfac[m++];
gridfft[n] *= term;
gridfft[n+1] *= term;
n += 2;
}
}
}
// post-convolution operations including backward FFT
// gridppost = my portion of 4d grid in brick decomp w/ ghost values
double ****gridpost = (double ****) ic_kspace->post_convolution();
// get potential
double time0, time1;
MPI_Barrier(world);
time0 = MPI_Wtime();
fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
time1 = MPI_Wtime();
time_fphi_uind += (time1 - time0);
// store fractional reciprocal potentials for OPT method
if (poltyp == OPT) {
for (int i = 0; i < nlocal; i++) {
for (int j = 0; j < 10; j++) {
fopt[i][optlevel][j] = fdip_phi1[i][j];
foptp[i][optlevel][j] = fdip_phi2[i][j];
}
}
}
// convert the dipole fields from fractional to Cartesian
for (int i = 0; i < 3; i++) {
a[0][i] = nfft1 * recip[0][i];
a[1][i] = nfft2 * recip[1][i];
a[2][i] = nfft3 * recip[2][i];
}
for (int i = 0; i < nlocal; i++) {
double dfx = a[0][0]*fdip_phi1[i][1] +
a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3];
double dfy = a[1][0]*fdip_phi1[i][1] +
a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3];
double dfz = a[2][0]*fdip_phi1[i][1] +
a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3];
field[i][0] -= dfx;
field[i][1] -= dfy;
field[i][2] -= dfz;
}
for (int i = 0; i < nlocal; i++) {
double dfx = a[0][0]*fdip_phi2[i][1] +
a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3];
double dfy = a[1][0]*fdip_phi2[i][1] +
a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3];
double dfz = a[2][0]*fdip_phi2[i][1] +
a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3];
fieldp[i][0] -= dfx;
fieldp[i][1] -= dfy;
fieldp[i][2] -= dfz;
}
/*
for (int i = 0; i < nlocal; i++) {
for (j = 0; j < 3; j++) {
dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] +
a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3];
dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] +
a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3];
}
}
// increment the field at each multipole site
for (i = 0; i < nlocal; i++) {
for (j = 0; j < 3; j++) {
field[i][j] -= dipfield1[i][j];
fieldp[i][j] -= dipfield2[i][j];
}
}
*/
}
/* ----------------------------------------------------------------------
fphi_uind = induced potential from grid
fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
------------------------------------------------------------------------- */
void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1,
double **fdip_phi2, double **fdip_sum_phi)
{
if (!gpu_fphi_uind_ready) {
PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi);
return;
}
void* fdip_phi1_pinned = nullptr;
void* fdip_phi2_pinned = nullptr;
void* fdip_sum_phi_pinned = nullptr;
hippo_gpu_fphi_uind(atom->nlocal, bsorder, thetai1,
thetai2, thetai3, igrid, grid,
&fdip_phi1_pinned, &fdip_phi2_pinned,
&fdip_sum_phi_pinned,
ic_kspace->nzlo_out, ic_kspace->nzhi_out,
ic_kspace->nylo_out, ic_kspace->nyhi_out,
ic_kspace->nxlo_out, ic_kspace->nxhi_out,
first_induce_iteration);
int nlocal = atom->nlocal;
double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
for (int i = 0; i < nlocal; i++) {
int n = i;
for (int m = 0; m < 10; m++) {
fdip_phi1[i][m] = _fdip_phi1_ptr[n];
n += nlocal;
}
}
double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
for (int i = 0; i < nlocal; i++) {
int n = i;
for (int m = 0; m < 10; m++) {
fdip_phi2[i][m] = _fdip_phi2_ptr[n];
n += nlocal;
}
}
double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
for (int i = 0; i < nlocal; i++) {
int n = i;
for (int m = 0; m < 20; m++) {
fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
n += nlocal;
}
}
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -1089,29 +1359,6 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
double *pval = atom->dvector[index_pval]; double *pval = atom->dvector[index_pval];
hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval, hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval,
aewald, off2, &fieldp_pinned); aewald, off2, &fieldp_pinned);
/*
// accumulate the field and fieldp values from the GPU lib
// field and fieldp may already have some nonzero values from kspace (umutual1)
int nlocal = atom->nlocal;
double *field_ptr = (double *)fieldp_pinned;
for (int i = 0; i < nlocal; i++) {
int idx = 4*i;
field[i][0] += field_ptr[idx];
field[i][1] += field_ptr[idx+1];
field[i][2] += field_ptr[idx+2];
}
double* fieldp_ptr = (double *)fieldp_pinned;
fieldp_ptr += 4*inum;
for (int i = 0; i < nlocal; i++) {
int idx = 4*i;
fieldp[i][0] += fieldp_ptr[idx];
fieldp[i][1] += fieldp_ptr[idx+1];
fieldp[i][2] += fieldp_ptr[idx+2];
}
*/
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------

View File

@ -39,6 +39,8 @@ class PairHippoGPU : public PairAmoeba {
virtual void dispersion_real(); virtual void dispersion_real();
virtual void multipole_real(); virtual void multipole_real();
virtual void udirect2b(double **, double **); virtual void udirect2b(double **, double **);
virtual void umutual1(double **, double **);
virtual void fphi_uind(double ****, double **, double **, double **);
virtual void umutual2b(double **, double **); virtual void umutual2b(double **, double **);
virtual void ufield0c(double **, double **); virtual void ufield0c(double **, double **);
virtual void polar_real(); virtual void polar_real();
@ -55,9 +57,13 @@ class PairHippoGPU : public PairAmoeba {
bool gpu_dispersion_real_ready; bool gpu_dispersion_real_ready;
bool gpu_multipole_real_ready; bool gpu_multipole_real_ready;
bool gpu_udirect2b_ready; bool gpu_udirect2b_ready;
bool gpu_umutual1_ready;
bool gpu_fphi_uind_ready;
bool gpu_umutual2b_ready; bool gpu_umutual2b_ready;
bool gpu_polar_real_ready; bool gpu_polar_real_ready;
bool first_induce_iteration;
void udirect2b_cpu(); void udirect2b_cpu();
template<class numtyp> template<class numtyp>