Enabled fphi_uind in hippo/gpu, really need to refactor hippo and amoeba in the GPU lib to remove kernel duplicates
This commit is contained in:
@ -68,31 +68,7 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
|
|||||||
|
|
||||||
# host code compilation
|
# host code compilation
|
||||||
|
|
||||||
$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
|
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
|
||||||
$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
|
|
||||||
|
|
||||||
$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
|
|
||||||
$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
|
|
||||||
|
|
||||||
$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
|
|
||||||
$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
|
|
||||||
|
|
||||||
$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
|
|
||||||
$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
|
|
||||||
|
|
||||||
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
|
|
||||||
$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
|
|
||||||
|
|
||||||
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
|
|
||||||
$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
|
|
||||||
|
|
||||||
$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
|
|
||||||
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
|
||||||
|
|
||||||
$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
|
|
||||||
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
|
||||||
|
|
||||||
$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
|
|
||||||
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
||||||
|
|
||||||
#ifdef CUDPP_OPT
|
#ifdef CUDPP_OPT
|
||||||
|
|||||||
@ -64,7 +64,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
|
|||||||
cell_size,gpu_split,_screen,amoeba,
|
cell_size,gpu_split,_screen,amoeba,
|
||||||
"k_amoeba_multipole", "k_amoeba_udirect2b",
|
"k_amoeba_multipole", "k_amoeba_udirect2b",
|
||||||
"k_amoeba_umutual2b", "k_amoeba_polar",
|
"k_amoeba_umutual2b", "k_amoeba_polar",
|
||||||
"k_amoeba_short_nbor", "k_amoeba_special15");
|
"k_amoeba_fphi_uind", "k_amoeba_short_nbor",
|
||||||
|
"k_amoeba_special15");
|
||||||
if (success!=0)
|
if (success!=0)
|
||||||
return success;
|
return success;
|
||||||
|
|
||||||
|
|||||||
@ -1626,7 +1626,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
|||||||
fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
|
fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
__kernel void k_fphi_uind(const __global numtyp4 *restrict thetai1,
|
__kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
|
||||||
const __global numtyp4 *restrict thetai2,
|
const __global numtyp4 *restrict thetai2,
|
||||||
const __global numtyp4 *restrict thetai3,
|
const __global numtyp4 *restrict thetai3,
|
||||||
const __global int *restrict igrid,
|
const __global int *restrict igrid,
|
||||||
|
|||||||
@ -65,6 +65,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
|||||||
const char *k_name_udirect2b,
|
const char *k_name_udirect2b,
|
||||||
const char *k_name_umutual2b,
|
const char *k_name_umutual2b,
|
||||||
const char *k_name_polar,
|
const char *k_name_polar,
|
||||||
|
const char *k_name_fphi_uind,
|
||||||
const char *k_name_short_nbor,
|
const char *k_name_short_nbor,
|
||||||
const char* k_name_special15) {
|
const char* k_name_special15) {
|
||||||
screen=_screen;
|
screen=_screen;
|
||||||
@ -100,7 +101,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
|||||||
_block_bio_size=device->block_bio_pair();
|
_block_bio_size=device->block_bio_pair();
|
||||||
compile_kernels(*ucl_device,pair_program,k_name_multipole,
|
compile_kernels(*ucl_device,pair_program,k_name_multipole,
|
||||||
k_name_udirect2b, k_name_umutual2b,k_name_polar,
|
k_name_udirect2b, k_name_umutual2b,k_name_polar,
|
||||||
k_name_short_nbor, k_name_special15);
|
k_name_fphi_uind, k_name_short_nbor, k_name_special15);
|
||||||
|
|
||||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||||
nbor->packing(true);
|
nbor->packing(true);
|
||||||
@ -934,6 +935,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
|||||||
const char *kname_udirect2b,
|
const char *kname_udirect2b,
|
||||||
const char *kname_umutual2b,
|
const char *kname_umutual2b,
|
||||||
const char *kname_polar,
|
const char *kname_polar,
|
||||||
|
const char *kname_fphi_uind,
|
||||||
const char *kname_short_nbor,
|
const char *kname_short_nbor,
|
||||||
const char* kname_special15) {
|
const char* kname_special15) {
|
||||||
if (_compiled)
|
if (_compiled)
|
||||||
@ -942,17 +944,17 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
|||||||
if (pair_program) delete pair_program;
|
if (pair_program) delete pair_program;
|
||||||
pair_program=new UCL_Program(dev);
|
pair_program=new UCL_Program(dev);
|
||||||
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
||||||
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen);
|
||||||
|
|
||||||
k_multipole.set_function(*pair_program,kname_multipole);
|
k_multipole.set_function(*pair_program, kname_multipole);
|
||||||
k_udirect2b.set_function(*pair_program,kname_udirect2b);
|
k_udirect2b.set_function(*pair_program, kname_udirect2b);
|
||||||
k_umutual2b.set_function(*pair_program,kname_umutual2b);
|
k_umutual2b.set_function(*pair_program, kname_umutual2b);
|
||||||
k_polar.set_function(*pair_program,kname_polar);
|
k_polar.set_function(*pair_program, kname_polar);
|
||||||
k_fphi_uind.set_function(*pair_program,"k_fphi_uind");
|
k_fphi_uind.set_function(*pair_program, kname_fphi_uind);
|
||||||
k_short_nbor.set_function(*pair_program,kname_short_nbor);
|
k_short_nbor.set_function(*pair_program, kname_short_nbor);
|
||||||
k_special15.set_function(*pair_program,kname_special15);
|
k_special15.set_function(*pair_program, kname_special15);
|
||||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
pos_tex.get_texture(*pair_program, "pos_tex");
|
||||||
q_tex.get_texture(*pair_program,"q_tex");
|
q_tex.get_texture(*pair_program, "q_tex");
|
||||||
|
|
||||||
_compiled=true;
|
_compiled=true;
|
||||||
|
|
||||||
|
|||||||
@ -62,9 +62,10 @@ class BaseAmoeba {
|
|||||||
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||||
const int maxspecial, const int maxspecial15, const double cell_size,
|
const int maxspecial, const int maxspecial15, const double cell_size,
|
||||||
const double gpu_split, FILE *screen, const void *pair_program,
|
const double gpu_split, FILE *screen, const void *pair_program,
|
||||||
const char *kname_multipole,
|
const char *kname_multipole, const char *kname_udirect2b,
|
||||||
const char *kname_udirect2b, const char *kname_umutual2b,
|
const char *kname_umutual2b, const char *kname_polar,
|
||||||
const char *kname_polar, const char *kname_short_nbor, const char* kname_special15);
|
const char *kname_fphi_uind, const char *kname_short_nbor,
|
||||||
|
const char* kname_special15);
|
||||||
|
|
||||||
/// Estimate the overhead for GPU context changes and CPU driver
|
/// Estimate the overhead for GPU context changes and CPU driver
|
||||||
void estimate_gpu_overhead(const int add_kernels=0);
|
void estimate_gpu_overhead(const int add_kernels=0);
|
||||||
@ -309,7 +310,8 @@ class BaseAmoeba {
|
|||||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||||
const char *kname_multipole, const char *kname_udirect2b,
|
const char *kname_multipole, const char *kname_udirect2b,
|
||||||
const char *kname_umutual2b, const char *kname_polar,
|
const char *kname_umutual2b, const char *kname_polar,
|
||||||
const char *kname_short_nbor, const char* kname_special15);
|
const char *kname_fphi_uind, const char *kname_short_nbor,
|
||||||
|
const char* kname_special15);
|
||||||
|
|
||||||
virtual int multipole_real(const int eflag, const int vflag) = 0;
|
virtual int multipole_real(const int eflag, const int vflag) = 0;
|
||||||
virtual int udirect2b(const int eflag, const int vflag) = 0;
|
virtual int udirect2b(const int eflag, const int vflag) = 0;
|
||||||
|
|||||||
@ -67,7 +67,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
|
|||||||
cell_size,gpu_split,_screen,hippo,
|
cell_size,gpu_split,_screen,hippo,
|
||||||
"k_hippo_multipole", "k_hippo_udirect2b",
|
"k_hippo_multipole", "k_hippo_udirect2b",
|
||||||
"k_hippo_umutual2b", "k_hippo_polar",
|
"k_hippo_umutual2b", "k_hippo_polar",
|
||||||
"k_hippo_short_nbor", "k_hippo_special15");
|
"k_hippo_fphi_uind", "k_hippo_short_nbor",
|
||||||
|
"k_hippo_special15");
|
||||||
if (success!=0)
|
if (success!=0)
|
||||||
return success;
|
return success;
|
||||||
|
|
||||||
|
|||||||
@ -2045,6 +2045,307 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
|
|||||||
offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
|
offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------------
|
||||||
|
fphi_uind = induced potential from grid
|
||||||
|
fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
__kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
|
||||||
|
const __global numtyp4 *restrict thetai2,
|
||||||
|
const __global numtyp4 *restrict thetai3,
|
||||||
|
const __global int *restrict igrid,
|
||||||
|
const __global numtyp *restrict grid,
|
||||||
|
__global numtyp *restrict fdip_phi1,
|
||||||
|
__global numtyp *restrict fdip_phi2,
|
||||||
|
__global numtyp *restrict fdip_sum_phi,
|
||||||
|
const int bsorder, const int inum,
|
||||||
|
const int nzlo_out, const int nylo_out,
|
||||||
|
const int nxlo_out, const int ngridxy,
|
||||||
|
const int ngridx)
|
||||||
|
{
|
||||||
|
//int tid, ii, offset, i, n_stride;
|
||||||
|
//atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
|
||||||
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
|
||||||
|
int nlpts = (bsorder-1) / 2;
|
||||||
|
|
||||||
|
int istart = fast_mul(ii,4);
|
||||||
|
int igridx = igrid[istart];
|
||||||
|
int igridy = igrid[istart+1];
|
||||||
|
int igridz = igrid[istart+2];
|
||||||
|
|
||||||
|
// now istart is used to index thetai1, thetai2 and thetai3
|
||||||
|
istart = fast_mul(ii,bsorder);
|
||||||
|
|
||||||
|
// extract the permanent multipole field at each site
|
||||||
|
|
||||||
|
numtyp tuv100_1 = (numtyp)0.0;
|
||||||
|
numtyp tuv010_1 = (numtyp)0.0;
|
||||||
|
numtyp tuv001_1 = (numtyp)0.0;
|
||||||
|
numtyp tuv200_1 = (numtyp)0.0;
|
||||||
|
numtyp tuv020_1 = (numtyp)0.0;
|
||||||
|
numtyp tuv002_1 = (numtyp)0.0;
|
||||||
|
numtyp tuv110_1 = (numtyp)0.0;
|
||||||
|
numtyp tuv101_1 = (numtyp)0.0;
|
||||||
|
numtyp tuv011_1 = (numtyp)0.0;
|
||||||
|
numtyp tuv100_2 = (numtyp)0.0;
|
||||||
|
numtyp tuv010_2 = (numtyp)0.0;
|
||||||
|
numtyp tuv001_2 = (numtyp)0.0;
|
||||||
|
numtyp tuv200_2 = (numtyp)0.0;
|
||||||
|
numtyp tuv020_2 = (numtyp)0.0;
|
||||||
|
numtyp tuv002_2 = (numtyp)0.0;
|
||||||
|
numtyp tuv110_2 = (numtyp)0.0;
|
||||||
|
numtyp tuv101_2 = (numtyp)0.0;
|
||||||
|
numtyp tuv011_2 = (numtyp)0.0;
|
||||||
|
numtyp tuv000 = (numtyp)0.0;
|
||||||
|
numtyp tuv001 = (numtyp)0.0;
|
||||||
|
numtyp tuv010 = (numtyp)0.0;
|
||||||
|
numtyp tuv100 = (numtyp)0.0;
|
||||||
|
numtyp tuv200 = (numtyp)0.0;
|
||||||
|
numtyp tuv020 = (numtyp)0.0;
|
||||||
|
numtyp tuv002 = (numtyp)0.0;
|
||||||
|
numtyp tuv110 = (numtyp)0.0;
|
||||||
|
numtyp tuv101 = (numtyp)0.0;
|
||||||
|
numtyp tuv011 = (numtyp)0.0;
|
||||||
|
numtyp tuv300 = (numtyp)0.0;
|
||||||
|
numtyp tuv030 = (numtyp)0.0;
|
||||||
|
numtyp tuv003 = (numtyp)0.0;
|
||||||
|
numtyp tuv210 = (numtyp)0.0;
|
||||||
|
numtyp tuv201 = (numtyp)0.0;
|
||||||
|
numtyp tuv120 = (numtyp)0.0;
|
||||||
|
numtyp tuv021 = (numtyp)0.0;
|
||||||
|
numtyp tuv102 = (numtyp)0.0;
|
||||||
|
numtyp tuv012 = (numtyp)0.0;
|
||||||
|
numtyp tuv111 = (numtyp)0.0;
|
||||||
|
|
||||||
|
int k = (igridz - nzlo_out) - nlpts;
|
||||||
|
for (int kb = 0; kb < bsorder; kb++) {
|
||||||
|
/*
|
||||||
|
v0 = thetai3[m][kb][0];
|
||||||
|
v1 = thetai3[m][kb][1];
|
||||||
|
v2 = thetai3[m][kb][2];
|
||||||
|
v3 = thetai3[m][kb][3];
|
||||||
|
*/
|
||||||
|
int i3 = istart + kb;
|
||||||
|
numtyp4 tha3 = thetai3[i3];
|
||||||
|
numtyp v0 = tha3.x;
|
||||||
|
numtyp v1 = tha3.y;
|
||||||
|
numtyp v2 = tha3.z;
|
||||||
|
numtyp v3 = tha3.w;
|
||||||
|
numtyp tu00_1 = (numtyp)0.0;
|
||||||
|
numtyp tu01_1 = (numtyp)0.0;
|
||||||
|
numtyp tu10_1 = (numtyp)0.0;
|
||||||
|
numtyp tu20_1 = (numtyp)0.0;
|
||||||
|
numtyp tu11_1 = (numtyp)0.0;
|
||||||
|
numtyp tu02_1 = (numtyp)0.0;
|
||||||
|
numtyp tu00_2 = (numtyp)0.0;
|
||||||
|
numtyp tu01_2 = (numtyp)0.0;
|
||||||
|
numtyp tu10_2 = (numtyp)0.0;
|
||||||
|
numtyp tu20_2 = (numtyp)0.0;
|
||||||
|
numtyp tu11_2 = (numtyp)0.0;
|
||||||
|
numtyp tu02_2 = (numtyp)0.0;
|
||||||
|
numtyp tu00 = (numtyp)0.0;
|
||||||
|
numtyp tu10 = (numtyp)0.0;
|
||||||
|
numtyp tu01 = (numtyp)0.0;
|
||||||
|
numtyp tu20 = (numtyp)0.0;
|
||||||
|
numtyp tu11 = (numtyp)0.0;
|
||||||
|
numtyp tu02 = (numtyp)0.0;
|
||||||
|
numtyp tu30 = (numtyp)0.0;
|
||||||
|
numtyp tu21 = (numtyp)0.0;
|
||||||
|
numtyp tu12 = (numtyp)0.0;
|
||||||
|
numtyp tu03 = (numtyp)0.0;
|
||||||
|
|
||||||
|
int j = (igridy - nylo_out) - nlpts;
|
||||||
|
for (int jb = 0; jb < bsorder; jb++) {
|
||||||
|
/*
|
||||||
|
u0 = thetai2[m][jb][0];
|
||||||
|
u1 = thetai2[m][jb][1];
|
||||||
|
u2 = thetai2[m][jb][2];
|
||||||
|
u3 = thetai2[m][jb][3];
|
||||||
|
*/
|
||||||
|
int i2 = istart + jb;
|
||||||
|
numtyp4 tha2 = thetai2[i2];
|
||||||
|
numtyp u0 = tha2.x;
|
||||||
|
numtyp u1 = tha2.y;
|
||||||
|
numtyp u2 = tha2.z;
|
||||||
|
numtyp u3 = tha2.w;
|
||||||
|
numtyp t0_1 = (numtyp)0.0;
|
||||||
|
numtyp t1_1 = (numtyp)0.0;
|
||||||
|
numtyp t2_1 = (numtyp)0.0;
|
||||||
|
numtyp t0_2 = (numtyp)0.0;
|
||||||
|
numtyp t1_2 = (numtyp)0.0;
|
||||||
|
numtyp t2_2 = (numtyp)0.0;
|
||||||
|
numtyp t3 = (numtyp)0.0;
|
||||||
|
|
||||||
|
int i = (igridx - nxlo_out) - nlpts;
|
||||||
|
for (int ib = 0; ib < bsorder; ib++) {
|
||||||
|
/*
|
||||||
|
tq_1 = grid[k][j][i][0];
|
||||||
|
tq_2 = grid[k][j][i][1];
|
||||||
|
t0_1 += tq_1*thetai1[m][ib][0];
|
||||||
|
t1_1 += tq_1*thetai1[m][ib][1];
|
||||||
|
t2_1 += tq_1*thetai1[m][ib][2];
|
||||||
|
t0_2 += tq_2*thetai1[m][ib][0];
|
||||||
|
t1_2 += tq_2*thetai1[m][ib][1];
|
||||||
|
t2_2 += tq_2*thetai1[m][ib][2];
|
||||||
|
t3 += (tq_1+tq_2)*thetai1[m][ib][3];
|
||||||
|
*/
|
||||||
|
int i1 = istart + ib;
|
||||||
|
numtyp4 tha1 = thetai1[i1];
|
||||||
|
numtyp w0 = tha1.x;
|
||||||
|
numtyp w1 = tha1.y;
|
||||||
|
numtyp w2 = tha1.z;
|
||||||
|
numtyp w3 = tha1.w;
|
||||||
|
int gidx = 2*(k*ngridxy + j*ngridx + i);
|
||||||
|
numtyp tq_1 = grid[gidx];
|
||||||
|
numtyp tq_2 = grid[gidx+1];
|
||||||
|
t0_1 += tq_1*w0;
|
||||||
|
t1_1 += tq_1*w1;
|
||||||
|
t2_1 += tq_1*w2;
|
||||||
|
t0_2 += tq_2*w0;
|
||||||
|
t1_2 += tq_2*w1;
|
||||||
|
t2_2 += tq_2*w2;
|
||||||
|
t3 += (tq_1+tq_2)*w3;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
tu00_1 += t0_1*u0;
|
||||||
|
tu10_1 += t1_1*u0;
|
||||||
|
tu01_1 += t0_1*u1;
|
||||||
|
tu20_1 += t2_1*u0;
|
||||||
|
tu11_1 += t1_1*u1;
|
||||||
|
tu02_1 += t0_1*u2;
|
||||||
|
tu00_2 += t0_2*u0;
|
||||||
|
tu10_2 += t1_2*u0;
|
||||||
|
tu01_2 += t0_2*u1;
|
||||||
|
tu20_2 += t2_2*u0;
|
||||||
|
tu11_2 += t1_2*u1;
|
||||||
|
tu02_2 += t0_2*u2;
|
||||||
|
numtyp t0 = t0_1 + t0_2;
|
||||||
|
numtyp t1 = t1_1 + t1_2;
|
||||||
|
numtyp t2 = t2_1 + t2_2;
|
||||||
|
tu00 += t0*u0;
|
||||||
|
tu10 += t1*u0;
|
||||||
|
tu01 += t0*u1;
|
||||||
|
tu20 += t2*u0;
|
||||||
|
tu11 += t1*u1;
|
||||||
|
tu02 += t0*u2;
|
||||||
|
tu30 += t3*u0;
|
||||||
|
tu21 += t2*u1;
|
||||||
|
tu12 += t1*u2;
|
||||||
|
tu03 += t0*u3;
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
|
||||||
|
tuv100_1 += tu10_1*v0;
|
||||||
|
tuv010_1 += tu01_1*v0;
|
||||||
|
tuv001_1 += tu00_1*v1;
|
||||||
|
tuv200_1 += tu20_1*v0;
|
||||||
|
tuv020_1 += tu02_1*v0;
|
||||||
|
tuv002_1 += tu00_1*v2;
|
||||||
|
tuv110_1 += tu11_1*v0;
|
||||||
|
tuv101_1 += tu10_1*v1;
|
||||||
|
tuv011_1 += tu01_1*v1;
|
||||||
|
tuv100_2 += tu10_2*v0;
|
||||||
|
tuv010_2 += tu01_2*v0;
|
||||||
|
tuv001_2 += tu00_2*v1;
|
||||||
|
tuv200_2 += tu20_2*v0;
|
||||||
|
tuv020_2 += tu02_2*v0;
|
||||||
|
tuv002_2 += tu00_2*v2;
|
||||||
|
tuv110_2 += tu11_2*v0;
|
||||||
|
tuv101_2 += tu10_2*v1;
|
||||||
|
tuv011_2 += tu01_2*v1;
|
||||||
|
tuv000 += tu00*v0;
|
||||||
|
tuv100 += tu10*v0;
|
||||||
|
tuv010 += tu01*v0;
|
||||||
|
tuv001 += tu00*v1;
|
||||||
|
tuv200 += tu20*v0;
|
||||||
|
tuv020 += tu02*v0;
|
||||||
|
tuv002 += tu00*v2;
|
||||||
|
tuv110 += tu11*v0;
|
||||||
|
tuv101 += tu10*v1;
|
||||||
|
tuv011 += tu01*v1;
|
||||||
|
tuv300 += tu30*v0;
|
||||||
|
tuv030 += tu03*v0;
|
||||||
|
tuv003 += tu00*v3;
|
||||||
|
tuv210 += tu21*v0;
|
||||||
|
tuv201 += tu20*v1;
|
||||||
|
tuv120 += tu12*v0;
|
||||||
|
tuv021 += tu02*v1;
|
||||||
|
tuv102 += tu10*v2;
|
||||||
|
tuv012 += tu01*v2;
|
||||||
|
tuv111 += tu11*v1;
|
||||||
|
k++;
|
||||||
|
}
|
||||||
|
|
||||||
|
int idx;
|
||||||
|
numtyp fdip_buf[20];
|
||||||
|
|
||||||
|
fdip_buf[0] = (numtyp)0.0;
|
||||||
|
fdip_buf[1] = tuv100_1;
|
||||||
|
fdip_buf[2] = tuv010_1;
|
||||||
|
fdip_buf[3] = tuv001_1;
|
||||||
|
fdip_buf[4] = tuv200_1;
|
||||||
|
fdip_buf[5] = tuv020_1;
|
||||||
|
fdip_buf[6] = tuv002_1;
|
||||||
|
fdip_buf[7] = tuv110_1;
|
||||||
|
fdip_buf[8] = tuv101_1;
|
||||||
|
fdip_buf[9] = tuv011_1;
|
||||||
|
idx = ii;
|
||||||
|
for (int m = 0; m < 10; m++) {
|
||||||
|
fdip_phi1[idx] = fdip_buf[m];
|
||||||
|
idx += inum;
|
||||||
|
}
|
||||||
|
|
||||||
|
fdip_buf[0] = (numtyp)0.0;
|
||||||
|
fdip_buf[1] = tuv100_2;
|
||||||
|
fdip_buf[2] = tuv010_2;
|
||||||
|
fdip_buf[3] = tuv001_2;
|
||||||
|
fdip_buf[4] = tuv200_2;
|
||||||
|
fdip_buf[5] = tuv020_2;
|
||||||
|
fdip_buf[6] = tuv002_2;
|
||||||
|
fdip_buf[7] = tuv110_2;
|
||||||
|
fdip_buf[8] = tuv101_2;
|
||||||
|
fdip_buf[9] = tuv011_2;
|
||||||
|
idx = ii;
|
||||||
|
for (int m = 0; m < 10; m++) {
|
||||||
|
fdip_phi2[idx] = fdip_buf[m];
|
||||||
|
idx += inum;
|
||||||
|
}
|
||||||
|
|
||||||
|
fdip_buf[0] = tuv000;
|
||||||
|
fdip_buf[1] = tuv100;
|
||||||
|
fdip_buf[2] = tuv010;
|
||||||
|
fdip_buf[3] = tuv001;
|
||||||
|
fdip_buf[4] = tuv200;
|
||||||
|
fdip_buf[5] = tuv020;
|
||||||
|
fdip_buf[6] = tuv002;
|
||||||
|
fdip_buf[7] = tuv110;
|
||||||
|
fdip_buf[8] = tuv101;
|
||||||
|
fdip_buf[9] = tuv011;
|
||||||
|
fdip_buf[10] = tuv300;
|
||||||
|
fdip_buf[11] = tuv030;
|
||||||
|
fdip_buf[12] = tuv003;
|
||||||
|
fdip_buf[13] = tuv210;
|
||||||
|
fdip_buf[14] = tuv201;
|
||||||
|
fdip_buf[15] = tuv120;
|
||||||
|
fdip_buf[16] = tuv021;
|
||||||
|
fdip_buf[17] = tuv102;
|
||||||
|
fdip_buf[18] = tuv012;
|
||||||
|
fdip_buf[19] = tuv111;
|
||||||
|
idx = ii;
|
||||||
|
for (int m = 0; m < 20; m++) {
|
||||||
|
fdip_sum_phi[idx] = fdip_buf[m];
|
||||||
|
idx += inum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
scan standard neighbor list and make it compatible with 1-5 neighbors
|
scan standard neighbor list and make it compatible with 1-5 neighbors
|
||||||
if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
|
if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
|
||||||
|
|||||||
@ -193,6 +193,20 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **
|
|||||||
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
|
||||||
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
|
double ***host_thetai3, int** igrid, double ****host_grid_brick,
|
||||||
|
void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
|
||||||
|
const int nzlo_out, const int nzhi_out,
|
||||||
|
const int nylo_out, const int nyhi_out,
|
||||||
|
const int nxlo_out, const int nxhi_out,
|
||||||
|
bool& first_iteration) {
|
||||||
|
HIPPOMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
|
||||||
|
host_thetai3, igrid, host_grid_brick, host_fdip_phi1,
|
||||||
|
host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
|
||||||
|
nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration);
|
||||||
|
}
|
||||||
|
|
||||||
double hippo_gpu_bytes() {
|
double hippo_gpu_bytes() {
|
||||||
return HIPPOMF.host_memory_usage();
|
return HIPPOMF.host_memory_usage();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -938,7 +938,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
// accumulate the field and fieldp values from the real space portion from umutual2b() on the GPU
|
// accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
|
||||||
// field and fieldp may already have some nonzero values from kspace (umutual1 and self)
|
// field and fieldp may already have some nonzero values from kspace (umutual1 and self)
|
||||||
|
|
||||||
amoeba_gpu_update_fieldp(&fieldp_pinned);
|
amoeba_gpu_update_fieldp(&fieldp_pinned);
|
||||||
|
|||||||
@ -18,7 +18,7 @@
|
|||||||
|
|
||||||
#include "pair_hippo_gpu.h"
|
#include "pair_hippo_gpu.h"
|
||||||
|
|
||||||
#include "amoeba_convolution.h"
|
#include "amoeba_convolution_gpu.h"
|
||||||
#include "atom.h"
|
#include "atom.h"
|
||||||
#include "comm.h"
|
#include "comm.h"
|
||||||
#include "domain.h"
|
#include "domain.h"
|
||||||
@ -46,6 +46,9 @@ enum{GEAR,ASPC,LSQR};
|
|||||||
enum{BUILD,APPLY};
|
enum{BUILD,APPLY};
|
||||||
enum{GORDON1,GORDON2};
|
enum{GORDON1,GORDON2};
|
||||||
|
|
||||||
|
// same as in pair_amoeba.cpp
|
||||||
|
enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};
|
||||||
|
|
||||||
#define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye
|
#define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye
|
||||||
|
|
||||||
// External functions from cuda library for atom decomposition
|
// External functions from cuda library for atom decomposition
|
||||||
@ -102,6 +105,16 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
|
|||||||
|
|
||||||
void hippo_gpu_update_fieldp(void **fieldp_ptr);
|
void hippo_gpu_update_fieldp(void **fieldp_ptr);
|
||||||
|
|
||||||
|
void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
|
||||||
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
|
double ***host_thetai3, int** igrid,
|
||||||
|
double ****host_grid_brick, void **host_fdip_phi1,
|
||||||
|
void **host_fdip_phi2, void **host_fdip_sum_phi,
|
||||||
|
const int nzlo_out, const int nzhi_out,
|
||||||
|
const int nylo_out, const int nyhi_out,
|
||||||
|
const int nxlo_out, const int nxhi_out,
|
||||||
|
bool& first_iteration);
|
||||||
|
|
||||||
void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
|
void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
|
||||||
double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
|
double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
|
||||||
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
|
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
|
||||||
@ -129,8 +142,10 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
|
|||||||
gpu_dispersion_real_ready = true;
|
gpu_dispersion_real_ready = true;
|
||||||
gpu_multipole_real_ready = true;
|
gpu_multipole_real_ready = true;
|
||||||
gpu_udirect2b_ready = true;
|
gpu_udirect2b_ready = true;
|
||||||
|
gpu_umutual1_ready = true;
|
||||||
|
gpu_fphi_uind_ready = true;
|
||||||
gpu_umutual2b_ready = true;
|
gpu_umutual2b_ready = true;
|
||||||
gpu_polar_real_ready = true;
|
gpu_polar_real_ready = true; // need to be true for copying data from device back to host
|
||||||
|
|
||||||
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
|
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
|
||||||
}
|
}
|
||||||
@ -198,6 +213,16 @@ void PairHippoGPU::init_style()
|
|||||||
tq_single = false;
|
tq_single = false;
|
||||||
else
|
else
|
||||||
tq_single = true;
|
tq_single = true;
|
||||||
|
|
||||||
|
// replace with the gpu counterpart
|
||||||
|
|
||||||
|
if (gpu_umutual1_ready) {
|
||||||
|
if (use_ewald && ic_kspace) {
|
||||||
|
delete ic_kspace;
|
||||||
|
ic_kspace =
|
||||||
|
new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
@ -392,6 +417,8 @@ void PairHippoGPU::induce()
|
|||||||
|
|
||||||
int debug = 1;
|
int debug = 1;
|
||||||
|
|
||||||
|
first_induce_iteration = true;
|
||||||
|
|
||||||
// set cutoffs, taper coeffs, and PME params
|
// set cutoffs, taper coeffs, and PME params
|
||||||
// create qfac here, free at end of polar()
|
// create qfac here, free at end of polar()
|
||||||
|
|
||||||
@ -403,8 +430,6 @@ void PairHippoGPU::induce()
|
|||||||
|
|
||||||
// owned atoms
|
// owned atoms
|
||||||
|
|
||||||
double **x = atom->x;
|
|
||||||
double **f = atom->f;
|
|
||||||
int nlocal = atom->nlocal;
|
int nlocal = atom->nlocal;
|
||||||
|
|
||||||
// zero out the induced dipoles at each site
|
// zero out the induced dipoles at each site
|
||||||
@ -996,37 +1021,60 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
|
|||||||
int i,j;
|
int i,j;
|
||||||
double term;
|
double term;
|
||||||
|
|
||||||
|
double time0,time1,time2;
|
||||||
|
|
||||||
// zero field,fieldp for owned and ghost atoms
|
// zero field,fieldp for owned and ghost atoms
|
||||||
|
|
||||||
int nlocal = atom->nlocal;
|
int nlocal = atom->nlocal;
|
||||||
int nall = nlocal + atom->nghost;
|
int nall = nlocal + atom->nghost;
|
||||||
|
|
||||||
for (i = 0; i < nall; i++) {
|
memset(&field[0][0], 0, 3*nall *sizeof(double));
|
||||||
for (j = 0; j < 3; j++) {
|
memset(&fieldp[0][0], 0, 3*nall *sizeof(double));
|
||||||
|
|
||||||
|
/*
|
||||||
|
for (int i = 0; i < nall; i++) {
|
||||||
|
for (int j = 0; j < 3; j++) {
|
||||||
field[i][j] = 0.0;
|
field[i][j] = 0.0;
|
||||||
fieldp[i][j] = 0.0;
|
fieldp[i][j] = 0.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
// get the real space portion of the mutual field first
|
// get the real space portion of the mutual field first
|
||||||
|
|
||||||
|
MPI_Barrier(world);
|
||||||
|
time0 = MPI_Wtime();
|
||||||
|
|
||||||
if (polar_rspace_flag) umutual2b(field,fieldp);
|
if (polar_rspace_flag) umutual2b(field,fieldp);
|
||||||
|
time1 = MPI_Wtime();
|
||||||
|
|
||||||
// get the reciprocal space part of the mutual field
|
// get the reciprocal space part of the mutual field
|
||||||
|
|
||||||
if (polar_kspace_flag) umutual1(field,fieldp);
|
if (polar_kspace_flag) umutual1(field,fieldp);
|
||||||
|
time2 = MPI_Wtime();
|
||||||
|
|
||||||
// add the self-energy portion of the mutual field
|
// add the self-energy portion of the mutual field
|
||||||
|
|
||||||
term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
|
term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
field[i][0] += term*uind[i][0];
|
||||||
|
field[i][1] += term*uind[i][1];
|
||||||
|
field[i][2] += term*uind[i][2];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
fieldp[i][0] += term*uinp[i][0];
|
||||||
|
fieldp[i][1] += term*uinp[i][1];
|
||||||
|
fieldp[i][2] += term*uinp[i][2];
|
||||||
|
}
|
||||||
|
/*
|
||||||
for (i = 0; i < nlocal; i++) {
|
for (i = 0; i < nlocal; i++) {
|
||||||
for (j = 0; j < 3; j++) {
|
for (j = 0; j < 3; j++) {
|
||||||
field[i][j] += term*uind[i][j];
|
field[i][j] += term*uind[i][j];
|
||||||
fieldp[i][j] += term*uinp[i][j];
|
fieldp[i][j] += term*uinp[i][j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
// accumulate the field and fieldp values from real-space portion from umutual2b() on the GPU
|
// accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
|
||||||
// field and fieldp may already have some nonzero values from kspace (umutual1 and self)
|
// field and fieldp may already have some nonzero values from kspace (umutual1 and self)
|
||||||
|
|
||||||
hippo_gpu_update_fieldp(&fieldp_pinned);
|
hippo_gpu_update_fieldp(&fieldp_pinned);
|
||||||
@ -1049,6 +1097,228 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
|
|||||||
fieldp[i][1] += fieldp_ptr[idx+1];
|
fieldp[i][1] += fieldp_ptr[idx+1];
|
||||||
fieldp[i][2] += fieldp_ptr[idx+2];
|
fieldp[i][2] += fieldp_ptr[idx+2];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// accumulate timing information
|
||||||
|
|
||||||
|
time_mutual_rspace += time1 - time0;
|
||||||
|
time_mutual_kspace += time2 - time1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------------
|
||||||
|
umutual1 = Ewald recip mutual induced field
|
||||||
|
umutual1 computes the reciprocal space contribution of the
|
||||||
|
induced atomic dipole moments to the field
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
void PairHippoGPU::umutual1(double **field, double **fieldp)
|
||||||
|
{
|
||||||
|
int m,n;
|
||||||
|
int nxlo,nxhi,nylo,nyhi,nzlo,nzhi;
|
||||||
|
double term;
|
||||||
|
double a[3][3]; // indices not flipped vs Fortran
|
||||||
|
|
||||||
|
// return if the Ewald coefficient is zero
|
||||||
|
|
||||||
|
if (aewald < 1.0e-6) return;
|
||||||
|
|
||||||
|
// convert Cartesian dipoles to fractional coordinates
|
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) {
|
||||||
|
a[0][j] = nfft1 * recip[0][j];
|
||||||
|
a[1][j] = nfft2 * recip[1][j];
|
||||||
|
a[2][j] = nfft3 * recip[2][j];
|
||||||
|
}
|
||||||
|
|
||||||
|
int nlocal = atom->nlocal;
|
||||||
|
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2];
|
||||||
|
fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2];
|
||||||
|
fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2];
|
||||||
|
fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
|
||||||
|
fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
for (i = 0; i < nlocal; i++) {
|
||||||
|
for (j = 0; j < 3; j++) {
|
||||||
|
fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2];
|
||||||
|
fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
// gridpre = my portion of 4d grid in brick decomp w/ ghost values
|
||||||
|
|
||||||
|
double ****gridpre = (double ****) ic_kspace->zero();
|
||||||
|
|
||||||
|
// map 2 values to grid
|
||||||
|
|
||||||
|
grid_uind(fuind,fuinp,gridpre);
|
||||||
|
|
||||||
|
// pre-convolution operations including forward FFT
|
||||||
|
// gridfft = my portion of complex 3d grid in FFT decomposition
|
||||||
|
|
||||||
|
double *gridfft = ic_kspace->pre_convolution();
|
||||||
|
|
||||||
|
// ---------------------
|
||||||
|
// convolution operation
|
||||||
|
// ---------------------
|
||||||
|
|
||||||
|
nxlo = ic_kspace->nxlo_fft;
|
||||||
|
nxhi = ic_kspace->nxhi_fft;
|
||||||
|
nylo = ic_kspace->nylo_fft;
|
||||||
|
nyhi = ic_kspace->nyhi_fft;
|
||||||
|
nzlo = ic_kspace->nzlo_fft;
|
||||||
|
nzhi = ic_kspace->nzhi_fft;
|
||||||
|
|
||||||
|
// use qfac values stored in udirect1()
|
||||||
|
|
||||||
|
m = n = 0;
|
||||||
|
for (int k = nzlo; k <= nzhi; k++) {
|
||||||
|
for (int j = nylo; j <= nyhi; j++) {
|
||||||
|
for (int i = nxlo; i <= nxhi; i++) {
|
||||||
|
term = qfac[m++];
|
||||||
|
gridfft[n] *= term;
|
||||||
|
gridfft[n+1] *= term;
|
||||||
|
n += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// post-convolution operations including backward FFT
|
||||||
|
// gridppost = my portion of 4d grid in brick decomp w/ ghost values
|
||||||
|
|
||||||
|
double ****gridpost = (double ****) ic_kspace->post_convolution();
|
||||||
|
|
||||||
|
// get potential
|
||||||
|
double time0, time1;
|
||||||
|
|
||||||
|
MPI_Barrier(world);
|
||||||
|
time0 = MPI_Wtime();
|
||||||
|
|
||||||
|
fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
|
||||||
|
|
||||||
|
time1 = MPI_Wtime();
|
||||||
|
time_fphi_uind += (time1 - time0);
|
||||||
|
|
||||||
|
// store fractional reciprocal potentials for OPT method
|
||||||
|
|
||||||
|
if (poltyp == OPT) {
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
for (int j = 0; j < 10; j++) {
|
||||||
|
fopt[i][optlevel][j] = fdip_phi1[i][j];
|
||||||
|
foptp[i][optlevel][j] = fdip_phi2[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// convert the dipole fields from fractional to Cartesian
|
||||||
|
|
||||||
|
for (int i = 0; i < 3; i++) {
|
||||||
|
a[0][i] = nfft1 * recip[0][i];
|
||||||
|
a[1][i] = nfft2 * recip[1][i];
|
||||||
|
a[2][i] = nfft3 * recip[2][i];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
double dfx = a[0][0]*fdip_phi1[i][1] +
|
||||||
|
a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3];
|
||||||
|
double dfy = a[1][0]*fdip_phi1[i][1] +
|
||||||
|
a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3];
|
||||||
|
double dfz = a[2][0]*fdip_phi1[i][1] +
|
||||||
|
a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3];
|
||||||
|
field[i][0] -= dfx;
|
||||||
|
field[i][1] -= dfy;
|
||||||
|
field[i][2] -= dfz;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
double dfx = a[0][0]*fdip_phi2[i][1] +
|
||||||
|
a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3];
|
||||||
|
double dfy = a[1][0]*fdip_phi2[i][1] +
|
||||||
|
a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3];
|
||||||
|
double dfz = a[2][0]*fdip_phi2[i][1] +
|
||||||
|
a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3];
|
||||||
|
fieldp[i][0] -= dfx;
|
||||||
|
fieldp[i][1] -= dfy;
|
||||||
|
fieldp[i][2] -= dfz;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
for (j = 0; j < 3; j++) {
|
||||||
|
dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] +
|
||||||
|
a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3];
|
||||||
|
dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] +
|
||||||
|
a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// increment the field at each multipole site
|
||||||
|
|
||||||
|
for (i = 0; i < nlocal; i++) {
|
||||||
|
for (j = 0; j < 3; j++) {
|
||||||
|
field[i][j] -= dipfield1[i][j];
|
||||||
|
fieldp[i][j] -= dipfield2[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------------
|
||||||
|
fphi_uind = induced potential from grid
|
||||||
|
fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1,
|
||||||
|
double **fdip_phi2, double **fdip_sum_phi)
|
||||||
|
{
|
||||||
|
if (!gpu_fphi_uind_ready) {
|
||||||
|
PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
void* fdip_phi1_pinned = nullptr;
|
||||||
|
void* fdip_phi2_pinned = nullptr;
|
||||||
|
void* fdip_sum_phi_pinned = nullptr;
|
||||||
|
hippo_gpu_fphi_uind(atom->nlocal, bsorder, thetai1,
|
||||||
|
thetai2, thetai3, igrid, grid,
|
||||||
|
&fdip_phi1_pinned, &fdip_phi2_pinned,
|
||||||
|
&fdip_sum_phi_pinned,
|
||||||
|
ic_kspace->nzlo_out, ic_kspace->nzhi_out,
|
||||||
|
ic_kspace->nylo_out, ic_kspace->nyhi_out,
|
||||||
|
ic_kspace->nxlo_out, ic_kspace->nxhi_out,
|
||||||
|
first_induce_iteration);
|
||||||
|
|
||||||
|
int nlocal = atom->nlocal;
|
||||||
|
double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
int n = i;
|
||||||
|
for (int m = 0; m < 10; m++) {
|
||||||
|
fdip_phi1[i][m] = _fdip_phi1_ptr[n];
|
||||||
|
n += nlocal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
int n = i;
|
||||||
|
for (int m = 0; m < 10; m++) {
|
||||||
|
fdip_phi2[i][m] = _fdip_phi2_ptr[n];
|
||||||
|
n += nlocal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
int n = i;
|
||||||
|
for (int m = 0; m < 20; m++) {
|
||||||
|
fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
|
||||||
|
n += nlocal;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
@ -1089,29 +1359,6 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
|
|||||||
double *pval = atom->dvector[index_pval];
|
double *pval = atom->dvector[index_pval];
|
||||||
hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval,
|
hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval,
|
||||||
aewald, off2, &fieldp_pinned);
|
aewald, off2, &fieldp_pinned);
|
||||||
/*
|
|
||||||
// accumulate the field and fieldp values from the GPU lib
|
|
||||||
// field and fieldp may already have some nonzero values from kspace (umutual1)
|
|
||||||
|
|
||||||
int nlocal = atom->nlocal;
|
|
||||||
double *field_ptr = (double *)fieldp_pinned;
|
|
||||||
|
|
||||||
for (int i = 0; i < nlocal; i++) {
|
|
||||||
int idx = 4*i;
|
|
||||||
field[i][0] += field_ptr[idx];
|
|
||||||
field[i][1] += field_ptr[idx+1];
|
|
||||||
field[i][2] += field_ptr[idx+2];
|
|
||||||
}
|
|
||||||
|
|
||||||
double* fieldp_ptr = (double *)fieldp_pinned;
|
|
||||||
fieldp_ptr += 4*inum;
|
|
||||||
for (int i = 0; i < nlocal; i++) {
|
|
||||||
int idx = 4*i;
|
|
||||||
fieldp[i][0] += fieldp_ptr[idx];
|
|
||||||
fieldp[i][1] += fieldp_ptr[idx+1];
|
|
||||||
fieldp[i][2] += fieldp_ptr[idx+2];
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
|
|||||||
@ -39,6 +39,8 @@ class PairHippoGPU : public PairAmoeba {
|
|||||||
virtual void dispersion_real();
|
virtual void dispersion_real();
|
||||||
virtual void multipole_real();
|
virtual void multipole_real();
|
||||||
virtual void udirect2b(double **, double **);
|
virtual void udirect2b(double **, double **);
|
||||||
|
virtual void umutual1(double **, double **);
|
||||||
|
virtual void fphi_uind(double ****, double **, double **, double **);
|
||||||
virtual void umutual2b(double **, double **);
|
virtual void umutual2b(double **, double **);
|
||||||
virtual void ufield0c(double **, double **);
|
virtual void ufield0c(double **, double **);
|
||||||
virtual void polar_real();
|
virtual void polar_real();
|
||||||
@ -55,9 +57,13 @@ class PairHippoGPU : public PairAmoeba {
|
|||||||
bool gpu_dispersion_real_ready;
|
bool gpu_dispersion_real_ready;
|
||||||
bool gpu_multipole_real_ready;
|
bool gpu_multipole_real_ready;
|
||||||
bool gpu_udirect2b_ready;
|
bool gpu_udirect2b_ready;
|
||||||
|
bool gpu_umutual1_ready;
|
||||||
|
bool gpu_fphi_uind_ready;
|
||||||
bool gpu_umutual2b_ready;
|
bool gpu_umutual2b_ready;
|
||||||
bool gpu_polar_real_ready;
|
bool gpu_polar_real_ready;
|
||||||
|
|
||||||
|
bool first_induce_iteration;
|
||||||
|
|
||||||
void udirect2b_cpu();
|
void udirect2b_cpu();
|
||||||
|
|
||||||
template<class numtyp>
|
template<class numtyp>
|
||||||
|
|||||||
Reference in New Issue
Block a user