Attempted to remove some redundancy in data transfers in the amoeba kernels; keeping HIPPO independent of AMOEBA for now
This commit is contained in:
@ -62,9 +62,9 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
|
||||
cell_size,gpu_split,_screen,amoeba,
|
||||
"k_amoeba_multipole",
|
||||
"k_amoeba_udirect2b", "k_amoeba_umutual2b",
|
||||
"k_amoeba_polar", "k_amoeba_short_nbor");
|
||||
"k_amoeba_multipole", "k_amoeba_udirect2b",
|
||||
"k_amoeba_umutual2b", "k_amoeba_polar",
|
||||
"k_amoeba_short_nbor", "k_amoeba_special15");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
@ -1637,12 +1637,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
||||
else do nothing to IJ entry
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__kernel void k_special15(__global int * dev_nbor,
|
||||
__kernel void k_amoeba_special15(__global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global tagint *restrict tag,
|
||||
const __global int *restrict nspecial15,
|
||||
const __global tagint *restrict special15,
|
||||
const int inum, const int nall, const int nbor_pitch,
|
||||
const int inum, const int nall,
|
||||
const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
int tid, ii, offset, n_stride, i;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
@ -58,7 +58,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
||||
const char *k_name_udirect2b,
|
||||
const char *k_name_umutual2b,
|
||||
const char *k_name_polar,
|
||||
const char *k_name_short_nbor) {
|
||||
const char *k_name_short_nbor,
|
||||
const char* k_name_special15) {
|
||||
screen=_screen;
|
||||
|
||||
int gpu_nbor=0;
|
||||
@ -91,7 +92,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
||||
_block_size=device->pair_block_size();
|
||||
_block_bio_size=device->block_bio_pair();
|
||||
compile_kernels(*ucl_device,pair_program,k_name_multipole,
|
||||
k_name_udirect2b, k_name_umutual2b,k_name_polar,k_name_short_nbor);
|
||||
k_name_udirect2b, k_name_umutual2b,k_name_polar,
|
||||
k_name_short_nbor, k_name_special15);
|
||||
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
@ -399,24 +401,22 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
|
||||
if (!success)
|
||||
return nullptr;
|
||||
atom->cast_q_data(host_q);
|
||||
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
|
||||
//cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
|
||||
hd_balancer.start_timer();
|
||||
} else {
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
atom->cast_q_data(host_q);
|
||||
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
|
||||
//cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
atom->add_q_data();
|
||||
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
|
||||
atom->add_extra_data();
|
||||
|
||||
*ilist=nbor->host_ilist.begin();
|
||||
*jnum=nbor->host_acc.begin();
|
||||
|
||||
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
|
||||
boxlo, prd);
|
||||
|
||||
// re-allocate dev_short_nbor if necessary
|
||||
if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
@ -463,13 +463,8 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
|
||||
// reallocate per-atom arrays, transfer data from the host
|
||||
// and build the neighbor lists if needed
|
||||
// NOTE:
|
||||
// For now we invoke precompute() again here,
|
||||
// to be able to turn on/off the udirect2b kernel (which comes before this)
|
||||
// Once all the kernels are ready, precompute() is needed only once
|
||||
// in the first kernel in a time step.
|
||||
// We only need to cast uind and uinp from host to device here
|
||||
// if the neighbor lists are rebuilt and other per-atom arrays
|
||||
// (x, type, amtype, amgroup, rpole) are ready on the device.
|
||||
|
||||
int** firstneigh = nullptr;
|
||||
firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
|
||||
@ -553,6 +548,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
|
||||
// and build the neighbor lists if needed
|
||||
|
||||
int** firstneigh = nullptr;
|
||||
/*
|
||||
firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
|
||||
host_amtype, host_amgroup, host_rpole,
|
||||
host_uind, host_uinp, nullptr, sublo, subhi, tag,
|
||||
@ -560,6 +556,9 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
|
||||
eflag_in, vflag_in, eatom, vatom,
|
||||
host_start, ilist, jnum, cpu_time,
|
||||
success, host_q, boxlo, prd);
|
||||
*/
|
||||
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
|
||||
atom->add_extra_data();
|
||||
|
||||
// ------------------- Resize _fieldp array ------------------------
|
||||
|
||||
@ -627,6 +626,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
|
||||
// and build the neighbor lists if needed
|
||||
|
||||
int** firstneigh = nullptr;
|
||||
/*
|
||||
firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
|
||||
host_amtype, host_amgroup, host_rpole,
|
||||
host_uind, host_uinp, nullptr, sublo, subhi, tag,
|
||||
@ -634,6 +634,9 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
|
||||
eflag_in, vflag_in, eatom, vatom,
|
||||
host_start, ilist, jnum, cpu_time,
|
||||
success, host_q, boxlo, prd);
|
||||
*/
|
||||
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
|
||||
atom->add_extra_data();
|
||||
|
||||
// ------------------- Resize _fieldp array ------------------------
|
||||
|
||||
@ -708,6 +711,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
|
||||
// (x, type, amtype, amgroup, rpole) are ready on the device.
|
||||
|
||||
int** firstneigh = nullptr;
|
||||
/*
|
||||
firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
|
||||
host_amtype, host_amgroup, host_rpole,
|
||||
host_uind, host_uinp, nullptr, sublo, subhi, tag,
|
||||
@ -715,6 +719,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
|
||||
eflag_in, vflag_in, eatom, vatom,
|
||||
host_start, ilist, jnum, cpu_time,
|
||||
success, host_q, boxlo, prd);
|
||||
*/
|
||||
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
|
||||
atom->add_extra_data();
|
||||
|
||||
// ------------------- Resize _tep array ------------------------
|
||||
|
||||
@ -829,7 +836,8 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
const char *kname_udirect2b,
|
||||
const char *kname_umutual2b,
|
||||
const char *kname_polar,
|
||||
const char *kname_short_nbor) {
|
||||
const char *kname_short_nbor,
|
||||
const char* kname_special15) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
@ -843,7 +851,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
k_umutual2b.set_function(*pair_program,kname_umutual2b);
|
||||
k_polar.set_function(*pair_program,kname_polar);
|
||||
k_short_nbor.set_function(*pair_program,kname_short_nbor);
|
||||
k_special15.set_function(*pair_program,"k_special15");
|
||||
k_special15.set_function(*pair_program,kname_special15);
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
q_tex.get_texture(*pair_program,"q_tex");
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ class BaseAmoeba {
|
||||
const double gpu_split, FILE *screen, const void *pair_program,
|
||||
const char *kname_multipole,
|
||||
const char *kname_udirect2b, const char *kname_umutual2b,
|
||||
const char *kname_polar, const char *kname_short_nbor);
|
||||
const char *kname_polar, const char *kname_short_nbor, const char* kname_special15);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead(const int add_kernels=0);
|
||||
@ -279,9 +279,9 @@ class BaseAmoeba {
|
||||
numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||
const char *kname_multipole,
|
||||
const char *kname_udirect2b, const char *kname_umutual2b,
|
||||
const char *kname_polar, const char *kname_short_nbor);
|
||||
const char *kname_multipole, const char *kname_udirect2b,
|
||||
const char *kname_umutual2b, const char *kname_polar,
|
||||
const char *kname_short_nbor, const char* kname_special15);
|
||||
|
||||
virtual int multipole_real(const int eflag, const int vflag) = 0;
|
||||
virtual int udirect2b(const int eflag, const int vflag) = 0;
|
||||
|
||||
@ -67,9 +67,9 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
|
||||
cell_size,gpu_split,_screen,hippo,
|
||||
"k_hippo_multipole",
|
||||
"k_hippo_udirect2b", "k_hippo_umutual2b",
|
||||
"k_hippo_polar", "k_hippo_short_nbor");
|
||||
"k_hippo_multipole", "k_hippo_udirect2b",
|
||||
"k_hippo_umutual2b", "k_hippo_polar",
|
||||
"k_hippo_short_nbor", "k_hippo_special15");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
@ -2135,7 +2135,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
|
||||
else do nothing to IJ entry
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__kernel void k_special15(__global int * dev_nbor,
|
||||
__kernel void k_hippo_special15(__global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global tagint *restrict tag,
|
||||
const __global int *restrict nspecial15,
|
||||
|
||||
@ -7,12 +7,12 @@ SHELL = /bin/sh
|
||||
# specify flags and libraries needed for your compiler
|
||||
|
||||
CC = mpicxx
|
||||
CCFLAGS = -g -O3
|
||||
CCFLAGS = -g -O3 -fopenmp
|
||||
SHFLAGS = -fPIC
|
||||
DEPFLAGS = -M
|
||||
|
||||
LINK = mpicxx
|
||||
LINKFLAGS = -g -O3
|
||||
LINKFLAGS = -g -O3 -fopenmp
|
||||
LIB =
|
||||
SIZE = size
|
||||
|
||||
|
||||
Reference in New Issue
Block a user