Hybrid working, but not fully tested.
This commit is contained in:
@ -118,7 +118,7 @@ $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared
|
||||
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H)
|
||||
$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
|
||||
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H)
|
||||
$(CUDR) -o $@ -c pair_gpu_device.cpp
|
||||
|
||||
$(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
|
||||
|
||||
@ -68,7 +68,7 @@ $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared
|
||||
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h
|
||||
$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
|
||||
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H)
|
||||
$(OCL) -o $@ -c pair_gpu_device.cpp
|
||||
|
||||
$(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
|
||||
|
||||
@ -23,11 +23,13 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
template <class numtyp, class acctyp>
|
||||
AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) {
|
||||
device=&pair_gpu_device;
|
||||
ans=new PairGPUAns<numtyp,acctyp>();
|
||||
nbor=new PairGPUNbor();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
AtomicGPUMemoryT::~AtomicGPUMemory() {
|
||||
delete ans;
|
||||
delete nbor;
|
||||
}
|
||||
|
||||
@ -54,7 +56,7 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
if (!device->init(false,false,nlocal,host_nlocal,nall,nbor,maxspecial,
|
||||
if (!device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,maxspecial,
|
||||
gpu_nbor,_gpu_host,max_nbors,cell_size,false))
|
||||
return false;
|
||||
ucl_device=device->gpu;
|
||||
@ -74,7 +76,7 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
||||
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
|
||||
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -84,7 +86,7 @@ void AtomicGPUMemoryT::clear_atomic() {
|
||||
// Output any timing information
|
||||
acc_timers();
|
||||
double avg_split=hd_balancer.all_avg_split();
|
||||
device->output_times(time_pair,*nbor,avg_split,_max_bytes+_max_an_bytes,
|
||||
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
|
||||
screen);
|
||||
|
||||
if (_compiled) {
|
||||
@ -118,7 +120,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
|
||||
|
||||
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
||||
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
|
||||
@ -149,7 +151,7 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
|
||||
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
}
|
||||
@ -173,7 +175,7 @@ void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time,nbor->gpu_nbor());
|
||||
atom->inum(inum);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
@ -187,7 +189,8 @@ void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
|
||||
atom->add_x_data(host_x,host_type);
|
||||
|
||||
loop(eflag,vflag);
|
||||
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
@ -210,7 +213,7 @@ int * AtomicGPUMemoryT::compute(const int ago, const int inum_full,
|
||||
|
||||
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
|
||||
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||
atom->inum(inum);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
@ -227,7 +230,8 @@ int * AtomicGPUMemoryT::compute(const int ago, const int inum_full,
|
||||
}
|
||||
|
||||
loop(eflag,vflag);
|
||||
atom->copy_answers(eflag,vflag,eatom,vatom);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
return nbor->host_nbor.begin();
|
||||
|
||||
@ -48,8 +48,9 @@ class AtomicGPUMemory {
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(inum, nall, success))
|
||||
if (atom->resize(nall, success))
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
ans->resize(inum,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
@ -92,6 +93,7 @@ class AtomicGPUMemory {
|
||||
}
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
}
|
||||
|
||||
/// Zero timers
|
||||
@ -99,6 +101,7 @@ class AtomicGPUMemory {
|
||||
nbor_time_avail=false;
|
||||
time_pair.zero();
|
||||
atom->zero_timers();
|
||||
ans->zero_timers();
|
||||
}
|
||||
|
||||
/// Copy neighbor list from host
|
||||
@ -148,6 +151,9 @@ class AtomicGPUMemory {
|
||||
/// Atom Data
|
||||
PairGPUAtom<numtyp,acctyp> *atom;
|
||||
|
||||
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||
|
||||
PairGPUAns<numtyp,acctyp> *ans;
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
|
||||
@ -23,11 +23,13 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
template <class numtyp, class acctyp>
|
||||
ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
|
||||
device=&pair_gpu_device;
|
||||
ans=new PairGPUAns<numtyp,acctyp>();
|
||||
nbor=new PairGPUNbor();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
ChargeGPUMemoryT::~ChargeGPUMemory() {
|
||||
delete ans;
|
||||
delete nbor;
|
||||
}
|
||||
|
||||
@ -54,8 +56,8 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
if (!device->init(true,false,nlocal,host_nlocal,nall,nbor,maxspecial,gpu_nbor,
|
||||
_gpu_host,max_nbors,cell_size,false))
|
||||
if (!device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,maxspecial,
|
||||
gpu_nbor,_gpu_host,max_nbors,cell_size,false))
|
||||
return false;
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
@ -75,7 +77,7 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
q_tex.bind_float(atom->dev_q,1);
|
||||
|
||||
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -85,7 +87,7 @@ void ChargeGPUMemoryT::clear_atomic() {
|
||||
// Output any timing information
|
||||
acc_timers();
|
||||
double avg_split=hd_balancer.all_avg_split();
|
||||
device->output_times(time_pair,*nbor,avg_split,_max_bytes+_max_an_bytes,
|
||||
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
|
||||
screen);
|
||||
|
||||
if (_compiled) {
|
||||
@ -119,7 +121,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
|
||||
|
||||
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
||||
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
|
||||
@ -150,7 +152,7 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
|
||||
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
}
|
||||
@ -175,7 +177,7 @@ void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time,
|
||||
nbor->gpu_nbor());
|
||||
atom->inum(inum);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
@ -188,10 +190,11 @@ void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
|
||||
atom->cast_q_data(host_q);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
atom->add_other_data();
|
||||
atom->add_q_data();
|
||||
|
||||
loop(eflag,vflag);
|
||||
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
@ -215,7 +218,7 @@ int * ChargeGPUMemoryT::compute(const int ago, const int inum_full,
|
||||
|
||||
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
|
||||
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||
atom->inum(inum);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
@ -232,10 +235,11 @@ int * ChargeGPUMemoryT::compute(const int ago, const int inum_full,
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
atom->add_other_data();
|
||||
atom->add_q_data();
|
||||
|
||||
loop(eflag,vflag);
|
||||
atom->copy_answers(eflag,vflag,eatom,vatom);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
return nbor->host_nbor.begin();
|
||||
|
||||
@ -48,10 +48,11 @@ class ChargeGPUMemory {
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(inum, nall, success)) {
|
||||
if (atom->resize(nall, success)) {
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
q_tex.bind_float(atom->dev_q,1);
|
||||
}
|
||||
ans->resize(inum,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
@ -94,6 +95,7 @@ class ChargeGPUMemory {
|
||||
}
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
}
|
||||
|
||||
/// Zero timers
|
||||
@ -101,6 +103,7 @@ class ChargeGPUMemory {
|
||||
nbor_time_avail=false;
|
||||
time_pair.zero();
|
||||
atom->zero_timers();
|
||||
ans->zero_timers();
|
||||
}
|
||||
|
||||
/// Copy neighbor list from host
|
||||
@ -151,6 +154,10 @@ class ChargeGPUMemory {
|
||||
PairGPUAtom<numtyp,acctyp> *atom;
|
||||
|
||||
|
||||
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||
|
||||
PairGPUAns<numtyp,acctyp> *ans;
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
/// Neighbor data
|
||||
|
||||
@ -122,9 +122,9 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
@ -133,15 +133,15 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -133,9 +133,9 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
@ -144,8 +144,8 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald);
|
||||
@ -153,8 +153,8 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald);
|
||||
}
|
||||
|
||||
@ -133,9 +133,9 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
@ -144,8 +144,8 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &_cut_coulsq,
|
||||
&_qqrd2e, &_smooth);
|
||||
@ -153,8 +153,8 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&_cut_coulsq, &_qqrd2e, &_smooth);
|
||||
}
|
||||
|
||||
@ -137,9 +137,9 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
@ -147,8 +147,8 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
|
||||
&sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
|
||||
@ -157,8 +157,8 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
|
||||
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq);
|
||||
|
||||
@ -216,9 +216,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum())/BX));
|
||||
int stride=gbm.nbor->nbor_pitch();
|
||||
int ainum=gbm.atom->inum();
|
||||
int ainum=gbm.ans->inum();
|
||||
int anall=gbm.atom->nall();
|
||||
|
||||
if (gbm.multiple_forms) {
|
||||
@ -237,11 +237,11 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
||||
&gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
|
||||
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
||||
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
|
||||
&stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
|
||||
&stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(),
|
||||
&gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
|
||||
gbm.time_gayberne.stop();
|
||||
|
||||
if (gbm.last_ellipse==gbm.atom->inum()) {
|
||||
if (gbm.last_ellipse==gbm.ans->inum()) {
|
||||
gbm.time_kernel2.start();
|
||||
gbm.time_kernel2.stop();
|
||||
gbm.time_gayberne2.start();
|
||||
@ -254,9 +254,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
||||
// ------------ SPHERE_ELLIPSE ---------------
|
||||
|
||||
gbm.time_kernel2.start();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
|
||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum()-
|
||||
gbm.last_ellipse)/BX));
|
||||
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
|
||||
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(),
|
||||
SPHERE_ELLIPSE,SPHERE_ELLIPSE);
|
||||
gbm.time_kernel2.stop();
|
||||
|
||||
@ -266,13 +266,13 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
||||
&gbm.shape.begin(), &gbm.well.begin(),
|
||||
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
||||
&gbm._lj_types, &gbm.lshape.begin(),
|
||||
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
|
||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
|
||||
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),
|
||||
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
|
||||
&vflag, &gbm.last_ellipse, &ainum, &anall);
|
||||
gbm.time_gayberne2.stop();
|
||||
} else {
|
||||
gbm.atom->dev_ans.zero();
|
||||
gbm.atom->dev_engv.zero();
|
||||
gbm.ans->dev_ans.zero();
|
||||
gbm.ans->dev_engv.zero();
|
||||
gbm.time_kernel.stop();
|
||||
gbm.time_gayberne.start();
|
||||
gbm.time_gayberne.stop();
|
||||
@ -284,29 +284,29 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
||||
|
||||
// ------------ LJ ---------------
|
||||
gbm.time_pair.start();
|
||||
if (gbm.last_ellipse<gbm.atom->inum()) {
|
||||
if (gbm.last_ellipse<gbm.ans->inum()) {
|
||||
if (gbm.shared_types) {
|
||||
GBMF.k_lj_fast.set_size(GX,BX);
|
||||
GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
|
||||
&gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
|
||||
&stride, &gbm.nbor->dev_packed.begin(),
|
||||
&gbm.atom->dev_ans.begin(),
|
||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&gbm.ans->dev_ans.begin(),
|
||||
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
|
||||
} else {
|
||||
GBMF.k_lj.set_size(GX,BX);
|
||||
GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
|
||||
&gbm.lj3.begin(), &gbm._lj_types,
|
||||
&gbm.gamma_upsilon_mu.begin(), &stride,
|
||||
&gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
|
||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(),
|
||||
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
|
||||
}
|
||||
}
|
||||
gbm.time_pair.stop();
|
||||
} else {
|
||||
gbm.time_kernel.start();
|
||||
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
|
||||
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE,
|
||||
ELLIPSE_ELLIPSE);
|
||||
gbm.time_kernel.stop();
|
||||
gbm.time_gayberne.start();
|
||||
@ -315,8 +315,8 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
||||
&gbm.shape.begin(), &gbm.well.begin(),
|
||||
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
||||
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
|
||||
&stride, &gbm.atom->dev_ans.begin(), &ainum,
|
||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&stride, &gbm.ans->dev_ans.begin(), &ainum,
|
||||
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&eflag, &vflag, &ainum, &anall);
|
||||
gbm.time_gayberne.stop();
|
||||
}
|
||||
@ -342,7 +342,7 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
|
||||
|
||||
gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
|
||||
int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full);
|
||||
gbm.atom->inum(inum);
|
||||
gbm.ans->inum(inum);
|
||||
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
|
||||
host_start=inum;
|
||||
|
||||
@ -361,10 +361,11 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
|
||||
gbm.atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
|
||||
gbm.atom->add_other_data();
|
||||
gbm.atom->add_quat_data();
|
||||
|
||||
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
|
||||
gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
|
||||
gbm.ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
gbm.device->add_ans_object(gbm.ans);
|
||||
gbm.hd_balancer.stop_timer();
|
||||
return gbm.nbor->host_nbor.begin();
|
||||
}
|
||||
@ -399,7 +400,7 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
|
||||
|
||||
int ago=gbm.hd_balancer.ago_first(f_ago);
|
||||
int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time,gbm.nbor->gpu_nbor());
|
||||
gbm.atom->inum(inum);
|
||||
gbm.ans->inum(inum);
|
||||
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
|
||||
host_start=inum;
|
||||
|
||||
@ -419,10 +420,11 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
|
||||
gbm.atom->cast_quat_data(host_quat[0]);
|
||||
gbm.hd_balancer.start_timer();
|
||||
gbm.atom->add_x_data(host_x,host_type);
|
||||
gbm.atom->add_other_data();
|
||||
gbm.atom->add_quat_data();
|
||||
|
||||
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
|
||||
gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
|
||||
gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list);
|
||||
gbm.device->add_ans_object(gbm.ans);
|
||||
gbm.hd_balancer.stop_timer();
|
||||
return list;
|
||||
}
|
||||
|
||||
@ -32,12 +32,14 @@ template <class numtyp, class acctyp>
|
||||
GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
|
||||
_max_bytes(0.0) {
|
||||
device=&pair_gpu_device;
|
||||
ans=new PairGPUAns<numtyp,acctyp>();
|
||||
nbor=new PairGPUNbor;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
GB_GPU_MemoryT::~GB_GPU_Memory() {
|
||||
clear();
|
||||
delete ans;
|
||||
delete nbor;
|
||||
}
|
||||
|
||||
@ -70,7 +72,7 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
if (!device->init(false,true,nlocal,host_nlocal,nall,nbor,0,gpu_nbor,
|
||||
if (!device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,gpu_nbor,
|
||||
_gpu_host,max_nbors,cell_size,true))
|
||||
return false;
|
||||
ucl_device=device->gpu;
|
||||
@ -187,9 +189,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
||||
}
|
||||
|
||||
if (multiple_forms)
|
||||
atom->dev_ans.zero();
|
||||
ans->dev_ans.zero();
|
||||
|
||||
_max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
_max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
// Memory for ilist ordered by particle type
|
||||
return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
|
||||
@ -212,7 +214,7 @@ void GB_GPU_MemoryT::clear() {
|
||||
acc_timers();
|
||||
double single[6], times[6];
|
||||
|
||||
single[0]=atom->transfer_time();
|
||||
single[0]=atom->transfer_time()+ans->transfer_time();
|
||||
single[1]=nbor->time_nbor.total_seconds();
|
||||
single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
|
||||
nbor->time_kernel.total_seconds();
|
||||
@ -221,7 +223,7 @@ void GB_GPU_MemoryT::clear() {
|
||||
single[4]=time_pair.total_seconds();
|
||||
else
|
||||
single[4]=0;
|
||||
single[5]=atom->cast_time();
|
||||
single[5]=atom->cast_time()+ans->cast_time();
|
||||
|
||||
MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica());
|
||||
double avg_split=hd_balancer.all_avg_split();
|
||||
@ -229,7 +231,7 @@ void GB_GPU_MemoryT::clear() {
|
||||
_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
|
||||
sigma_epsilon.row_bytes()+cut_form.row_bytes()+
|
||||
shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
|
||||
gamma_upsilon_mu.row_bytes();
|
||||
gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
|
||||
double mpi_max_bytes;
|
||||
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
|
||||
device->replica());
|
||||
|
||||
@ -49,9 +49,10 @@ class GB_GPU_Memory {
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
atom->resize(inum, nall, success);
|
||||
if (multiple_forms) atom->dev_ans.zero();
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
atom->resize(nall, success);
|
||||
ans->resize(inum, success);
|
||||
if (multiple_forms) ans->dev_ans.zero();
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_bytes)
|
||||
_max_bytes=bytes;
|
||||
}
|
||||
@ -74,7 +75,7 @@ class GB_GPU_Memory {
|
||||
success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
|
||||
}
|
||||
nbor->resize(nlocal,host_inum,max_nbors,success);
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_bytes)
|
||||
_max_bytes=bytes;
|
||||
}
|
||||
@ -104,6 +105,7 @@ class GB_GPU_Memory {
|
||||
time_pair.add_to_total();
|
||||
}
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
}
|
||||
|
||||
/// Accumulate timers
|
||||
@ -117,6 +119,7 @@ class GB_GPU_Memory {
|
||||
time_pair.zero();
|
||||
}
|
||||
atom->zero_timers();
|
||||
ans->zero_timers();
|
||||
}
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
@ -168,6 +171,10 @@ class GB_GPU_Memory {
|
||||
|
||||
int last_ellipse, max_last_ellipse;
|
||||
|
||||
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||
|
||||
PairGPUAns<numtyp,acctyp> *ans;
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
/// Neighbor data
|
||||
|
||||
@ -25,6 +25,7 @@
|
||||
#define NVD_TIMER_H
|
||||
|
||||
#include "nvd_macros.h"
|
||||
#include "nvd_device.h"
|
||||
|
||||
namespace ucl_cudadr {
|
||||
|
||||
@ -66,12 +67,23 @@ class UCL_Timer {
|
||||
/// Stop timing on command queue
|
||||
inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
|
||||
|
||||
/// Block until the start event has been reached on device
|
||||
inline void sync_start()
|
||||
{ CU_SAFE_CALL(cuEventSynchronize(start_event)); }
|
||||
|
||||
/// Block until the stop event has been reached on device
|
||||
inline void sync_stop()
|
||||
{ CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
|
||||
|
||||
/// Set the time elapsed to zero (not the total_time)
|
||||
inline void zero() {
|
||||
CU_SAFE_CALL(cuEventRecord(start_event,_cq));
|
||||
CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
|
||||
}
|
||||
|
||||
/// Set the total time to zero
|
||||
inline void zero_total() { _total_time=0.0; }
|
||||
|
||||
/// Add time from previous start and stop to total
|
||||
/** Forces synchronization **/
|
||||
inline double add_to_total()
|
||||
|
||||
@ -25,6 +25,7 @@
|
||||
#define OCL_TIMER_H
|
||||
|
||||
#include "ocl_macros.h"
|
||||
#include "ocl_device.h"
|
||||
|
||||
namespace ucl_opencl {
|
||||
|
||||
@ -67,10 +68,21 @@ class UCL_Timer {
|
||||
/// Stop timing on default command queue
|
||||
inline void stop() { clEnqueueMarker(_cq,&stop_event); }
|
||||
|
||||
/// Block until the start event has been reached on device
|
||||
inline void sync_start()
|
||||
{ CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
|
||||
|
||||
/// Block until the stop event has been reached on device
|
||||
inline void sync_stop()
|
||||
{ CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
|
||||
|
||||
/// Set the time elapsed to zero (not the total_time)
|
||||
inline void zero()
|
||||
{ clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); }
|
||||
|
||||
/// Set the total time to zero
|
||||
inline void zero_total() { _total_time=0.0; }
|
||||
|
||||
/// Add time from previous start and stop to total
|
||||
/** Forces synchronization **/
|
||||
inline double add_to_total()
|
||||
|
||||
@ -122,9 +122,9 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
@ -133,15 +133,15 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -122,9 +122,9 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
@ -133,15 +133,15 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -134,9 +134,9 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
@ -145,8 +145,8 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &cutsq.begin(),
|
||||
&_qqrd2e);
|
||||
@ -154,8 +154,8 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&cutsq.begin(), &_qqrd2e);
|
||||
}
|
||||
|
||||
@ -132,9 +132,9 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
@ -143,8 +143,8 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald);
|
||||
@ -152,8 +152,8 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald);
|
||||
}
|
||||
|
||||
@ -138,11 +138,10 @@ void PairGPUAnsT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
|
||||
time_pos.clear();
|
||||
time_other.clear();
|
||||
time_answer.clear();
|
||||
clear_resize();
|
||||
_inum=0;
|
||||
_ilist=NULL;
|
||||
_eflag=false;
|
||||
_vflag=false;
|
||||
}
|
||||
@ -181,7 +180,6 @@ void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
|
||||
else
|
||||
ucl_copy(host_ans,dev_ans,_inum*4,true);
|
||||
time_answer.stop();
|
||||
dev->add_answer_object(this);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -199,7 +197,7 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
|
||||
return 0.0;
|
||||
|
||||
double evdwl=0.0;
|
||||
if (_gpu_nbor) {
|
||||
if (_ilist==NULL) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
if (_eflag) {
|
||||
@ -279,7 +277,7 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
|
||||
|
||||
double evdwl=0.0;
|
||||
double _ecoul=0.0;
|
||||
if (_gpu_nbor) {
|
||||
if (_ilist==NULL) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
if (_eflag) {
|
||||
@ -359,11 +357,8 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAnsT::get_answers(double **f, double **tor) {
|
||||
_x_avail=false;
|
||||
_q_avail=false;
|
||||
_quat_avail=false;
|
||||
acctyp *ap=host_ans.begin();
|
||||
if (_gpu_nbor) {
|
||||
if (_ilist==NULL) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
f[i][0]+=*ap;
|
||||
ap++;
|
||||
|
||||
@ -58,14 +58,12 @@ class PairGPUAns {
|
||||
bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev);
|
||||
|
||||
/// Check if we have enough device storage and realloc if not
|
||||
inline bool resize(const int inum, bool &success) {
|
||||
inline void resize(const int inum, bool &success) {
|
||||
_inum=inum;
|
||||
if (inum>_max_local) {
|
||||
clear_resize();
|
||||
success = success && alloc(inum);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// If already initialized by another LAMMPS style, add fields as necessary
|
||||
@ -123,6 +121,16 @@ class PairGPUAns {
|
||||
/// Add forces and torques from the GPU into a LAMMPS pointer
|
||||
void get_answers(double **f, double **tor);
|
||||
|
||||
inline double get_answers(double **f, double **tor, double *eatom,
|
||||
double **vatom, double *virial, double &ecoul) {
|
||||
time_answer.sync_stop();
|
||||
double ts=MPI_Wtime();
|
||||
double evdw=energy_virial(eatom,vatom,virial,ecoul);
|
||||
get_answers(f,tor);
|
||||
_time_cast+=MPI_Wtime()-ts;
|
||||
return evdw;
|
||||
}
|
||||
|
||||
// ------------------------------ DATA ----------------------------------
|
||||
|
||||
/// Force and possibly torque
|
||||
|
||||
@ -29,7 +29,8 @@ __win_sort _win_sort;
|
||||
#endif
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false) {
|
||||
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
|
||||
_max_gpu_bytes(0) {
|
||||
#ifndef USE_OPENCL
|
||||
sort_config.op = CUDPP_ADD;
|
||||
sort_config.datatype = CUDPP_UINT;
|
||||
@ -108,7 +109,7 @@ bool PairGPUAtomT::alloc(const int nall) {
|
||||
|
||||
|
||||
// --------------------------- Device allocations
|
||||
_gpu_bytes=0;
|
||||
int gpu_bytes=0;
|
||||
if (cpuview) {
|
||||
#ifdef GPU_CAST
|
||||
assert(0==1);
|
||||
@ -126,7 +127,7 @@ bool PairGPUAtomT::alloc(const int nall) {
|
||||
dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
|
||||
success=success && (UCL_SUCCESS==
|
||||
dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
|
||||
_gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
|
||||
gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
|
||||
#else
|
||||
success=success && (UCL_SUCCESS==
|
||||
dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
|
||||
@ -134,25 +135,27 @@ bool PairGPUAtomT::alloc(const int nall) {
|
||||
if (_charge) {
|
||||
success=success && (dev_q.alloc(_max_atoms,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_q.row_bytes();
|
||||
gpu_bytes+=dev_q.row_bytes();
|
||||
}
|
||||
if (_rot) {
|
||||
success=success && (dev_quat.alloc(_max_atoms*4,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_quat.row_bytes();
|
||||
gpu_bytes+=dev_quat.row_bytes();
|
||||
}
|
||||
}
|
||||
if (_gpu_nbor) {
|
||||
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
|
||||
gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
|
||||
if (_bonds) {
|
||||
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_tag.row_bytes();
|
||||
gpu_bytes+=dev_tag.row_bytes();
|
||||
}
|
||||
}
|
||||
|
||||
_gpu_bytes+=dev_x.row_bytes();
|
||||
gpu_bytes+=dev_x.row_bytes();
|
||||
if (gpu_bytes>_max_gpu_bytes)
|
||||
_max_gpu_bytes=gpu_bytes;
|
||||
|
||||
_allocated=true;
|
||||
return success;
|
||||
@ -168,6 +171,7 @@ bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
|
||||
_x_avail=false;
|
||||
_q_avail=false;
|
||||
_quat_avail=false;
|
||||
_resized=false;
|
||||
_gpu_nbor=gpu_nbor;
|
||||
_bonds=bonds;
|
||||
_charge=charge;
|
||||
@ -182,9 +186,11 @@ bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
|
||||
|
||||
// Initialize timers for the selected device
|
||||
time_pos.init(*dev);
|
||||
time_other.init(*dev);
|
||||
time_q.init(*dev);
|
||||
time_quat.init(*dev);
|
||||
time_pos.zero();
|
||||
time_other.zero();
|
||||
time_q.zero();
|
||||
time_quat.zero();
|
||||
_time_cast=0.0;
|
||||
|
||||
#ifdef GPU_CAST
|
||||
@ -252,12 +258,13 @@ void PairGPUAtomT::clear_resize() {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::clear() {
|
||||
_gpu_bytes=0;
|
||||
_max_gpu_bytes=0;
|
||||
if (!_allocated)
|
||||
return;
|
||||
|
||||
time_pos.clear();
|
||||
time_other.clear();
|
||||
time_q.clear();
|
||||
time_quat.clear();
|
||||
clear_resize();
|
||||
|
||||
#ifdef GPU_CAST
|
||||
|
||||
@ -64,14 +64,15 @@ class PairGPUAtom {
|
||||
UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
|
||||
|
||||
/// Check if we have enough device storage and realloc if not
|
||||
/** Returns true if resized with any call during this timestep **/
|
||||
inline bool resize(const int nall, bool &success) {
|
||||
_nall=nall;
|
||||
if (nall>_max_atoms) {
|
||||
clear_resize();
|
||||
success = success && alloc(nall);
|
||||
return true;
|
||||
_resized=true;
|
||||
}
|
||||
return false;
|
||||
return _resized;
|
||||
}
|
||||
|
||||
/// If already initialized by another LAMMPS style, add fields as necessary
|
||||
@ -94,26 +95,42 @@ class PairGPUAtom {
|
||||
/// Add copy times to timers
|
||||
inline void acc_timers() {
|
||||
time_pos.add_to_total();
|
||||
if (_other)
|
||||
time_other.add_to_total();
|
||||
if (_charge)
|
||||
time_q.add_to_total();
|
||||
if (_rot)
|
||||
time_quat.add_to_total();
|
||||
}
|
||||
|
||||
/// Add copy times to timers
|
||||
inline void zero_timers() {
|
||||
time_pos.zero();
|
||||
if (_other)
|
||||
time_other.zero();
|
||||
if (_charge)
|
||||
time_q.zero();
|
||||
if (_rot)
|
||||
time_quat.zero();
|
||||
}
|
||||
|
||||
/// Return the total time for host/device data transfer
|
||||
/** Zeros the total so that the atom times are only included once **/
|
||||
inline double transfer_time() {
|
||||
double total=time_pos.total_seconds();
|
||||
if (_other) total+=time_other.total_seconds();
|
||||
time_pos.zero_total();
|
||||
if (_charge) {
|
||||
total+=time_q.total_seconds();
|
||||
time_q.zero_total();
|
||||
}
|
||||
if (_rot) {
|
||||
total+=time_q.total_seconds();
|
||||
time_quat.zero_total();
|
||||
}
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
/// Return the total time for data cast/pack
|
||||
inline double cast_time() { return _time_cast; }
|
||||
/** Zeros the time so that atom times are only included once **/
|
||||
inline double cast_time()
|
||||
{ double t=_time_cast; _time_cast=0.0; return t; }
|
||||
|
||||
/// Pack LAMMPS atom type constants into matrix and copy to device
|
||||
template <class dev_typ, class t1>
|
||||
@ -210,7 +227,7 @@ class PairGPUAtom {
|
||||
|
||||
/// Signal that we need to transfer atom data for next timestep
|
||||
inline void data_unavail()
|
||||
{ _x_avail=false; _q_avail=false; _quat_avail=false; }
|
||||
{ _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
|
||||
|
||||
/// Cast positions and types to write buffer
|
||||
inline void cast_x_data(double **host_ptr, const int *host_type) {
|
||||
@ -322,20 +339,9 @@ class PairGPUAtom {
|
||||
}
|
||||
}
|
||||
|
||||
/// Copy data other than pos and type to device
|
||||
inline void add_other_data() {
|
||||
if (_other) {
|
||||
time_other.start();
|
||||
if (_charge)
|
||||
add_q_data();
|
||||
if (_rot)
|
||||
add_quat_data();
|
||||
time_other.stop();
|
||||
}
|
||||
}
|
||||
|
||||
/// Return number of bytes used on device
|
||||
inline double gpu_bytes() { return _gpu_bytes; }
|
||||
inline double max_gpu_bytes()
|
||||
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
|
||||
|
||||
// ------------------------------ DATA ----------------------------------
|
||||
|
||||
@ -368,7 +374,7 @@ class PairGPUAtom {
|
||||
UCL_D_Vec<int> dev_tag;
|
||||
|
||||
/// Device timers
|
||||
UCL_Timer time_pos, time_other;
|
||||
UCL_Timer time_pos, time_q, time_quat;
|
||||
|
||||
/// Geryon device
|
||||
UCL_Device *dev;
|
||||
@ -383,7 +389,7 @@ class PairGPUAtom {
|
||||
bool _compiled;
|
||||
|
||||
// True if data has been copied to device already
|
||||
bool _x_avail, _q_avail, _quat_avail;
|
||||
bool _x_avail, _q_avail, _quat_avail, _resized;
|
||||
|
||||
bool alloc(const int nall);
|
||||
|
||||
@ -392,7 +398,7 @@ class PairGPUAtom {
|
||||
bool _gpu_nbor, _bonds;
|
||||
double _time_cast;
|
||||
|
||||
double _gpu_bytes;
|
||||
double _max_gpu_bytes;
|
||||
|
||||
#ifndef USE_OPENCL
|
||||
CUDPPConfiguration sort_config;
|
||||
|
||||
@ -117,10 +117,11 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal,
|
||||
bool PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
|
||||
const bool rot, const int nlocal,
|
||||
const int host_nlocal, const int nall,
|
||||
PairGPUNbor *nbor, const int maxspecial,
|
||||
const bool gpu_nbor, const int gpu_host,
|
||||
PairGPUNbor *nbor, const int maxspecial,
|
||||
const bool gpu_nbor, const int gpu_host,
|
||||
const int max_nbors, const double cell_size,
|
||||
const bool pre_cut) {
|
||||
if (!_device_init)
|
||||
@ -133,12 +134,14 @@ bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal,
|
||||
|
||||
if (_init_count==0) {
|
||||
// Initialize atom and nbor data
|
||||
if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
|
||||
gpu_nbor && maxspecial>0))
|
||||
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
|
||||
return false;
|
||||
} else
|
||||
atom.add_fields(charge,rot);
|
||||
|
||||
if (!ans.init(ef_nlocal,charge,rot,*gpu))
|
||||
return false;
|
||||
|
||||
if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
|
||||
*gpu,gpu_nbor,gpu_host,pre_cut))
|
||||
return false;
|
||||
@ -189,20 +192,21 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUDeviceT::output_times(UCL_Timer &time_pair, PairGPUNbor &nbor,
|
||||
const double avg_split,
|
||||
void PairGPUDeviceT::output_times(UCL_Timer &time_pair,
|
||||
PairGPUAns<numtyp,acctyp> &ans,
|
||||
PairGPUNbor &nbor, const double avg_split,
|
||||
const double max_bytes, FILE *screen) {
|
||||
double single[5], times[5];
|
||||
|
||||
single[0]=atom.transfer_time();
|
||||
single[0]=atom.transfer_time()+ans.transfer_time();
|
||||
single[1]=nbor.time_nbor.total_seconds();
|
||||
single[2]=nbor.time_kernel.total_seconds();
|
||||
single[3]=time_pair.total_seconds();
|
||||
single[4]=atom.cast_time();
|
||||
single[4]=atom.cast_time()+ans.cast_time();
|
||||
|
||||
MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
|
||||
|
||||
double my_max_bytes=max_bytes;
|
||||
double my_max_bytes=max_bytes+atom.max_gpu_bytes();
|
||||
double mpi_max_bytes;
|
||||
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
|
||||
double max_mb=mpi_max_bytes/(1024.0*1024.0);
|
||||
@ -276,15 +280,5 @@ void lmp_clear_device() {
|
||||
|
||||
double lmp_gpu_forces(double **f, double **tor, double *eatom,
|
||||
double **vatom, double *virial, double &ecoul) {
|
||||
if (pair_gpu_device.init_count()) {
|
||||
pair_gpu_device.stop_host_timer();
|
||||
pair_gpu_device.gpu->sync();
|
||||
double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
|
||||
pair_gpu_device.atom.get_answers(f,tor);
|
||||
pair_gpu_device.atom.data_unavail();
|
||||
|
||||
return evdw;
|
||||
}
|
||||
return 0.0;
|
||||
return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
|
||||
}
|
||||
|
||||
|
||||
@ -53,18 +53,19 @@ class PairGPUDevice {
|
||||
* \param cell_size cutoff+skin
|
||||
* \param pre_cut True if cutoff test will be performed in separate kernel
|
||||
* than the force kernel **/
|
||||
bool init(const bool charge, const bool rot, const int nlocal,
|
||||
const int host_nlocal, const int nall, PairGPUNbor *nbor,
|
||||
const int maxspecial, const bool gpu_nbor, const int gpu_host,
|
||||
const int max_nbors, const double cell_size, const bool pre_cut);
|
||||
bool init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
|
||||
const int nlocal, const int host_nlocal, const int nall,
|
||||
PairGPUNbor *nbor, const int maxspecial, const bool gpu_nbor,
|
||||
const int gpu_host, const int max_nbors, const double cell_size,
|
||||
const bool pre_cut);
|
||||
|
||||
/// Output a message for pair_style acceleration with device stats
|
||||
void init_message(FILE *screen, const char *name,
|
||||
const int first_gpu, const int last_gpu);
|
||||
|
||||
/// Output a message with timing information
|
||||
void output_times(UCL_Timer &time_pair, PairGPUNbor &nbor,
|
||||
const double avg_split,
|
||||
void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans,
|
||||
PairGPUNbor &nbor, const double avg_split,
|
||||
const double max_bytes, FILE *screen);
|
||||
|
||||
/// Clear all memory on host and device associated with atom and nbor data
|
||||
@ -74,7 +75,24 @@ class PairGPUDevice {
|
||||
void clear_device();
|
||||
|
||||
/// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
|
||||
inline void add_ans_object(PairGPUAns *ans) { ans_queue.push(ans); }
|
||||
inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
|
||||
{ ans_queue.push(ans); }
|
||||
|
||||
/// Add "answers" (force,energies,etc.) into LAMMPS structures
|
||||
inline double fix_gpu(double **f, double **tor, double *eatom,
|
||||
double **vatom, double *virial, double &ecoul) {
|
||||
atom.data_unavail();
|
||||
if (ans_queue.empty()==false) {
|
||||
stop_host_timer();
|
||||
double evdw=0.0;
|
||||
while (ans_queue.empty()==false) {
|
||||
evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
|
||||
ans_queue.pop();
|
||||
}
|
||||
return evdw;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
/// Start timer on host
|
||||
inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
|
||||
@ -139,7 +157,7 @@ class PairGPUDevice {
|
||||
PairGPUNborShared _nbor_shared;
|
||||
|
||||
private:
|
||||
std::queue<PairGPUAns *> ans_queue;
|
||||
std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
|
||||
int _init_count;
|
||||
bool _device_init;
|
||||
MPI_Comm _comm_world, _comm_replica, _comm_gpu;
|
||||
|
||||
Reference in New Issue
Block a user