Hybrid working, but not fully tested.

This commit is contained in:
W. Michael Brown
2011-01-27 17:52:08 -05:00
parent b9c7e2bb96
commit c1c9fd47de
25 changed files with 269 additions and 185 deletions

View File

@ -118,7 +118,7 @@ $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H)
$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H)
$(CUDR) -o $@ -c pair_gpu_device.cpp
$(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp

View File

@ -68,7 +68,7 @@ $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h
$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H)
$(OCL) -o $@ -c pair_gpu_device.cpp
$(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp

View File

@ -23,11 +23,13 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) {
device=&pair_gpu_device;
ans=new PairGPUAns<numtyp,acctyp>();
nbor=new PairGPUNbor();
}
template <class numtyp, class acctyp>
AtomicGPUMemoryT::~AtomicGPUMemory() {
delete ans;
delete nbor;
}
@ -54,7 +56,7 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
if (host_nlocal>0)
_gpu_host=1;
if (!device->init(false,false,nlocal,host_nlocal,nall,nbor,maxspecial,
if (!device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,maxspecial,
gpu_nbor,_gpu_host,max_nbors,cell_size,false))
return false;
ucl_device=device->gpu;
@ -74,7 +76,7 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
pos_tex.bind_float(atom->dev_x,4);
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
return true;
}
@ -84,7 +86,7 @@ void AtomicGPUMemoryT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
device->output_times(time_pair,*nbor,avg_split,_max_bytes+_max_an_bytes,
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
screen);
if (_compiled) {
@ -118,7 +120,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
@ -149,7 +151,7 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
nspecial, special, success, mn);
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
@ -173,7 +175,7 @@ void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time,nbor->gpu_nbor());
atom->inum(inum);
ans->inum(inum);
host_start=inum;
if (ago==0) {
@ -187,7 +189,8 @@ void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
atom->add_x_data(host_x,host_type);
loop(eflag,vflag);
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
device->add_ans_object(ans);
hd_balancer.stop_timer();
}
@ -210,7 +213,7 @@ int * AtomicGPUMemoryT::compute(const int ago, const int inum_full,
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
int inum=hd_balancer.get_gpu_count(ago,inum_full);
atom->inum(inum);
ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
@ -227,7 +230,8 @@ int * AtomicGPUMemoryT::compute(const int ago, const int inum_full,
}
loop(eflag,vflag);
atom->copy_answers(eflag,vflag,eatom,vatom);
ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans);
hd_balancer.stop_timer();
return nbor->host_nbor.begin();

View File

@ -48,8 +48,9 @@ class AtomicGPUMemory {
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(inum, nall, success))
if (atom->resize(nall, success))
pos_tex.bind_float(atom->dev_x,4);
ans->resize(inum,success);
}
/// Check if there is enough storage for neighbors and realloc if not
@ -92,6 +93,7 @@ class AtomicGPUMemory {
}
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
}
/// Zero timers
@ -99,6 +101,7 @@ class AtomicGPUMemory {
nbor_time_avail=false;
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
}
/// Copy neighbor list from host
@ -148,6 +151,9 @@ class AtomicGPUMemory {
/// Atom Data
PairGPUAtom<numtyp,acctyp> *atom;
// ------------------------ FORCE/ENERGY DATA -----------------------
PairGPUAns<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------

View File

@ -23,11 +23,13 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
device=&pair_gpu_device;
ans=new PairGPUAns<numtyp,acctyp>();
nbor=new PairGPUNbor();
}
template <class numtyp, class acctyp>
ChargeGPUMemoryT::~ChargeGPUMemory() {
delete ans;
delete nbor;
}
@ -54,8 +56,8 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
if (host_nlocal>0)
_gpu_host=1;
if (!device->init(true,false,nlocal,host_nlocal,nall,nbor,maxspecial,gpu_nbor,
_gpu_host,max_nbors,cell_size,false))
if (!device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,maxspecial,
gpu_nbor,_gpu_host,max_nbors,cell_size,false))
return false;
ucl_device=device->gpu;
atom=&device->atom;
@ -75,7 +77,7 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
return true;
}
@ -85,7 +87,7 @@ void ChargeGPUMemoryT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
device->output_times(time_pair,*nbor,avg_split,_max_bytes+_max_an_bytes,
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
screen);
if (_compiled) {
@ -119,7 +121,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
@ -150,7 +152,7 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
nspecial, special, success, mn);
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
@ -175,7 +177,7 @@ void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time,
nbor->gpu_nbor());
atom->inum(inum);
ans->inum(inum);
host_start=inum;
if (ago==0) {
@ -188,10 +190,11 @@ void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
atom->cast_q_data(host_q);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
atom->add_other_data();
atom->add_q_data();
loop(eflag,vflag);
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
device->add_ans_object(ans);
hd_balancer.stop_timer();
}
@ -215,7 +218,7 @@ int * ChargeGPUMemoryT::compute(const int ago, const int inum_full,
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
int inum=hd_balancer.get_gpu_count(ago,inum_full);
atom->inum(inum);
ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
@ -232,10 +235,11 @@ int * ChargeGPUMemoryT::compute(const int ago, const int inum_full,
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
atom->add_other_data();
atom->add_q_data();
loop(eflag,vflag);
atom->copy_answers(eflag,vflag,eatom,vatom);
ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans);
hd_balancer.stop_timer();
return nbor->host_nbor.begin();

View File

@ -48,10 +48,11 @@ class ChargeGPUMemory {
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(inum, nall, success)) {
if (atom->resize(nall, success)) {
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
}
ans->resize(inum,success);
}
/// Check if there is enough storage for neighbors and realloc if not
@ -94,6 +95,7 @@ class ChargeGPUMemory {
}
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
}
/// Zero timers
@ -101,6 +103,7 @@ class ChargeGPUMemory {
nbor_time_avail=false;
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
}
/// Copy neighbor list from host
@ -151,6 +154,10 @@ class ChargeGPUMemory {
PairGPUAtom<numtyp,acctyp> *atom;
// ------------------------ FORCE/ENERGY DATA -----------------------
PairGPUAns<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data

View File

@ -122,9 +122,9 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -133,15 +133,15 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch);
}
this->time_pair.stop();

View File

@ -133,9 +133,9 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -144,8 +144,8 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald);
@ -153,8 +153,8 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald);
}

View File

@ -133,9 +133,9 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -144,8 +144,8 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_smooth);
@ -153,8 +153,8 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_smooth);
}

View File

@ -137,9 +137,9 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -147,8 +147,8 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
&sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
@ -157,8 +157,8 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq);

View File

@ -216,9 +216,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum())/BX));
int stride=gbm.nbor->nbor_pitch();
int ainum=gbm.atom->inum();
int ainum=gbm.ans->inum();
int anall=gbm.atom->nall();
if (gbm.multiple_forms) {
@ -237,11 +237,11 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
&gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
&stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
&stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(),
&gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
gbm.time_gayberne.stop();
if (gbm.last_ellipse==gbm.atom->inum()) {
if (gbm.last_ellipse==gbm.ans->inum()) {
gbm.time_kernel2.start();
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
@ -254,9 +254,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
// ------------ SPHERE_ELLIPSE ---------------
gbm.time_kernel2.start();
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum()-
gbm.last_ellipse)/BX));
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(),
SPHERE_ELLIPSE,SPHERE_ELLIPSE);
gbm.time_kernel2.stop();
@ -266,13 +266,13 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
&gbm.shape.begin(), &gbm.well.begin(),
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
&gbm._lj_types, &gbm.lshape.begin(),
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
&vflag, &gbm.last_ellipse, &ainum, &anall);
gbm.time_gayberne2.stop();
} else {
gbm.atom->dev_ans.zero();
gbm.atom->dev_engv.zero();
gbm.ans->dev_ans.zero();
gbm.ans->dev_engv.zero();
gbm.time_kernel.stop();
gbm.time_gayberne.start();
gbm.time_gayberne.stop();
@ -284,29 +284,29 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
// ------------ LJ ---------------
gbm.time_pair.start();
if (gbm.last_ellipse<gbm.atom->inum()) {
if (gbm.last_ellipse<gbm.ans->inum()) {
if (gbm.shared_types) {
GBMF.k_lj_fast.set_size(GX,BX);
GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
&gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
&stride, &gbm.nbor->dev_packed.begin(),
&gbm.atom->dev_ans.begin(),
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
&gbm.ans->dev_ans.begin(),
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
} else {
GBMF.k_lj.set_size(GX,BX);
GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
&gbm.lj3.begin(), &gbm._lj_types,
&gbm.gamma_upsilon_mu.begin(), &stride,
&gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
&gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(),
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
}
}
gbm.time_pair.stop();
} else {
gbm.time_kernel.start();
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE,
ELLIPSE_ELLIPSE);
gbm.time_kernel.stop();
gbm.time_gayberne.start();
@ -315,8 +315,8 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
&gbm.shape.begin(), &gbm.well.begin(),
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
&stride, &gbm.atom->dev_ans.begin(), &ainum,
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
&stride, &gbm.ans->dev_ans.begin(), &ainum,
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &ainum, &anall);
gbm.time_gayberne.stop();
}
@ -342,7 +342,7 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full);
gbm.atom->inum(inum);
gbm.ans->inum(inum);
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
host_start=inum;
@ -361,10 +361,11 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
gbm.atom->add_x_data(host_x,host_type);
}
gbm.atom->add_other_data();
gbm.atom->add_quat_data();
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
gbm.ans->copy_answers(eflag,vflag,eatom,vatom);
gbm.device->add_ans_object(gbm.ans);
gbm.hd_balancer.stop_timer();
return gbm.nbor->host_nbor.begin();
}
@ -399,7 +400,7 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
int ago=gbm.hd_balancer.ago_first(f_ago);
int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time,gbm.nbor->gpu_nbor());
gbm.atom->inum(inum);
gbm.ans->inum(inum);
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
host_start=inum;
@ -419,10 +420,11 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
gbm.atom->cast_quat_data(host_quat[0]);
gbm.hd_balancer.start_timer();
gbm.atom->add_x_data(host_x,host_type);
gbm.atom->add_other_data();
gbm.atom->add_quat_data();
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list);
gbm.device->add_ans_object(gbm.ans);
gbm.hd_balancer.stop_timer();
return list;
}

View File

@ -32,12 +32,14 @@ template <class numtyp, class acctyp>
GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
_max_bytes(0.0) {
device=&pair_gpu_device;
ans=new PairGPUAns<numtyp,acctyp>();
nbor=new PairGPUNbor;
}
template <class numtyp, class acctyp>
GB_GPU_MemoryT::~GB_GPU_Memory() {
clear();
delete ans;
delete nbor;
}
@ -70,7 +72,7 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
if (host_nlocal>0)
_gpu_host=1;
if (!device->init(false,true,nlocal,host_nlocal,nall,nbor,0,gpu_nbor,
if (!device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,gpu_nbor,
_gpu_host,max_nbors,cell_size,true))
return false;
ucl_device=device->gpu;
@ -187,9 +189,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
}
if (multiple_forms)
atom->dev_ans.zero();
ans->dev_ans.zero();
_max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
_max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
// Memory for ilist ordered by particle type
return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
@ -212,7 +214,7 @@ void GB_GPU_MemoryT::clear() {
acc_timers();
double single[6], times[6];
single[0]=atom->transfer_time();
single[0]=atom->transfer_time()+ans->transfer_time();
single[1]=nbor->time_nbor.total_seconds();
single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
nbor->time_kernel.total_seconds();
@ -221,7 +223,7 @@ void GB_GPU_MemoryT::clear() {
single[4]=time_pair.total_seconds();
else
single[4]=0;
single[5]=atom->cast_time();
single[5]=atom->cast_time()+ans->cast_time();
MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica());
double avg_split=hd_balancer.all_avg_split();
@ -229,7 +231,7 @@ void GB_GPU_MemoryT::clear() {
_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
sigma_epsilon.row_bytes()+cut_form.row_bytes()+
shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
gamma_upsilon_mu.row_bytes();
gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
double mpi_max_bytes;
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
device->replica());

View File

@ -49,9 +49,10 @@ class GB_GPU_Memory {
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
atom->resize(inum, nall, success);
if (multiple_forms) atom->dev_ans.zero();
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
atom->resize(nall, success);
ans->resize(inum, success);
if (multiple_forms) ans->dev_ans.zero();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_bytes)
_max_bytes=bytes;
}
@ -74,7 +75,7 @@ class GB_GPU_Memory {
success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
}
nbor->resize(nlocal,host_inum,max_nbors,success);
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_bytes)
_max_bytes=bytes;
}
@ -104,6 +105,7 @@ class GB_GPU_Memory {
time_pair.add_to_total();
}
atom->acc_timers();
ans->acc_timers();
}
/// Accumulate timers
@ -117,6 +119,7 @@ class GB_GPU_Memory {
time_pair.zero();
}
atom->zero_timers();
ans->zero_timers();
}
// -------------------------- DEVICE DATA -------------------------
@ -168,6 +171,10 @@ class GB_GPU_Memory {
int last_ellipse, max_last_ellipse;
// ------------------------ FORCE/ENERGY DATA -----------------------
PairGPUAns<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data

View File

@ -25,6 +25,7 @@
#define NVD_TIMER_H
#include "nvd_macros.h"
#include "nvd_device.h"
namespace ucl_cudadr {
@ -66,12 +67,23 @@ class UCL_Timer {
/// Stop timing on command queue
inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
/// Block until the start event has been reached on device
inline void sync_start()
{ CU_SAFE_CALL(cuEventSynchronize(start_event)); }
/// Block until the stop event has been reached on device
inline void sync_stop()
{ CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
/// Set the time elapsed to zero (not the total_time)
inline void zero() {
CU_SAFE_CALL(cuEventRecord(start_event,_cq));
CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
}
/// Set the total time to zero
inline void zero_total() { _total_time=0.0; }
/// Add time from previous start and stop to total
/** Forces synchronization **/
inline double add_to_total()

View File

@ -25,6 +25,7 @@
#define OCL_TIMER_H
#include "ocl_macros.h"
#include "ocl_device.h"
namespace ucl_opencl {
@ -67,10 +68,21 @@ class UCL_Timer {
/// Stop timing on default command queue
inline void stop() { clEnqueueMarker(_cq,&stop_event); }
/// Block until the start event has been reached on device
inline void sync_start()
{ CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
/// Block until the stop event has been reached on device
inline void sync_stop()
{ CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
/// Set the time elapsed to zero (not the total_time)
inline void zero()
{ clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); }
/// Set the total time to zero
inline void zero_total() { _total_time=0.0; }
/// Add time from previous start and stop to total
/** Forces synchronization **/
inline double add_to_total()

View File

@ -122,9 +122,9 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -133,15 +133,15 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch);
}
this->time_pair.stop();

View File

@ -122,9 +122,9 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -133,15 +133,15 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch);
}
this->time_pair.stop();

View File

@ -134,9 +134,9 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -145,8 +145,8 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->atom->dev_q.begin(), &cutsq.begin(),
&_qqrd2e);
@ -154,8 +154,8 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
&cutsq.begin(), &_qqrd2e);
}

View File

@ -132,9 +132,9 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -143,8 +143,8 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald);
@ -152,8 +152,8 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald);
}

View File

@ -138,11 +138,10 @@ void PairGPUAnsT::clear() {
if (!_allocated)
return;
time_pos.clear();
time_other.clear();
time_answer.clear();
clear_resize();
_inum=0;
_ilist=NULL;
_eflag=false;
_vflag=false;
}
@ -181,7 +180,6 @@ void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
else
ucl_copy(host_ans,dev_ans,_inum*4,true);
time_answer.stop();
dev->add_answer_object(this);
}
template <class numtyp, class acctyp>
@ -199,7 +197,7 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
return 0.0;
double evdwl=0.0;
if (_gpu_nbor) {
if (_ilist==NULL) {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
if (_eflag) {
@ -279,7 +277,7 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
double evdwl=0.0;
double _ecoul=0.0;
if (_gpu_nbor) {
if (_ilist==NULL) {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
if (_eflag) {
@ -359,11 +357,8 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
template <class numtyp, class acctyp>
void PairGPUAnsT::get_answers(double **f, double **tor) {
_x_avail=false;
_q_avail=false;
_quat_avail=false;
acctyp *ap=host_ans.begin();
if (_gpu_nbor) {
if (_ilist==NULL) {
for (int i=0; i<_inum; i++) {
f[i][0]+=*ap;
ap++;

View File

@ -58,14 +58,12 @@ class PairGPUAns {
bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev);
/// Check if we have enough device storage and realloc if not
inline bool resize(const int inum, bool &success) {
inline void resize(const int inum, bool &success) {
_inum=inum;
if (inum>_max_local) {
clear_resize();
success = success && alloc(inum);
return true;
}
return false;
}
/// If already initialized by another LAMMPS style, add fields as necessary
@ -123,6 +121,16 @@ class PairGPUAns {
/// Add forces and torques from the GPU into a LAMMPS pointer
void get_answers(double **f, double **tor);
inline double get_answers(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul) {
time_answer.sync_stop();
double ts=MPI_Wtime();
double evdw=energy_virial(eatom,vatom,virial,ecoul);
get_answers(f,tor);
_time_cast+=MPI_Wtime()-ts;
return evdw;
}
// ------------------------------ DATA ----------------------------------
/// Force and possibly torque

View File

@ -29,7 +29,8 @@ __win_sort _win_sort;
#endif
template <class numtyp, class acctyp>
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false) {
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
_max_gpu_bytes(0) {
#ifndef USE_OPENCL
sort_config.op = CUDPP_ADD;
sort_config.datatype = CUDPP_UINT;
@ -108,7 +109,7 @@ bool PairGPUAtomT::alloc(const int nall) {
// --------------------------- Device allocations
_gpu_bytes=0;
int gpu_bytes=0;
if (cpuview) {
#ifdef GPU_CAST
assert(0==1);
@ -126,7 +127,7 @@ bool PairGPUAtomT::alloc(const int nall) {
dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
success=success && (UCL_SUCCESS==
dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
_gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
#else
success=success && (UCL_SUCCESS==
dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
@ -134,25 +135,27 @@ bool PairGPUAtomT::alloc(const int nall) {
if (_charge) {
success=success && (dev_q.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
_gpu_bytes+=dev_q.row_bytes();
gpu_bytes+=dev_q.row_bytes();
}
if (_rot) {
success=success && (dev_quat.alloc(_max_atoms*4,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
_gpu_bytes+=dev_quat.row_bytes();
gpu_bytes+=dev_quat.row_bytes();
}
}
if (_gpu_nbor) {
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
_gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
if (_bonds) {
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
_gpu_bytes+=dev_tag.row_bytes();
gpu_bytes+=dev_tag.row_bytes();
}
}
_gpu_bytes+=dev_x.row_bytes();
gpu_bytes+=dev_x.row_bytes();
if (gpu_bytes>_max_gpu_bytes)
_max_gpu_bytes=gpu_bytes;
_allocated=true;
return success;
@ -168,6 +171,7 @@ bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
_x_avail=false;
_q_avail=false;
_quat_avail=false;
_resized=false;
_gpu_nbor=gpu_nbor;
_bonds=bonds;
_charge=charge;
@ -182,9 +186,11 @@ bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
// Initialize timers for the selected device
time_pos.init(*dev);
time_other.init(*dev);
time_q.init(*dev);
time_quat.init(*dev);
time_pos.zero();
time_other.zero();
time_q.zero();
time_quat.zero();
_time_cast=0.0;
#ifdef GPU_CAST
@ -252,12 +258,13 @@ void PairGPUAtomT::clear_resize() {
template <class numtyp, class acctyp>
void PairGPUAtomT::clear() {
_gpu_bytes=0;
_max_gpu_bytes=0;
if (!_allocated)
return;
time_pos.clear();
time_other.clear();
time_q.clear();
time_quat.clear();
clear_resize();
#ifdef GPU_CAST

View File

@ -64,14 +64,15 @@ class PairGPUAtom {
UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
/// Check if we have enough device storage and realloc if not
/** Returns true if resized with any call during this timestep **/
inline bool resize(const int nall, bool &success) {
_nall=nall;
if (nall>_max_atoms) {
clear_resize();
success = success && alloc(nall);
return true;
_resized=true;
}
return false;
return _resized;
}
/// If already initialized by another LAMMPS style, add fields as necessary
@ -94,26 +95,42 @@ class PairGPUAtom {
/// Add copy times to timers
inline void acc_timers() {
time_pos.add_to_total();
if (_other)
time_other.add_to_total();
if (_charge)
time_q.add_to_total();
if (_rot)
time_quat.add_to_total();
}
/// Add copy times to timers
inline void zero_timers() {
time_pos.zero();
if (_other)
time_other.zero();
if (_charge)
time_q.zero();
if (_rot)
time_quat.zero();
}
/// Return the total time for host/device data transfer
/** Zeros the total so that the atom times are only included once **/
inline double transfer_time() {
double total=time_pos.total_seconds();
if (_other) total+=time_other.total_seconds();
time_pos.zero_total();
if (_charge) {
total+=time_q.total_seconds();
time_q.zero_total();
}
if (_rot) {
total+=time_q.total_seconds();
time_quat.zero_total();
}
return total;
}
/// Return the total time for data cast/pack
inline double cast_time() { return _time_cast; }
/** Zeros the time so that atom times are only included once **/
inline double cast_time()
{ double t=_time_cast; _time_cast=0.0; return t; }
/// Pack LAMMPS atom type constants into matrix and copy to device
template <class dev_typ, class t1>
@ -210,7 +227,7 @@ class PairGPUAtom {
/// Signal that we need to transfer atom data for next timestep
inline void data_unavail()
{ _x_avail=false; _q_avail=false; _quat_avail=false; }
{ _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
/// Cast positions and types to write buffer
inline void cast_x_data(double **host_ptr, const int *host_type) {
@ -322,20 +339,9 @@ class PairGPUAtom {
}
}
/// Copy data other than pos and type to device
inline void add_other_data() {
if (_other) {
time_other.start();
if (_charge)
add_q_data();
if (_rot)
add_quat_data();
time_other.stop();
}
}
/// Return number of bytes used on device
inline double gpu_bytes() { return _gpu_bytes; }
inline double max_gpu_bytes()
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
// ------------------------------ DATA ----------------------------------
@ -368,7 +374,7 @@ class PairGPUAtom {
UCL_D_Vec<int> dev_tag;
/// Device timers
UCL_Timer time_pos, time_other;
UCL_Timer time_pos, time_q, time_quat;
/// Geryon device
UCL_Device *dev;
@ -383,7 +389,7 @@ class PairGPUAtom {
bool _compiled;
// True if data has been copied to device already
bool _x_avail, _q_avail, _quat_avail;
bool _x_avail, _q_avail, _quat_avail, _resized;
bool alloc(const int nall);
@ -392,7 +398,7 @@ class PairGPUAtom {
bool _gpu_nbor, _bonds;
double _time_cast;
double _gpu_bytes;
double _max_gpu_bytes;
#ifndef USE_OPENCL
CUDPPConfiguration sort_config;

View File

@ -117,10 +117,11 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
}
template <class numtyp, class acctyp>
bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal,
bool PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
const bool rot, const int nlocal,
const int host_nlocal, const int nall,
PairGPUNbor *nbor, const int maxspecial,
const bool gpu_nbor, const int gpu_host,
PairGPUNbor *nbor, const int maxspecial,
const bool gpu_nbor, const int gpu_host,
const int max_nbors, const double cell_size,
const bool pre_cut) {
if (!_device_init)
@ -133,12 +134,14 @@ bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal,
if (_init_count==0) {
// Initialize atom and nbor data
if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
gpu_nbor && maxspecial>0))
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
return false;
} else
atom.add_fields(charge,rot);
if (!ans.init(ef_nlocal,charge,rot,*gpu))
return false;
if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
*gpu,gpu_nbor,gpu_host,pre_cut))
return false;
@ -189,20 +192,21 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::output_times(UCL_Timer &time_pair, PairGPUNbor &nbor,
const double avg_split,
void PairGPUDeviceT::output_times(UCL_Timer &time_pair,
PairGPUAns<numtyp,acctyp> &ans,
PairGPUNbor &nbor, const double avg_split,
const double max_bytes, FILE *screen) {
double single[5], times[5];
single[0]=atom.transfer_time();
single[0]=atom.transfer_time()+ans.transfer_time();
single[1]=nbor.time_nbor.total_seconds();
single[2]=nbor.time_kernel.total_seconds();
single[3]=time_pair.total_seconds();
single[4]=atom.cast_time();
single[4]=atom.cast_time()+ans.cast_time();
MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
double my_max_bytes=max_bytes;
double my_max_bytes=max_bytes+atom.max_gpu_bytes();
double mpi_max_bytes;
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
double max_mb=mpi_max_bytes/(1024.0*1024.0);
@ -276,15 +280,5 @@ void lmp_clear_device() {
double lmp_gpu_forces(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul) {
if (pair_gpu_device.init_count()) {
pair_gpu_device.stop_host_timer();
pair_gpu_device.gpu->sync();
double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
pair_gpu_device.atom.get_answers(f,tor);
pair_gpu_device.atom.data_unavail();
return evdw;
}
return 0.0;
return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
}

View File

@ -53,18 +53,19 @@ class PairGPUDevice {
* \param cell_size cutoff+skin
* \param pre_cut True if cutoff test will be performed in separate kernel
* than the force kernel **/
bool init(const bool charge, const bool rot, const int nlocal,
const int host_nlocal, const int nall, PairGPUNbor *nbor,
const int maxspecial, const bool gpu_nbor, const int gpu_host,
const int max_nbors, const double cell_size, const bool pre_cut);
bool init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
const int nlocal, const int host_nlocal, const int nall,
PairGPUNbor *nbor, const int maxspecial, const bool gpu_nbor,
const int gpu_host, const int max_nbors, const double cell_size,
const bool pre_cut);
/// Output a message for pair_style acceleration with device stats
void init_message(FILE *screen, const char *name,
const int first_gpu, const int last_gpu);
/// Output a message with timing information
void output_times(UCL_Timer &time_pair, PairGPUNbor &nbor,
const double avg_split,
void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans,
PairGPUNbor &nbor, const double avg_split,
const double max_bytes, FILE *screen);
/// Clear all memory on host and device associated with atom and nbor data
@ -74,7 +75,24 @@ class PairGPUDevice {
void clear_device();
/// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
inline void add_ans_object(PairGPUAns *ans) { ans_queue.push(ans); }
inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
{ ans_queue.push(ans); }
/// Add "answers" (force,energies,etc.) into LAMMPS structures
inline double fix_gpu(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul) {
atom.data_unavail();
if (ans_queue.empty()==false) {
stop_host_timer();
double evdw=0.0;
while (ans_queue.empty()==false) {
evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
ans_queue.pop();
}
return evdw;
}
return 0.0;
}
/// Start timer on host
inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
@ -139,7 +157,7 @@ class PairGPUDevice {
PairGPUNborShared _nbor_shared;
private:
std::queue<PairGPUAns *> ans_queue;
std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
int _init_count;
bool _device_init;
MPI_Comm _comm_world, _comm_replica, _comm_gpu;