From c1c9fd47dee5a226fc22ea0edc35feddcb38835e Mon Sep 17 00:00:00 2001 From: "W. Michael Brown" Date: Thu, 27 Jan 2011 17:52:08 -0500 Subject: [PATCH] Hybrid working, but not fully tested. --- lib/gpu/Nvidia.makefile | 2 +- lib/gpu/Opencl.makefile | 2 +- lib/gpu/atomic_gpu_memory.cpp | 22 ++++++++----- lib/gpu/atomic_gpu_memory.h | 8 ++++- lib/gpu/charge_gpu_memory.cpp | 28 +++++++++------- lib/gpu/charge_gpu_memory.h | 9 ++++- lib/gpu/cmm_cut_gpu_memory.cpp | 12 +++---- lib/gpu/cmmc_long_gpu_memory.cpp | 12 +++---- lib/gpu/cmmc_msm_gpu_memory.cpp | 12 +++---- lib/gpu/crml_gpu_memory.cpp | 12 +++---- lib/gpu/gb_gpu.cpp | 50 ++++++++++++++-------------- lib/gpu/gb_gpu_memory.cpp | 14 ++++---- lib/gpu/gb_gpu_memory.h | 15 ++++++--- lib/gpu/geryon/nvd_timer.h | 12 +++++++ lib/gpu/geryon/ocl_timer.h | 12 +++++++ lib/gpu/lj96_cut_gpu_memory.cpp | 12 +++---- lib/gpu/lj_cut_gpu_memory.cpp | 12 +++---- lib/gpu/ljc_cut_gpu_memory.cpp | 12 +++---- lib/gpu/ljcl_cut_gpu_memory.cpp | 12 +++---- lib/gpu/pair_gpu_ans.cpp | 13 +++----- lib/gpu/pair_gpu_ans.h | 14 ++++++-- lib/gpu/pair_gpu_atom.cpp | 31 +++++++++++------- lib/gpu/pair_gpu_atom.h | 56 ++++++++++++++++++-------------- lib/gpu/pair_gpu_device.cpp | 36 +++++++++----------- lib/gpu/pair_gpu_device.h | 34 ++++++++++++++----- 25 files changed, 269 insertions(+), 185 deletions(-) diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 3c859ffdd4..967fbba76f 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -118,7 +118,7 @@ $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H) $(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H) +$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(CUDR) -o $@ -c pair_gpu_device.cpp $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index e488a56bc0..b0b4857770 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -68,7 +68,7 @@ $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h $(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H) +$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OCL) -o $@ -c pair_gpu_device.cpp $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp index bea44c4ced..af7ed795a5 100644 --- a/lib/gpu/atomic_gpu_memory.cpp +++ b/lib/gpu/atomic_gpu_memory.cpp @@ -23,11 +23,13 @@ extern PairGPUDevice pair_gpu_device; template AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) { device=&pair_gpu_device; + ans=new PairGPUAns(); nbor=new PairGPUNbor(); } template AtomicGPUMemoryT::~AtomicGPUMemory() { + delete ans; delete nbor; } @@ -54,7 +56,7 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, if (host_nlocal>0) _gpu_host=1; - if (!device->init(false,false,nlocal,host_nlocal,nall,nbor,maxspecial, + if (!device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,maxspecial, gpu_nbor,_gpu_host,max_nbors,cell_size,false)) return false; ucl_device=device->gpu; @@ -74,7 +76,7 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, pos_tex.bind_float(atom->dev_x,4); - _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); return true; } @@ -84,7 +86,7 @@ void AtomicGPUMemoryT::clear_atomic() { // Output any timing information acc_timers(); double avg_split=hd_balancer.all_avg_split(); - device->output_times(time_pair,*nbor,avg_split,_max_bytes+_max_an_bytes, + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, screen); if (_compiled) { @@ -118,7 +120,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist, nbor->get_host(inum,ilist,numj,firstneigh,block_size()); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; @@ -149,7 +151,7 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum, nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag, nspecial, special, success, mn); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; } @@ -173,7 +175,7 @@ void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full, int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time,nbor->gpu_nbor()); - atom->inum(inum); + ans->inum(inum); host_start=inum; if (ago==0) { @@ -187,7 +189,8 @@ void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full, atom->add_x_data(host_x,host_type); loop(eflag,vflag); - atom->copy_answers(eflag,vflag,eatom,vatom,ilist); + ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + device->add_ans_object(ans); hd_balancer.stop_timer(); } @@ -210,7 +213,7 @@ int * AtomicGPUMemoryT::compute(const int ago, const int inum_full, hd_balancer.balance(cpu_time,nbor->gpu_nbor()); int inum=hd_balancer.get_gpu_count(ago,inum_full); - atom->inum(inum); + ans->inum(inum); host_start=inum; // Build neighbor list on GPU if necessary @@ -227,7 +230,8 @@ int * AtomicGPUMemoryT::compute(const int ago, const int inum_full, } loop(eflag,vflag); - atom->copy_answers(eflag,vflag,eatom,vatom); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); hd_balancer.stop_timer(); return nbor->host_nbor.begin(); diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h index 642ff56afd..11cda990af 100644 --- a/lib/gpu/atomic_gpu_memory.h +++ b/lib/gpu/atomic_gpu_memory.h @@ -48,8 +48,9 @@ class AtomicGPUMemory { /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ inline void resize_atom(const int inum, const int nall, bool &success) { - if (atom->resize(inum, nall, success)) + if (atom->resize(nall, success)) pos_tex.bind_float(atom->dev_x,4); + ans->resize(inum,success); } /// Check if there is enough storage for neighbors and realloc if not @@ -92,6 +93,7 @@ class AtomicGPUMemory { } time_pair.add_to_total(); atom->acc_timers(); + ans->acc_timers(); } /// Zero timers @@ -99,6 +101,7 @@ class AtomicGPUMemory { nbor_time_avail=false; time_pair.zero(); atom->zero_timers(); + ans->zero_timers(); } /// Copy neighbor list from host @@ -148,6 +151,9 @@ class AtomicGPUMemory { /// Atom Data PairGPUAtom *atom; + // ------------------------ FORCE/ENERGY DATA ----------------------- + + PairGPUAns *ans; // --------------------------- NBOR DATA ---------------------------- diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp index 71a2f1622b..4460077ff4 100644 --- a/lib/gpu/charge_gpu_memory.cpp +++ b/lib/gpu/charge_gpu_memory.cpp @@ -23,11 +23,13 @@ extern PairGPUDevice pair_gpu_device; template ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) { device=&pair_gpu_device; + ans=new PairGPUAns(); nbor=new PairGPUNbor(); } template ChargeGPUMemoryT::~ChargeGPUMemory() { + delete ans; delete nbor; } @@ -54,8 +56,8 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, if (host_nlocal>0) _gpu_host=1; - if (!device->init(true,false,nlocal,host_nlocal,nall,nbor,maxspecial,gpu_nbor, - _gpu_host,max_nbors,cell_size,false)) + if (!device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,maxspecial, + gpu_nbor,_gpu_host,max_nbors,cell_size,false)) return false; ucl_device=device->gpu; atom=&device->atom; @@ -75,7 +77,7 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, pos_tex.bind_float(atom->dev_x,4); q_tex.bind_float(atom->dev_q,1); - _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); return true; } @@ -85,7 +87,7 @@ void ChargeGPUMemoryT::clear_atomic() { // Output any timing information acc_timers(); double avg_split=hd_balancer.all_avg_split(); - device->output_times(time_pair,*nbor,avg_split,_max_bytes+_max_an_bytes, + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, screen); if (_compiled) { @@ -119,7 +121,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist, nbor->get_host(inum,ilist,numj,firstneigh,block_size()); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; @@ -150,7 +152,7 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum, nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag, nspecial, special, success, mn); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; } @@ -175,7 +177,7 @@ void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full, int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time, nbor->gpu_nbor()); - atom->inum(inum); + ans->inum(inum); host_start=inum; if (ago==0) { @@ -188,10 +190,11 @@ void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full, atom->cast_q_data(host_q); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); - atom->add_other_data(); + atom->add_q_data(); loop(eflag,vflag); - atom->copy_answers(eflag,vflag,eatom,vatom,ilist); + ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + device->add_ans_object(ans); hd_balancer.stop_timer(); } @@ -215,7 +218,7 @@ int * ChargeGPUMemoryT::compute(const int ago, const int inum_full, hd_balancer.balance(cpu_time,nbor->gpu_nbor()); int inum=hd_balancer.get_gpu_count(ago,inum_full); - atom->inum(inum); + ans->inum(inum); host_start=inum; // Build neighbor list on GPU if necessary @@ -232,10 +235,11 @@ int * ChargeGPUMemoryT::compute(const int ago, const int inum_full, hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); } - atom->add_other_data(); + atom->add_q_data(); loop(eflag,vflag); - atom->copy_answers(eflag,vflag,eatom,vatom); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); hd_balancer.stop_timer(); return nbor->host_nbor.begin(); diff --git a/lib/gpu/charge_gpu_memory.h b/lib/gpu/charge_gpu_memory.h index 42e7307f47..f35b14a8c2 100644 --- a/lib/gpu/charge_gpu_memory.h +++ b/lib/gpu/charge_gpu_memory.h @@ -48,10 +48,11 @@ class ChargeGPUMemory { /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ inline void resize_atom(const int inum, const int nall, bool &success) { - if (atom->resize(inum, nall, success)) { + if (atom->resize(nall, success)) { pos_tex.bind_float(atom->dev_x,4); q_tex.bind_float(atom->dev_q,1); } + ans->resize(inum,success); } /// Check if there is enough storage for neighbors and realloc if not @@ -94,6 +95,7 @@ class ChargeGPUMemory { } time_pair.add_to_total(); atom->acc_timers(); + ans->acc_timers(); } /// Zero timers @@ -101,6 +103,7 @@ class ChargeGPUMemory { nbor_time_avail=false; time_pair.zero(); atom->zero_timers(); + ans->zero_timers(); } /// Copy neighbor list from host @@ -151,6 +154,10 @@ class ChargeGPUMemory { PairGPUAtom *atom; + // ------------------------ FORCE/ENERGY DATA ----------------------- + + PairGPUAns *ans; + // --------------------------- NBOR DATA ---------------------------- /// Neighbor data diff --git a/lib/gpu/cmm_cut_gpu_memory.cpp b/lib/gpu/cmm_cut_gpu_memory.cpp index e5a83e5872..53f219274c 100644 --- a/lib/gpu/cmm_cut_gpu_memory.cpp +++ b/lib/gpu/cmm_cut_gpu_memory.cpp @@ -122,9 +122,9 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -133,15 +133,15 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch); } this->time_pair.stop(); diff --git a/lib/gpu/cmmc_long_gpu_memory.cpp b/lib/gpu/cmmc_long_gpu_memory.cpp index 9a63bc5628..2091122832 100644 --- a/lib/gpu/cmmc_long_gpu_memory.cpp +++ b/lib/gpu/cmmc_long_gpu_memory.cpp @@ -133,9 +133,9 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -144,8 +144,8 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald); @@ -153,8 +153,8 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald); } diff --git a/lib/gpu/cmmc_msm_gpu_memory.cpp b/lib/gpu/cmmc_msm_gpu_memory.cpp index 6fced81d86..7675005348 100644 --- a/lib/gpu/cmmc_msm_gpu_memory.cpp +++ b/lib/gpu/cmmc_msm_gpu_memory.cpp @@ -133,9 +133,9 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -144,8 +144,8 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_smooth); @@ -153,8 +153,8 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_smooth); } diff --git a/lib/gpu/crml_gpu_memory.cpp b/lib/gpu/crml_gpu_memory.cpp index e877503e87..f051a349f9 100644 --- a/lib/gpu/crml_gpu_memory.cpp +++ b/lib/gpu/crml_gpu_memory.cpp @@ -137,9 +137,9 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -147,8 +147,8 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq, @@ -157,8 +157,8 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq); diff --git a/lib/gpu/gb_gpu.cpp b/lib/gpu/gb_gpu.cpp index c1b403fe24..9212e43cb6 100644 --- a/lib/gpu/gb_gpu.cpp +++ b/lib/gpu/gb_gpu.cpp @@ -216,9 +216,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(gbm.atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(gbm.ans->inum())/BX)); int stride=gbm.nbor->nbor_pitch(); - int ainum=gbm.atom->inum(); + int ainum=gbm.ans->inum(); int anall=gbm.atom->nall(); if (gbm.multiple_forms) { @@ -237,11 +237,11 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(), &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), - &stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(), + &stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall); gbm.time_gayberne.stop(); - if (gbm.last_ellipse==gbm.atom->inum()) { + if (gbm.last_ellipse==gbm.ans->inum()) { gbm.time_kernel2.start(); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); @@ -254,9 +254,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { // ------------ SPHERE_ELLIPSE --------------- gbm.time_kernel2.start(); - GX=static_cast(ceil(static_cast(gbm.atom->inum()- + GX=static_cast(ceil(static_cast(gbm.ans->inum()- gbm.last_ellipse)/BX)); - gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(), + gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(), SPHERE_ELLIPSE,SPHERE_ELLIPSE); gbm.time_kernel2.stop(); @@ -266,13 +266,13 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm.shape.begin(), &gbm.well.begin(), &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), - &gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(), - &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, + &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(), + &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall); gbm.time_gayberne2.stop(); } else { - gbm.atom->dev_ans.zero(); - gbm.atom->dev_engv.zero(); + gbm.ans->dev_ans.zero(); + gbm.ans->dev_engv.zero(); gbm.time_kernel.stop(); gbm.time_gayberne.start(); gbm.time_gayberne.stop(); @@ -284,29 +284,29 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { // ------------ LJ --------------- gbm.time_pair.start(); - if (gbm.last_ellipseinum()) { + if (gbm.last_ellipseinum()) { if (gbm.shared_types) { GBMF.k_lj_fast.set_size(GX,BX); GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(), &gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(), &stride, &gbm.nbor->dev_packed.begin(), - &gbm.atom->dev_ans.begin(), - &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), + &gbm.ans->dev_ans.begin(), + &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall); } else { GBMF.k_lj.set_size(GX,BX); GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(), &gbm.lj3.begin(), &gbm._lj_types, &gbm.gamma_upsilon_mu.begin(), &stride, - &gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(), - &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), + &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(), + &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall); } } gbm.time_pair.stop(); } else { gbm.time_kernel.start(); - gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE, + gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE, ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); gbm.time_gayberne.start(); @@ -315,8 +315,8 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm.shape.begin(), &gbm.well.begin(), &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), - &stride, &gbm.atom->dev_ans.begin(), &ainum, - &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), + &stride, &gbm.ans->dev_ans.begin(), &ainum, + &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, &vflag, &ainum, &anall); gbm.time_gayberne.stop(); } @@ -342,7 +342,7 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int ago, gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor()); int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full); - gbm.atom->inum(inum); + gbm.ans->inum(inum); gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse); host_start=inum; @@ -361,10 +361,11 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int ago, gbm.atom->add_x_data(host_x,host_type); } - gbm.atom->add_other_data(); + gbm.atom->add_quat_data(); _gb_gpu_gayberne(gbm,eflag,vflag); - gbm.atom->copy_answers(eflag,vflag,eatom,vatom); + gbm.ans->copy_answers(eflag,vflag,eatom,vatom); + gbm.device->add_ans_object(gbm.ans); gbm.hd_balancer.stop_timer(); return gbm.nbor->host_nbor.begin(); } @@ -399,7 +400,7 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full, int ago=gbm.hd_balancer.ago_first(f_ago); int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time,gbm.nbor->gpu_nbor()); - gbm.atom->inum(inum); + gbm.ans->inum(inum); gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse); host_start=inum; @@ -419,10 +420,11 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full, gbm.atom->cast_quat_data(host_quat[0]); gbm.hd_balancer.start_timer(); gbm.atom->add_x_data(host_x,host_type); - gbm.atom->add_other_data(); + gbm.atom->add_quat_data(); _gb_gpu_gayberne(gbm,eflag,vflag); - gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list); + gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list); + gbm.device->add_ans_object(gbm.ans); gbm.hd_balancer.stop_timer(); return list; } diff --git a/lib/gpu/gb_gpu_memory.cpp b/lib/gpu/gb_gpu_memory.cpp index 622bdd73fc..ac2923aa9c 100644 --- a/lib/gpu/gb_gpu_memory.cpp +++ b/lib/gpu/gb_gpu_memory.cpp @@ -32,12 +32,14 @@ template GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false), _max_bytes(0.0) { device=&pair_gpu_device; + ans=new PairGPUAns(); nbor=new PairGPUNbor; } template GB_GPU_MemoryT::~GB_GPU_Memory() { clear(); + delete ans; delete nbor; } @@ -70,7 +72,7 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, if (host_nlocal>0) _gpu_host=1; - if (!device->init(false,true,nlocal,host_nlocal,nall,nbor,0,gpu_nbor, + if (!device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,gpu_nbor, _gpu_host,max_nbors,cell_size,true)) return false; ucl_device=device->gpu; @@ -187,9 +189,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, } if (multiple_forms) - atom->dev_ans.zero(); + ans->dev_ans.zero(); - _max_bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + _max_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); // Memory for ilist ordered by particle type return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS); @@ -212,7 +214,7 @@ void GB_GPU_MemoryT::clear() { acc_timers(); double single[6], times[6]; - single[0]=atom->transfer_time(); + single[0]=atom->transfer_time()+ans->transfer_time(); single[1]=nbor->time_nbor.total_seconds(); single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+ nbor->time_kernel.total_seconds(); @@ -221,7 +223,7 @@ void GB_GPU_MemoryT::clear() { single[4]=time_pair.total_seconds(); else single[4]=0; - single[5]=atom->cast_time(); + single[5]=atom->cast_time()+ans->cast_time(); MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica()); double avg_split=hd_balancer.all_avg_split(); @@ -229,7 +231,7 @@ void GB_GPU_MemoryT::clear() { _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+ sigma_epsilon.row_bytes()+cut_form.row_bytes()+ shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+ - gamma_upsilon_mu.row_bytes(); + gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0, device->replica()); diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h index 2cfc805cd8..b9c0ce8c53 100644 --- a/lib/gpu/gb_gpu_memory.h +++ b/lib/gpu/gb_gpu_memory.h @@ -49,9 +49,10 @@ class GB_GPU_Memory { /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ inline void resize_atom(const int inum, const int nall, bool &success) { - atom->resize(inum, nall, success); - if (multiple_forms) atom->dev_ans.zero(); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + atom->resize(nall, success); + ans->resize(inum, success); + if (multiple_forms) ans->dev_ans.zero(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_bytes) _max_bytes=bytes; } @@ -74,7 +75,7 @@ class GB_GPU_Memory { success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS); } nbor->resize(nlocal,host_inum,max_nbors,success); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_bytes) _max_bytes=bytes; } @@ -104,6 +105,7 @@ class GB_GPU_Memory { time_pair.add_to_total(); } atom->acc_timers(); + ans->acc_timers(); } /// Accumulate timers @@ -117,6 +119,7 @@ class GB_GPU_Memory { time_pair.zero(); } atom->zero_timers(); + ans->zero_timers(); } // -------------------------- DEVICE DATA ------------------------- @@ -168,6 +171,10 @@ class GB_GPU_Memory { int last_ellipse, max_last_ellipse; + // ------------------------ FORCE/ENERGY DATA ----------------------- + + PairGPUAns *ans; + // --------------------------- NBOR DATA ---------------------------- /// Neighbor data diff --git a/lib/gpu/geryon/nvd_timer.h b/lib/gpu/geryon/nvd_timer.h index 59001c03fd..1f39abb971 100644 --- a/lib/gpu/geryon/nvd_timer.h +++ b/lib/gpu/geryon/nvd_timer.h @@ -25,6 +25,7 @@ #define NVD_TIMER_H #include "nvd_macros.h" +#include "nvd_device.h" namespace ucl_cudadr { @@ -66,12 +67,23 @@ class UCL_Timer { /// Stop timing on command queue inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); } + /// Block until the start event has been reached on device + inline void sync_start() + { CU_SAFE_CALL(cuEventSynchronize(start_event)); } + + /// Block until the stop event has been reached on device + inline void sync_stop() + { CU_SAFE_CALL(cuEventSynchronize(stop_event)); } + /// Set the time elapsed to zero (not the total_time) inline void zero() { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); } + /// Set the total time to zero + inline void zero_total() { _total_time=0.0; } + /// Add time from previous start and stop to total /** Forces synchronization **/ inline double add_to_total() diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h index aafb0aac4b..a268dca256 100644 --- a/lib/gpu/geryon/ocl_timer.h +++ b/lib/gpu/geryon/ocl_timer.h @@ -25,6 +25,7 @@ #define OCL_TIMER_H #include "ocl_macros.h" +#include "ocl_device.h" namespace ucl_opencl { @@ -67,10 +68,21 @@ class UCL_Timer { /// Stop timing on default command queue inline void stop() { clEnqueueMarker(_cq,&stop_event); } + /// Block until the start event has been reached on device + inline void sync_start() + { CL_SAFE_CALL(clWaitForEvents(1,&start_event)); } + + /// Block until the stop event has been reached on device + inline void sync_stop() + { CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); } + /// Set the time elapsed to zero (not the total_time) inline void zero() { clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); } + /// Set the total time to zero + inline void zero_total() { _total_time=0.0; } + /// Add time from previous start and stop to total /** Forces synchronization **/ inline double add_to_total() diff --git a/lib/gpu/lj96_cut_gpu_memory.cpp b/lib/gpu/lj96_cut_gpu_memory.cpp index d365d71044..c30c5d8caf 100644 --- a/lib/gpu/lj96_cut_gpu_memory.cpp +++ b/lib/gpu/lj96_cut_gpu_memory.cpp @@ -122,9 +122,9 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -133,15 +133,15 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch); } this->time_pair.stop(); diff --git a/lib/gpu/lj_cut_gpu_memory.cpp b/lib/gpu/lj_cut_gpu_memory.cpp index 23b2fcf6d0..27a759fec1 100644 --- a/lib/gpu/lj_cut_gpu_memory.cpp +++ b/lib/gpu/lj_cut_gpu_memory.cpp @@ -122,9 +122,9 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -133,15 +133,15 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch); } this->time_pair.stop(); diff --git a/lib/gpu/ljc_cut_gpu_memory.cpp b/lib/gpu/ljc_cut_gpu_memory.cpp index d63ed6e5d9..b662b653fe 100644 --- a/lib/gpu/ljc_cut_gpu_memory.cpp +++ b/lib/gpu/ljc_cut_gpu_memory.cpp @@ -134,9 +134,9 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -145,8 +145,8 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &cutsq.begin(), &_qqrd2e); @@ -154,8 +154,8 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &cutsq.begin(), &_qqrd2e); } diff --git a/lib/gpu/ljcl_cut_gpu_memory.cpp b/lib/gpu/ljcl_cut_gpu_memory.cpp index a126309a92..1f638c030c 100644 --- a/lib/gpu/ljcl_cut_gpu_memory.cpp +++ b/lib/gpu/ljcl_cut_gpu_memory.cpp @@ -132,9 +132,9 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -143,8 +143,8 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald); @@ -152,8 +152,8 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald); } diff --git a/lib/gpu/pair_gpu_ans.cpp b/lib/gpu/pair_gpu_ans.cpp index e6982e6eba..a06ba9e394 100644 --- a/lib/gpu/pair_gpu_ans.cpp +++ b/lib/gpu/pair_gpu_ans.cpp @@ -138,11 +138,10 @@ void PairGPUAnsT::clear() { if (!_allocated) return; - time_pos.clear(); - time_other.clear(); time_answer.clear(); clear_resize(); _inum=0; + _ilist=NULL; _eflag=false; _vflag=false; } @@ -181,7 +180,6 @@ void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag, else ucl_copy(host_ans,dev_ans,_inum*4,true); time_answer.stop(); - dev->add_answer_object(this); } template @@ -199,7 +197,7 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom, return 0.0; double evdwl=0.0; - if (_gpu_nbor) { + if (_ilist==NULL) { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; if (_eflag) { @@ -279,7 +277,7 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom, double evdwl=0.0; double _ecoul=0.0; - if (_gpu_nbor) { + if (_ilist==NULL) { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; if (_eflag) { @@ -359,11 +357,8 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom, template void PairGPUAnsT::get_answers(double **f, double **tor) { - _x_avail=false; - _q_avail=false; - _quat_avail=false; acctyp *ap=host_ans.begin(); - if (_gpu_nbor) { + if (_ilist==NULL) { for (int i=0; i<_inum; i++) { f[i][0]+=*ap; ap++; diff --git a/lib/gpu/pair_gpu_ans.h b/lib/gpu/pair_gpu_ans.h index a93ed6fcd5..7c876ee246 100644 --- a/lib/gpu/pair_gpu_ans.h +++ b/lib/gpu/pair_gpu_ans.h @@ -58,14 +58,12 @@ class PairGPUAns { bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev); /// Check if we have enough device storage and realloc if not - inline bool resize(const int inum, bool &success) { + inline void resize(const int inum, bool &success) { _inum=inum; if (inum>_max_local) { clear_resize(); success = success && alloc(inum); - return true; } - return false; } /// If already initialized by another LAMMPS style, add fields as necessary @@ -123,6 +121,16 @@ class PairGPUAns { /// Add forces and torques from the GPU into a LAMMPS pointer void get_answers(double **f, double **tor); + inline double get_answers(double **f, double **tor, double *eatom, + double **vatom, double *virial, double &ecoul) { + time_answer.sync_stop(); + double ts=MPI_Wtime(); + double evdw=energy_virial(eatom,vatom,virial,ecoul); + get_answers(f,tor); + _time_cast+=MPI_Wtime()-ts; + return evdw; + } + // ------------------------------ DATA ---------------------------------- /// Force and possibly torque diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp index 812a7c82d6..6342d2c3fc 100644 --- a/lib/gpu/pair_gpu_atom.cpp +++ b/lib/gpu/pair_gpu_atom.cpp @@ -29,7 +29,8 @@ __win_sort _win_sort; #endif template -PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false) { +PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false), + _max_gpu_bytes(0) { #ifndef USE_OPENCL sort_config.op = CUDPP_ADD; sort_config.datatype = CUDPP_UINT; @@ -108,7 +109,7 @@ bool PairGPUAtomT::alloc(const int nall) { // --------------------------- Device allocations - _gpu_bytes=0; + int gpu_bytes=0; if (cpuview) { #ifdef GPU_CAST assert(0==1); @@ -126,7 +127,7 @@ bool PairGPUAtomT::alloc(const int nall) { dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)); success=success && (UCL_SUCCESS== dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)); - _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); + gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); #else success=success && (UCL_SUCCESS== dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY)); @@ -134,25 +135,27 @@ bool PairGPUAtomT::alloc(const int nall) { if (_charge) { success=success && (dev_q.alloc(_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); - _gpu_bytes+=dev_q.row_bytes(); + gpu_bytes+=dev_q.row_bytes(); } if (_rot) { success=success && (dev_quat.alloc(_max_atoms*4,*dev, UCL_READ_ONLY)==UCL_SUCCESS); - _gpu_bytes+=dev_quat.row_bytes(); + gpu_bytes+=dev_quat.row_bytes(); } } if (_gpu_nbor) { success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); - _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes(); + gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes(); if (_bonds) { success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); - _gpu_bytes+=dev_tag.row_bytes(); + gpu_bytes+=dev_tag.row_bytes(); } } - _gpu_bytes+=dev_x.row_bytes(); + gpu_bytes+=dev_x.row_bytes(); + if (gpu_bytes>_max_gpu_bytes) + _max_gpu_bytes=gpu_bytes; _allocated=true; return success; @@ -168,6 +171,7 @@ bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot, _x_avail=false; _q_avail=false; _quat_avail=false; + _resized=false; _gpu_nbor=gpu_nbor; _bonds=bonds; _charge=charge; @@ -182,9 +186,11 @@ bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot, // Initialize timers for the selected device time_pos.init(*dev); - time_other.init(*dev); + time_q.init(*dev); + time_quat.init(*dev); time_pos.zero(); - time_other.zero(); + time_q.zero(); + time_quat.zero(); _time_cast=0.0; #ifdef GPU_CAST @@ -252,12 +258,13 @@ void PairGPUAtomT::clear_resize() { template void PairGPUAtomT::clear() { - _gpu_bytes=0; + _max_gpu_bytes=0; if (!_allocated) return; time_pos.clear(); - time_other.clear(); + time_q.clear(); + time_quat.clear(); clear_resize(); #ifdef GPU_CAST diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h index c4c6b1586f..45e29753b8 100644 --- a/lib/gpu/pair_gpu_atom.h +++ b/lib/gpu/pair_gpu_atom.h @@ -64,14 +64,15 @@ class PairGPUAtom { UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false); /// Check if we have enough device storage and realloc if not + /** Returns true if resized with any call during this timestep **/ inline bool resize(const int nall, bool &success) { _nall=nall; if (nall>_max_atoms) { clear_resize(); success = success && alloc(nall); - return true; + _resized=true; } - return false; + return _resized; } /// If already initialized by another LAMMPS style, add fields as necessary @@ -94,26 +95,42 @@ class PairGPUAtom { /// Add copy times to timers inline void acc_timers() { time_pos.add_to_total(); - if (_other) - time_other.add_to_total(); + if (_charge) + time_q.add_to_total(); + if (_rot) + time_quat.add_to_total(); } /// Add copy times to timers inline void zero_timers() { time_pos.zero(); - if (_other) - time_other.zero(); + if (_charge) + time_q.zero(); + if (_rot) + time_quat.zero(); } /// Return the total time for host/device data transfer + /** Zeros the total so that the atom times are only included once **/ inline double transfer_time() { double total=time_pos.total_seconds(); - if (_other) total+=time_other.total_seconds(); + time_pos.zero_total(); + if (_charge) { + total+=time_q.total_seconds(); + time_q.zero_total(); + } + if (_rot) { + total+=time_q.total_seconds(); + time_quat.zero_total(); + } + return total; } /// Return the total time for data cast/pack - inline double cast_time() { return _time_cast; } + /** Zeros the time so that atom times are only included once **/ + inline double cast_time() + { double t=_time_cast; _time_cast=0.0; return t; } /// Pack LAMMPS atom type constants into matrix and copy to device template @@ -210,7 +227,7 @@ class PairGPUAtom { /// Signal that we need to transfer atom data for next timestep inline void data_unavail() - { _x_avail=false; _q_avail=false; _quat_avail=false; } + { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; } /// Cast positions and types to write buffer inline void cast_x_data(double **host_ptr, const int *host_type) { @@ -322,20 +339,9 @@ class PairGPUAtom { } } - /// Copy data other than pos and type to device - inline void add_other_data() { - if (_other) { - time_other.start(); - if (_charge) - add_q_data(); - if (_rot) - add_quat_data(); - time_other.stop(); - } - } - /// Return number of bytes used on device - inline double gpu_bytes() { return _gpu_bytes; } + inline double max_gpu_bytes() + { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } // ------------------------------ DATA ---------------------------------- @@ -368,7 +374,7 @@ class PairGPUAtom { UCL_D_Vec dev_tag; /// Device timers - UCL_Timer time_pos, time_other; + UCL_Timer time_pos, time_q, time_quat; /// Geryon device UCL_Device *dev; @@ -383,7 +389,7 @@ class PairGPUAtom { bool _compiled; // True if data has been copied to device already - bool _x_avail, _q_avail, _quat_avail; + bool _x_avail, _q_avail, _quat_avail, _resized; bool alloc(const int nall); @@ -392,7 +398,7 @@ class PairGPUAtom { bool _gpu_nbor, _bonds; double _time_cast; - double _gpu_bytes; + double _max_gpu_bytes; #ifndef USE_OPENCL CUDPPConfiguration sort_config; diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp index 718e9d9ddb..c98c72e04f 100644 --- a/lib/gpu/pair_gpu_device.cpp +++ b/lib/gpu/pair_gpu_device.cpp @@ -117,10 +117,11 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, } template -bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, +bool PairGPUDeviceT::init(PairGPUAns &ans, const bool charge, + const bool rot, const int nlocal, const int host_nlocal, const int nall, - PairGPUNbor *nbor, const int maxspecial, - const bool gpu_nbor, const int gpu_host, + PairGPUNbor *nbor, const int maxspecial, + const bool gpu_nbor, const int gpu_host, const int max_nbors, const double cell_size, const bool pre_cut) { if (!_device_init) @@ -133,12 +134,14 @@ bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, if (_init_count==0) { // Initialize atom and nbor data - if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor, - gpu_nbor && maxspecial>0)) + if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0)) return false; } else atom.add_fields(charge,rot); + if (!ans.init(ef_nlocal,charge,rot,*gpu)) + return false; + if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, *gpu,gpu_nbor,gpu_host,pre_cut)) return false; @@ -189,20 +192,21 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name, } template -void PairGPUDeviceT::output_times(UCL_Timer &time_pair, PairGPUNbor &nbor, - const double avg_split, +void PairGPUDeviceT::output_times(UCL_Timer &time_pair, + PairGPUAns &ans, + PairGPUNbor &nbor, const double avg_split, const double max_bytes, FILE *screen) { double single[5], times[5]; - single[0]=atom.transfer_time(); + single[0]=atom.transfer_time()+ans.transfer_time(); single[1]=nbor.time_nbor.total_seconds(); single[2]=nbor.time_kernel.total_seconds(); single[3]=time_pair.total_seconds(); - single[4]=atom.cast_time(); + single[4]=atom.cast_time()+ans.cast_time(); MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica); - double my_max_bytes=max_bytes; + double my_max_bytes=max_bytes+atom.max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); @@ -276,15 +280,5 @@ void lmp_clear_device() { double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul) { - if (pair_gpu_device.init_count()) { - pair_gpu_device.stop_host_timer(); - pair_gpu_device.gpu->sync(); - double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul); - pair_gpu_device.atom.get_answers(f,tor); - pair_gpu_device.atom.data_unavail(); - - return evdw; - } - return 0.0; + return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul); } - diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index 1ef85c78ad..fde23ce2c3 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -53,18 +53,19 @@ class PairGPUDevice { * \param cell_size cutoff+skin * \param pre_cut True if cutoff test will be performed in separate kernel * than the force kernel **/ - bool init(const bool charge, const bool rot, const int nlocal, - const int host_nlocal, const int nall, PairGPUNbor *nbor, - const int maxspecial, const bool gpu_nbor, const int gpu_host, - const int max_nbors, const double cell_size, const bool pre_cut); + bool init(PairGPUAns &a, const bool charge, const bool rot, + const int nlocal, const int host_nlocal, const int nall, + PairGPUNbor *nbor, const int maxspecial, const bool gpu_nbor, + const int gpu_host, const int max_nbors, const double cell_size, + const bool pre_cut); /// Output a message for pair_style acceleration with device stats void init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu); /// Output a message with timing information - void output_times(UCL_Timer &time_pair, PairGPUNbor &nbor, - const double avg_split, + void output_times(UCL_Timer &time_pair, PairGPUAns &ans, + PairGPUNbor &nbor, const double avg_split, const double max_bytes, FILE *screen); /// Clear all memory on host and device associated with atom and nbor data @@ -74,7 +75,24 @@ class PairGPUDevice { void clear_device(); /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS - inline void add_ans_object(PairGPUAns *ans) { ans_queue.push(ans); } + inline void add_ans_object(PairGPUAns *ans) + { ans_queue.push(ans); } + + /// Add "answers" (force,energies,etc.) into LAMMPS structures + inline double fix_gpu(double **f, double **tor, double *eatom, + double **vatom, double *virial, double &ecoul) { + atom.data_unavail(); + if (ans_queue.empty()==false) { + stop_host_timer(); + double evdw=0.0; + while (ans_queue.empty()==false) { + evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul); + ans_queue.pop(); + } + return evdw; + } + return 0.0; + } /// Start timer on host inline void start_host_timer() { _cpu_full=MPI_Wtime(); } @@ -139,7 +157,7 @@ class PairGPUDevice { PairGPUNborShared _nbor_shared; private: - std::queue ans_queue; + std::queue *> ans_queue; int _init_count; bool _device_init; MPI_Comm _comm_world, _comm_replica, _comm_gpu;