diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h index 2d2a751f85..129bdbbdef 100644 --- a/lib/gpu/geryon/nvd_device.h +++ b/lib/gpu/geryon/nvd_device.h @@ -260,6 +260,9 @@ class UCL_Device { /// List all devices along with all properties inline void print_all(std::ostream &out); + /// Select the platform that has accelerators (for compatibility with OpenCL) + inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; } + private: int _device, _num_devices; std::vector _properties; diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index f6a0b109f2..57880d7ca9 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -322,10 +322,12 @@ class Atom { // Copy charges to device asynchronously inline void add_q_data() { + time_q.start(); if (_q_avail==false) { q.update_device(_nall,true); _q_avail=true; } + time_q.stop(); } // Cast quaternions to write buffer @@ -347,10 +349,12 @@ class Atom { // Copy quaternions to device /** Copies nall()*4 elements **/ inline void add_quat_data() { + time_quat.start(); if (_quat_avail==false) { quat.update_device(_nall*4,true); _quat_avail=true; } + time_quat.stop(); } /// Cast velocities and tags to write buffer diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 7f54432a74..6b4d0ab2a5 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -130,8 +130,16 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, // Time on the device only if 1 proc per gpu _time_device=true; + +#if 0 + // XXX: the following setting triggers a memory leak with OpenCL and MPI + // setting _time_device=true for all processes doesn't seem to be a + // problem with either (no segfault, no (large) memory leak. + // thus keeping this disabled for now. may need to review later. + // 2018-07-23 if (_procs_per_gpu>1) _time_device=false; +#endif // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 04e08c3e9c..3e128bcf57 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -127,10 +127,10 @@ void Neighbor::alloc(bool &success) { dev_packed.clear(); success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev, _packed_permissions)==UCL_SUCCESS); - dev_acc.clear(); - success=success && (dev_acc.alloc(_max_atoms,*dev, + dev_ilist.clear(); + success=success && (dev_ilist.alloc(_max_atoms,*dev, UCL_READ_WRITE)==UCL_SUCCESS); - _c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes(); + _c_bytes+=dev_packed.row_bytes()+dev_ilist.row_bytes(); } if (_max_host>0) { nbor_host.clear(); @@ -197,7 +197,7 @@ void Neighbor::clear() { host_packed.clear(); host_acc.clear(); - dev_acc.clear(); + dev_ilist.clear(); dev_nbor.clear(); nbor_host.clear(); dev_packed.clear(); @@ -281,7 +281,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj, } UCL_D_Vec acc_view; acc_view.view_offset(inum,dev_nbor,inum*2); - ucl_copy(acc_view,host_acc,true); + ucl_copy(acc_view,host_acc,inum*2,true); UCL_H_Vec host_view; host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE); @@ -289,7 +289,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj, int i=ilist[ii]; host_view[i] = ii; } - ucl_copy(dev_acc,host_view,true); + ucl_copy(dev_ilist,host_view,true); time_nbor.stop(); @@ -364,7 +364,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj, } UCL_D_Vec acc_view; acc_view.view_offset(inum,dev_nbor,inum*2); - ucl_copy(acc_view,host_acc,true); + ucl_copy(acc_view,host_acc,inum*2,true); time_nbor.stop(); if (_use_packing==false) { diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h index 05168834c6..996deaff6d 100644 --- a/lib/gpu/lal_neighbor.h +++ b/lib/gpu/lal_neighbor.h @@ -110,7 +110,7 @@ class Neighbor { } if (_time_device) { time_nbor.add_to_total(); - time_kernel.add_to_total(); + if (_use_packing==false) time_kernel.add_to_total(); if (_gpu_nbor==2) { time_hybrid1.add_to_total(); time_hybrid2.add_to_total(); @@ -200,7 +200,7 @@ class Neighbor { /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2) UCL_H_Vec host_acc; /// Device storage for accessing atom indices from the neighbor list (3-body) - UCL_D_Vec dev_acc; + UCL_D_Vec dev_ilist; // ----------------- Data for GPU Neighbor Calculation --------------- diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp index 24984e4878..46b6382a60 100644 --- a/lib/gpu/lal_sw.cpp +++ b/lib/gpu/lal_sw.cpp @@ -243,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, &this->dev_short_nbor, + &this->nbor->dev_ilist, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -252,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, &this->dev_short_nbor, + &this->nbor->dev_ilist, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu index 517de70691..3b6de5a683 100644 --- a/lib/gpu/lal_sw.cu +++ b/lib/gpu/lal_sw.cu @@ -544,7 +544,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, const int nelements, const __global int * dev_nbor, const __global int * dev_packed, - const __global int * dev_acc, + const __global int * dev_ilist, const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, @@ -614,13 +614,13 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, int nbor_k,numk; if (dev_nbor==dev_packed) { if (gpu_nbor) nbor_k=j+nbor_pitch; - else nbor_k=dev_acc[j]+nbor_pitch; + else nbor_k=dev_ilist[j]+nbor_pitch; numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); nbor_k+=offset_k; } else { - nbor_k=dev_acc[j]+nbor_pitch; + nbor_k=dev_ilist[j]+nbor_pitch; numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch; nbor_k=dev_nbor[nbor_k]; @@ -698,7 +698,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, const int nelements, const __global int * dev_nbor, const __global int * dev_packed, - const __global int * dev_acc, + const __global int * dev_ilist, const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, @@ -768,13 +768,13 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, int nbor_k,numk; if (dev_nbor==dev_packed) { if (gpu_nbor) nbor_k=j+nbor_pitch; - else nbor_k=dev_acc[j]+nbor_pitch; + else nbor_k=dev_ilist[j]+nbor_pitch; numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); nbor_k+=offset_k; } else { - nbor_k=dev_acc[j]+nbor_pitch; + nbor_k=dev_ilist[j]+nbor_pitch; numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch; nbor_k=dev_nbor[nbor_k]; diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp index a63d286d9c..ef55b98a2d 100644 --- a/lib/gpu/lal_tersoff.cpp +++ b/lib/gpu/lal_tersoff.cpp @@ -272,7 +272,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, - &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); + &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); ainum=this->ans->inum(); nbor_pitch=this->nbor->nbor_pitch(); @@ -311,7 +311,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, &this->dev_short_nbor, + &this->nbor->dev_ilist, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -320,7 +320,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, &this->dev_short_nbor, + &this->nbor->dev_ilist, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index cec0ccc443..836f05660d 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -696,7 +696,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, const __global int * dev_packed, - const __global int * dev_acc, + const __global int * dev_ilist, const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, @@ -777,13 +777,13 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, int nbor_k,numk; if (dev_nbor==dev_packed) { if (gpu_nbor) nbor_k=j+nbor_pitch; - else nbor_k=dev_acc[j]+nbor_pitch; + else nbor_k=dev_ilist[j]+nbor_pitch; numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); nbor_k+=offset_k; } else { - nbor_k=dev_acc[j]+nbor_pitch; + nbor_k=dev_ilist[j]+nbor_pitch; numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch; nbor_k=dev_nbor[nbor_k]; @@ -941,7 +941,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, const __global int * dev_packed, - const __global int * dev_acc, + const __global int * dev_ilist, const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, @@ -1022,13 +1022,13 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, int nbor_k,numk; if (dev_nbor==dev_packed) { if (gpu_nbor) nbor_k=j+nbor_pitch; - else nbor_k=dev_acc[j]+nbor_pitch; + else nbor_k=dev_ilist[j]+nbor_pitch; numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); nbor_k+=offset_k; } else { - nbor_k=dev_acc[j]+nbor_pitch; + nbor_k=dev_ilist[j]+nbor_pitch; numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch; nbor_k=dev_nbor[nbor_k]; diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp index c37c07f1a1..3cbb488cab 100644 --- a/lib/gpu/lal_tersoff_mod.cpp +++ b/lib/gpu/lal_tersoff_mod.cpp @@ -272,7 +272,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, - &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); + &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); ainum=this->ans->inum(); nbor_pitch=this->nbor->nbor_pitch(); @@ -311,7 +311,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, &this->dev_short_nbor, + &this->nbor->dev_ilist, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -320,7 +320,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, &this->dev_short_nbor, + &this->nbor->dev_ilist, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu index 576359b514..dfb94c4145 100644 --- a/lib/gpu/lal_tersoff_mod.cu +++ b/lib/gpu/lal_tersoff_mod.cu @@ -272,7 +272,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_, if (iinbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, - &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); + &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); ainum=this->ans->inum(); nbor_pitch=this->nbor->nbor_pitch(); @@ -337,7 +337,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, &this->dev_short_nbor, + &this->nbor->dev_ilist, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -346,7 +346,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, &this->dev_short_nbor, + &this->nbor->dev_ilist, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu index e8bb017f59..73ff51c704 100644 --- a/lib/gpu/lal_tersoff_zbl.cu +++ b/lib/gpu/lal_tersoff_zbl.cu @@ -278,7 +278,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_, if (iik_three_end_vatom.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, &this->dev_short_nbor, + &this->nbor->dev_ilist, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } else { @@ -286,7 +286,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, &this->dev_short_nbor, + &this->nbor->dev_ilist, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu index d2e8bb1496..0da46c3b53 100644 --- a/lib/gpu/lal_vashishta.cu +++ b/lib/gpu/lal_vashishta.cu @@ -554,7 +554,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_, const int nelements, const __global int * dev_nbor, const __global int * dev_packed, - const __global int * dev_acc, + const __global int * dev_ilist, const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, @@ -623,13 +623,13 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_, int nbor_k,numk; if (dev_nbor==dev_packed) { if (gpu_nbor) nbor_k=j+nbor_pitch; - else nbor_k=dev_acc[j]+nbor_pitch; + else nbor_k=dev_ilist[j]+nbor_pitch; numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); nbor_k+=offset_k; } else { - nbor_k=dev_acc[j]+nbor_pitch; + nbor_k=dev_ilist[j]+nbor_pitch; numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch; nbor_k=dev_nbor[nbor_k]; @@ -709,7 +709,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_, const int nelements, const __global int * dev_nbor, const __global int * dev_packed, - const __global int * dev_acc, + const __global int * dev_ilist, const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, @@ -778,13 +778,13 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_, int nbor_k,numk; if (dev_nbor==dev_packed) { if (gpu_nbor) nbor_k=j+nbor_pitch; - else nbor_k=dev_acc[j]+nbor_pitch; + else nbor_k=dev_ilist[j]+nbor_pitch; numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); nbor_k+=offset_k; } else { - nbor_k=dev_acc[j]+nbor_pitch; + nbor_k=dev_ilist[j]+nbor_pitch; numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch; nbor_k=dev_nbor[nbor_k];