diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index cc74d2ebd6..3c859ffdd4 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -30,8 +30,9 @@ UCL_H = $(wildcard ./geryon/ucl*.h) NVC_H = $(wildcard ./geryon/nvc*.h) $(UCL_H) NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) # Headers for Pair Stuff -PAIR_H = pair_gpu_atom.h pair_gpu_nbor_shared.h pair_gpu_nbor.h \ - pair_gpu_precision.h pair_gpu_device.h pair_gpu_balance.h +PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \ + pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \ + pair_gpu_balance.h ALL_H = $(NVD_H) $(PAIR_H) @@ -39,8 +40,9 @@ EXECS = $(BIN_DIR)/nvc_get_devices CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \ $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \ $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o -OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \ - $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_device.o \ +OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ + $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \ + $(OBJ_DIR)/pair_gpu_device.o \ $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \ $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \ $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \ @@ -95,6 +97,9 @@ $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h $(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H) + $(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 829add7350..e488a56bc0 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -23,14 +23,16 @@ OCL_LIB = $(LIB_DIR)/libgpu.a UCL_H = $(wildcard ./geryon/ucl*.h) OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) # Headers for Pair Stuff -PAIR_H = pair_gpu_atom.h pair_gpu_nbor_shared.h pair_gpu_nbor.h \ - pair_gpu_precision.h pair_gpu_device.h pair_gpu_balance.h +PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \ + pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \ + pair_gpu_balance.h ALL_H = $(OCL_H) $(PAIR_H) EXECS = $(BIN_DIR)/ocl_get_devices -OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \ - $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_device.o \ +OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ + $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \ + $(OBJ_DIR)/pair_gpu_device.o \ $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \ $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \ $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \ @@ -46,7 +48,7 @@ KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \ $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \ $(OBJ_DIR)/crml_gpu_cl.h \ $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h - + OCL_EXECS = $(BIN_DIR)/ocl_get_devices all: $(OCL_LIB) $(EXECS) diff --git a/lib/gpu/pair_gpu_ans.cpp b/lib/gpu/pair_gpu_ans.cpp new file mode 100644 index 0000000000..e6982e6eba --- /dev/null +++ b/lib/gpu/pair_gpu_ans.cpp @@ -0,0 +1,409 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include "pair_gpu_ans.h" + +#define PairGPUAnsT PairGPUAns + +template +PairGPUAnsT::PairGPUAns() : _allocated(false),_eflag(false),_vflag(false), + _inum(0),_ilist(NULL),_newton(false) { +} + +template +int PairGPUAnsT::bytes_per_atom() const { + int bytes=11*sizeof(acctyp); + if (_rot) + bytes+=4*sizeof(acctyp); + if (_charge) + bytes+=sizeof(acctyp); + return bytes; +} + +template +bool PairGPUAnsT::alloc(const int inum) { + _max_local=static_cast(static_cast(inum)*1.10); + + bool success=true; + + int ans_elements=4; + if (_rot) + ans_elements+=4; + + // Ignore host/device transfers? + bool cpuview=false; + if (dev->device_type()==UCL_CPU) + cpuview=true; + + // -------------------------- Host allocations + success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS); + success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS); + + // --------------------------- Device allocations + if (cpuview) { + dev_engv.view(host_engv); + dev_ans.view(host_ans); + } else { + success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev, + UCL_WRITE_ONLY)==UCL_SUCCESS); + success=success && (dev_ans.alloc(ans_elements*_max_local, + *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); + } + _gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes(); + + _allocated=true; + return success; +} + +template +bool PairGPUAnsT::init(const int inum, const bool charge, const bool rot, + UCL_Device &devi) { + clear(); + + bool success=true; + _charge=charge; + _rot=rot; + _other=_charge || _rot; + dev=&devi; + + _e_fields=1; + if (_charge) + _e_fields++; + _ev_fields=6+_e_fields; + + // Initialize atom and nbor data + int ef_inum=inum; + if (ef_inum==0) + ef_inum=1000; + + // Initialize timers for the selected device + time_answer.init(*dev); + time_answer.zero(); + _time_cast=0.0; + + return success && alloc(ef_inum); +} + +template +bool PairGPUAnsT::add_fields(const bool charge, const bool rot) { + bool realloc=false; + if (charge && _charge==false) { + _charge=true; + _e_fields++; + _ev_fields++; + realloc=true; + } + if (rot && _rot==false) { + _rot=true; + realloc=true; + } + if (realloc) { + _other=_charge || _rot; + int inum=_max_local; + clear_resize(); + return alloc(inum); + } + return true; +} + +template +void PairGPUAnsT::clear_resize() { + if (!_allocated) + return; + _allocated=false; + + dev_ans.clear(); + dev_engv.clear(); + host_ans.clear(); + host_engv.clear(); +} + +template +void PairGPUAnsT::clear() { + _gpu_bytes=0; + if (!_allocated) + return; + + time_pos.clear(); + time_other.clear(); + time_answer.clear(); + clear_resize(); + _inum=0; + _eflag=false; + _vflag=false; +} + +template +double PairGPUAnsT::host_memory_usage() const { + int atom_bytes=4; + if (_charge) + atom_bytes+=1; + if (_rot) + atom_bytes+=4; + int ans_bytes=atom_bytes+_ev_fields; + return ans_bytes*(_max_local)*sizeof(acctyp)+ + sizeof(PairGPUAns); +} + +template +void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom) { + time_answer.start(); + _eflag=eflag; + _vflag=vflag; + _ef_atom=ef_atom; + _vf_atom=vf_atom; + + int csize=_ev_fields; + if (!eflag) + csize-=_e_fields; + if (!vflag) + csize-=6; + + if (csize>0) + ucl_copy(host_engv,dev_engv,_inum*csize,true); + if (_rot) + ucl_copy(host_ans,dev_ans,_inum*4*2,true); + else + ucl_copy(host_ans,dev_ans,_inum*4,true); + time_answer.stop(); + dev->add_answer_object(this); +} + +template +void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom, + int *ilist) { + _ilist=ilist; + copy_answers(eflag,vflag,ef_atom,vf_atom); +} + +template +double PairGPUAnsT::energy_virial(double *eatom, double **vatom, + double *virial) { + if (_eflag==false && _vflag==false) + return 0.0; + + double evdwl=0.0; + if (_gpu_nbor) { + for (int i=0; i<_inum; i++) { + acctyp *ap=host_engv.begin()+i; + if (_eflag) { + if (_ef_atom) { + evdwl+=*ap; + eatom[i]+=*ap*0.5; + ap+=_inum; + } else { + evdwl+=*ap; + ap+=_inum; + } + } + if (_vflag) { + if (_vf_atom) { + for (int j=0; j<6; j++) { + vatom[i][j]+=*ap*0.5; + virial[j]+=*ap; + ap+=_inum; + } + } else { + for (int j=0; j<6; j++) { + virial[j]+=*ap; + ap+=_inum; + } + } + } + } + for (int j=0; j<6; j++) + virial[j]*=0.5; + } else { + for (int i=0; i<_inum; i++) { + acctyp *ap=host_engv.begin()+i; + int ii=_ilist[i]; + if (_eflag) { + if (_ef_atom) { + evdwl+=*ap; + eatom[ii]+=*ap*0.5; + ap+=_inum; + } else { + evdwl+=*ap; + ap+=_inum; + } + } + if (_vflag) { + if (_vf_atom) { + for (int j=0; j<6; j++) { + vatom[ii][j]+=*ap*0.5; + virial[j]+=*ap; + ap+=_inum; + } + } else { + for (int j=0; j<6; j++) { + virial[j]+=*ap; + ap+=_inum; + } + } + } + } + for (int j=0; j<6; j++) + virial[j]*=0.5; + } + + evdwl*=0.5; + return evdwl; +} + +template +double PairGPUAnsT::energy_virial(double *eatom, double **vatom, + double *virial, double &ecoul) { + if (_eflag==false && _vflag==false) { + ecoul=0.0; + return 0.0; + } + + if (_charge==false) + return energy_virial(eatom,vatom,virial); + + double evdwl=0.0; + double _ecoul=0.0; + if (_gpu_nbor) { + for (int i=0; i<_inum; i++) { + acctyp *ap=host_engv.begin()+i; + if (_eflag) { + if (_ef_atom) { + evdwl+=*ap; + eatom[i]+=*ap*0.5; + ap+=_inum; + _ecoul+=*ap; + eatom[i]+=*ap*0.5; + ap+=_inum; + } else { + evdwl+=*ap; + ap+=_inum; + _ecoul+=*ap; + ap+=_inum; + } + } + if (_vflag) { + if (_vf_atom) { + for (int j=0; j<6; j++) { + vatom[i][j]+=*ap*0.5; + virial[j]+=*ap; + ap+=_inum; + } + } else { + for (int j=0; j<6; j++) { + virial[j]+=*ap; + ap+=_inum; + } + } + } + } + for (int j=0; j<6; j++) + virial[j]*=0.5; + } else { + for (int i=0; i<_inum; i++) { + acctyp *ap=host_engv.begin()+i; + int ii=_ilist[i]; + if (_eflag) { + if (_ef_atom) { + evdwl+=*ap; + eatom[ii]+=*ap*0.5; + ap+=_inum; + _ecoul+=*ap; + eatom[ii]+=*ap*0.5; + ap+=_inum; + } else { + evdwl+=*ap; + ap+=_inum; + _ecoul+=*ap; + ap+=_inum; + } + } + if (_vflag) { + if (_vf_atom) { + for (int j=0; j<6; j++) { + vatom[ii][j]+=*ap*0.5; + virial[j]+=*ap; + ap+=_inum; + } + } else { + for (int j=0; j<6; j++) { + virial[j]+=*ap; + ap+=_inum; + } + } + } + } + for (int j=0; j<6; j++) + virial[j]*=0.5; + } + + evdwl*=0.5; + ecoul+=_ecoul*0.5; + return evdwl; +} + +template +void PairGPUAnsT::get_answers(double **f, double **tor) { + _x_avail=false; + _q_avail=false; + _quat_avail=false; + acctyp *ap=host_ans.begin(); + if (_gpu_nbor) { + for (int i=0; i<_inum; i++) { + f[i][0]+=*ap; + ap++; + f[i][1]+=*ap; + ap++; + f[i][2]+=*ap; + ap+=2; + } + if (_rot) { + for (int i=0; i<_inum; i++) { + tor[i][0]+=*ap; + ap++; + tor[i][1]+=*ap; + ap++; + tor[i][2]+=*ap; + ap+=2; + } + } + } else { + for (int i=0; i<_inum; i++) { + int ii=_ilist[i]; + f[ii][0]+=*ap; + ap++; + f[ii][1]+=*ap; + ap++; + f[ii][2]+=*ap; + ap+=2; + } + if (_rot) { + for (int i=0; i<_inum; i++) { + int ii=_ilist[i]; + tor[ii][0]+=*ap; + ap++; + tor[ii][1]+=*ap; + ap++; + tor[ii][2]+=*ap; + ap+=2; + } + } + } +} + +template class PairGPUAns; diff --git a/lib/gpu/pair_gpu_ans.h b/lib/gpu/pair_gpu_ans.h new file mode 100644 index 0000000000..a93ed6fcd5 --- /dev/null +++ b/lib/gpu/pair_gpu_ans.h @@ -0,0 +1,158 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef PAIR_GPU_ANS_H +#define PAIR_GPU_ANS_H + +#include +#include "mpi.h" + +#ifdef USE_OPENCL + +#include "geryon/ocl_timer.h" +#include "geryon/ocl_mat.h" +using namespace ucl_opencl; + +#else + +#include "cudpp.h" +#include "geryon/nvd_timer.h" +#include "geryon/nvd_mat.h" +using namespace ucl_cudadr; + +#endif + +#include "pair_gpu_precision.h" + +template +class PairGPUAns { + public: + PairGPUAns(); + ~PairGPUAns() { clear(); } + + /// Current number of local atoms stored + inline int inum() const { return _inum; } + /// Set number of local atoms for future copy operations + inline void inum(const int n) { _inum=n; } + + /// Memory usage per atom in this class + int bytes_per_atom() const; + + /// Clear any previous data and set up for a new LAMMPS run + /** \param rot True if atom storage needs quaternions + * \param gpu_nbor True if neighboring will be performed on device **/ + bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev); + + /// Check if we have enough device storage and realloc if not + inline bool resize(const int inum, bool &success) { + _inum=inum; + if (inum>_max_local) { + clear_resize(); + success = success && alloc(inum); + return true; + } + return false; + } + + /// If already initialized by another LAMMPS style, add fields as necessary + /** \param rot True if atom storage needs quaternions + * \param gpu_nbor True if neighboring will be performed on device **/ + bool add_fields(const bool charge, const bool rot); + + /// Free all memory on host and device needed to realloc for more atoms + void clear_resize(); + + /// Free all memory on host and device + void clear(); + + /// Return the total amount of host memory used by class in bytes + double host_memory_usage() const; + + /// Add copy times to timers + inline void acc_timers() { + time_answer.add_to_total(); + } + + /// Add copy times to timers + inline void zero_timers() { + time_answer.zero(); + } + + /// Return the total time for host/device data transfer + inline double transfer_time() { + return time_answer.total_seconds(); + } + + /// Return the total time for data cast/pack + inline double cast_time() { return _time_cast; } + + /// Return number of bytes used on device + inline double gpu_bytes() { return _gpu_bytes; } + + // -------------------------COPY FROM GPU ------------------------------- + + /// Copy answers from device into read buffer asynchronously + void copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom); + + /// Copy answers from device into read buffer asynchronously + void copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom, int *ilist); + + /// Copy energy and virial data into LAMMPS memory + double energy_virial(double *eatom, double **vatom, double *virial); + + /// Copy energy and virial data into LAMMPS memory + double energy_virial(double *eatom, double **vatom, double *virial, + double &ecoul); + + /// Add forces and torques from the GPU into a LAMMPS pointer + void get_answers(double **f, double **tor); + + // ------------------------------ DATA ---------------------------------- + + /// Force and possibly torque + UCL_D_Vec dev_ans; + /// Energy and virial per-atom storage + UCL_D_Vec dev_engv; + + /// Force and possibly torque data on host + UCL_H_Vec host_ans; + /// Energy/virial data on host + UCL_H_Vec host_engv; + + /// Device timers + UCL_Timer time_answer; + + /// Geryon device + UCL_Device *dev; + + private: + bool alloc(const int inum); + + bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; + int _max_local, _inum, _e_fields, _ev_fields; + int *_ilist; + double _time_cast; + + double _gpu_bytes; + + bool _newton; +}; + +#endif + diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp index 46c9066e56..812a7c82d6 100644 --- a/lib/gpu/pair_gpu_atom.cpp +++ b/lib/gpu/pair_gpu_atom.cpp @@ -29,9 +29,7 @@ __win_sort _win_sort; #endif template -PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false), - _vflag(false),_inum(0),_ilist(NULL), - _newton(false) { +PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false) { #ifndef USE_OPENCL sort_config.op = CUDPP_ADD; sort_config.datatype = CUDPP_UINT; @@ -56,28 +54,20 @@ int PairGPUAtomT::bytes_per_atom() const { int id_space=0; if (_gpu_nbor) id_space=2; - int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space; + int bytes=4*sizeof(numtyp)+id_space; if (_rot) - bytes+=4*sizeof(numtyp)+4*sizeof(acctyp); + bytes+=4*sizeof(numtyp); if (_charge) bytes+=sizeof(numtyp); return bytes; } template -bool PairGPUAtomT::alloc(const int inum, const int nall) { +bool PairGPUAtomT::alloc(const int nall) { _max_atoms=static_cast(static_cast(nall)*1.10); - if (_newton) - _max_local=_max_atoms; - else - _max_local=static_cast(static_cast(inum)*1.10); bool success=true; - int ans_elements=4; - if (_rot) - ans_elements+=4; - // Ignore host/device transfers? bool cpuview=false; if (dev->device_type()==UCL_CPU) @@ -107,8 +97,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) { success=success && (host_x.alloc(_max_atoms*4,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); #endif - success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS); - success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS); // Buffer for casting only if different precisions if (_charge) success=success && (host_q.alloc(_max_atoms,*dev, @@ -127,8 +115,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) { #else dev_x.view(host_x); #endif - dev_engv.view(host_engv); - dev_ans.view(host_ans); if (_rot) dev_quat.view(host_quat); if (_charge) @@ -145,10 +131,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) { success=success && (UCL_SUCCESS== dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY)); #endif - success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev, - UCL_WRITE_ONLY)==UCL_SUCCESS); - success=success && (dev_ans.alloc(ans_elements*_max_local, - *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); if (_charge) { success=success && (dev_q.alloc(_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); @@ -170,15 +152,15 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) { } } - _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes(); + _gpu_bytes+=dev_x.row_bytes(); _allocated=true; return success; } template -bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, - const bool rot, UCL_Device &devi, const bool gpu_nbor, +bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot, + UCL_Device &devi, const bool gpu_nbor, const bool bonds) { clear(); @@ -193,33 +175,23 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, _other=_charge || _rot; dev=&devi; - _e_fields=1; - if (_charge) - _e_fields++; - _ev_fields=6+_e_fields; - // Initialize atom and nbor data - int ef_inum=inum; - if (ef_inum==0) - ef_inum=1000; int ef_nall=nall; - if (ef_nall<=ef_inum) - ef_nall=ef_inum*2; + if (ef_nall==0) + ef_nall=2000; // Initialize timers for the selected device time_pos.init(*dev); time_other.init(*dev); - time_answer.init(*dev); time_pos.zero(); time_other.zero(); - time_answer.zero(); _time_cast=0.0; #ifdef GPU_CAST compile_kernels(*dev); #endif - return success && alloc(ef_inum,ef_nall); + return success && alloc(ef_nall); } template @@ -227,7 +199,6 @@ bool PairGPUAtomT::add_fields(const bool charge, const bool rot) { bool realloc=false; if (charge && _charge==false) { _charge=true; - _e_fields++; realloc=true; } if (rot && _rot==false) { @@ -236,10 +207,9 @@ bool PairGPUAtomT::add_fields(const bool charge, const bool rot) { } if (realloc) { _other=_charge || _rot; - int inum=_max_local; - int nall=_max_atoms; + int max_atoms=_max_atoms; clear_resize(); - return alloc(inum,nall); + return alloc(max_atoms); } return true; } @@ -259,16 +229,12 @@ void PairGPUAtomT::clear_resize() { dev_quat.clear(); host_quat.clear(); } - dev_ans.clear(); - dev_engv.clear(); #ifndef GPU_CAST host_x.clear(); #else host_x_cast.clear(); host_type_cast.clear(); #endif - host_ans.clear(); - host_engv.clear(); dev_cell_id.clear(); dev_particle_id.clear(); dev_tag.clear(); @@ -292,11 +258,7 @@ void PairGPUAtomT::clear() { time_pos.clear(); time_other.clear(); - time_answer.clear(); clear_resize(); - _inum=0; - _eflag=false; - _vflag=false; #ifdef GPU_CAST if (_compiled) { @@ -314,258 +276,10 @@ double PairGPUAtomT::host_memory_usage() const { atom_bytes+=1; if (_rot) atom_bytes+=4; - int ans_bytes=atom_bytes+_ev_fields; return _max_atoms*atom_bytes*sizeof(numtyp)+ - ans_bytes*(_max_local)*sizeof(acctyp)+ sizeof(PairGPUAtom); } -template -void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom) { - time_answer.start(); - _eflag=eflag; - _vflag=vflag; - _ef_atom=ef_atom; - _vf_atom=vf_atom; - - int csize=_ev_fields; - if (!eflag) - csize-=_e_fields; - if (!vflag) - csize-=6; - - if (csize>0) - ucl_copy(host_engv,dev_engv,_inum*csize,true); - if (_rot) - ucl_copy(host_ans,dev_ans,_inum*4*2,true); - else - ucl_copy(host_ans,dev_ans,_inum*4,true); - time_answer.stop(); -} - -template -void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom, - int *ilist) { - _ilist=ilist; - copy_answers(eflag,vflag,ef_atom,vf_atom); -} - -template -double PairGPUAtomT::energy_virial(double *eatom, double **vatom, - double *virial) { - if (_eflag==false && _vflag==false) - return 0.0; - - double evdwl=0.0; - if (_gpu_nbor) { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[i][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } else { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - int ii=_ilist[i]; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[ii][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } - - evdwl*=0.5; - return evdwl; -} - -template -double PairGPUAtomT::energy_virial(double *eatom, double **vatom, - double *virial, double &ecoul) { - if (_eflag==false && _vflag==false) { - ecoul=0.0; - return 0.0; - } - - if (_charge==false) - return energy_virial(eatom,vatom,virial); - - double evdwl=0.0; - double _ecoul=0.0; - if (_gpu_nbor) { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - _ecoul+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - _ecoul+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[i][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } else { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - int ii=_ilist[i]; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - _ecoul+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - _ecoul+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[ii][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } - - evdwl*=0.5; - ecoul+=_ecoul*0.5; - return evdwl; -} - -template -void PairGPUAtomT::get_answers(double **f, double **tor) { - _x_avail=false; - _q_avail=false; - _quat_avail=false; - acctyp *ap=host_ans.begin(); - if (_gpu_nbor) { - for (int i=0; i<_inum; i++) { - f[i][0]+=*ap; - ap++; - f[i][1]+=*ap; - ap++; - f[i][2]+=*ap; - ap+=2; - } - if (_rot) { - for (int i=0; i<_inum; i++) { - tor[i][0]+=*ap; - ap++; - tor[i][1]+=*ap; - ap++; - tor[i][2]+=*ap; - ap+=2; - } - } - } else { - for (int i=0; i<_inum; i++) { - int ii=_ilist[i]; - f[ii][0]+=*ap; - ap++; - f[ii][1]+=*ap; - ap++; - f[ii][2]+=*ap; - ap+=2; - } - if (_rot) { - for (int i=0; i<_inum; i++) { - int ii=_ilist[i]; - tor[ii][0]+=*ap; - ap++; - tor[ii][1]+=*ap; - ap++; - tor[ii][2]+=*ap; - ap+=2; - } - } - } -} - // Sort arrays for neighbor list calculation template void PairGPUAtomT::sort_neighbor(const int num_atoms) { diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h index 562ca0846d..c4c6b1586f 100644 --- a/lib/gpu/pair_gpu_atom.h +++ b/lib/gpu/pair_gpu_atom.h @@ -23,7 +23,6 @@ #ifdef USE_OPENCL -#include "geryon/ocl_device.h" #include "geryon/ocl_timer.h" #include "geryon/ocl_mat.h" #include "geryon/ocl_kernel.h" @@ -32,7 +31,6 @@ using namespace ucl_opencl; #else #include "cudpp.h" -#include "geryon/nvd_device.h" #include "geryon/nvd_timer.h" #include "geryon/nvd_mat.h" #include "geryon/nvd_kernel.h" @@ -40,10 +38,6 @@ using namespace ucl_cudadr; #endif -#ifndef int2 -struct int2 { int x; int y; }; -#endif - #include "pair_gpu_precision.h" template @@ -56,13 +50,9 @@ class PairGPUAtom { inline int max_atoms() const { return _max_atoms; } /// Current number of local+ghost atoms stored inline int nall() const { return _nall; } - /// Current number of local atoms stored - inline int inum() const { return _inum; } /// Set number of local+ghost atoms for future copy operations inline void nall(const int n) { _nall=n; } - /// Set number of local atoms for future copy operations - inline void inum(const int n) { _inum=n; } /// Memory usage per atom in this class int bytes_per_atom() const; @@ -70,16 +60,15 @@ class PairGPUAtom { /// Clear any previous data and set up for a new LAMMPS run /** \param rot True if atom storage needs quaternions * \param gpu_nbor True if neighboring will be performed on device **/ - bool init(const int inum, const int nall, const bool charge, const bool rot, + bool init(const int nall, const bool charge, const bool rot, UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false); /// Check if we have enough device storage and realloc if not - inline bool resize(const int inum, const int nall, bool &success) { - _inum=inum; + inline bool resize(const int nall, bool &success) { _nall=nall; - if (inum>_max_local || nall>_max_atoms) { + if (nall>_max_atoms) { clear_resize(); - success = success && alloc(inum,nall); + success = success && alloc(nall); return true; } return false; @@ -90,9 +79,6 @@ class PairGPUAtom { * \param gpu_nbor True if neighboring will be performed on device **/ bool add_fields(const bool charge, const bool rot); - /// True if charge data is available for kernels - bool charge_avail() const { return _charge; } - /// Only free matrices of length inum or nall for resizing void clear_resize(); @@ -108,7 +94,6 @@ class PairGPUAtom { /// Add copy times to timers inline void acc_timers() { time_pos.add_to_total(); - time_answer.add_to_total(); if (_other) time_other.add_to_total(); } @@ -116,14 +101,13 @@ class PairGPUAtom { /// Add copy times to timers inline void zero_timers() { time_pos.zero(); - time_answer.zero(); if (_other) time_other.zero(); } /// Return the total time for host/device data transfer inline double transfer_time() { - double total=time_pos.total_seconds()+time_answer.total_seconds(); + double total=time_pos.total_seconds(); if (_other) total+=time_other.total_seconds(); return total; } @@ -224,6 +208,10 @@ class PairGPUAtom { // -------------------------COPY TO GPU ---------------------------------- + /// Signal that we need to transfer atom data for next timestep + inline void data_unavail() + { _x_avail=false; _q_avail=false; _quat_avail=false; } + /// Cast positions and types to write buffer inline void cast_x_data(double **host_ptr, const int *host_type) { if (_x_avail==false) { @@ -349,26 +337,6 @@ class PairGPUAtom { /// Return number of bytes used on device inline double gpu_bytes() { return _gpu_bytes; } - // -------------------------COPY FROM GPU ------------------------------- - - /// Copy answers from device into read buffer asynchronously - void copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom); - - /// Copy answers from device into read buffer asynchronously - void copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom, int *ilist); - - /// Copy energy and virial data into LAMMPS memory - double energy_virial(double *eatom, double **vatom, double *virial); - - /// Copy energy and virial data into LAMMPS memory - double energy_virial(double *eatom, double **vatom, double *virial, - double &ecoul); - - /// Add forces and torques from the GPU into a LAMMPS pointer - void get_answers(double **f, double **tor); - // ------------------------------ DATA ---------------------------------- /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type @@ -377,10 +345,6 @@ class PairGPUAtom { UCL_D_Vec dev_q; /// Quaterions UCL_D_Vec dev_quat; - /// Force and possibly torque - UCL_D_Vec dev_ans; - /// Energy and virial per-atom storage - UCL_D_Vec dev_engv; #ifdef GPU_CAST UCL_D_Vec dev_x_cast; @@ -395,10 +359,6 @@ class PairGPUAtom { UCL_H_Vec host_q; /// Buffer for moving quat data to GPU UCL_H_Vec host_quat; - /// Force and possibly torque data on host - UCL_H_Vec host_ans; - /// Energy/virial data on host - UCL_H_Vec host_engv; /// Cell list identifiers for device nbor builds UCL_D_Vec dev_cell_id; @@ -408,7 +368,7 @@ class PairGPUAtom { UCL_D_Vec dev_tag; /// Device timers - UCL_Timer time_pos, time_other, time_answer; + UCL_Timer time_pos, time_other; /// Geryon device UCL_Device *dev; @@ -423,20 +383,17 @@ class PairGPUAtom { bool _compiled; // True if data has been copied to device already - int _x_avail, _q_avail, _quat_avail; + bool _x_avail, _q_avail, _quat_avail; - bool alloc(const int inum, const int nall); + bool alloc(const int nall); - bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; - int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields; + bool _allocated, _rot, _charge, _other; + int _max_atoms, _nall; bool _gpu_nbor, _bonds; - int *_ilist; double _time_cast; double _gpu_bytes; - bool _newton; - #ifndef USE_OPENCL CUDPPConfiguration sort_config; CUDPPHandle sort_plan; diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp index 30206d72e6..718e9d9ddb 100644 --- a/lib/gpu/pair_gpu_device.cpp +++ b/lib/gpu/pair_gpu_device.cpp @@ -281,6 +281,7 @@ double lmp_gpu_forces(double **f, double **tor, double *eatom, pair_gpu_device.gpu->sync(); double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul); pair_gpu_device.atom.get_answers(f,tor); + pair_gpu_device.atom.data_unavail(); return evdw; } diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index 8f24e0231b..1ef85c78ad 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -19,11 +19,13 @@ #define PAIR_GPU_DEVICE_H #include "pair_gpu_atom.h" +#include "pair_gpu_ans.h" #include "pair_gpu_nbor.h" #include "mpi.h" #include #include "stdio.h" #include +#include template class PairGPUDevice { @@ -71,6 +73,9 @@ class PairGPUDevice { /// Clear all memory on host and device void clear_device(); + /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS + inline void add_ans_object(PairGPUAns *ans) { ans_queue.push(ans); } + /// Start timer on host inline void start_host_timer() { _cpu_full=MPI_Wtime(); } @@ -134,6 +139,7 @@ class PairGPUDevice { PairGPUNborShared _nbor_shared; private: + std::queue ans_queue; int _init_count; bool _device_init; MPI_Comm _comm_world, _comm_replica, _comm_gpu;