/*************************************************************************** atom.h ------------------- W. Michael Brown (ORNL) Class for particle data management __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #ifndef PAIR_GPU_ATOM_H #define PAIR_GPU_ATOM_H #include #include "mpi.h" #ifdef USE_OPENCL #include "geryon/ocl_timer.h" #include "geryon/ocl_mat.h" #include "geryon/ocl_kernel.h" using namespace ucl_opencl; #else #include "geryon/nvd_timer.h" #include "geryon/nvd_mat.h" #include "geryon/nvd_kernel.h" using namespace ucl_cudadr; #endif #ifdef USE_CUDPP #include "cudpp.h" #endif #include "lal_precision.h" namespace LAMMPS_AL { template class Atom { public: Atom(); ~Atom() { clear(); } /// Maximum number of atoms that can be stored with current allocation inline int max_atoms() const { return _max_atoms; } /// Current number of local+ghost atoms stored inline int nall() const { return _nall; } /// Set number of local+ghost atoms for future copy operations inline void nall(const int n) { _nall=n; } /// Memory usage per atom in this class int bytes_per_atom() const; /// Clear any previous data and set up for a new LAMMPS run /** \param rot True if atom storage needs quaternions * \param gpu_nbor 0 if neighboring will be performed on host * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device **/ bool init(const int nall, const bool charge, const bool rot, UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false); /// Check if we have enough device storage and realloc if not /** Returns true if resized with any call during this timestep **/ inline bool resize(const int nall, bool &success) { _nall=nall; if (nall>_max_atoms) { clear_resize(); success = success && alloc(nall); _resized=true; } return _resized; } /// If already initialized by another LAMMPS style, add fields as necessary /** \param rot True if atom storage needs quaternions * \param gpu_nbor 0 if neighboring will be performed on host * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device **/ bool add_fields(const bool charge, const bool rot, const int gpu_nbor, const bool bonds); /// Returns true if GPU is using charges bool charge() { return _charge; } /// Returns true if GPU is using quaternions bool quat() { return _rot; } /// Only free matrices of length inum or nall for resizing void clear_resize(); /// Free all memory on host and device void clear(); /// Return the total amount of host memory used by class in bytes double host_memory_usage() const; /// Sort arrays for neighbor list calculation on device void sort_neighbor(const int num_atoms); /// Add copy times to timers inline void acc_timers() { time_pos.add_to_total(); if (_charge) time_q.add_to_total(); if (_rot) time_quat.add_to_total(); } /// Add copy times to timers inline void zero_timers() { time_pos.zero(); if (_charge) time_q.zero(); if (_rot) time_quat.zero(); } /// Return the total time for host/device data transfer /** Zeros the total so that the atom times are only included once **/ inline double transfer_time() { double total=time_pos.total_seconds(); time_pos.zero_total(); if (_charge) { total+=time_q.total_seconds(); time_q.zero_total(); } if (_rot) { total+=time_q.total_seconds(); time_quat.zero_total(); } return total+_time_transfer/1000.0; } /// Return the total time for data cast/pack /** Zeros the time so that atom times are only included once **/ inline double cast_time() { double t=_time_cast; _time_cast=0.0; return t; } /// Pack LAMMPS atom type constants into matrix and copy to device template inline void type_pack1(const int n, const int m_size, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one) { int ii=0; for (int i=0; i(one[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom type constants into 2 vectors and copy to device template inline void type_pack2(const int n, const int m_size, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one, t2 **two) { int ii=0; for (int i=0; i(one[i][j]); buffer[ii*2+1]=static_cast(two[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device template inline void type_pack4(const int n, const int m_size, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one, t2 **two, t3 **three) { int ii=0; for (int i=0; i(one[i][j]); buffer[ii*4+1]=static_cast(two[i][j]); buffer[ii*4+2]=static_cast(three[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device template inline void type_pack4(const int n, const int m_size, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one, t2 **two, t3 **three, t4 **four) { int ii=0; for (int i=0; i(one[i][j]); buffer[ii*4+1]=static_cast(two[i][j]); buffer[ii*4+2]=static_cast(three[i][j]); buffer[ii*4+3]=static_cast(four[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device template inline void self_pack2(const int n, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one, t2 **two) { for (int i=0; i(one[i][i]); buffer[i*2+1]=static_cast(two[i][i]); } UCL_H_Vec view; view.view((dev_typ*)buffer.begin(),n,*dev); ucl_copy(dev_v,view,false); } // -------------------------COPY TO GPU ---------------------------------- /// Signal that we need to transfer atom data for next timestep inline void data_unavail() { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; } /// Cast positions and types to write buffer inline void cast_x_data(double **host_ptr, const int *host_type) { if (_x_avail==false) { double t=MPI_Wtime(); #ifdef GPU_CAST memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); #else numtyp *_write_loc=host_x.begin(); for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i][0]; _write_loc++; *_write_loc=host_ptr[i][1]; _write_loc++; *_write_loc=host_ptr[i][2]; _write_loc++; *_write_loc=host_type[i]; _write_loc++; } #endif _time_cast+=MPI_Wtime()-t; } } /// Copy positions and types to device asynchronously /** Copies nall() elements **/ inline void add_x_data(double **host_ptr, int *host_type) { time_pos.start(); if (_x_avail==false) { #ifdef GPU_CAST ucl_copy(dev_x_cast,host_x_cast,_nall*3,true); ucl_copy(dev_type_cast,host_type_cast,_nall,true); int block_size=64; int GX=static_cast(ceil(static_cast(_nall)/block_size)); k_cast_x.set_size(GX,block_size); k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), &_nall); #else ucl_copy(dev_x,host_x,_nall*4,true); #endif _x_avail=true; } time_pos.stop(); } /// Calls cast_x_data and add_x_data and times the routines inline void cast_copy_x(double **host_ptr, int *host_type) { cast_x_data(host_ptr,host_type); add_x_data(host_ptr,host_type); } // Cast charges to write buffer template inline void cast_q_data(cpytyp *host_ptr) { if (_q_avail==false) { double t=MPI_Wtime(); if (dev->device_type()==UCL_CPU) { if (sizeof(numtyp)==sizeof(double)) { host_q.view((numtyp*)host_ptr,_nall,*dev); dev_q.view(host_q); } else for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; } else { if (sizeof(numtyp)==sizeof(double)) memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp)); else for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; } _time_cast+=MPI_Wtime()-t; } } // Copy charges to device asynchronously inline void add_q_data() { if (_q_avail==false) { ucl_copy(dev_q,host_q,_nall,true); _q_avail=true; } } // Cast quaternions to write buffer template inline void cast_quat_data(cpytyp *host_ptr) { if (_quat_avail==false) { double t=MPI_Wtime(); if (dev->device_type()==UCL_CPU) { if (sizeof(numtyp)==sizeof(double)) { host_quat.view((numtyp*)host_ptr,_nall*4,*dev); dev_quat.view(host_quat); } else for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; } else { if (sizeof(numtyp)==sizeof(double)) memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp)); else for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; } _time_cast+=MPI_Wtime()-t; } } // Copy quaternions to device /** Copies nall()*4 elements **/ inline void add_quat_data() { if (_quat_avail==false) { ucl_copy(dev_quat,host_quat,_nall*4,true); _quat_avail=true; } } /// Add in casting time from additional data (seconds) inline void add_cast_time(double t) { _time_cast+=t; } /// Add in transfer time from additional data (ms) inline void add_transfer_time(double t) { _time_transfer+=t; } /// Return number of bytes used on device inline double max_gpu_bytes() { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } // ------------------------------ DATA ---------------------------------- /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type UCL_D_Vec dev_x; /// Charges UCL_D_Vec dev_q; /// Quaterions UCL_D_Vec dev_quat; #ifdef GPU_CAST UCL_D_Vec dev_x_cast; UCL_D_Vec dev_type_cast; UCL_H_Vec host_x_cast; UCL_H_Vec host_type_cast; #endif /// Buffer for moving positions to device UCL_H_Vec host_x; /// Buffer for moving charge data to GPU UCL_H_Vec host_q; /// Buffer for moving quat data to GPU UCL_H_Vec host_quat; /// Cell list identifiers for device nbor builds UCL_D_Vec dev_cell_id; /// Cell list identifiers for device nbor builds UCL_D_Vec dev_particle_id; /// Atom tag information for device nbor builds UCL_D_Vec dev_tag; /// Cell list identifiers for hybrid nbor builds UCL_H_Vec host_cell_id; /// Cell list identifiers for hybrid nbor builds UCL_H_Vec host_particle_id; /// Device timers UCL_Timer time_pos, time_q, time_quat; /// Geryon device UCL_Device *dev; private: #ifdef GPU_CAST UCL_Program *atom_program; UCL_Kernel k_cast_x; void compile_kernels(UCL_Device &dev); #endif bool _compiled; // True if data has been copied to device already bool _x_avail, _q_avail, _quat_avail, _resized; bool alloc(const int nall); bool _allocated, _rot, _charge, _other; int _max_atoms, _nall, _gpu_nbor; bool _bonds; double _time_cast, _time_transfer; double _max_gpu_bytes; #ifdef USE_CUDPP CUDPPConfiguration sort_config; CUDPPHandle sort_plan; #endif }; } #endif