diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp index cda4d383b5..618ffb0106 100644 --- a/lib/gpu/lal_atom.cpp +++ b/lib/gpu/lal_atom.cpp @@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const { bytes+=sizeof(numtyp); if (_vel) bytes+=4*sizeof(numtyp); + if (_extra_fields>0) + bytes+=_extra_fields*sizeof(numtyp); return bytes; } @@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) { UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=v.device.row_bytes(); } + if (_extra_fields>0 && _host_view==false) { + success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=extra.device.row_bytes(); + } if (_gpu_nbor>0) { if (_bonds) { @@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) { template bool AtomT::add_fields(const bool charge, const bool rot, - const int gpu_nbor, const bool bonds, const bool vel) { + const int gpu_nbor, const bool bonds, const bool vel, + const int extra_fields) { bool success=true; // Ignore host/device transfers? int gpu_bytes=0; @@ -191,6 +199,16 @@ bool AtomT::add_fields(const bool charge, const bool rot, } } + if (extra_fields > 0 && _extra_fields==0) { + _extra_fields=extra_fields; + _other=true; + if (_host_view==false) { + success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=extra.device.row_bytes(); + } + } + if (bonds && _bonds==false) { _bonds=true; if (_bonds && _gpu_nbor>0) { @@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot, template bool AtomT::init(const int nall, const bool charge, const bool rot, - UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) { + UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel, + const int extra_fields) { clear(); bool success=true; @@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, _q_avail=false; _quat_avail=false; _v_avail=false; + _extra_avail=false; _resized=false; _gpu_nbor=gpu_nbor; _bonds=bonds; _charge=charge; _rot=rot; _vel=vel; - _other=_charge || _rot || _vel; + _extra_fields=extra_fields; + _other=_charge || _rot || _vel || (extra_fields>0); dev=&devi; _time_transfer=0; @@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, time_q.init(*dev); time_quat.init(*dev); time_vel.init(*dev); + time_extra.init(*dev); + time_pos.zero(); time_q.zero(); time_quat.zero(); time_vel.zero(); + time_extra.zero(); + _time_cast=0.0; #ifdef GPU_CAST @@ -308,6 +333,8 @@ void AtomT::clear_resize() { quat.clear(); if (_vel) v.clear(); + if (_extra_fields>0) + extra.clear(); dev_cell_id.clear(); dev_particle_id.clear(); @@ -350,6 +377,7 @@ void AtomT::clear() { time_q.clear(); time_quat.clear(); time_vel.clear(); + time_extra.clear(); clear_resize(); #ifdef GPU_CAST @@ -370,6 +398,8 @@ double AtomT::host_memory_usage() const { atom_bytes+=4; if (_vel) atom_bytes+=4; + if (_extra_fields>0) + atom_bytes+=_extra_fields; return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom); } diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 3cf97d94a0..ff335fffa9 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -76,7 +76,7 @@ class Atom { * gpu_nbor 2 if binning on host and neighboring on device **/ bool init(const int nall, const bool charge, const bool rot, UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, - const bool vel=false); + const bool vel=false, const int extra_fields=0); /// Check if we have enough device storage and realloc if not /** Returns true if resized with any call during this timestep **/ @@ -96,7 +96,7 @@ class Atom { * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device **/ bool add_fields(const bool charge, const bool rot, const int gpu_nbor, - const bool bonds, const bool vel=false); + const bool bonds, const bool vel=false, const int extra_fields=0); /// Returns true if GPU is using charges bool charge() { return _charge; } @@ -107,6 +107,9 @@ class Atom { /// Returns true if GPU is using velocities bool velocity() { return _vel; } + /// Returns true if GPU is using extra fields + bool using_extra() { return _extra_fields; } + /// Only free matrices of length inum or nall for resizing void clear_resize(); @@ -450,6 +453,38 @@ class Atom { add_v_data(host_ptr,host_tag); } + // Cast extras to write buffer + template + inline void cast_extra_data(cpytyp *host_ptr) { + if (_extra_avail==false) { + double t=MPI_Wtime(); + if (_host_view) { + extra.host.view((numtyp*)host_ptr,_nall*_extra_fields,*dev); + extra.device.view(extra.host); + } else if (sizeof(numtyp)==sizeof(double)) + memcpy(extra.host.begin(),host_ptr,_nall*_extra_fields*sizeof(numtyp)); + else + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i=0; i<_nall*_extra_fields; i++) extra[i]=host_ptr[i]; + _time_cast+=MPI_Wtime()-t; + } + } + + // Copy extras to device + /** Copies nall()*_extra elements **/ + inline void add_extra_data() { + time_extra.start(); + if (_extra_avail==false) { + extra.update_device(_nall*_extra_fields,true); + _extra_avail=true; + } + time_extra.stop(); + } + /// Add in casting time from additional data (seconds) inline void add_cast_time(double t) { _time_cast+=t; } @@ -473,6 +508,8 @@ class Atom { UCL_Vector quat; /// Velocities UCL_Vector v; + /// Extras + UCL_Vector extra; #ifdef GPU_CAST UCL_Vector x_cast; @@ -493,7 +530,7 @@ class Atom { UCL_H_Vec host_particle_id; /// Device timers - UCL_Timer time_pos, time_q, time_quat, time_vel; + UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra; /// Geryon device UCL_Device *dev; @@ -508,11 +545,12 @@ class Atom { bool _compiled; // True if data has been copied to device already - bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized; + bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized; bool alloc(const int nall); bool _allocated, _rot, _charge, _bonds, _vel, _other; + int _extra_fields; int _max_atoms, _nall, _gpu_nbor; bool _host_view; double _time_cast, _time_transfer; diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index e2b5b9cdb5..8908f3aff7 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -424,7 +424,7 @@ template int DeviceT::init(Answer &ans, const bool charge, const bool rot, const int nlocal, const int nall, const int maxspecial, - const bool vel) { + const bool vel, const int extra_fields) { if (!_device_init) return -1; if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) @@ -453,7 +453,7 @@ int DeviceT::init(Answer &ans, const bool charge, if (_init_count==0) { // Initialize atom and nbor data - if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel)) + if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields)) return -3; _data_in_estimate++; @@ -463,6 +463,9 @@ int DeviceT::init(Answer &ans, const bool charge, _data_in_estimate++; if (vel) _data_in_estimate++; + if (extra_fields>0) + _data_in_estimate++; + } else { if (atom.charge()==false && charge) _data_in_estimate++; @@ -470,7 +473,9 @@ int DeviceT::init(Answer &ans, const bool charge, _data_in_estimate++; if (atom.velocity()==false && vel) _data_in_estimate++; - if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel)) + if (atom.using_extra()==false && extra_fields>0) + _data_in_estimate++; + if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields)) return -3; } diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 1db6ae3127..01d3d64627 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -61,6 +61,7 @@ class Device { * \param nall Total number of local+ghost particles * \param maxspecial Maximum mumber of special bonded atoms per atom * \param vel True if velocities need to be stored + * \param extra_fields Nonzero if extra fields need to be stored * * Returns: * - 0 if successful @@ -70,7 +71,7 @@ class Device { * - -5 Double precision is not supported on card **/ int init(Answer &ans, const bool charge, const bool rot, const int nlocal, const int nall, const int maxspecial, - const bool vel=false); + const bool vel=false, const int extra_fields=0); /// Initialize the device for Atom storage only /** \param nlocal Total number of local particles to allocate memory for diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index 12cf6345c2..2ef8af0911 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -330,6 +330,10 @@ #define NEIGHMASK 0x3FFFFFFF ucl_inline int sbmask(int j) { return j >> SBBITS & 3; }; +#define SBBITS15 29 +#define NEIGHMASK15 0x1FFFFFFF +ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; }; + // default to 32-bit smallint and other ints, 64-bit bigint: // same as defined in src/lmptype.h #if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \