Added changes to Atom and Device classes for allocation of extra fields and SBBITS15 and NEIGHMASK15
This commit is contained in:
@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const {
|
|||||||
bytes+=sizeof(numtyp);
|
bytes+=sizeof(numtyp);
|
||||||
if (_vel)
|
if (_vel)
|
||||||
bytes+=4*sizeof(numtyp);
|
bytes+=4*sizeof(numtyp);
|
||||||
|
if (_extra_fields>0)
|
||||||
|
bytes+=_extra_fields*sizeof(numtyp);
|
||||||
return bytes;
|
return bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) {
|
|||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
gpu_bytes+=v.device.row_bytes();
|
gpu_bytes+=v.device.row_bytes();
|
||||||
}
|
}
|
||||||
|
if (_extra_fields>0 && _host_view==false) {
|
||||||
|
success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
|
||||||
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
|
gpu_bytes+=extra.device.row_bytes();
|
||||||
|
}
|
||||||
|
|
||||||
if (_gpu_nbor>0) {
|
if (_gpu_nbor>0) {
|
||||||
if (_bonds) {
|
if (_bonds) {
|
||||||
@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) {
|
|||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool AtomT::add_fields(const bool charge, const bool rot,
|
bool AtomT::add_fields(const bool charge, const bool rot,
|
||||||
const int gpu_nbor, const bool bonds, const bool vel) {
|
const int gpu_nbor, const bool bonds, const bool vel,
|
||||||
|
const int extra_fields) {
|
||||||
bool success=true;
|
bool success=true;
|
||||||
// Ignore host/device transfers?
|
// Ignore host/device transfers?
|
||||||
int gpu_bytes=0;
|
int gpu_bytes=0;
|
||||||
@ -191,6 +199,16 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (extra_fields > 0 && _extra_fields==0) {
|
||||||
|
_extra_fields=extra_fields;
|
||||||
|
_other=true;
|
||||||
|
if (_host_view==false) {
|
||||||
|
success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
|
||||||
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
|
gpu_bytes+=extra.device.row_bytes();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (bonds && _bonds==false) {
|
if (bonds && _bonds==false) {
|
||||||
_bonds=true;
|
_bonds=true;
|
||||||
if (_bonds && _gpu_nbor>0) {
|
if (_bonds && _gpu_nbor>0) {
|
||||||
@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
|||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool AtomT::init(const int nall, const bool charge, const bool rot,
|
bool AtomT::init(const int nall, const bool charge, const bool rot,
|
||||||
UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) {
|
UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel,
|
||||||
|
const int extra_fields) {
|
||||||
clear();
|
clear();
|
||||||
|
|
||||||
bool success=true;
|
bool success=true;
|
||||||
@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
|
|||||||
_q_avail=false;
|
_q_avail=false;
|
||||||
_quat_avail=false;
|
_quat_avail=false;
|
||||||
_v_avail=false;
|
_v_avail=false;
|
||||||
|
_extra_avail=false;
|
||||||
_resized=false;
|
_resized=false;
|
||||||
_gpu_nbor=gpu_nbor;
|
_gpu_nbor=gpu_nbor;
|
||||||
_bonds=bonds;
|
_bonds=bonds;
|
||||||
_charge=charge;
|
_charge=charge;
|
||||||
_rot=rot;
|
_rot=rot;
|
||||||
_vel=vel;
|
_vel=vel;
|
||||||
_other=_charge || _rot || _vel;
|
_extra_fields=extra_fields;
|
||||||
|
_other=_charge || _rot || _vel || (extra_fields>0);
|
||||||
dev=&devi;
|
dev=&devi;
|
||||||
_time_transfer=0;
|
_time_transfer=0;
|
||||||
|
|
||||||
@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
|
|||||||
time_q.init(*dev);
|
time_q.init(*dev);
|
||||||
time_quat.init(*dev);
|
time_quat.init(*dev);
|
||||||
time_vel.init(*dev);
|
time_vel.init(*dev);
|
||||||
|
time_extra.init(*dev);
|
||||||
|
|
||||||
time_pos.zero();
|
time_pos.zero();
|
||||||
time_q.zero();
|
time_q.zero();
|
||||||
time_quat.zero();
|
time_quat.zero();
|
||||||
time_vel.zero();
|
time_vel.zero();
|
||||||
|
time_extra.zero();
|
||||||
|
|
||||||
_time_cast=0.0;
|
_time_cast=0.0;
|
||||||
|
|
||||||
#ifdef GPU_CAST
|
#ifdef GPU_CAST
|
||||||
@ -308,6 +333,8 @@ void AtomT::clear_resize() {
|
|||||||
quat.clear();
|
quat.clear();
|
||||||
if (_vel)
|
if (_vel)
|
||||||
v.clear();
|
v.clear();
|
||||||
|
if (_extra_fields>0)
|
||||||
|
extra.clear();
|
||||||
|
|
||||||
dev_cell_id.clear();
|
dev_cell_id.clear();
|
||||||
dev_particle_id.clear();
|
dev_particle_id.clear();
|
||||||
@ -350,6 +377,7 @@ void AtomT::clear() {
|
|||||||
time_q.clear();
|
time_q.clear();
|
||||||
time_quat.clear();
|
time_quat.clear();
|
||||||
time_vel.clear();
|
time_vel.clear();
|
||||||
|
time_extra.clear();
|
||||||
clear_resize();
|
clear_resize();
|
||||||
|
|
||||||
#ifdef GPU_CAST
|
#ifdef GPU_CAST
|
||||||
@ -370,6 +398,8 @@ double AtomT::host_memory_usage() const {
|
|||||||
atom_bytes+=4;
|
atom_bytes+=4;
|
||||||
if (_vel)
|
if (_vel)
|
||||||
atom_bytes+=4;
|
atom_bytes+=4;
|
||||||
|
if (_extra_fields>0)
|
||||||
|
atom_bytes+=_extra_fields;
|
||||||
return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
|
return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -76,7 +76,7 @@ class Atom {
|
|||||||
* gpu_nbor 2 if binning on host and neighboring on device **/
|
* gpu_nbor 2 if binning on host and neighboring on device **/
|
||||||
bool init(const int nall, const bool charge, const bool rot,
|
bool init(const int nall, const bool charge, const bool rot,
|
||||||
UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
|
UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
|
||||||
const bool vel=false);
|
const bool vel=false, const int extra_fields=0);
|
||||||
|
|
||||||
/// Check if we have enough device storage and realloc if not
|
/// Check if we have enough device storage and realloc if not
|
||||||
/** Returns true if resized with any call during this timestep **/
|
/** Returns true if resized with any call during this timestep **/
|
||||||
@ -96,7 +96,7 @@ class Atom {
|
|||||||
* gpu_nbor 1 if neighboring will be performed on device
|
* gpu_nbor 1 if neighboring will be performed on device
|
||||||
* gpu_nbor 2 if binning on host and neighboring on device **/
|
* gpu_nbor 2 if binning on host and neighboring on device **/
|
||||||
bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
|
bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
|
||||||
const bool bonds, const bool vel=false);
|
const bool bonds, const bool vel=false, const int extra_fields=0);
|
||||||
|
|
||||||
/// Returns true if GPU is using charges
|
/// Returns true if GPU is using charges
|
||||||
bool charge() { return _charge; }
|
bool charge() { return _charge; }
|
||||||
@ -107,6 +107,9 @@ class Atom {
|
|||||||
/// Returns true if GPU is using velocities
|
/// Returns true if GPU is using velocities
|
||||||
bool velocity() { return _vel; }
|
bool velocity() { return _vel; }
|
||||||
|
|
||||||
|
/// Returns true if GPU is using extra fields
|
||||||
|
bool using_extra() { return _extra_fields; }
|
||||||
|
|
||||||
/// Only free matrices of length inum or nall for resizing
|
/// Only free matrices of length inum or nall for resizing
|
||||||
void clear_resize();
|
void clear_resize();
|
||||||
|
|
||||||
@ -450,6 +453,38 @@ class Atom {
|
|||||||
add_v_data(host_ptr,host_tag);
|
add_v_data(host_ptr,host_tag);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Cast extras to write buffer
|
||||||
|
template<class cpytyp>
|
||||||
|
inline void cast_extra_data(cpytyp *host_ptr) {
|
||||||
|
if (_extra_avail==false) {
|
||||||
|
double t=MPI_Wtime();
|
||||||
|
if (_host_view) {
|
||||||
|
extra.host.view((numtyp*)host_ptr,_nall*_extra_fields,*dev);
|
||||||
|
extra.device.view(extra.host);
|
||||||
|
} else if (sizeof(numtyp)==sizeof(double))
|
||||||
|
memcpy(extra.host.begin(),host_ptr,_nall*_extra_fields*sizeof(numtyp));
|
||||||
|
else
|
||||||
|
#if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
|
||||||
|
#pragma omp parallel for simd schedule(static)
|
||||||
|
#elif (LAL_USE_OMP_SIMD == 1)
|
||||||
|
#pragma omp simd
|
||||||
|
#endif
|
||||||
|
for (int i=0; i<_nall*_extra_fields; i++) extra[i]=host_ptr[i];
|
||||||
|
_time_cast+=MPI_Wtime()-t;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy extras to device
|
||||||
|
/** Copies nall()*_extra elements **/
|
||||||
|
inline void add_extra_data() {
|
||||||
|
time_extra.start();
|
||||||
|
if (_extra_avail==false) {
|
||||||
|
extra.update_device(_nall*_extra_fields,true);
|
||||||
|
_extra_avail=true;
|
||||||
|
}
|
||||||
|
time_extra.stop();
|
||||||
|
}
|
||||||
|
|
||||||
/// Add in casting time from additional data (seconds)
|
/// Add in casting time from additional data (seconds)
|
||||||
inline void add_cast_time(double t) { _time_cast+=t; }
|
inline void add_cast_time(double t) { _time_cast+=t; }
|
||||||
|
|
||||||
@ -473,6 +508,8 @@ class Atom {
|
|||||||
UCL_Vector<numtyp,numtyp> quat;
|
UCL_Vector<numtyp,numtyp> quat;
|
||||||
/// Velocities
|
/// Velocities
|
||||||
UCL_Vector<numtyp,numtyp> v;
|
UCL_Vector<numtyp,numtyp> v;
|
||||||
|
/// Extras
|
||||||
|
UCL_Vector<numtyp,numtyp> extra;
|
||||||
|
|
||||||
#ifdef GPU_CAST
|
#ifdef GPU_CAST
|
||||||
UCL_Vector<double,double> x_cast;
|
UCL_Vector<double,double> x_cast;
|
||||||
@ -493,7 +530,7 @@ class Atom {
|
|||||||
UCL_H_Vec<int> host_particle_id;
|
UCL_H_Vec<int> host_particle_id;
|
||||||
|
|
||||||
/// Device timers
|
/// Device timers
|
||||||
UCL_Timer time_pos, time_q, time_quat, time_vel;
|
UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra;
|
||||||
|
|
||||||
/// Geryon device
|
/// Geryon device
|
||||||
UCL_Device *dev;
|
UCL_Device *dev;
|
||||||
@ -508,11 +545,12 @@ class Atom {
|
|||||||
bool _compiled;
|
bool _compiled;
|
||||||
|
|
||||||
// True if data has been copied to device already
|
// True if data has been copied to device already
|
||||||
bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
|
bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized;
|
||||||
|
|
||||||
bool alloc(const int nall);
|
bool alloc(const int nall);
|
||||||
|
|
||||||
bool _allocated, _rot, _charge, _bonds, _vel, _other;
|
bool _allocated, _rot, _charge, _bonds, _vel, _other;
|
||||||
|
int _extra_fields;
|
||||||
int _max_atoms, _nall, _gpu_nbor;
|
int _max_atoms, _nall, _gpu_nbor;
|
||||||
bool _host_view;
|
bool _host_view;
|
||||||
double _time_cast, _time_transfer;
|
double _time_cast, _time_transfer;
|
||||||
|
|||||||
@ -424,7 +424,7 @@ template <class numtyp, class acctyp>
|
|||||||
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||||
const bool rot, const int nlocal,
|
const bool rot, const int nlocal,
|
||||||
const int nall, const int maxspecial,
|
const int nall, const int maxspecial,
|
||||||
const bool vel) {
|
const bool vel, const int extra_fields) {
|
||||||
if (!_device_init)
|
if (!_device_init)
|
||||||
return -1;
|
return -1;
|
||||||
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
|
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
|
||||||
@ -453,7 +453,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
|||||||
|
|
||||||
if (_init_count==0) {
|
if (_init_count==0) {
|
||||||
// Initialize atom and nbor data
|
// Initialize atom and nbor data
|
||||||
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
|
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields))
|
||||||
return -3;
|
return -3;
|
||||||
|
|
||||||
_data_in_estimate++;
|
_data_in_estimate++;
|
||||||
@ -463,6 +463,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
|||||||
_data_in_estimate++;
|
_data_in_estimate++;
|
||||||
if (vel)
|
if (vel)
|
||||||
_data_in_estimate++;
|
_data_in_estimate++;
|
||||||
|
if (extra_fields>0)
|
||||||
|
_data_in_estimate++;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
if (atom.charge()==false && charge)
|
if (atom.charge()==false && charge)
|
||||||
_data_in_estimate++;
|
_data_in_estimate++;
|
||||||
@ -470,7 +473,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
|||||||
_data_in_estimate++;
|
_data_in_estimate++;
|
||||||
if (atom.velocity()==false && vel)
|
if (atom.velocity()==false && vel)
|
||||||
_data_in_estimate++;
|
_data_in_estimate++;
|
||||||
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
|
if (atom.using_extra()==false && extra_fields>0)
|
||||||
|
_data_in_estimate++;
|
||||||
|
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields))
|
||||||
return -3;
|
return -3;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -61,6 +61,7 @@ class Device {
|
|||||||
* \param nall Total number of local+ghost particles
|
* \param nall Total number of local+ghost particles
|
||||||
* \param maxspecial Maximum mumber of special bonded atoms per atom
|
* \param maxspecial Maximum mumber of special bonded atoms per atom
|
||||||
* \param vel True if velocities need to be stored
|
* \param vel True if velocities need to be stored
|
||||||
|
* \param extra_fields Nonzero if extra fields need to be stored
|
||||||
*
|
*
|
||||||
* Returns:
|
* Returns:
|
||||||
* - 0 if successful
|
* - 0 if successful
|
||||||
@ -70,7 +71,7 @@ class Device {
|
|||||||
* - -5 Double precision is not supported on card **/
|
* - -5 Double precision is not supported on card **/
|
||||||
int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
|
int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
|
||||||
const int nlocal, const int nall, const int maxspecial,
|
const int nlocal, const int nall, const int maxspecial,
|
||||||
const bool vel=false);
|
const bool vel=false, const int extra_fields=0);
|
||||||
|
|
||||||
/// Initialize the device for Atom storage only
|
/// Initialize the device for Atom storage only
|
||||||
/** \param nlocal Total number of local particles to allocate memory for
|
/** \param nlocal Total number of local particles to allocate memory for
|
||||||
|
|||||||
@ -330,6 +330,10 @@
|
|||||||
#define NEIGHMASK 0x3FFFFFFF
|
#define NEIGHMASK 0x3FFFFFFF
|
||||||
ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
|
ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
|
||||||
|
|
||||||
|
#define SBBITS15 29
|
||||||
|
#define NEIGHMASK15 0x1FFFFFFF
|
||||||
|
ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; };
|
||||||
|
|
||||||
// default to 32-bit smallint and other ints, 64-bit bigint:
|
// default to 32-bit smallint and other ints, 64-bit bigint:
|
||||||
// same as defined in src/lmptype.h
|
// same as defined in src/lmptype.h
|
||||||
#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \
|
#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \
|
||||||
|
|||||||
Reference in New Issue
Block a user