git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6053 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -23,7 +23,6 @@
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
|
||||
#include "geryon/ocl_device.h"
|
||||
#include "geryon/ocl_timer.h"
|
||||
#include "geryon/ocl_mat.h"
|
||||
#include "geryon/ocl_kernel.h"
|
||||
@ -32,7 +31,6 @@ using namespace ucl_opencl;
|
||||
#else
|
||||
|
||||
#include "cudpp.h"
|
||||
#include "geryon/nvd_device.h"
|
||||
#include "geryon/nvd_timer.h"
|
||||
#include "geryon/nvd_mat.h"
|
||||
#include "geryon/nvd_kernel.h"
|
||||
@ -40,10 +38,6 @@ using namespace ucl_cudadr;
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef int2
|
||||
struct int2 { int x; int y; };
|
||||
#endif
|
||||
|
||||
#include "pair_gpu_precision.h"
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -56,13 +50,9 @@ class PairGPUAtom {
|
||||
inline int max_atoms() const { return _max_atoms; }
|
||||
/// Current number of local+ghost atoms stored
|
||||
inline int nall() const { return _nall; }
|
||||
/// Current number of local atoms stored
|
||||
inline int inum() const { return _inum; }
|
||||
|
||||
/// Set number of local+ghost atoms for future copy operations
|
||||
inline void nall(const int n) { _nall=n; }
|
||||
/// Set number of local atoms for future copy operations
|
||||
inline void inum(const int n) { _inum=n; }
|
||||
|
||||
/// Memory usage per atom in this class
|
||||
int bytes_per_atom() const;
|
||||
@ -70,21 +60,33 @@ class PairGPUAtom {
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param rot True if atom storage needs quaternions
|
||||
* \param gpu_nbor True if neighboring will be performed on device **/
|
||||
bool init(const int inum, const int nall, const bool charge, const bool rot,
|
||||
bool init(const int nall, const bool charge, const bool rot,
|
||||
UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
|
||||
|
||||
/// Check if we have enough device storage and realloc if not
|
||||
inline bool resize(const int inum, const int nall, bool &success) {
|
||||
_inum=inum;
|
||||
/** Returns true if resized with any call during this timestep **/
|
||||
inline bool resize(const int nall, bool &success) {
|
||||
_nall=nall;
|
||||
if (inum>_max_local || nall>_max_atoms) {
|
||||
if (nall>_max_atoms) {
|
||||
clear_resize();
|
||||
success = success && alloc(inum,nall);
|
||||
return true;
|
||||
success = success && alloc(nall);
|
||||
_resized=true;
|
||||
}
|
||||
return false;
|
||||
return _resized;
|
||||
}
|
||||
|
||||
|
||||
/// If already initialized by another LAMMPS style, add fields as necessary
|
||||
/** \param rot True if atom storage needs quaternions
|
||||
* \param gpu_nbor True if neighboring will be performed on device **/
|
||||
bool add_fields(const bool charge, const bool rot, const bool gpu_nbor,
|
||||
const bool bonds);
|
||||
|
||||
/// Returns true if GPU is using charges
|
||||
bool charge() { return _charge; }
|
||||
|
||||
/// Returns true if GPU is using quaternions
|
||||
bool quat() { return _rot; }
|
||||
|
||||
/// Only free matrices of length inum or nall for resizing
|
||||
void clear_resize();
|
||||
|
||||
@ -100,28 +102,42 @@ class PairGPUAtom {
|
||||
/// Add copy times to timers
|
||||
inline void acc_timers() {
|
||||
time_pos.add_to_total();
|
||||
time_answer.add_to_total();
|
||||
if (_other)
|
||||
time_other.add_to_total();
|
||||
if (_charge)
|
||||
time_q.add_to_total();
|
||||
if (_rot)
|
||||
time_quat.add_to_total();
|
||||
}
|
||||
|
||||
/// Add copy times to timers
|
||||
inline void zero_timers() {
|
||||
time_pos.zero();
|
||||
time_answer.zero();
|
||||
if (_other)
|
||||
time_other.zero();
|
||||
if (_charge)
|
||||
time_q.zero();
|
||||
if (_rot)
|
||||
time_quat.zero();
|
||||
}
|
||||
|
||||
/// Return the total time for host/device data transfer
|
||||
/** Zeros the total so that the atom times are only included once **/
|
||||
inline double transfer_time() {
|
||||
double total=time_pos.total_seconds()+time_answer.total_seconds();
|
||||
if (_other) total+=time_other.total_seconds();
|
||||
double total=time_pos.total_seconds();
|
||||
time_pos.zero_total();
|
||||
if (_charge) {
|
||||
total+=time_q.total_seconds();
|
||||
time_q.zero_total();
|
||||
}
|
||||
if (_rot) {
|
||||
total+=time_q.total_seconds();
|
||||
time_quat.zero_total();
|
||||
}
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
/// Return the total time for data cast/pack
|
||||
inline double cast_time() { return _time_cast; }
|
||||
/** Zeros the time so that atom times are only included once **/
|
||||
inline double cast_time()
|
||||
{ double t=_time_cast; _time_cast=0.0; return t; }
|
||||
|
||||
/// Pack LAMMPS atom type constants into matrix and copy to device
|
||||
template <class dev_typ, class t1>
|
||||
@ -216,43 +232,52 @@ class PairGPUAtom {
|
||||
|
||||
// -------------------------COPY TO GPU ----------------------------------
|
||||
|
||||
/// Signal that we need to transfer atom data for next timestep
|
||||
inline void data_unavail()
|
||||
{ _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
|
||||
|
||||
/// Cast positions and types to write buffer
|
||||
inline void cast_x_data(double **host_ptr, const int *host_type) {
|
||||
double t=MPI_Wtime();
|
||||
#ifdef GPU_CAST
|
||||
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
|
||||
#else
|
||||
numtyp *_write_loc=host_x.begin();
|
||||
for (int i=0; i<_nall; i++) {
|
||||
*_write_loc=host_ptr[i][0];
|
||||
_write_loc++;
|
||||
*_write_loc=host_ptr[i][1];
|
||||
_write_loc++;
|
||||
*_write_loc=host_ptr[i][2];
|
||||
_write_loc++;
|
||||
*_write_loc=host_type[i];
|
||||
_write_loc++;
|
||||
if (_x_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
#ifdef GPU_CAST
|
||||
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
|
||||
#else
|
||||
numtyp *_write_loc=host_x.begin();
|
||||
for (int i=0; i<_nall; i++) {
|
||||
*_write_loc=host_ptr[i][0];
|
||||
_write_loc++;
|
||||
*_write_loc=host_ptr[i][1];
|
||||
_write_loc++;
|
||||
*_write_loc=host_ptr[i][2];
|
||||
_write_loc++;
|
||||
*_write_loc=host_type[i];
|
||||
_write_loc++;
|
||||
}
|
||||
#endif
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
#endif
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
}
|
||||
|
||||
/// Copy positions and types to device asynchronously
|
||||
/** Copies nall() elements **/
|
||||
inline void add_x_data(double **host_ptr, int *host_type) {
|
||||
time_pos.start();
|
||||
#ifdef GPU_CAST
|
||||
ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
|
||||
ucl_copy(dev_type_cast,host_type_cast,_nall,true);
|
||||
int block_size=64;
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
|
||||
k_cast_x.set_size(GX,block_size);
|
||||
k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(),
|
||||
&_nall);
|
||||
#else
|
||||
ucl_copy(dev_x,host_x,_nall*4,true);
|
||||
#endif
|
||||
if (_x_avail==false) {
|
||||
#ifdef GPU_CAST
|
||||
ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
|
||||
ucl_copy(dev_type_cast,host_type_cast,_nall,true);
|
||||
int block_size=64;
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
|
||||
k_cast_x.set_size(GX,block_size);
|
||||
k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(),
|
||||
&_nall);
|
||||
#else
|
||||
ucl_copy(dev_x,host_x,_nall*4,true);
|
||||
#endif
|
||||
_x_avail=true;
|
||||
}
|
||||
time_pos.stop();
|
||||
}
|
||||
|
||||
@ -262,87 +287,68 @@ class PairGPUAtom {
|
||||
add_x_data(host_ptr,host_type);
|
||||
}
|
||||
|
||||
/// Cast charges to write buffer
|
||||
// Cast charges to write buffer
|
||||
template<class cpytyp>
|
||||
inline void cast_q_data(cpytyp *host_ptr) {
|
||||
double t=MPI_Wtime();
|
||||
if (dev->device_type()==UCL_CPU) {
|
||||
if (sizeof(numtyp)==sizeof(double)) {
|
||||
host_q.view((numtyp*)host_ptr,_nall,*dev);
|
||||
dev_q.view(host_q);
|
||||
} else
|
||||
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
|
||||
if (_q_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
if (dev->device_type()==UCL_CPU) {
|
||||
if (sizeof(numtyp)==sizeof(double)) {
|
||||
host_q.view((numtyp*)host_ptr,_nall,*dev);
|
||||
dev_q.view(host_q);
|
||||
} else
|
||||
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
|
||||
}
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
|
||||
/// Copy charges to device asynchronously
|
||||
// Copy charges to device asynchronously
|
||||
inline void add_q_data() {
|
||||
ucl_copy(dev_q,host_q,_nall,true);
|
||||
if (_q_avail==false) {
|
||||
ucl_copy(dev_q,host_q,_nall,true);
|
||||
_q_avail=true;
|
||||
}
|
||||
}
|
||||
|
||||
/// Cast quaternions to write buffer
|
||||
// Cast quaternions to write buffer
|
||||
template<class cpytyp>
|
||||
inline void cast_quat_data(cpytyp *host_ptr) {
|
||||
double t=MPI_Wtime();
|
||||
if (dev->device_type()==UCL_CPU) {
|
||||
if (sizeof(numtyp)==sizeof(double)) {
|
||||
host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
|
||||
dev_quat.view(host_quat);
|
||||
} else
|
||||
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
|
||||
if (_quat_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
if (dev->device_type()==UCL_CPU) {
|
||||
if (sizeof(numtyp)==sizeof(double)) {
|
||||
host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
|
||||
dev_quat.view(host_quat);
|
||||
} else
|
||||
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
|
||||
}
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
|
||||
/// Copy quaternions to device
|
||||
// Copy quaternions to device
|
||||
/** Copies nall()*4 elements **/
|
||||
inline void add_quat_data() {
|
||||
ucl_copy(dev_quat,host_quat,_nall*4,true);
|
||||
if (_quat_avail==false) {
|
||||
ucl_copy(dev_quat,host_quat,_nall*4,true);
|
||||
_quat_avail=true;
|
||||
}
|
||||
}
|
||||
|
||||
/// Copy data other than pos and data to device
|
||||
inline void add_other_data() {
|
||||
time_other.start();
|
||||
if (_charge)
|
||||
add_q_data();
|
||||
if (_rot)
|
||||
add_quat_data();
|
||||
time_other.stop();
|
||||
}
|
||||
|
||||
/// Return number of bytes used on device
|
||||
inline double gpu_bytes() { return _gpu_bytes; }
|
||||
|
||||
// -------------------------COPY FROM GPU -------------------------------
|
||||
|
||||
/// Copy answers from device into read buffer asynchronously
|
||||
void copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom);
|
||||
|
||||
/// Copy answers from device into read buffer asynchronously
|
||||
void copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom, int *ilist);
|
||||
|
||||
/// Copy energy and virial data into LAMMPS memory
|
||||
double energy_virial(double *eatom, double **vatom, double *virial);
|
||||
|
||||
/// Copy energy and virial data into LAMMPS memory
|
||||
double energy_virial(double *eatom, double **vatom, double *virial,
|
||||
double &ecoul);
|
||||
|
||||
/// Add forces and torques from the GPU into a LAMMPS pointer
|
||||
void get_answers(double **f, double **tor);
|
||||
inline double max_gpu_bytes()
|
||||
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
|
||||
|
||||
// ------------------------------ DATA ----------------------------------
|
||||
|
||||
@ -352,10 +358,6 @@ class PairGPUAtom {
|
||||
UCL_D_Vec<numtyp> dev_q;
|
||||
/// Quaterions
|
||||
UCL_D_Vec<numtyp> dev_quat;
|
||||
/// Force and possibly torque
|
||||
UCL_D_Vec<acctyp> dev_ans;
|
||||
/// Energy and virial per-atom storage
|
||||
UCL_D_Vec<acctyp> dev_engv;
|
||||
|
||||
#ifdef GPU_CAST
|
||||
UCL_D_Vec<double> dev_x_cast;
|
||||
@ -370,10 +372,6 @@ class PairGPUAtom {
|
||||
UCL_H_Vec<numtyp> host_q;
|
||||
/// Buffer for moving quat data to GPU
|
||||
UCL_H_Vec<numtyp> host_quat;
|
||||
/// Force and possibly torque data on host
|
||||
UCL_H_Vec<acctyp> host_ans;
|
||||
/// Energy/virial data on host
|
||||
UCL_H_Vec<acctyp> host_engv;
|
||||
|
||||
/// Cell list identifiers for device nbor builds
|
||||
UCL_D_Vec<unsigned> dev_cell_id;
|
||||
@ -383,7 +381,7 @@ class PairGPUAtom {
|
||||
UCL_D_Vec<int> dev_tag;
|
||||
|
||||
/// Device timers
|
||||
UCL_Timer time_pos, time_other, time_answer;
|
||||
UCL_Timer time_pos, time_q, time_quat;
|
||||
|
||||
/// Geryon device
|
||||
UCL_Device *dev;
|
||||
@ -396,19 +394,19 @@ class PairGPUAtom {
|
||||
#endif
|
||||
|
||||
bool _compiled;
|
||||
|
||||
bool alloc(const int inum, const int nall);
|
||||
|
||||
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
|
||||
int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
|
||||
// True if data has been copied to device already
|
||||
bool _x_avail, _q_avail, _quat_avail, _resized;
|
||||
|
||||
bool alloc(const int nall);
|
||||
|
||||
bool _allocated, _rot, _charge, _other;
|
||||
int _max_atoms, _nall;
|
||||
bool _gpu_nbor, _bonds;
|
||||
int *_ilist;
|
||||
double _time_cast;
|
||||
|
||||
double _gpu_bytes;
|
||||
double _max_gpu_bytes;
|
||||
|
||||
bool _newton;
|
||||
|
||||
#ifndef USE_OPENCL
|
||||
CUDPPConfiguration sort_config;
|
||||
CUDPPHandle sort_plan;
|
||||
|
||||
Reference in New Issue
Block a user