Files
lammps/lib/gpu/pair_gpu_atom.h
Axel Kohlmeyer 92cd2a69bf do GPU device memory accounting with doubles
using (signed) int will overflow at 2GB,
switching to unsigned has risk of hiding
overflows and using long long is not as
portable as double precisiong floating point.
2010-11-11 12:31:36 -05:00

407 lines
12 KiB
C++

/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_ATOM_H
#define PAIR_GPU_ATOM_H
#include <math.h>
#include "mpi.h"
#ifdef USE_OPENCL
#include "geryon/ocl_device.h"
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
#include "geryon/ocl_kernel.h"
using namespace ucl_opencl;
#else
#include "cudpp.h"
#include "geryon/nvd_device.h"
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
#include "geryon/nvd_kernel.h"
using namespace ucl_cudadr;
#endif
#ifndef int2
struct int2 { int x; int y; };
#endif
#include "pair_gpu_precision.h"
template <class numtyp, class acctyp>
class PairGPUAtom {
public:
PairGPUAtom();
~PairGPUAtom() { clear(); }
/// Maximum number of atoms that can be stored with current allocation
inline int max_atoms() const { return _max_atoms; }
/// Current number of local+ghost atoms stored
inline int nall() const { return _nall; }
/// Current number of local atoms stored
inline int inum() const { return _inum; }
/// Set number of local+ghost atoms for future copy operations
inline void nall(const int n) { _nall=n; }
/// Set number of local atoms for future copy operations
inline void inum(const int n) { _inum=n; }
/// Memory usage per atom in this class
int bytes_per_atom() const;
/// Clear any previous data and set up for a new LAMMPS run
/** \param rot True if atom storage needs quaternions
* \param gpu_nbor True if neighboring will be performed on device **/
bool init(const int inum, const int nall, const bool charge, const bool rot,
UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
/// Check if we have enough device storage and realloc if not
inline bool resize(const int inum, const int nall, bool &success) {
_inum=inum;
_nall=nall;
if (nall>_max_atoms) {
clear_resize();
_max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
_allocated=true;
success = success && alloc(_max_atoms);
return true;
}
return false;
}
/// Only free matrices of length inum or nall for resizing
void clear_resize();
/// Free all memory on host and device
void clear();
/// Return the total amount of host memory used by class in bytes
double host_memory_usage() const;
/// Sort arrays for neighbor list calculation on device
void sort_neighbor(const int num_atoms);
/// Add copy times to timers
inline void acc_timers() {
time_pos.add_to_total();
time_answer.add_to_total();
if (_other)
time_other.add_to_total();
}
/// Add copy times to timers
inline void zero_timers() {
time_pos.zero();
time_answer.zero();
if (_other)
time_other.zero();
}
/// Return the total time for host/device data transfer
inline double transfer_time() {
double total=time_pos.total_seconds()+time_answer.total_seconds();
if (_other) total+=time_other.total_seconds();
return total;
}
/// Return the total time for data cast/pack
inline double cast_time() { return _time_cast; }
/// Pack LAMMPS atom type constants into matrix and copy to device
template <class dev_typ, class t1>
inline void type_pack1(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
buffer[ii]=static_cast<numtyp>(one[i][j]);
ii++;
}
ii+=m_size-n;
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false);
}
/// Pack LAMMPS atom type constants into 2 vectors and copy to device
template <class dev_typ, class t1, class t2>
inline void type_pack2(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one, t2 **two) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
buffer[ii*2]=static_cast<numtyp>(one[i][j]);
buffer[ii*2+1]=static_cast<numtyp>(two[i][j]);
ii++;
}
ii+=m_size-n;
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false);
}
/// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device
template <class dev_typ, class t1, class t2, class t3>
inline void type_pack4(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one, t2 **two, t3 **three) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
buffer[ii*4]=static_cast<numtyp>(one[i][j]);
buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
ii++;
}
ii+=m_size-n;
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false);
}
/// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
template <class dev_typ, class t1, class t2, class t3, class t4>
inline void type_pack4(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one, t2 **two, t3 **three, t4 **four) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
buffer[ii*4]=static_cast<numtyp>(one[i][j]);
buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
buffer[ii*4+3]=static_cast<numtyp>(four[i][j]);
ii++;
}
ii+=m_size-n;
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false);
}
// -------------------------COPY TO GPU ----------------------------------
/// Cast positions and types to write buffer
inline void cast_x_data(double **host_ptr, const int *host_type) {
double t=MPI_Wtime();
#ifdef GPU_CAST
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
#else
numtyp *_write_loc=host_x.begin();
for (int i=0; i<_nall; i++) {
*_write_loc=host_ptr[i][0];
_write_loc++;
*_write_loc=host_ptr[i][1];
_write_loc++;
*_write_loc=host_ptr[i][2];
_write_loc++;
*_write_loc=host_type[i];
_write_loc++;
}
#endif
_time_cast+=MPI_Wtime()-t;
}
/// Copy positions and types to device asynchronously
/** Copies nall() elements **/
inline void add_x_data(double **host_ptr, int *host_type) {
time_pos.start();
#ifdef GPU_CAST
ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
ucl_copy(dev_type_cast,host_type_cast,_nall,true);
int block_size=64;
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
k_cast_x.set_size(GX,block_size);
k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(),
&_nall);
#else
ucl_copy(dev_x,host_x,_nall*4,true);
#endif
time_pos.stop();
}
/// Calls cast_x_data and add_x_data and times the routines
inline void cast_copy_x(double **host_ptr, int *host_type) {
cast_x_data(host_ptr,host_type);
add_x_data(host_ptr,host_type);
}
/// Cast charges to write buffer
template<class cpytyp>
inline void cast_q_data(cpytyp *host_ptr) {
double t=MPI_Wtime();
if (dev->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_q.view((numtyp*)host_ptr,_nall,*dev);
dev_q.view(host_q);
} else
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
else
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
}
_time_cast+=MPI_Wtime()-t;
}
/// Copy charges to device asynchronously
inline void add_q_data() {
ucl_copy(dev_q,host_q,_nall,true);
}
/// Cast quaternions to write buffer
template<class cpytyp>
inline void cast_quat_data(cpytyp *host_ptr) {
double t=MPI_Wtime();
if (dev->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
dev_quat.view(host_quat);
} else
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
else
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
}
_time_cast+=MPI_Wtime()-t;
}
/// Copy quaternions to device
/** Copies nall()*4 elements **/
inline void add_quat_data() {
ucl_copy(dev_quat,host_quat,_nall*4,true);
}
/// Copy data other than pos and data to device
inline void add_other_data() {
time_other.start();
if (_charge)
add_q_data();
if (_rot)
add_quat_data();
time_other.stop();
}
/// Return number of bytes used on device
inline double gpu_bytes() { return _gpu_bytes; }
// -------------------------COPY FROM GPU -------------------------------
/// Copy answers from device into read buffer asynchronously
void copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom);
/// Copy answers from device into read buffer asynchronously
void copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom, int *ilist);
/// Copy energy and virial data into LAMMPS memory
double energy_virial(double *eatom, double **vatom, double *virial);
/// Copy energy and virial data into LAMMPS memory
double energy_virial(double *eatom, double **vatom, double *virial,
double &ecoul);
/// Add forces and torques from the GPU into a LAMMPS pointer
void get_answers(double **f, double **tor);
// ------------------------------ DATA ----------------------------------
/// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
UCL_D_Vec<numtyp> dev_x;
/// Charges
UCL_D_Vec<numtyp> dev_q;
/// Quaterions
UCL_D_Vec<numtyp> dev_quat;
/// Force and possibly torque
UCL_D_Vec<acctyp> dev_ans;
/// Energy and virial per-atom storage
UCL_D_Vec<acctyp> dev_engv;
#ifdef GPU_CAST
UCL_D_Vec<double> dev_x_cast;
UCL_D_Vec<int> dev_type_cast;
UCL_H_Vec<double> host_x_cast;
UCL_H_Vec<int> host_type_cast;
#endif
/// Buffer for moving positions to device
UCL_H_Vec<numtyp> host_x;
/// Buffer for moving charge data to GPU
UCL_H_Vec<numtyp> host_q;
/// Buffer for moving quat data to GPU
UCL_H_Vec<numtyp> host_quat;
/// Force and possibly torque data on host
UCL_H_Vec<acctyp> host_ans;
/// Energy/virial data on host
UCL_H_Vec<acctyp> host_engv;
/// Cell list identifiers for device nbor builds
UCL_D_Vec<unsigned> dev_cell_id;
/// Cell list identifiers for device nbor builds
UCL_D_Vec<int> dev_particle_id;
/// Atom tag information for device nbor builds
UCL_D_Vec<int> dev_tag;
/// Device timers
UCL_Timer time_pos, time_other, time_answer;
/// Geryon device
UCL_Device *dev;
private:
#ifdef GPU_CAST
UCL_Program *atom_program;
UCL_Kernel k_cast_x;
void compile_kernels(UCL_Device &dev);
#endif
bool _compiled;
bool alloc(const int max_atoms);
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
int _max_atoms, _nall, _inum, _e_fields, _ev_fields;
bool _gpu_nbor, _bonds;
int *_ilist;
double _time_cast;
double _gpu_bytes;
#ifndef USE_OPENCL
CUDPPConfiguration sort_config;
CUDPPHandle sort_plan;
#endif
};
#endif