git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3406 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -16,7 +16,7 @@
|
||||
BIN_DIR = .
|
||||
OBJ_DIR = .
|
||||
AR = ar
|
||||
CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math
|
||||
CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||
CUDA_ARCH = -maxrregcount 128 #-arch=sm_13
|
||||
CUDA_PREC = -D_SINGLE_SINGLE
|
||||
CUDA_LINK = -L/usr/local/cuda/lib64 -lcudart $(CUDA_LIB)
|
||||
|
||||
@ -69,6 +69,9 @@ the CUDA_PREC variable:
|
||||
NOTE: Double precision is only supported on certain GPUS (with
|
||||
compute capability>=1.3).
|
||||
|
||||
NOTE: For Tesla and other graphics cards with compute capability>=1.3,
|
||||
make sure that -arch=sm_13 is set on the CUDA_ARCH line.
|
||||
|
||||
NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
|
||||
package has been installed before installing the GPU package in LAMMPS.
|
||||
|
||||
|
||||
@ -203,17 +203,19 @@ string gb_gpu_name(const int id, const int max_nbors) {
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **shape,
|
||||
double **well, double **cutsq, double **sigma,
|
||||
double **epsilon, double *host_lshape, int **form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int max_nbors, const int thread, const int gpu_id) {
|
||||
bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **shape,
|
||||
double **well, double **cutsq, double **sigma,
|
||||
double **epsilon, double *host_lshape, int **form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int max_nbors, const int thread, const int gpu_id) {
|
||||
assert(thread<MAX_GPU_THREADS);
|
||||
|
||||
GBMF[thread].gpu.init();
|
||||
|
||||
if (GBMF[thread].gpu.num_devices()==0)
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
ij_size=IJ_SIZE;
|
||||
return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
|
||||
@ -337,17 +339,20 @@ int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
|
||||
// forces, and torques for those interactions
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class gbmtyp>
|
||||
void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) {
|
||||
void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij,
|
||||
const bool eflag) {
|
||||
gbm.nbor.time_nbor.add_to_total();
|
||||
// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
|
||||
|
||||
memcpy(gbm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
|
||||
gbm.nbor.time_nbor.start();
|
||||
gbm.nbor.add(num_ij,gbm.pair_stream);
|
||||
gbm.nbor.time_nbor.stop();
|
||||
}
|
||||
|
||||
void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) {
|
||||
_gb_gpu_nbors(GBMF[thread],num_ij,eflag);
|
||||
void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
|
||||
const int thread) {
|
||||
_gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -475,7 +480,7 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
|
||||
gbm.time_gayberne2.add_to_total();
|
||||
gbm.time_pair.add_to_total();
|
||||
}
|
||||
// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
|
||||
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
|
||||
|
||||
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
|
||||
gbm.atom.add_forces(ilist,f);
|
||||
|
||||
@ -37,7 +37,7 @@ GB_GPU_MemoryT::~GB_GPU_Memory() {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
const double gamma, const double upsilon,
|
||||
const double mu, double **host_shape,
|
||||
double **host_well, double **host_cutsq,
|
||||
@ -50,9 +50,11 @@ int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
if (this->allocated)
|
||||
clear();
|
||||
|
||||
LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
|
||||
host_lj1, host_lj2, host_lj3, host_lj4, host_offset,
|
||||
host_special_lj, max_nbors, me);
|
||||
bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
|
||||
host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
host_offset, host_special_lj, max_nbors, me);
|
||||
if (!p)
|
||||
return false;
|
||||
|
||||
host_form=h_form;
|
||||
|
||||
@ -100,7 +102,7 @@ int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
// Memory for ilist ordered by particle type
|
||||
host_olist.safe_alloc_rw(this->max_atoms);
|
||||
|
||||
return this->nbor.host_ij.begin();
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
||||
@ -35,7 +35,7 @@ class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
|
||||
GB_GPU_Memory();
|
||||
~GB_GPU_Memory();
|
||||
|
||||
int* init(const int ij_size, const int ntypes, const double gamma,
|
||||
bool init(const int ij_size, const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **host_shape,
|
||||
double **host_well, double **host_cutsq, double **host_sigma,
|
||||
double **host_epsilon, double *host_lshape, int **h_form,
|
||||
|
||||
@ -63,12 +63,13 @@ string lj_gpu_name(const int id, const int max_nbors) {
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma,
|
||||
double **epsilon, double **host_lj1, double **host_lj2,
|
||||
double **host_lj3, double **host_lj4, double **offset,
|
||||
double *special_lj, const int max_nbors, const int gpu_id) {
|
||||
bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma,
|
||||
double **epsilon, double **host_lj1, double **host_lj2,
|
||||
double **host_lj3, double **host_lj4, double **offset,
|
||||
double *special_lj, const int max_nbors, const int gpu_id) {
|
||||
LJMF.gpu.init();
|
||||
if (LJMF.gpu.num_devices()==0)
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
ij_size=IJ_SIZE;
|
||||
return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
|
||||
@ -142,17 +143,19 @@ bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist,
|
||||
// forces, and torques for those interactions
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class LJMTyp>
|
||||
void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) {
|
||||
void _lj_gpu_nbors(LJMTyp &ljm, const int *ij, const int num_ij) {
|
||||
ljm.nbor.time_nbor.add_to_total();
|
||||
// CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
|
||||
|
||||
// CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
|
||||
|
||||
memcpy(ljm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
|
||||
ljm.nbor.time_nbor.start();
|
||||
ljm.nbor.add(num_ij,ljm.pair_stream);
|
||||
ljm.nbor.time_nbor.stop();
|
||||
}
|
||||
|
||||
void lj_gpu_nbors(const int num_ij) {
|
||||
_lj_gpu_nbors(LJMF,num_ij);
|
||||
void lj_gpu_nbors(const int *ij, const int num_ij) {
|
||||
_lj_gpu_nbors(LJMF,ij,num_ij);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -201,7 +204,7 @@ double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
|
||||
ljm.atom.time_atom.add_to_total();
|
||||
ljm.nbor.time_nbor.add_to_total();
|
||||
ljm.time_pair.add_to_total();
|
||||
// CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed
|
||||
CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream));
|
||||
|
||||
evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
|
||||
ljm.atom.add_forces(ilist,f);
|
||||
|
||||
@ -39,7 +39,7 @@ int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
double **host_cutsq, double **host_sigma,
|
||||
double **host_epsilon, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
@ -50,10 +50,10 @@ int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
clear();
|
||||
|
||||
if (me>=gpu.num_devices())
|
||||
return 0;
|
||||
return false;
|
||||
gpu.set(me);
|
||||
if (gpu.revision()<1.0)
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_pair.init();
|
||||
@ -114,8 +114,7 @@ int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
dev_error.zero();
|
||||
|
||||
allocated=true;
|
||||
|
||||
return nbor.host_ij.begin();
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
||||
@ -40,7 +40,7 @@ class LJ_GPU_Memory {
|
||||
~LJ_GPU_Memory() { clear(); }
|
||||
|
||||
/// Allocate memory on host and device
|
||||
int* init(const int ij_size, const int ntypes, double **host_cutsq,
|
||||
bool init(const int ij_size, const int ntypes, double **host_cutsq,
|
||||
double **host_sigma, double **host_epsilon,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
|
||||
@ -28,7 +28,9 @@
|
||||
#include "nvc_device.h"
|
||||
|
||||
// Grabs the properties for all devices
|
||||
NVCDevice::NVCDevice() {
|
||||
void NVCDevice::init() {
|
||||
_properties.clear();
|
||||
|
||||
CUDA_SAFE_CALL(cudaGetDeviceCount(&_num_devices));
|
||||
for (int dev=0; dev<_num_devices; ++dev) {
|
||||
cudaDeviceProp deviceProp;
|
||||
|
||||
@ -33,11 +33,16 @@ using namespace std;
|
||||
/// Class for looking at device properties
|
||||
/** \note Calls to change the device outside of the class results in incorrect
|
||||
* behavior
|
||||
* \note There is no error checking for indexing past the number of devices **/
|
||||
* \note There is no error checking for indexing past the number of devices
|
||||
* \note init() at least once before using any of the routines **/
|
||||
class NVCDevice {
|
||||
public:
|
||||
/// Grabs the properties for all devices
|
||||
NVCDevice();
|
||||
/** \note init() must be called following construction before any routines **/
|
||||
NVCDevice() {}
|
||||
|
||||
/// Collect properties for every GPU on the node and set active GPU to ID 0
|
||||
void init();
|
||||
|
||||
/// Return the number of devices that support CUDA
|
||||
inline int num_devices() { return _properties.size(); }
|
||||
|
||||
@ -25,6 +25,7 @@
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
NVCDevice gpu;
|
||||
gpu.init();
|
||||
gpu.print_all(cout);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -20,7 +20,7 @@ static __inline__ __device__ numbr cuda_zero() { return 0.0; }
|
||||
template <>
|
||||
static __inline__ __device__ float cuda_zero<float>() { return 0.0f; }
|
||||
|
||||
#ifdef DEBUG
|
||||
#ifndef NO_DEBUG
|
||||
|
||||
# define CU_SAFE_CALL_NO_SYNC( call ) do { \
|
||||
CUresult err = call; \
|
||||
|
||||
@ -156,25 +156,26 @@ class NVC_Host {
|
||||
/// Asynchronous copy from device (numel is not bytes)
|
||||
inline void copy_from_device(const numtyp *device_p, size_t numel,
|
||||
cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
|
||||
cudaMemcpyDeviceToHost,stream));
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
|
||||
cudaMemcpyDeviceToHost,stream));
|
||||
}
|
||||
|
||||
/// Asynchronous copy to device (numel is not bytes)
|
||||
inline void copy_to_device(numtyp *device_p, size_t numel,
|
||||
cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
}
|
||||
|
||||
/// Asynchronous copy to 2D matrix on device (numel is not bytes)
|
||||
inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
|
||||
const size_t rows, const size_t cols,
|
||||
cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp),
|
||||
_array,cols*sizeof(numtyp),
|
||||
cols*sizeof(numtyp),rows,
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(device_p,
|
||||
dev_row_size*sizeof(numtyp),
|
||||
_array,cols*sizeof(numtyp),
|
||||
cols*sizeof(numtyp),rows,
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
}
|
||||
|
||||
private:
|
||||
@ -226,8 +227,8 @@ class NVC_Vec {
|
||||
|
||||
/// Asynchronous copy from host
|
||||
inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
|
||||
{ CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(),
|
||||
cudaMemcpyHostToDevice, stream)); }
|
||||
{ CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,row_bytes(),
|
||||
cudaMemcpyHostToDevice, stream)); }
|
||||
|
||||
/// Copy to host
|
||||
inline void copy_to_host(numtyp *host_p)
|
||||
@ -328,17 +329,17 @@ class NVC_Mat {
|
||||
/// Asynchronous copy from host (elements not bytes)
|
||||
inline void copy_from_host(const numtyp *host_p, const size_t numel,
|
||||
cudaStream_t &stream)
|
||||
{ CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice, stream)); }
|
||||
{ CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice, stream)); }
|
||||
|
||||
/// Asynchronous Copy from Host
|
||||
/** \note Used when the number of columns/rows allocated on host smaller than
|
||||
* on device **/
|
||||
inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
|
||||
const size_t cols, cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp),
|
||||
cols*sizeof(numtyp), rows,
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(_array, _pitch, host_p,
|
||||
cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
}
|
||||
|
||||
private:
|
||||
@ -416,9 +417,10 @@ class NVC_ConstMat {
|
||||
|
||||
/// Asynchronous Copy from Host
|
||||
inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p,
|
||||
numel()*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyToArrayAsync(_array, 0, 0, host_p,
|
||||
numel()*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice,
|
||||
stream));
|
||||
}
|
||||
|
||||
/// Asynchronous Copy from Host
|
||||
@ -426,9 +428,9 @@ class NVC_ConstMat {
|
||||
* on device **/
|
||||
inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
|
||||
const size_t cols, cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p,
|
||||
cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p,
|
||||
cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
}
|
||||
|
||||
/// Cast buffer to numtyp in host_write and copy to array
|
||||
|
||||
Reference in New Issue
Block a user