git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3406 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2009-11-16 21:42:57 +00:00
parent 5f7e6aa45c
commit 12c3a43919
13 changed files with 82 additions and 60 deletions

View File

@ -16,7 +16,7 @@
BIN_DIR = .
OBJ_DIR = .
AR = ar
CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math
CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -Xptxas -v --use_fast_math
CUDA_ARCH = -maxrregcount 128 #-arch=sm_13
CUDA_PREC = -D_SINGLE_SINGLE
CUDA_LINK = -L/usr/local/cuda/lib64 -lcudart $(CUDA_LIB)

View File

@ -69,6 +69,9 @@ the CUDA_PREC variable:
NOTE: Double precision is only supported on certain GPUS (with
compute capability>=1.3).
NOTE: For Tesla and other graphics cards with compute capability>=1.3,
make sure that -arch=sm_13 is set on the CUDA_ARCH line.
NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
package has been installed before installing the GPU package in LAMMPS.

View File

@ -203,7 +203,7 @@ string gb_gpu_name(const int id, const int max_nbors) {
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
@ -212,8 +212,10 @@ int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
const int max_nbors, const int thread, const int gpu_id) {
assert(thread<MAX_GPU_THREADS);
GBMF[thread].gpu.init();
if (GBMF[thread].gpu.num_devices()==0)
return 0;
return false;
ij_size=IJ_SIZE;
return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
@ -337,17 +339,20 @@ int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
// forces, and torques for those interactions
// ---------------------------------------------------------------------------
template <class gbmtyp>
void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) {
void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij,
const bool eflag) {
gbm.nbor.time_nbor.add_to_total();
// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
memcpy(gbm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
gbm.nbor.time_nbor.start();
gbm.nbor.add(num_ij,gbm.pair_stream);
gbm.nbor.time_nbor.stop();
}
void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) {
_gb_gpu_nbors(GBMF[thread],num_ij,eflag);
void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
const int thread) {
_gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
}
// ---------------------------------------------------------------------------
@ -475,7 +480,7 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
gbm.time_gayberne2.add_to_total();
gbm.time_pair.add_to_total();
}
// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
gbm.atom.add_forces(ilist,f);

View File

@ -37,7 +37,7 @@ GB_GPU_MemoryT::~GB_GPU_Memory() {
}
template <class numtyp, class acctyp>
int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
const double gamma, const double upsilon,
const double mu, double **host_shape,
double **host_well, double **host_cutsq,
@ -50,9 +50,11 @@ int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
if (this->allocated)
clear();
LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
host_lj1, host_lj2, host_lj3, host_lj4, host_offset,
host_special_lj, max_nbors, me);
bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
host_lj1, host_lj2, host_lj3, host_lj4,
host_offset, host_special_lj, max_nbors, me);
if (!p)
return false;
host_form=h_form;
@ -100,7 +102,7 @@ int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
// Memory for ilist ordered by particle type
host_olist.safe_alloc_rw(this->max_atoms);
return this->nbor.host_ij.begin();
return true;
}
template <class numtyp, class acctyp>

View File

@ -35,7 +35,7 @@ class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
GB_GPU_Memory();
~GB_GPU_Memory();
int* init(const int ij_size, const int ntypes, const double gamma,
bool init(const int ij_size, const int ntypes, const double gamma,
const double upsilon, const double mu, double **host_shape,
double **host_well, double **host_cutsq, double **host_sigma,
double **host_epsilon, double *host_lshape, int **h_form,

View File

@ -63,12 +63,13 @@ string lj_gpu_name(const int id, const int max_nbors) {
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma,
bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma,
double **epsilon, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4, double **offset,
double *special_lj, const int max_nbors, const int gpu_id) {
LJMF.gpu.init();
if (LJMF.gpu.num_devices()==0)
return 0;
return false;
ij_size=IJ_SIZE;
return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
@ -142,17 +143,19 @@ bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist,
// forces, and torques for those interactions
// ---------------------------------------------------------------------------
template <class LJMTyp>
void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) {
void _lj_gpu_nbors(LJMTyp &ljm, const int *ij, const int num_ij) {
ljm.nbor.time_nbor.add_to_total();
// CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
memcpy(ljm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
ljm.nbor.time_nbor.start();
ljm.nbor.add(num_ij,ljm.pair_stream);
ljm.nbor.time_nbor.stop();
}
void lj_gpu_nbors(const int num_ij) {
_lj_gpu_nbors(LJMF,num_ij);
void lj_gpu_nbors(const int *ij, const int num_ij) {
_lj_gpu_nbors(LJMF,ij,num_ij);
}
// ---------------------------------------------------------------------------
@ -201,7 +204,7 @@ double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
ljm.atom.time_atom.add_to_total();
ljm.nbor.time_nbor.add_to_total();
ljm.time_pair.add_to_total();
// CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed
CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream));
evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
ljm.atom.add_forces(ilist,f);

View File

@ -39,7 +39,7 @@ int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) {
}
template <class numtyp, class acctyp>
int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
double **host_cutsq, double **host_sigma,
double **host_epsilon, double **host_lj1,
double **host_lj2, double **host_lj3,
@ -50,10 +50,10 @@ int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
clear();
if (me>=gpu.num_devices())
return 0;
return false;
gpu.set(me);
if (gpu.revision()<1.0)
return 0;
return false;
// Initialize timers for the selected GPU
time_pair.init();
@ -114,8 +114,7 @@ int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
dev_error.zero();
allocated=true;
return nbor.host_ij.begin();
return true;
}
template <class numtyp, class acctyp>

View File

@ -40,7 +40,7 @@ class LJ_GPU_Memory {
~LJ_GPU_Memory() { clear(); }
/// Allocate memory on host and device
int* init(const int ij_size, const int ntypes, double **host_cutsq,
bool init(const int ij_size, const int ntypes, double **host_cutsq,
double **host_sigma, double **host_epsilon,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,

View File

@ -28,7 +28,9 @@
#include "nvc_device.h"
// Grabs the properties for all devices
NVCDevice::NVCDevice() {
void NVCDevice::init() {
_properties.clear();
CUDA_SAFE_CALL(cudaGetDeviceCount(&_num_devices));
for (int dev=0; dev<_num_devices; ++dev) {
cudaDeviceProp deviceProp;

View File

@ -33,11 +33,16 @@ using namespace std;
/// Class for looking at device properties
/** \note Calls to change the device outside of the class results in incorrect
* behavior
* \note There is no error checking for indexing past the number of devices **/
* \note There is no error checking for indexing past the number of devices
* \note init() at least once before using any of the routines **/
class NVCDevice {
public:
/// Grabs the properties for all devices
NVCDevice();
/** \note init() must be called following construction before any routines **/
NVCDevice() {}
/// Collect properties for every GPU on the node and set active GPU to ID 0
void init();
/// Return the number of devices that support CUDA
inline int num_devices() { return _properties.size(); }

View File

@ -25,6 +25,7 @@
int main(int argc, char** argv) {
NVCDevice gpu;
gpu.init();
gpu.print_all(cout);
return 0;
}

View File

@ -20,7 +20,7 @@ static __inline__ __device__ numbr cuda_zero() { return 0.0; }
template <>
static __inline__ __device__ float cuda_zero<float>() { return 0.0f; }
#ifdef DEBUG
#ifndef NO_DEBUG
# define CU_SAFE_CALL_NO_SYNC( call ) do { \
CUresult err = call; \

View File

@ -156,14 +156,14 @@ class NVC_Host {
/// Asynchronous copy from device (numel is not bytes)
inline void copy_from_device(const numtyp *device_p, size_t numel,
cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
cudaMemcpyDeviceToHost,stream));
}
/// Asynchronous copy to device (numel is not bytes)
inline void copy_to_device(numtyp *device_p, size_t numel,
cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
cudaMemcpyHostToDevice,stream));
}
@ -171,7 +171,8 @@ class NVC_Host {
inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
const size_t rows, const size_t cols,
cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp),
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(device_p,
dev_row_size*sizeof(numtyp),
_array,cols*sizeof(numtyp),
cols*sizeof(numtyp),rows,
cudaMemcpyHostToDevice,stream));
@ -226,7 +227,7 @@ class NVC_Vec {
/// Asynchronous copy from host
inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
{ CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(),
{ CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,row_bytes(),
cudaMemcpyHostToDevice, stream)); }
/// Copy to host
@ -328,7 +329,7 @@ class NVC_Mat {
/// Asynchronous copy from host (elements not bytes)
inline void copy_from_host(const numtyp *host_p, const size_t numel,
cudaStream_t &stream)
{ CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp),
{ CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp),
cudaMemcpyHostToDevice, stream)); }
/// Asynchronous Copy from Host
@ -336,8 +337,8 @@ class NVC_Mat {
* on device **/
inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
const size_t cols, cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp),
cols*sizeof(numtyp), rows,
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(_array, _pitch, host_p,
cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
cudaMemcpyHostToDevice,stream));
}
@ -416,9 +417,10 @@ class NVC_ConstMat {
/// Asynchronous Copy from Host
inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p,
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyToArrayAsync(_array, 0, 0, host_p,
numel()*sizeof(numtyp),
cudaMemcpyHostToDevice,stream));
cudaMemcpyHostToDevice,
stream));
}
/// Asynchronous Copy from Host
@ -426,7 +428,7 @@ class NVC_ConstMat {
* on device **/
inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
const size_t cols, cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p,
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p,
cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
cudaMemcpyHostToDevice,stream));
}