diff --git a/lib/gpu/Makefile.nvidia b/lib/gpu/Makefile.nvidia index 7efb430e5d..a1defe2254 100644 --- a/lib/gpu/Makefile.nvidia +++ b/lib/gpu/Makefile.nvidia @@ -16,7 +16,7 @@ BIN_DIR = . OBJ_DIR = . AR = ar -CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math +CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -Xptxas -v --use_fast_math CUDA_ARCH = -maxrregcount 128 #-arch=sm_13 CUDA_PREC = -D_SINGLE_SINGLE CUDA_LINK = -L/usr/local/cuda/lib64 -lcudart $(CUDA_LIB) diff --git a/lib/gpu/README b/lib/gpu/README index ed95c3eeab..62af16e4cb 100644 --- a/lib/gpu/README +++ b/lib/gpu/README @@ -69,6 +69,9 @@ the CUDA_PREC variable: NOTE: Double precision is only supported on certain GPUS (with compute capability>=1.3). +NOTE: For Tesla and other graphics cards with compute capability>=1.3, + make sure that -arch=sm_13 is set on the CUDA_ARCH line. + NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE package has been installed before installing the GPU package in LAMMPS. diff --git a/lib/gpu/gb_gpu.cu b/lib/gpu/gb_gpu.cu index 260edd9e6f..9bfb53a497 100644 --- a/lib/gpu/gb_gpu.cu +++ b/lib/gpu/gb_gpu.cu @@ -203,17 +203,19 @@ string gb_gpu_name(const int id, const int max_nbors) { // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma, - const double upsilon, const double mu, double **shape, - double **well, double **cutsq, double **sigma, - double **epsilon, double *host_lshape, int **form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int max_nbors, const int thread, const int gpu_id) { +bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma, + const double upsilon, const double mu, double **shape, + double **well, double **cutsq, double **sigma, + double **epsilon, double *host_lshape, int **form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int max_nbors, const int thread, const int gpu_id) { assert(thread -void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) { +void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij, + const bool eflag) { gbm.nbor.time_nbor.add_to_total(); // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed + memcpy(gbm.nbor.host_ij.begin(),ij,num_ij*sizeof(int)); gbm.nbor.time_nbor.start(); gbm.nbor.add(num_ij,gbm.pair_stream); gbm.nbor.time_nbor.stop(); } -void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) { - _gb_gpu_nbors(GBMF[thread],num_ij,eflag); +void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag, + const int thread) { + _gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag); } // --------------------------------------------------------------------------- @@ -475,7 +480,7 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist, gbm.time_gayberne2.add_to_total(); gbm.time_pair.add_to_total(); } - // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed + CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial); gbm.atom.add_forces(ilist,f); diff --git a/lib/gpu/gb_gpu_memory.cu b/lib/gpu/gb_gpu_memory.cu index aab5ea55a5..de24284751 100644 --- a/lib/gpu/gb_gpu_memory.cu +++ b/lib/gpu/gb_gpu_memory.cu @@ -37,7 +37,7 @@ GB_GPU_MemoryT::~GB_GPU_Memory() { } template -int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes, +bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes, const double gamma, const double upsilon, const double mu, double **host_shape, double **host_well, double **host_cutsq, @@ -50,9 +50,11 @@ int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes, if (this->allocated) clear(); - LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon, - host_lj1, host_lj2, host_lj3, host_lj4, host_offset, - host_special_lj, max_nbors, me); + bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon, + host_lj1, host_lj2, host_lj3, host_lj4, + host_offset, host_special_lj, max_nbors, me); + if (!p) + return false; host_form=h_form; @@ -100,7 +102,7 @@ int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes, // Memory for ilist ordered by particle type host_olist.safe_alloc_rw(this->max_atoms); - return this->nbor.host_ij.begin(); + return true; } template diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h index 4d607ed14e..eb9fb92a1c 100644 --- a/lib/gpu/gb_gpu_memory.h +++ b/lib/gpu/gb_gpu_memory.h @@ -35,7 +35,7 @@ class GB_GPU_Memory : public LJ_GPU_Memory { GB_GPU_Memory(); ~GB_GPU_Memory(); - int* init(const int ij_size, const int ntypes, const double gamma, + bool init(const int ij_size, const int ntypes, const double gamma, const double upsilon, const double mu, double **host_shape, double **host_well, double **host_cutsq, double **host_sigma, double **host_epsilon, double *host_lshape, int **h_form, diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu index 94d826e149..ffc67eb068 100644 --- a/lib/gpu/lj_gpu.cu +++ b/lib/gpu/lj_gpu.cu @@ -63,12 +63,13 @@ string lj_gpu_name(const int id, const int max_nbors) { // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, - double **epsilon, double **host_lj1, double **host_lj2, - double **host_lj3, double **host_lj4, double **offset, - double *special_lj, const int max_nbors, const int gpu_id) { +bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, + double **epsilon, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, + double *special_lj, const int max_nbors, const int gpu_id) { + LJMF.gpu.init(); if (LJMF.gpu.num_devices()==0) - return 0; + return false; ij_size=IJ_SIZE; return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, @@ -142,17 +143,19 @@ bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, // forces, and torques for those interactions // --------------------------------------------------------------------------- template -void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) { +void _lj_gpu_nbors(LJMTyp &ljm, const int *ij, const int num_ij) { ljm.nbor.time_nbor.add_to_total(); - // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed + // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed + + memcpy(ljm.nbor.host_ij.begin(),ij,num_ij*sizeof(int)); ljm.nbor.time_nbor.start(); ljm.nbor.add(num_ij,ljm.pair_stream); ljm.nbor.time_nbor.stop(); } -void lj_gpu_nbors(const int num_ij) { - _lj_gpu_nbors(LJMF,num_ij); +void lj_gpu_nbors(const int *ij, const int num_ij) { + _lj_gpu_nbors(LJMF,ij,num_ij); } // --------------------------------------------------------------------------- @@ -201,7 +204,7 @@ double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist, ljm.atom.time_atom.add_to_total(); ljm.nbor.time_nbor.add_to_total(); ljm.time_pair.add_to_total(); - // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed + CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial); ljm.atom.add_forces(ilist,f); diff --git a/lib/gpu/lj_gpu_memory.cu b/lib/gpu/lj_gpu_memory.cu index 5e8bf4cdc6..a5000a7eaf 100644 --- a/lib/gpu/lj_gpu_memory.cu +++ b/lib/gpu/lj_gpu_memory.cu @@ -39,7 +39,7 @@ int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) { } template -int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, +bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, double **host_cutsq, double **host_sigma, double **host_epsilon, double **host_lj1, double **host_lj2, double **host_lj3, @@ -50,10 +50,10 @@ int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, clear(); if (me>=gpu.num_devices()) - return 0; + return false; gpu.set(me); if (gpu.revision()<1.0) - return 0; + return false; // Initialize timers for the selected GPU time_pair.init(); @@ -114,8 +114,7 @@ int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, dev_error.zero(); allocated=true; - - return nbor.host_ij.begin(); + return true; } template diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h index 82a87ea6a8..9052ad4956 100644 --- a/lib/gpu/lj_gpu_memory.h +++ b/lib/gpu/lj_gpu_memory.h @@ -40,7 +40,7 @@ class LJ_GPU_Memory { ~LJ_GPU_Memory() { clear(); } /// Allocate memory on host and device - int* init(const int ij_size, const int ntypes, double **host_cutsq, + bool init(const int ij_size, const int ntypes, double **host_cutsq, double **host_sigma, double **host_epsilon, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, diff --git a/lib/gpu/nvc_device.cu b/lib/gpu/nvc_device.cu index 01a2db1bd7..2f25d61af2 100644 --- a/lib/gpu/nvc_device.cu +++ b/lib/gpu/nvc_device.cu @@ -28,7 +28,9 @@ #include "nvc_device.h" // Grabs the properties for all devices -NVCDevice::NVCDevice() { +void NVCDevice::init() { + _properties.clear(); + CUDA_SAFE_CALL(cudaGetDeviceCount(&_num_devices)); for (int dev=0; dev<_num_devices; ++dev) { cudaDeviceProp deviceProp; diff --git a/lib/gpu/nvc_device.h b/lib/gpu/nvc_device.h index 1ba5f2bc4c..4286c6022f 100644 --- a/lib/gpu/nvc_device.h +++ b/lib/gpu/nvc_device.h @@ -33,11 +33,16 @@ using namespace std; /// Class for looking at device properties /** \note Calls to change the device outside of the class results in incorrect * behavior - * \note There is no error checking for indexing past the number of devices **/ + * \note There is no error checking for indexing past the number of devices + * \note init() at least once before using any of the routines **/ class NVCDevice { public: /// Grabs the properties for all devices - NVCDevice(); + /** \note init() must be called following construction before any routines **/ + NVCDevice() {} + + /// Collect properties for every GPU on the node and set active GPU to ID 0 + void init(); /// Return the number of devices that support CUDA inline int num_devices() { return _properties.size(); } diff --git a/lib/gpu/nvc_get_devices.cu b/lib/gpu/nvc_get_devices.cu index a1d35232e6..b85626f3f7 100644 --- a/lib/gpu/nvc_get_devices.cu +++ b/lib/gpu/nvc_get_devices.cu @@ -25,6 +25,7 @@ int main(int argc, char** argv) { NVCDevice gpu; + gpu.init(); gpu.print_all(cout); return 0; } diff --git a/lib/gpu/nvc_macros.h b/lib/gpu/nvc_macros.h index 56bb5c64ce..4aaa167af5 100644 --- a/lib/gpu/nvc_macros.h +++ b/lib/gpu/nvc_macros.h @@ -20,7 +20,7 @@ static __inline__ __device__ numbr cuda_zero() { return 0.0; } template <> static __inline__ __device__ float cuda_zero() { return 0.0f; } -#ifdef DEBUG +#ifndef NO_DEBUG # define CU_SAFE_CALL_NO_SYNC( call ) do { \ CUresult err = call; \ diff --git a/lib/gpu/nvc_memory.h b/lib/gpu/nvc_memory.h index 0ff5229c05..5aaeffd2f0 100644 --- a/lib/gpu/nvc_memory.h +++ b/lib/gpu/nvc_memory.h @@ -156,25 +156,26 @@ class NVC_Host { /// Asynchronous copy from device (numel is not bytes) inline void copy_from_device(const numtyp *device_p, size_t numel, cudaStream_t &stream) { - CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp), - cudaMemcpyDeviceToHost,stream)); + CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp), + cudaMemcpyDeviceToHost,stream)); } /// Asynchronous copy to device (numel is not bytes) inline void copy_to_device(numtyp *device_p, size_t numel, cudaStream_t &stream) { - CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp), - cudaMemcpyHostToDevice,stream)); + CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp), + cudaMemcpyHostToDevice,stream)); } /// Asynchronous copy to 2D matrix on device (numel is not bytes) inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size, const size_t rows, const size_t cols, cudaStream_t &stream) { - CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp), - _array,cols*sizeof(numtyp), - cols*sizeof(numtyp),rows, - cudaMemcpyHostToDevice,stream)); + CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(device_p, + dev_row_size*sizeof(numtyp), + _array,cols*sizeof(numtyp), + cols*sizeof(numtyp),rows, + cudaMemcpyHostToDevice,stream)); } private: @@ -226,8 +227,8 @@ class NVC_Vec { /// Asynchronous copy from host inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) - { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(), - cudaMemcpyHostToDevice, stream)); } + { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,row_bytes(), + cudaMemcpyHostToDevice, stream)); } /// Copy to host inline void copy_to_host(numtyp *host_p) @@ -328,17 +329,17 @@ class NVC_Mat { /// Asynchronous copy from host (elements not bytes) inline void copy_from_host(const numtyp *host_p, const size_t numel, cudaStream_t &stream) - { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), - cudaMemcpyHostToDevice, stream)); } + { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), + cudaMemcpyHostToDevice, stream)); } /// Asynchronous Copy from Host /** \note Used when the number of columns/rows allocated on host smaller than * on device **/ inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, const size_t cols, cudaStream_t &stream) { - CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp), - cols*sizeof(numtyp), rows, - cudaMemcpyHostToDevice,stream)); + CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(_array, _pitch, host_p, + cols*sizeof(numtyp), cols*sizeof(numtyp), rows, + cudaMemcpyHostToDevice,stream)); } private: @@ -416,9 +417,10 @@ class NVC_ConstMat { /// Asynchronous Copy from Host inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) { - CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, - numel()*sizeof(numtyp), - cudaMemcpyHostToDevice,stream)); + CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, + numel()*sizeof(numtyp), + cudaMemcpyHostToDevice, + stream)); } /// Asynchronous Copy from Host @@ -426,9 +428,9 @@ class NVC_ConstMat { * on device **/ inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, const size_t cols, cudaStream_t &stream) { - CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, - cols*sizeof(numtyp), cols*sizeof(numtyp), rows, - cudaMemcpyHostToDevice,stream)); + CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, + cols*sizeof(numtyp), cols*sizeof(numtyp), rows, + cudaMemcpyHostToDevice,stream)); } /// Cast buffer to numtyp in host_write and copy to array