git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3406 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2009-11-16 21:42:57 +00:00
parent 5f7e6aa45c
commit 12c3a43919
13 changed files with 82 additions and 60 deletions
--- a/lib/gpu/Makefile.nvidia
+++ b/lib/gpu/Makefile.nvidia
@ -16,7 +16,7 @@
 BIN_DIR = .
 OBJ_DIR = .
 AR = ar
-CUDA_CPP  = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math
+CUDA_CPP  = nvcc -I/usr/local/cuda/include -DUNIX -O3 -Xptxas -v --use_fast_math
 CUDA_ARCH = -maxrregcount 128 #-arch=sm_13
 CUDA_PREC = -D_SINGLE_SINGLE
 CUDA_LINK = -L/usr/local/cuda/lib64 -lcudart $(CUDA_LIB)
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -69,6 +69,9 @@ the CUDA_PREC variable:
 NOTE: Double precision is only supported on certain GPUS (with
      compute capability>=1.3).
      
+NOTE: For Tesla and other graphics cards with compute capability>=1.3,
+      make sure that -arch=sm_13 is set on the CUDA_ARCH line.
+
 NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
      package has been installed before installing the GPU package in LAMMPS.

--- a/lib/gpu/gb_gpu.cu
+++ b/lib/gpu/gb_gpu.cu
@ -203,17 +203,19 @@ string gb_gpu_name(const int id, const int max_nbors) {
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
-                  const double upsilon, const double mu, double **shape,
-                  double **well, double **cutsq, double **sigma, 
-                  double **epsilon, double *host_lshape, int **form,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int max_nbors, const int thread, const int gpu_id) {
+bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
+                 const double upsilon, const double mu, double **shape,
+                 double **well, double **cutsq, double **sigma, 
+                 double **epsilon, double *host_lshape, int **form,
+                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj4, double **offset, double *special_lj,
+                 const int max_nbors, const int thread, const int gpu_id) {
  assert(thread<MAX_GPU_THREADS);
  
+  GBMF[thread].gpu.init();
+  
  if (GBMF[thread].gpu.num_devices()==0)
-    return 0;                   
+    return false;                   

  ij_size=IJ_SIZE;
  return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
@ -337,17 +339,20 @@ int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
 // forces, and torques for those interactions
 // ---------------------------------------------------------------------------
 template <class gbmtyp>
-void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) {
+void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij, 
+                   const bool eflag) {
  gbm.nbor.time_nbor.add_to_total();
  // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
  
+  memcpy(gbm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
  gbm.nbor.time_nbor.start();
  gbm.nbor.add(num_ij,gbm.pair_stream);
  gbm.nbor.time_nbor.stop();
 }

-void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) {
-  _gb_gpu_nbors(GBMF[thread],num_ij,eflag);
+void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag, 
+                  const int thread) {
+  _gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
 }

 // ---------------------------------------------------------------------------
@ -475,7 +480,7 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
    gbm.time_gayberne2.add_to_total();
    gbm.time_pair.add_to_total();
  }      
-  // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
+  CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
  
  evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
  gbm.atom.add_forces(ilist,f);
--- a/lib/gpu/gb_gpu_memory.cu
+++ b/lib/gpu/gb_gpu_memory.cu
@ -37,7 +37,7 @@ GB_GPU_MemoryT::~GB_GPU_Memory() {
 }
 
 template <class numtyp, class acctyp>
-int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes, 
+bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes, 
                          const double gamma, const double upsilon, 
                          const double mu, double **host_shape,
                          double **host_well, double **host_cutsq, 
@ -50,9 +50,11 @@ int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
  if (this->allocated)
    clear();
    
-  LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
-                       host_lj1, host_lj2, host_lj3, host_lj4, host_offset,
-                       host_special_lj, max_nbors, me);
+  bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
+                              host_lj1, host_lj2, host_lj3, host_lj4, 
+                              host_offset, host_special_lj, max_nbors, me);
+  if (!p)
+    return false;                              
    
  host_form=h_form;
    
@ -100,7 +102,7 @@ int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
  // Memory for ilist ordered by particle type
  host_olist.safe_alloc_rw(this->max_atoms);

-  return this->nbor.host_ij.begin();
+  return true;
 }
  
 template <class numtyp, class acctyp>
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@ -35,7 +35,7 @@ class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
  GB_GPU_Memory();
  ~GB_GPU_Memory(); 
 
-  int* init(const int ij_size, const int ntypes, const double gamma,
+  bool init(const int ij_size, const int ntypes, const double gamma,
            const double upsilon, const double mu, double **host_shape,
            double **host_well, double **host_cutsq, double **host_sigma, 
            double **host_epsilon, double *host_lshape, int **h_form,
--- a/lib/gpu/lj_gpu.cu
+++ b/lib/gpu/lj_gpu.cu
@ -63,12 +63,13 @@ string lj_gpu_name(const int id, const int max_nbors) {
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
-                  double **epsilon, double **host_lj1, double **host_lj2, 
-                  double **host_lj3, double **host_lj4, double **offset, 
-                  double *special_lj, const int max_nbors, const int gpu_id) {
+bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
+                 double **epsilon, double **host_lj1, double **host_lj2, 
+                 double **host_lj3, double **host_lj4, double **offset, 
+                 double *special_lj, const int max_nbors, const int gpu_id) {
+  LJMF.gpu.init();                  
  if (LJMF.gpu.num_devices()==0)
-    return 0;                   
+    return false;                   

  ij_size=IJ_SIZE;
  return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
@ -142,17 +143,19 @@ bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist,
 // forces, and torques for those interactions
 // ---------------------------------------------------------------------------
 template <class LJMTyp>
-void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) {
+void _lj_gpu_nbors(LJMTyp &ljm, const int *ij, const int num_ij) {
  ljm.nbor.time_nbor.add_to_total();
-  // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
  
+  // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
+
+  memcpy(ljm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
  ljm.nbor.time_nbor.start();
  ljm.nbor.add(num_ij,ljm.pair_stream);
  ljm.nbor.time_nbor.stop();
 }

-void lj_gpu_nbors(const int num_ij) {
-  _lj_gpu_nbors(LJMF,num_ij);
+void lj_gpu_nbors(const int *ij, const int num_ij) {
+  _lj_gpu_nbors(LJMF,ij,num_ij);
 }

 // ---------------------------------------------------------------------------
@ -201,7 +204,7 @@ double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
  ljm.atom.time_atom.add_to_total();
  ljm.nbor.time_nbor.add_to_total();
  ljm.time_pair.add_to_total();
-  // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed
+  CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream));

  evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
  ljm.atom.add_forces(ilist,f);
--- a/lib/gpu/lj_gpu_memory.cu
+++ b/lib/gpu/lj_gpu_memory.cu
@ -39,7 +39,7 @@ int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) {
 }
  
 template <class numtyp, class acctyp>
-int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, 
+bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, 
                          double **host_cutsq, double **host_sigma, 
                          double **host_epsilon, double **host_lj1, 
                          double **host_lj2, double **host_lj3, 
@ -50,10 +50,10 @@ int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
    clear();
    
  if (me>=gpu.num_devices())
-    return 0;
+    return false;
  gpu.set(me);
  if (gpu.revision()<1.0)
-    return 0;  
+    return false;  
    
  // Initialize timers for the selected GPU
  time_pair.init();
@ -114,8 +114,7 @@ int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
  dev_error.zero();
    
  allocated=true;
-
-  return nbor.host_ij.begin();
+  return true;
 }
  
 template <class numtyp, class acctyp>
--- a/lib/gpu/lj_gpu_memory.h
+++ b/lib/gpu/lj_gpu_memory.h
@ -40,7 +40,7 @@ class LJ_GPU_Memory {
  ~LJ_GPU_Memory() { clear(); }
 
  /// Allocate memory on host and device
-  int* init(const int ij_size, const int ntypes, double **host_cutsq, 
+  bool init(const int ij_size, const int ntypes, double **host_cutsq, 
            double **host_sigma, double **host_epsilon, 
            double **host_lj1, double **host_lj2, double **host_lj3, 
            double **host_lj4, double **host_offset, double *host_special_lj,
--- a/lib/gpu/nvc_device.cu
+++ b/lib/gpu/nvc_device.cu
@ -28,7 +28,9 @@
 #include "nvc_device.h"

 // Grabs the properties for all devices
-NVCDevice::NVCDevice() {
+void NVCDevice::init() {
+  _properties.clear();
+  
  CUDA_SAFE_CALL(cudaGetDeviceCount(&_num_devices));
  for (int dev=0; dev<_num_devices; ++dev) {
    cudaDeviceProp deviceProp;
--- a/lib/gpu/nvc_device.h
+++ b/lib/gpu/nvc_device.h
@ -33,11 +33,16 @@ using namespace std;
 /// Class for looking at device properties
 /** \note Calls to change the device outside of the class results in incorrect
  *       behavior 
-  * \note There is no error checking for indexing past the number of devices **/
+  * \note There is no error checking for indexing past the number of devices
+  * \note init() at least once before using any of the routines **/
 class NVCDevice {
 public:
  /// Grabs the properties for all devices
-  NVCDevice();
+  /** \note init() must be called following construction before any routines **/
+  NVCDevice() {}
+
+  /// Collect properties for every GPU on the node and set active GPU to ID 0
+  void init();

  /// Return the number of devices that support CUDA
  inline int num_devices() { return _properties.size(); }
--- a/lib/gpu/nvc_get_devices.cu
+++ b/lib/gpu/nvc_get_devices.cu
@ -25,6 +25,7 @@

 int main(int argc, char** argv) {
  NVCDevice gpu;
+  gpu.init();
  gpu.print_all(cout);
  return 0;
 }
--- a/lib/gpu/nvc_macros.h
+++ b/lib/gpu/nvc_macros.h
@ -20,7 +20,7 @@ static __inline__ __device__ numbr cuda_zero() { return 0.0; }
 template <>
 static __inline__ __device__ float cuda_zero<float>() { return 0.0f; }

-#ifdef DEBUG
+#ifndef NO_DEBUG

 #  define CU_SAFE_CALL_NO_SYNC( call ) do {                                  \
    CUresult err = call;                                                     \
--- a/lib/gpu/nvc_memory.h
+++ b/lib/gpu/nvc_memory.h
@ -156,25 +156,26 @@ class NVC_Host {
  /// Asynchronous copy from device (numel is not bytes)
  inline void copy_from_device(const numtyp *device_p, size_t numel,
                               cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
-                                   cudaMemcpyDeviceToHost,stream));
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
+                                           cudaMemcpyDeviceToHost,stream));
  }
  
  /// Asynchronous copy to device (numel is not bytes)
  inline void copy_to_device(numtyp *device_p, size_t numel,
                             cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
-                                   cudaMemcpyHostToDevice,stream));
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
+                                           cudaMemcpyHostToDevice,stream));
  }
  
  /// Asynchronous copy to 2D matrix on device (numel is not bytes)
  inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
                               const size_t rows, const size_t cols,
                               cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp),
-                                     _array,cols*sizeof(numtyp),
-                                     cols*sizeof(numtyp),rows,
-                                     cudaMemcpyHostToDevice,stream));
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(device_p,
+                                             dev_row_size*sizeof(numtyp),
+                                             _array,cols*sizeof(numtyp),
+                                             cols*sizeof(numtyp),rows,
+                                             cudaMemcpyHostToDevice,stream));
  }
  
 private:
@ -226,8 +227,8 @@ class NVC_Vec {

  /// Asynchronous copy from host
  inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
-    { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(), 
-                     cudaMemcpyHostToDevice, stream)); }
+    { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,row_bytes(), 
+                             cudaMemcpyHostToDevice, stream)); }

  /// Copy to host
  inline void copy_to_host(numtyp *host_p)
@ -328,17 +329,17 @@ class NVC_Mat {
  /// Asynchronous copy from host (elements not bytes)
  inline void copy_from_host(const numtyp *host_p, const size_t numel,
                             cudaStream_t &stream)
-    { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), 
-                     cudaMemcpyHostToDevice, stream)); }
+    { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), 
+                             cudaMemcpyHostToDevice, stream)); }

  /// Asynchronous Copy from Host
  /** \note Used when the number of columns/rows allocated on host smaller than
    *       on device **/
  inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
                               const size_t cols, cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp), 
-                                     cols*sizeof(numtyp), rows,
-                                     cudaMemcpyHostToDevice,stream));
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(_array, _pitch, host_p,
+                           cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
+                           cudaMemcpyHostToDevice,stream));
  }

 private:
@ -416,9 +417,10 @@ class NVC_ConstMat {

  /// Asynchronous Copy from Host
  inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, 
-                                          numel()*sizeof(numtyp),
-                                          cudaMemcpyHostToDevice,stream));
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, 
+                                                  numel()*sizeof(numtyp),
+                                                  cudaMemcpyHostToDevice,
+                                                  stream));
  }
      
  /// Asynchronous Copy from Host
@ -426,9 +428,9 @@ class NVC_ConstMat {
    *       on device **/
  inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
                               const size_t cols, cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, 
-                         cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
-                         cudaMemcpyHostToDevice,stream));
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, 
+                           cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
+                           cudaMemcpyHostToDevice,stream));
  }                               

  /// Cast buffer to numtyp in host_write and copy to array