git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6053 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-05-02 15:02:52 +00:00
parent 2be078632d
commit 5f799182b3
70 changed files with 4489 additions and 2253 deletions
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@ -23,7 +23,6 @@

 #ifdef USE_OPENCL

-#include "geryon/ocl_device.h"
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
@ -32,7 +31,6 @@ using namespace ucl_opencl;
 #else

 #include "cudpp.h"
-#include "geryon/nvd_device.h"
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 #include "geryon/nvd_kernel.h"
@ -40,10 +38,6 @@ using namespace ucl_cudadr;

 #endif

-#ifndef int2
-struct int2 { int x; int y; };
-#endif
-
 #include "pair_gpu_precision.h"

 template <class numtyp, class acctyp>
@ -56,13 +50,9 @@ class PairGPUAtom {
  inline int max_atoms() const { return _max_atoms; }
  /// Current number of local+ghost atoms stored
  inline int nall() const { return _nall; }
-  /// Current number of local atoms stored
-  inline int inum() const { return _inum; }

  /// Set number of local+ghost atoms for future copy operations
  inline void nall(const int n) { _nall=n; }
-  /// Set number of local atoms for future copy operations
-  inline void inum(const int n) { _inum=n; }
  
  /// Memory usage per atom in this class
  int bytes_per_atom() const; 
@ -70,21 +60,33 @@ class PairGPUAtom {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param rot True if atom storage needs quaternions
    * \param gpu_nbor True if neighboring will be performed on device **/
-  bool init(const int inum, const int nall, const bool charge, const bool rot, 
+  bool init(const int nall, const bool charge, const bool rot, 
            UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
  
  /// Check if we have enough device storage and realloc if not
-  inline bool resize(const int inum, const int nall, bool &success) {
-    _inum=inum;
+  /** Returns true if resized with any call during this timestep **/
+  inline bool resize(const int nall, bool &success) {
    _nall=nall;
-    if (inum>_max_local || nall>_max_atoms) {
+    if (nall>_max_atoms) {
      clear_resize();
-      success = success && alloc(inum,nall);
-      return true;
+      success = success && alloc(nall);
+      _resized=true;
    }
-    return false;
+    return _resized;
  }
-
+  
+  /// If already initialized by another LAMMPS style, add fields as necessary
+  /** \param rot True if atom storage needs quaternions
+    * \param gpu_nbor True if neighboring will be performed on device **/
+  bool add_fields(const bool charge, const bool rot, const bool gpu_nbor,
+                  const bool bonds);
+  
+  /// Returns true if GPU is using charges
+  bool charge() { return _charge; }
+  
+  /// Returns true if GPU is using quaternions
+  bool quat() { return _rot; }
+  
  /// Only free matrices of length inum or nall for resizing
  void clear_resize();
  
@ -100,28 +102,42 @@ class PairGPUAtom {
  /// Add copy times to timers
  inline void acc_timers() {
    time_pos.add_to_total();
-    time_answer.add_to_total();
-    if (_other)
-      time_other.add_to_total();
+    if (_charge)
+      time_q.add_to_total();
+    if (_rot)
+      time_quat.add_to_total();
  }

  /// Add copy times to timers
  inline void zero_timers() {
    time_pos.zero();
-    time_answer.zero();
-    if (_other)
-      time_other.zero();
+    if (_charge)
+      time_q.zero();
+    if (_rot)
+      time_quat.zero();
  }

  /// Return the total time for host/device data transfer
+  /** Zeros the total so that the atom times are only included once **/
  inline double transfer_time() {
-    double total=time_pos.total_seconds()+time_answer.total_seconds();
-    if (_other) total+=time_other.total_seconds();
+    double total=time_pos.total_seconds();
+    time_pos.zero_total();
+    if (_charge) {
+      total+=time_q.total_seconds();
+      time_q.zero_total();
+    }
+    if (_rot) {
+      total+=time_q.total_seconds();
+      time_quat.zero_total();
+    }
+    
    return total;
  }
  
  /// Return the total time for data cast/pack
-  inline double cast_time() { return _time_cast; }
+  /** Zeros the time so that atom times are only included once **/
+  inline double cast_time() 
+    { double t=_time_cast; _time_cast=0.0; return t; }

  /// Pack LAMMPS atom type constants into matrix and copy to device
  template <class dev_typ, class t1>
@ -216,43 +232,52 @@ class PairGPUAtom {

  // -------------------------COPY TO GPU ----------------------------------

+  /// Signal that we need to transfer atom data for next timestep
+  inline void data_unavail()
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
+
  /// Cast positions and types to write buffer
  inline void cast_x_data(double **host_ptr, const int *host_type) {
-    double t=MPI_Wtime();
-    #ifdef GPU_CAST
-    memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
-    memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
-    #else
-    numtyp *_write_loc=host_x.begin();
-    for (int i=0; i<_nall; i++) {
-      *_write_loc=host_ptr[i][0];
-      _write_loc++;
-      *_write_loc=host_ptr[i][1];
-      _write_loc++;
-      *_write_loc=host_ptr[i][2];
-      _write_loc++;
-      *_write_loc=host_type[i];
-      _write_loc++;
+    if (_x_avail==false) {
+      double t=MPI_Wtime();
+      #ifdef GPU_CAST
+      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
+      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
+      #else
+      numtyp *_write_loc=host_x.begin();
+      for (int i=0; i<_nall; i++) {
+        *_write_loc=host_ptr[i][0];
+        _write_loc++;
+        *_write_loc=host_ptr[i][1];
+        _write_loc++;
+        *_write_loc=host_ptr[i][2];
+        _write_loc++;
+        *_write_loc=host_type[i];
+        _write_loc++;
+      }
+      #endif
+      _time_cast+=MPI_Wtime()-t;
    }
-    #endif
-    _time_cast+=MPI_Wtime()-t;
-  }      
+  }

  /// Copy positions and types to device asynchronously
  /** Copies nall() elements **/
  inline void add_x_data(double **host_ptr, int *host_type) { 
    time_pos.start();
-    #ifdef GPU_CAST
-    ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
-    ucl_copy(dev_type_cast,host_type_cast,_nall,true);
-    int block_size=64;
-    int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
-    k_cast_x.set_size(GX,block_size);
-    k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
-                 &_nall);
-    #else
-    ucl_copy(dev_x,host_x,_nall*4,true);
-    #endif
+    if (_x_avail==false) {
+      #ifdef GPU_CAST
+      ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
+      ucl_copy(dev_type_cast,host_type_cast,_nall,true);
+      int block_size=64;
+      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
+      k_cast_x.set_size(GX,block_size);
+      k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
+                   &_nall);
+      #else
+      ucl_copy(dev_x,host_x,_nall*4,true);
+      #endif
+      _x_avail=true;
+    }
    time_pos.stop();
  }

@ -262,87 +287,68 @@ class PairGPUAtom {
    add_x_data(host_ptr,host_type);
  }

-  /// Cast charges to write buffer
+  // Cast charges to write buffer
  template<class cpytyp>
  inline void cast_q_data(cpytyp *host_ptr) {
-    double t=MPI_Wtime();
-    if (dev->device_type()==UCL_CPU) {
-      if (sizeof(numtyp)==sizeof(double)) {
-        host_q.view((numtyp*)host_ptr,_nall,*dev);
-        dev_q.view(host_q);
-      } else
-        for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
-    } else {
-      if (sizeof(numtyp)==sizeof(double))
-        memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
-      else
-        for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+    if (_q_avail==false) {
+      double t=MPI_Wtime();
+      if (dev->device_type()==UCL_CPU) {
+        if (sizeof(numtyp)==sizeof(double)) {
+          host_q.view((numtyp*)host_ptr,_nall,*dev);
+          dev_q.view(host_q);
+        } else
+          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+      } else {
+        if (sizeof(numtyp)==sizeof(double))
+          memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
+        else
+          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+      }
+      _time_cast+=MPI_Wtime()-t;
    }
-    _time_cast+=MPI_Wtime()-t;
  }

-  /// Copy charges to device asynchronously
+  // Copy charges to device asynchronously
  inline void add_q_data() {
-    ucl_copy(dev_q,host_q,_nall,true);
+    if (_q_avail==false) {
+      ucl_copy(dev_q,host_q,_nall,true);
+      _q_avail=true;
+    }
  }

-  /// Cast quaternions to write buffer
+  // Cast quaternions to write buffer
  template<class cpytyp>
  inline void cast_quat_data(cpytyp *host_ptr) {
-    double t=MPI_Wtime();
-    if (dev->device_type()==UCL_CPU) {
-      if (sizeof(numtyp)==sizeof(double)) {
-        host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
-        dev_quat.view(host_quat);
-      } else
-        for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
-    } else {
-      if (sizeof(numtyp)==sizeof(double))
-        memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
-      else
-        for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+    if (_quat_avail==false) {
+      double t=MPI_Wtime();
+      if (dev->device_type()==UCL_CPU) {
+        if (sizeof(numtyp)==sizeof(double)) {
+          host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
+          dev_quat.view(host_quat);
+        } else
+          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+      } else {
+        if (sizeof(numtyp)==sizeof(double))
+          memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
+        else
+          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+      }
+      _time_cast+=MPI_Wtime()-t;
    }
-    _time_cast+=MPI_Wtime()-t;
  }

-  /// Copy quaternions to device
+  // Copy quaternions to device
  /** Copies nall()*4 elements **/
  inline void add_quat_data() {
-    ucl_copy(dev_quat,host_quat,_nall*4,true);
+    if (_quat_avail==false) {
+      ucl_copy(dev_quat,host_quat,_nall*4,true);
+      _quat_avail=true;
+    }
  }

-  /// Copy data other than pos and data to device
-  inline void add_other_data() {
-    time_other.start();
-    if (_charge)
-      add_q_data();
-    if (_rot)
-      add_quat_data();
-    time_other.stop();
-  }
-  
  /// Return number of bytes used on device
-  inline double gpu_bytes() { return _gpu_bytes; } 
-
-  // -------------------------COPY FROM GPU -------------------------------
-
-  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom);
-
-  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom, int *ilist);
-  
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(double *eatom, double **vatom, double *virial);
-
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(double *eatom, double **vatom, double *virial,
-                       double &ecoul);
-
-  /// Add forces and torques from the GPU into a LAMMPS pointer
-  void get_answers(double **f, double **tor);
+  inline double max_gpu_bytes() 
+    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 

  // ------------------------------ DATA ----------------------------------

@ -352,10 +358,6 @@ class PairGPUAtom {
  UCL_D_Vec<numtyp> dev_q;
  /// Quaterions
  UCL_D_Vec<numtyp> dev_quat;
-  /// Force and possibly torque
-  UCL_D_Vec<acctyp> dev_ans;
-  /// Energy and virial per-atom storage
-  UCL_D_Vec<acctyp> dev_engv;
  
  #ifdef GPU_CAST
  UCL_D_Vec<double> dev_x_cast;
@ -370,10 +372,6 @@ class PairGPUAtom {
  UCL_H_Vec<numtyp> host_q;
  /// Buffer for moving quat data to GPU
  UCL_H_Vec<numtyp> host_quat;
-  /// Force and possibly torque data on host
-  UCL_H_Vec<acctyp> host_ans;
-  /// Energy/virial data on host
-  UCL_H_Vec<acctyp> host_engv;
  
  /// Cell list identifiers for device nbor builds
  UCL_D_Vec<unsigned> dev_cell_id;
@ -383,7 +381,7 @@ class PairGPUAtom {
  UCL_D_Vec<int> dev_tag;

  /// Device timers
-  UCL_Timer time_pos, time_other, time_answer;
+  UCL_Timer time_pos, time_q, time_quat;
  
  /// Geryon device
  UCL_Device *dev;
@ -396,19 +394,19 @@ class PairGPUAtom {
  #endif

  bool _compiled;
-
-  bool alloc(const int inum, const int nall);
  
-  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
-  int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
+  // True if data has been copied to device already
+  bool _x_avail, _q_avail, _quat_avail, _resized;
+
+  bool alloc(const int nall);
+  
+  bool _allocated, _rot, _charge, _other;
+  int _max_atoms, _nall;
  bool _gpu_nbor, _bonds;
-  int *_ilist;
  double _time_cast;
  
-  double _gpu_bytes;
+  double _max_gpu_bytes;
  
-  bool _newton;
-
  #ifndef USE_OPENCL
  CUDPPConfiguration sort_config;
  CUDPPHandle sort_plan;