git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6053 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-05-02 15:02:52 +00:00
parent 2be078632d
commit 5f799182b3
70 changed files with 4489 additions and 2253 deletions
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@ -19,11 +19,17 @@
 #define PAIR_GPU_DEVICE_H

 #include "pair_gpu_atom.h"
+#include "pair_gpu_ans.h"
 #include "pair_gpu_nbor.h"
+#include "pppm_gpu_memory.h"
 #include "mpi.h"
 #include <sstream>
 #include "stdio.h"
 #include <string>
+#include <queue>
+
+template <class numtyp, class acctyp, 
+          class grdtyp, class grdtyp4> class PPPMGPUMemory;

 template <class numtyp, class acctyp>
 class PairGPUDevice {
@ -33,10 +39,15 @@ class PairGPUDevice {
 
  /// Initialize the device for use by this process
  /** Sets up a per-device MPI communicator for load balancing and initializes
-    * the device (>=first_gpu and <=last_gpu) that this proc will be using **/
-  bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
+    * the device (>=first_gpu and <=last_gpu) that this proc will be using 
+    * Returns:
+    * -  0 if successfull
+    * - -2 if GPU not found
+    * - -4 if GPU library not compiled for GPU **/
+  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
                   const int last_gpu, const int gpu_mode, 
-                   const double particle_split, const int nthreads);
+                   const double particle_split, const int nthreads,
+                   const int t_per_atom);

  /// Initialize the device for Atom and Neighbor storage
  /** \param rot True if quaternions need to be stored
@ -50,19 +61,67 @@ class PairGPUDevice {
    * \param max_nbors Initial number of rows in the neighbor matrix
    * \param cell_size cutoff+skin 
    * \param pre_cut True if cutoff test will be performed in separate kernel
-    *                than the force kernel **/
-  bool init(const bool charge, const bool rot, const int nlocal,
-            const int host_nlocal, const int nall, const int maxspecial, 
-            const bool gpu_nbor, const int gpu_host, const int max_nbors,
-            const double cell_size, const bool pre_cut);
+    *                than the force kernel 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
+           const int nlocal, const int host_nlocal, const int nall,
+           PairGPUNbor *nbor, const int maxspecial, const int gpu_host,
+           const int max_nbors, const double cell_size, const bool pre_cut);
+
+  /// Initialize the device for Atom storage only
+  /** \param nlocal Total number of local particles to allocate memory for
+    * \param nall Total number of local+ghost particles
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall);

  /// Output a message for pair_style acceleration with device stats
  void init_message(FILE *screen, const char *name,
                    const int first_gpu, const int last_gpu);

+  /// Perform charge assignment asynchronously for PPPM
+	void set_single_precompute(PPPMGPUMemory<numtyp,acctyp,
+	                                         float,_lgpu_float4> *pppm);
+
+  /// Perform charge assignment asynchronously for PPPM
+	void set_double_precompute(PPPMGPUMemory<numtyp,acctyp,
+	                                         double,_lgpu_double4> *pppm);
+
+  /// Esimate the overhead from GPU calls from multiple procs
+  /** \param kernel_calls Number of kernel calls/timestep for timing estimated
+    *                     overhead
+    * \param gpu_overhead Estimated gpu overhead per timestep (sec)
+    * \param driver_overhead Estimated overhead from driver per timestep (s) **/
+  void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
+                             double &gpu_driver_overhead);
+
+  /// Returns true if double precision is supported on card
+  inline bool double_precision() { return gpu->double_precision(); }
+  
  /// Output a message with timing information
-  void output_times(UCL_Timer &time_pair, const double avg_split, 
-                    const double max_bytes, FILE *screen);
+  void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans, 
+                    PairGPUNbor &nbor, const double avg_split, 
+                    const double max_bytes, const double gpu_overhead,
+                    const double driver_overhead, 
+                    const int threads_per_atom, FILE *screen);
+
+  /// Output a message with timing information
+  void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
+                           UCL_Timer & time_map, UCL_Timer & time_rho,
+                           UCL_Timer &time_interp, 
+                           PairGPUAns<numtyp,acctyp> &ans, 
+                           const double max_bytes, const double cpu_time,
+                           const double cpu_idle_time, FILE *screen);

  /// Clear all memory on host and device associated with atom and nbor data
  void clear();
@ -70,11 +129,37 @@ class PairGPUDevice {
  /// Clear all memory on host and device
  void clear_device();

+  /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
+  inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
+    { ans_queue.push(ans); }
+
+  /// Add "answers" (force,energies,etc.) into LAMMPS structures
+  inline double fix_gpu(double **f, double **tor, double *eatom,
+                        double **vatom, double *virial, double &ecoul) {
+    atom.data_unavail();
+    if (ans_queue.empty()==false) {
+      stop_host_timer();
+      double evdw=0.0;
+      while (ans_queue.empty()==false) {
+        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
+        ans_queue.pop();
+      }                                                 
+      return evdw;
+    }
+    return 0.0;
+  }
+
  /// Start timer on host
-  inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
+  inline void start_host_timer() 
+    { _cpu_full=MPI_Wtime(); _host_timer_started=true; }
  
  /// Stop timer on host
-  inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; }
+  inline void stop_host_timer() { 
+    if (_host_timer_started) {
+      _cpu_full=MPI_Wtime()-_cpu_full; 
+      _host_timer_started=false;
+    }
+  }
  
  /// Return host time
  inline double host_time() { return _cpu_full; }
@ -114,6 +199,42 @@ class PairGPUDevice {
  inline double particle_split() const { return _particle_split; }
  /// Return the initialization count for the device
  inline int init_count() const { return _init_count; }
+  /// True if device is being timed
+  inline bool time_device() const { return _time_device; }
+
+  /// Return the number of threads accessing memory simulatenously
+  inline int num_mem_threads() const { return _num_mem_threads; }
+  /// Return the number of threads per atom for pair styles
+  inline int threads_per_atom() const { return _threads_per_atom; }
+  /// Return the number of threads per atom for pair styles using charge
+  inline int threads_per_charge() const { return _threads_per_charge; }
+  /// Return the min of the pair block size or the device max block size
+  inline int pair_block_size() const { return _block_pair; }
+  /// Return the maximum number of atom types that can be used with shared mem
+  inline int max_shared_types() const { return _max_shared_types; }
+  /// Return the maximum order for PPPM splines
+  inline int pppm_max_spline() const { return _pppm_max_spline; }
+  /// Return the block size for PPPM kernels
+  inline int pppm_block() const { return _pppm_block; }
+  /// Return the block size for neighbor binning
+  inline int block_cell_2d() const { return _block_cell_2d; }
+  /// Return the block size for atom mapping for neighbor builds
+  inline int block_cell_id() const { return _block_cell_id; }
+  /// Return the block size for neighbor build kernel
+  inline int block_nbor_build() const { return _block_nbor_build; }
+  /// Return the block size for "bio" pair styles
+  inline int block_bio_pair() const { return _block_bio_pair; }
+  /// Return the maximum number of atom types for shared mem with "bio" styles
+  inline int max_bio_shared_types() const { return _max_bio_shared_types; }
+
+  // -------------------- SHARED DEVICE ROUTINES -------------------- 
+  // Perform asynchronous zero of integer array 
+  void zero(UCL_D_Vec<int> &mem, const int numel) {
+    int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
+                                    _block_pair));
+    k_zero.set_size(num_blocks,_block_pair);
+    k_zero.run(&mem.begin(),&numel);
+  }

  // -------------------------- DEVICE DATA ------------------------- 

@ -130,11 +251,30 @@ class PairGPUDevice {
  // --------------------------- NBOR DATA ----------------------------
  
  /// Neighbor Data
-  PairGPUNbor nbor;
+  PairGPUNborShared _nbor_shared;
+
+  // ------------------------ LONG RANGE DATA -------------------------
+  
+  // Long Range Data
+  int _long_range_precompute;
+  PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
+  PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
+  /// Precomputations for long range charge assignment (asynchronously)
+  inline void precompute(const int ago, const int nlocal, const int nall,
+                         double **host_x, int *host_type, bool &success,
+                         double *charge, double *boxlo, double *prd) {
+    if (_long_range_precompute==1)
+      pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
+                              boxlo,prd);
+    else if (_long_range_precompute==2)
+      pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
+                              boxlo,prd);
+  }

 private:
+  std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
  int _init_count;
-  bool _device_init;
+  bool _device_init, _host_timer_started, _time_device;
  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
      _replica_size;
@ -142,6 +282,19 @@ class PairGPUDevice {
  double _particle_split;
  double _cpu_full;

+  int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
+  int _pppm_max_spline, _pppm_block;
+  int _block_pair, _max_shared_types;
+  int _block_cell_2d, _block_cell_id, _block_nbor_build;
+  int _block_bio_pair, _max_bio_shared_types;
+
+  UCL_Program *dev_program;
+  UCL_Kernel k_zero, k_info;
+  bool _compiled;
+  int compile_kernels();
+
+  int _data_in_estimate, _data_out_estimate;
+  
  template <class t>
  inline std::string toa(const t& in) {
    std::ostringstream o;