git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2016-07-01 23:27:26 +00:00
parent 8366b35459
commit 9656958169
245 changed files with 4890 additions and 4832 deletions
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -28,29 +28,29 @@

 namespace LAMMPS_AL {

-template <class numtyp, class acctyp, 
+template <class numtyp, class acctyp,
          class grdtyp, class grdtyp4> class PPPM;

 template <class numtyp, class acctyp>
 class Device {
 public:
  Device();
-  ~Device(); 
- 
+  ~Device();
+
  /// Initialize the device for use by this process
  /** Sets up a per-device MPI communicator for load balancing and initializes
-    * the device (>=first_gpu and <=last_gpu) that this proc will be using 
+    * the device (>=first_gpu and <=last_gpu) that this proc will be using
    * Returns:
    * -  0 if successfull
    * - -2 if GPU not found
    * - -4 if GPU library not compiled for GPU
    * - -6 if GPU could not be initialized for use
-    * - -7 if accelerator sharing is not currently allowed on system 
+    * - -7 if accelerator sharing is not currently allowed on system
    * - -11 if vendor_string has the wrong number of parameters **/
-  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
-                  const int last_gpu, const int gpu_mode, 
+  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
+                  const int last_gpu, const int gpu_mode,
                  const double particle_split, const int nthreads,
-                  const int t_per_atom, const double cell_size, 
+                  const int t_per_atom, const double cell_size,
                  char *vendor_string, const int block_pair);

  /// Initialize the device for Atom and Neighbor storage
@ -62,9 +62,9 @@ class Device {
    *                 1 if gpu_nbor is true, and host needs a half nbor list,
    *                 2 if gpu_nbor is true, and host needs a full nbor list
    * \param max_nbors Initial number of rows in the neighbor matrix
-    * \param cell_size cutoff+skin 
+    * \param cell_size cutoff+skin
    * \param pre_cut True if cutoff test will be performed in separate kernel
-    *                than the force kernel 
+    *                than the force kernel
    * \param threads_per_atom value to be used by the neighbor list only
    *
    * Returns:
@ -113,25 +113,25 @@ class Device {

  /// Returns true if double precision is supported on card
  inline bool double_precision() { return gpu->double_precision(); }
-  
+
  /// Output a message with timing information
-  void output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans, 
-                    Neighbor &nbor, const double avg_split, 
+  void output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
+                    Neighbor &nbor, const double avg_split,
                    const double max_bytes, const double gpu_overhead,
-                    const double driver_overhead, 
+                    const double driver_overhead,
                    const int threads_per_atom, FILE *screen);

  /// Output a message with timing information
  void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
                           UCL_Timer & time_map, UCL_Timer & time_rho,
-                           UCL_Timer &time_interp, 
-                           Answer<numtyp,acctyp> &ans, 
+                           UCL_Timer &time_interp,
+                           Answer<numtyp,acctyp> &ans,
                           const double max_bytes, const double cpu_time,
                           const double cpu_idle_time, FILE *screen);

  /// Clear all memory on host and device associated with atom and nbor data
  void clear();
-  
+
  /// Clear all memory on host and device
  void clear_device();

@ -149,24 +149,24 @@ class Device {
      while (ans_queue.empty()==false) {
        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
        ans_queue.pop();
-      }                                                 
+      }
      return evdw;
    }
    return 0.0;
  }

  /// Start timer on host
-  inline void start_host_timer() 
+  inline void start_host_timer()
    { _cpu_full=MPI_Wtime(); _host_timer_started=true; }
-  
+
  /// Stop timer on host
-  inline void stop_host_timer() { 
+  inline void stop_host_timer() {
    if (_host_timer_started) {
-      _cpu_full=MPI_Wtime()-_cpu_full; 
+      _cpu_full=MPI_Wtime()-_cpu_full;
      _host_timer_started=false;
    }
  }
-  
+
  /// Return host time
  inline double host_time() { return _cpu_full; }

@ -239,8 +239,8 @@ class Device {
  /// Number of threads executing concurrently on same multiproc
  inline int warp_size() const { return _warp_size; }

-  // -------------------- SHARED DEVICE ROUTINES -------------------- 
-  // Perform asynchronous zero of integer array 
+  // -------------------- SHARED DEVICE ROUTINES --------------------
+  // Perform asynchronous zero of integer array
  void zero(UCL_D_Vec<int> &mem, const int numel) {
    int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
                                    _block_pair));
@ -248,25 +248,25 @@ class Device {
    k_zero.run(&mem,&numel);
  }

-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------

  /// Geryon Device
  UCL_Device *gpu;

  enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH};

-  // --------------------------- ATOM DATA -------------------------- 
+  // --------------------------- ATOM DATA --------------------------

  /// Atom Data
  Atom<numtyp,acctyp> atom;

  // --------------------------- NBOR DATA ----------------------------
-  
+
  /// Neighbor Data
  NeighborShared _neighbor_shared;

  // ------------------------ LONG RANGE DATA -------------------------
-  
+
  // Long Range Data
  int _long_range_precompute;
  PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
@ -282,7 +282,7 @@ class Device {
      pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
                              boxlo,prd);
  }
-  
+
  inline std::string compile_string() { return _ocl_compile_string; }

 private:
@ -290,7 +290,7 @@ class Device {
  int _init_count;
  bool _device_init, _host_timer_started, _time_device;
  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
-  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
+  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
      _replica_size;
  int _gpu_mode, _first_device, _last_device, _nthreads;
  double _particle_split;
@ -310,10 +310,10 @@ class Device {
  int compile_kernels();

  int _data_in_estimate, _data_out_estimate;
-  
+
  std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string;
  int set_ocl_params(char *);
-  
+
  template <class t>
  inline std::string toa(const t& in) {
    std::ostringstream o;