git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@11660 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2014-03-20 14:50:49 +00:00
parent e1cc86a12f
commit 67afd6fb68
6 changed files with 170 additions and 43 deletions
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -29,6 +29,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
       $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
       $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
       $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
       $(OBJ_DIR)/lal_base_dpd.o \
       $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
       $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
       $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -65,7 +66,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
       $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
       $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
-       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
+       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \
       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o 
 CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@ -111,7 +113,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
       $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
       $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \
-       $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h
+       $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h \
       $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd_cubin.h
 all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
@ -190,6 +193,9 @@ $(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
 $(OBJ_DIR)/lal_base_three.o: $(ALL_H) lal_base_three.h lal_base_three.cpp
 	$(CUDR) -o $@ -c lal_base_three.cpp
 $(OBJ_DIR)/lal_base_dpd.o: $(ALL_H) lal_base_dpd.h lal_base_dpd.cpp
 	$(CUDR) -o $@ -c lal_base_dpd.cpp
 $(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
 	$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
@ -658,6 +664,18 @@ $(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_
 $(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/dpd.cubin: lal_dpd.cu lal_precision.h lal_preprocessor.h
 	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_dpd.cu
 $(OBJ_DIR)/dpd_cubin.h: $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd.cubin
 	$(BIN2C) -c -n dpd $(OBJ_DIR)/dpd.cubin > $(OBJ_DIR)/dpd_cubin.h
 $(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp $(OBJ_DIR)/dpd_cubin.h $(OBJ_DIR)/lal_base_dpd.o
 	$(CUDR) -o $@ -c lal_dpd.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h
 	$(CUDR) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR)
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -18,6 +18,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
       $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
       $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
       $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
       $(OBJ_DIR)/lal_base_dpd.o \
       $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
       $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
       $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -54,7 +55,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
       $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
       $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
-       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
+       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \
       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o 
 KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
       $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@ -77,7 +79,7 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
       $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
       $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
       $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \
-       $(OBJ_DIR)/lj_gromacs_cl.h
+       $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/dpd_cl.h
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -129,6 +131,9 @@ $(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
 $(OBJ_DIR)/lal_base_three.o: $(OCL_H) lal_base_three.h lal_base_three.cpp
 	$(OCL) -o $@ -c lal_base_three.cpp
 $(OBJ_DIR)/lal_base_dpd.o: $(OCL_H) lal_base_dpd.h lal_base_dpd.cpp
 	$(OCL) -o $@ -c lal_base_dpd.cpp
 $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
@ -471,6 +476,15 @@ $(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp  $(OBJ
 $(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/dpd_cl.h: lal_dpd.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh dpd $(PRE1_H) lal_dpd.cu $(OBJ_DIR)/dpd_cl.h;
 $(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp  $(OBJ_DIR)/dpd_cl.h $(OBJ_DIR)/dpd_cl.h $(OBJ_DIR)/lal_base_dpd.o
 	$(OCL) -o $@ -c lal_dpd.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h
 	$(OCL) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR)
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -41,6 +41,8 @@ int AtomT::bytes_per_atom() const {
    bytes+=4*sizeof(numtyp);
  if (_charge)
    bytes+=sizeof(numtyp);
  if (_vel)
    bytes+=4*sizeof(numtyp);
  return bytes;
 }
@ -52,7 +54,7 @@ bool AtomT::alloc(const int nall) {
  // Ignore host/device transfers?
  _host_view=false;
-  if (dev->shared_memory()) {
+  if (dev->shared_memory() && sizeof(numtyp)==sizeof(double)) {
    _host_view=true;
    #ifdef GPU_CAST
    assert(0==1);
@ -90,6 +92,11 @@ bool AtomT::alloc(const int nall) {
                                   UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=quat.device.row_bytes();
  }
  if (_vel && _host_view==false) {
    success=success && (v.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                                   UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=v.device.row_bytes();
  }
  if (_gpu_nbor>0) {
    if (_bonds) {
@ -124,7 +131,7 @@ bool AtomT::alloc(const int nall) {
 template <class numtyp, class acctyp>
 bool AtomT::add_fields(const bool charge, const bool rot,
-                       const int gpu_nbor, const bool bonds) {
+                       const int gpu_nbor, const bool bonds, const bool vel) {
  bool success=true;
  // Ignore host/device transfers?
  int gpu_bytes=0;
@ -149,6 +156,16 @@ bool AtomT::add_fields(const bool charge, const bool rot,
    }
  }
  if (vel && _vel==false) {
    _vel=true;
    _other=true;
    if (_host_view==false) {
      success=success && (v.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                                     UCL_READ_ONLY)==UCL_SUCCESS);
      gpu_bytes+=v.device.row_bytes();
    }
  }
  if (bonds && _bonds==false) {
    _bonds=true;
    if (_bonds && _gpu_nbor>0) {
@ -191,19 +208,21 @@ bool AtomT::add_fields(const bool charge, const bool rot,
 template <class numtyp, class acctyp>
 bool AtomT::init(const int nall, const bool charge, const bool rot,
-                 UCL_Device &devi, const int gpu_nbor, const bool bonds) {
+                 UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) {
  clear();
  bool success=true;
  _x_avail=false;
  _q_avail=false;
  _quat_avail=false;
  _v_avail=false;
  _resized=false;
  _gpu_nbor=gpu_nbor;
  _bonds=bonds;
  _charge=charge;
  _rot=rot;
-  _other=_charge || _rot;
+  _vel=vel;
  _other=_charge || _rot || _vel;
  dev=&devi;
  _time_transfer=0;
@ -216,9 +235,11 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
  time_pos.init(*dev);
  time_q.init(*dev);
  time_quat.init(*dev);
  time_vel.init(*dev);
  time_pos.zero();
  time_q.zero();
  time_quat.zero();
  time_vel.zero();
  _time_cast=0.0;
  #ifdef GPU_CAST
@ -239,6 +260,8 @@ void AtomT::clear_resize() {
    q.clear();
  if (_rot)
    quat.clear();
  if (_vel)
    v.clear();
  dev_cell_id.clear();
  dev_particle_id.clear();
@ -267,6 +290,7 @@ void AtomT::clear() {
  time_pos.clear();
  time_q.clear();
  time_quat.clear();
  time_vel.clear();
  clear_resize();
  #ifdef GPU_CAST
@ -285,6 +309,8 @@ double AtomT::host_memory_usage() const {
    atom_bytes+=1;
  if (_rot) 
    atom_bytes+=4;
  if (_vel) 
    atom_bytes+=4;
  return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -67,7 +67,8 @@ class Atom {
    *        gpu_nbor 1 if neighboring will be performed on device
    *        gpu_nbor 2 if binning on host and neighboring on device **/
  bool init(const int nall, const bool charge, const bool rot, 
-            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false);
+            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, 
            const bool vel=false);
  /// Check if we have enough device storage and realloc if not
  /** Returns true if resized with any call during this timestep **/
@ -87,7 +88,7 @@ class Atom {
    *        gpu_nbor 1 if neighboring will be performed on device
    *        gpu_nbor 2 if binning on host and neighboring on device **/
  bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
-                  const bool bonds);
+                  const bool bonds, const bool vel=false);
  /// Returns true if GPU is using charges
  bool charge() { return _charge; }
@ -95,6 +96,9 @@ class Atom {
  /// Returns true if GPU is using quaternions
  bool quaternion() { return _rot; }
  /// Returns true if GPU is using velocities
  bool velocity() { return _vel; }
  /// Only free matrices of length inum or nall for resizing
  void clear_resize();
@ -114,6 +118,8 @@ class Atom {
      time_q.add_to_total();
    if (_rot)
      time_quat.add_to_total();
    if (_vel)
      time_vel.add_to_total();
  }
  /// Add copy times to timers
@ -123,6 +129,8 @@ class Atom {
      time_q.zero();
    if (_rot)
      time_quat.zero();
    if (_vel)
      time_vel.zero();
  }
  /// Return the total time for host/device data transfer
@ -135,9 +143,13 @@ class Atom {
      time_q.zero_total();
    }
    if (_rot) {
-      total+=time_q.total_seconds();
+      total+=time_quat.total_seconds();
      time_quat.zero_total();
    }
    if (_vel) {
      total+=time_vel.total_seconds();
      time_vel.zero_total();
    }
    return total+_time_transfer/1000.0;
  }
@ -242,7 +254,7 @@ class Atom {
  /// Signal that we need to transfer atom data for next timestep
  inline void data_unavail()
-    { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
  /// Cast positions and types to write buffer
  inline void cast_x_data(double **host_ptr, const int *host_type) {
@ -341,6 +353,53 @@ class Atom {
    }
  }
  /// Cast velocities and tags to write buffer
  inline void cast_v_data(double **host_ptr, const tagint *host_tag) {
    if (_v_avail==false) {
      double t=MPI_Wtime();
      #ifdef GPU_CAST
      memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
      memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
      #else
      int wl=0;
      for (int i=0; i<_nall; i++) {
        v[wl]=host_ptr[i][0];
        v[wl+1]=host_ptr[i][1];
        v[wl+2]=host_ptr[i][2];
        v[wl+3]=host_tag[i];
        wl+=4;
      }
      #endif
      _time_cast+=MPI_Wtime()-t;
    }
  }
  /// Copy velocities and tags to device asynchronously
  /** Copies nall() elements **/
  inline void add_v_data(double **host_ptr, tagint *host_tag) { 
    time_vel.start();
    if (_v_avail==false) {
      #ifdef GPU_CAST
      v_cast.update_device(_nall*3,true);
      tag_cast.update_device(_nall,true);
      int block_size=64;
      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
      k_cast_x.set_size(GX,block_size);
      k_cast_x.run(&v, &v_cast, &tag_cast, &_nall);
      #else
      v.update_device(_nall*4,true);
      #endif
      _v_avail=true;
    }
    time_vel.stop();
  }
  /// Calls cast_v_data and add_v_data and times the routines
  inline void cast_copy_v(double **host_ptr, tagint *host_tag) {
    cast_v_data(host_ptr,host_tag);
    add_v_data(host_ptr,host_tag);
  }
  /// Add in casting time from additional data (seconds)
  inline void add_cast_time(double t) { _time_cast+=t; }
@ -362,7 +421,9 @@ class Atom {
  UCL_Vector<numtyp,numtyp> q;
  /// Quaterions
  UCL_Vector<numtyp,numtyp> quat;
-  
+  /// Velocities
  UCL_Vector<numtyp,numtyp> v;  
  #ifdef GPU_CAST
  UCL_Vector<double,double> x_cast;
  UCL_Vector<int,int> type_cast;
@ -372,6 +433,7 @@ class Atom {
  UCL_D_Vec<unsigned> dev_cell_id;
  /// Cell list identifiers for device nbor builds
  UCL_D_Vec<int> dev_particle_id;
  /// Atom tag information for device nbor builds
  UCL_D_Vec<tagint> dev_tag;
@ -381,7 +443,7 @@ class Atom {
  UCL_H_Vec<int> host_particle_id;
  /// Device timers
-  UCL_Timer time_pos, time_q, time_quat;
+  UCL_Timer time_pos, time_q, time_quat, time_vel;
  /// Geryon device
  UCL_Device *dev;
@ -396,11 +458,11 @@ class Atom {
  bool _compiled;
  // True if data has been copied to device already
-  bool _x_avail, _q_avail, _quat_avail, _resized;
+  bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
  bool alloc(const int nall);
-  bool _allocated, _rot, _charge, _bonds, _other;
+  bool _allocated, _rot, _charge, _bonds, _vel, _other;
  int _max_atoms, _nall, _gpu_nbor;
  bool _host_view;
  double _time_cast, _time_transfer;
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -213,12 +213,12 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
-                         const bool rot, const int nlocal, 
+                  const bool rot, const int nlocal, 
-                         const int host_nlocal, const int nall,
+                  const int host_nlocal, const int nall,
-                         Neighbor *nbor, const int maxspecial,
+                  Neighbor *nbor, const int maxspecial,
-                         const int gpu_host, const int max_nbors, 
+                  const int gpu_host, const int max_nbors, 
-                         const double cell_size, const bool pre_cut,
+                  const double cell_size, const bool pre_cut,
-                         const int threads_per_atom) {
+                  const int threads_per_atom, const bool vel) {
  if (!_device_init)
    return -1;
  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
@ -245,7 +245,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
  if (_init_count==0) {
    // Initialize atom and nbor data
-    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0))
+    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
      return -3;
    _data_in_estimate++;
@ -253,12 +253,16 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
      _data_in_estimate++;
    if (rot)
      _data_in_estimate++;
    if (vel)
      _data_in_estimate++;
  } else {
    if (atom.charge()==false && charge)
      _data_in_estimate++;
    if (atom.quaternion()==false && rot)
      _data_in_estimate++;
-    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial))
+    if (atom.velocity()==false && vel)
      _data_in_estimate++;
    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
      return -3;
  }
@ -318,7 +322,7 @@ void DeviceT::set_double_precompute
 template <class numtyp, class acctyp>
 void DeviceT::init_message(FILE *screen, const char *name,
-                                  const int first_gpu, const int last_gpu) {
+                           const int first_gpu, const int last_gpu) {
  #if defined(USE_OPENCL)
  std::string fs="";
  #elif defined(USE_CUDART)
@ -330,7 +334,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
  if (_replica_me == 0 && screen) {
    fprintf(screen,"\n-------------------------------------");
    fprintf(screen,"-------------------------------------\n");
-    fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
+    fprintf(screen,"- Using acceleration for %s:\n",name);
    fprintf(screen,"-  with %d proc(s) per device.\n",_procs_per_gpu);
    #ifdef _OPENMP
    fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
@ -361,7 +365,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
      } else
        sname+="Double Precision)";
-      fprintf(screen,"GPU %d: %s\n",i,sname.c_str());
+      fprintf(screen,"Device %d: %s\n",i,sname.c_str());
    }
    fprintf(screen,"-------------------------------------");
@ -371,8 +375,8 @@ void DeviceT::init_message(FILE *screen, const char *name,
 template <class numtyp, class acctyp>
 void DeviceT::estimate_gpu_overhead(const int kernel_calls, 
-                                           double &gpu_overhead,
+                                    double &gpu_overhead,
-                                           double &gpu_driver_overhead) {
+                                    double &gpu_driver_overhead) {
  UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
  UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
  UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
@ -506,16 +510,17 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
  double mpi_max_bytes;
  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
  double max_mb=mpi_max_bytes/(1024.0*1024.0);
  double t_time=times[0]+times[1]+times[2]+times[3]+times[4];
  if (replica_me()==0)
    if (screen && times[5]>0.0) {
      fprintf(screen,"\n\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average): ");
      fprintf(screen,"\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
-      if (time_device()) {
+      if (time_device() && t_time>0) {
        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
@ -527,7 +532,8 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
      }
      if (nbor.gpu_nbor()==2)
        fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[8]/_replica_size);
-      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[5]/_replica_size);
+      if (times[5]>0)
        fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size);
      fprintf(screen,"Average split:   %.4f.\n",avg_split);
      fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
@ -541,14 +547,14 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
 template <class numtyp, class acctyp>
 void DeviceT::output_kspace_times(UCL_Timer &time_in, 
-                                         UCL_Timer &time_out,
+                                  UCL_Timer &time_out,
-                                         UCL_Timer &time_map,
+                                  UCL_Timer &time_map,
-                                         UCL_Timer &time_rho,
+                                  UCL_Timer &time_rho,
-                                         UCL_Timer &time_interp,
+                                  UCL_Timer &time_interp,
-                                         Answer<numtyp,acctyp> &ans, 
+                                  Answer<numtyp,acctyp> &ans, 
-                                         const double max_bytes, 
+                                  const double max_bytes, 
-                                         const double cpu_time, 
+                                  const double cpu_time, 
-                                         const double idle_time, FILE *screen) {
+                                  const double idle_time, FILE *screen) {
  double single[8], times[8];
  single[0]=time_out.total_seconds();
@ -566,16 +572,17 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
  double mpi_max_bytes;
  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
  double max_mb=mpi_max_bytes/(1024.0*1024.0);
  double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
  if (replica_me()==0)
    if (screen && times[6]>0.0) {
      fprintf(screen,"\n\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"      Device Time Info (average): ");
      fprintf(screen,"\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
-      if (time_device()) {
+      if (time_device() && t_time>0) {
        fprintf(screen,"Data Out:        %.4f s.\n",times[0]/_replica_size);
        fprintf(screen,"Data In:         %.4f s.\n",times[1]/_replica_size);
        fprintf(screen,"Kernel (map):    %.4f s.\n",times[2]/_replica_size);
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -77,7 +77,7 @@ class Device {
           const int nlocal, const int host_nlocal, const int nall,
           Neighbor *nbor, const int maxspecial, const int gpu_host,
           const int max_nbors, const double cell_size, const bool pre_cut,
-           const int threads_per_atom);
+           const int threads_per_atom, const bool vel=false);
  /// Initialize the device for Atom storage only
  /** \param nlocal Total number of local particles to allocate memory for