diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index e93805df3b..da07cec1b7 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -29,6 +29,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
        $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
        $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
        $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
+       $(OBJ_DIR)/lal_base_dpd.o \
        $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
        $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
        $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@@ -65,7 +66,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
        $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
        $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
        $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
-       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
+       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \
+       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o 
 
 CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
        $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@@ -111,7 +113,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
        $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
        $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
        $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \
-       $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h
+       $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h \
+       $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd_cubin.h
 
 all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
 
@@ -190,6 +193,9 @@ $(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
 $(OBJ_DIR)/lal_base_three.o: $(ALL_H) lal_base_three.h lal_base_three.cpp
 	$(CUDR) -o $@ -c lal_base_three.cpp
 
+$(OBJ_DIR)/lal_base_dpd.o: $(ALL_H) lal_base_dpd.h lal_base_dpd.cpp
+	$(CUDR) -o $@ -c lal_base_dpd.cpp
+
 $(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
 	$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
 
@@ -658,6 +664,18 @@ $(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_
 $(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/dpd.cubin: lal_dpd.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_dpd.cu
+
+$(OBJ_DIR)/dpd_cubin.h: $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd.cubin
+	$(BIN2C) -c -n dpd $(OBJ_DIR)/dpd.cubin > $(OBJ_DIR)/dpd_cubin.h
+
+$(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp $(OBJ_DIR)/dpd_cubin.h $(OBJ_DIR)/lal_base_dpd.o
+	$(CUDR) -o $@ -c lal_dpd.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h
+	$(CUDR) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 
 
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index bb528f1e2b..1d85c65354 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -18,6 +18,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
        $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
        $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
        $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
+       $(OBJ_DIR)/lal_base_dpd.o \
        $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
        $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
        $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@@ -54,7 +55,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
        $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
        $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
        $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
-       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
+       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \
+       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o 
 
 KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
        $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@@ -77,7 +79,7 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
        $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
        $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
        $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \
-       $(OBJ_DIR)/lj_gromacs_cl.h
+       $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/dpd_cl.h
 
 
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@@ -129,6 +131,9 @@ $(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
 $(OBJ_DIR)/lal_base_three.o: $(OCL_H) lal_base_three.h lal_base_three.cpp
 	$(OCL) -o $@ -c lal_base_three.cpp
 
+$(OBJ_DIR)/lal_base_dpd.o: $(OCL_H) lal_base_dpd.h lal_base_dpd.cpp
+	$(OCL) -o $@ -c lal_base_dpd.cpp
+
 $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
 
@@ -471,6 +476,15 @@ $(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp  $(OBJ
 $(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/dpd_cl.h: lal_dpd.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh dpd $(PRE1_H) lal_dpd.cu $(OBJ_DIR)/dpd_cl.h;
+
+$(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp  $(OBJ_DIR)/dpd_cl.h $(OBJ_DIR)/dpd_cl.h $(OBJ_DIR)/lal_base_dpd.o
+	$(OCL) -o $@ -c lal_dpd.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h
+	$(OCL) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 
 
diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index 92cda1239e..a250584dfa 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -41,6 +41,8 @@ int AtomT::bytes_per_atom() const {
     bytes+=4*sizeof(numtyp);
   if (_charge)
     bytes+=sizeof(numtyp);
+  if (_vel)
+    bytes+=4*sizeof(numtyp);
   return bytes;
 }
 
@@ -52,7 +54,7 @@ bool AtomT::alloc(const int nall) {
   
   // Ignore host/device transfers?
   _host_view=false;
-  if (dev->shared_memory()) {
+  if (dev->shared_memory() && sizeof(numtyp)==sizeof(double)) {
     _host_view=true;
     #ifdef GPU_CAST
     assert(0==1);
@@ -90,6 +92,11 @@ bool AtomT::alloc(const int nall) {
                                    UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=quat.device.row_bytes();
   }
+  if (_vel && _host_view==false) {
+    success=success && (v.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=v.device.row_bytes();
+  }
 
   if (_gpu_nbor>0) {
     if (_bonds) {
@@ -124,7 +131,7 @@ bool AtomT::alloc(const int nall) {
 
 template <class numtyp, class acctyp>
 bool AtomT::add_fields(const bool charge, const bool rot,
-                       const int gpu_nbor, const bool bonds) {
+                       const int gpu_nbor, const bool bonds, const bool vel) {
   bool success=true;
   // Ignore host/device transfers?
   int gpu_bytes=0;
@@ -149,6 +156,16 @@ bool AtomT::add_fields(const bool charge, const bool rot,
     }
   }
 
+  if (vel && _vel==false) {
+    _vel=true;
+    _other=true;
+    if (_host_view==false) {
+      success=success && (v.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
+                                     UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=v.device.row_bytes();
+    }
+  }
+
   if (bonds && _bonds==false) {
     _bonds=true;
     if (_bonds && _gpu_nbor>0) {
@@ -191,19 +208,21 @@ bool AtomT::add_fields(const bool charge, const bool rot,
 
 template <class numtyp, class acctyp>
 bool AtomT::init(const int nall, const bool charge, const bool rot,
-                 UCL_Device &devi, const int gpu_nbor, const bool bonds) {
+                 UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) {
   clear();
 
   bool success=true;
   _x_avail=false;
   _q_avail=false;
   _quat_avail=false;
+  _v_avail=false;
   _resized=false;
   _gpu_nbor=gpu_nbor;
   _bonds=bonds;
   _charge=charge;
   _rot=rot;
-  _other=_charge || _rot;
+  _vel=vel;
+  _other=_charge || _rot || _vel;
   dev=&devi;
   _time_transfer=0;
 
@@ -216,9 +235,11 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
   time_pos.init(*dev);
   time_q.init(*dev);
   time_quat.init(*dev);
+  time_vel.init(*dev);
   time_pos.zero();
   time_q.zero();
   time_quat.zero();
+  time_vel.zero();
   _time_cast=0.0;
   
   #ifdef GPU_CAST
@@ -239,6 +260,8 @@ void AtomT::clear_resize() {
     q.clear();
   if (_rot)
     quat.clear();
+  if (_vel)
+    v.clear();
 
   dev_cell_id.clear();
   dev_particle_id.clear();
@@ -267,6 +290,7 @@ void AtomT::clear() {
   time_pos.clear();
   time_q.clear();
   time_quat.clear();
+  time_vel.clear();
   clear_resize();
 
   #ifdef GPU_CAST
@@ -285,6 +309,8 @@ double AtomT::host_memory_usage() const {
     atom_bytes+=1;
   if (_rot) 
     atom_bytes+=4;
+  if (_vel) 
+    atom_bytes+=4;
   return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }
   
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index 4731b1b08d..23112fe712 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -67,7 +67,8 @@ class Atom {
     *        gpu_nbor 1 if neighboring will be performed on device
     *        gpu_nbor 2 if binning on host and neighboring on device **/
   bool init(const int nall, const bool charge, const bool rot, 
-            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false);
+            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, 
+            const bool vel=false);
   
   /// Check if we have enough device storage and realloc if not
   /** Returns true if resized with any call during this timestep **/
@@ -87,7 +88,7 @@ class Atom {
     *        gpu_nbor 1 if neighboring will be performed on device
     *        gpu_nbor 2 if binning on host and neighboring on device **/
   bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
-                  const bool bonds);
+                  const bool bonds, const bool vel=false);
   
   /// Returns true if GPU is using charges
   bool charge() { return _charge; }
@@ -95,6 +96,9 @@ class Atom {
   /// Returns true if GPU is using quaternions
   bool quaternion() { return _rot; }
   
+  /// Returns true if GPU is using velocities
+  bool velocity() { return _vel; }
+
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
   
@@ -114,6 +118,8 @@ class Atom {
       time_q.add_to_total();
     if (_rot)
       time_quat.add_to_total();
+    if (_vel)
+      time_vel.add_to_total();
   }
 
   /// Add copy times to timers
@@ -123,6 +129,8 @@ class Atom {
       time_q.zero();
     if (_rot)
       time_quat.zero();
+    if (_vel)
+      time_vel.zero();
   }
 
   /// Return the total time for host/device data transfer
@@ -135,9 +143,13 @@ class Atom {
       time_q.zero_total();
     }
     if (_rot) {
-      total+=time_q.total_seconds();
+      total+=time_quat.total_seconds();
       time_quat.zero_total();
     }
+    if (_vel) {
+      total+=time_vel.total_seconds();
+      time_vel.zero_total();
+    }
     
     return total+_time_transfer/1000.0;
   }
@@ -242,7 +254,7 @@ class Atom {
 
   /// Signal that we need to transfer atom data for next timestep
   inline void data_unavail()
-    { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
 
   /// Cast positions and types to write buffer
   inline void cast_x_data(double **host_ptr, const int *host_type) {
@@ -341,6 +353,53 @@ class Atom {
     }
   }
 
+  /// Cast velocities and tags to write buffer
+  inline void cast_v_data(double **host_ptr, const tagint *host_tag) {
+    if (_v_avail==false) {
+      double t=MPI_Wtime();
+      #ifdef GPU_CAST
+      memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
+      memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
+      #else
+      int wl=0;
+      for (int i=0; i<_nall; i++) {
+        v[wl]=host_ptr[i][0];
+        v[wl+1]=host_ptr[i][1];
+        v[wl+2]=host_ptr[i][2];
+        v[wl+3]=host_tag[i];
+        wl+=4;
+      }
+      #endif
+      _time_cast+=MPI_Wtime()-t;
+    }
+  }
+
+  /// Copy velocities and tags to device asynchronously
+  /** Copies nall() elements **/
+  inline void add_v_data(double **host_ptr, tagint *host_tag) { 
+    time_vel.start();
+    if (_v_avail==false) {
+      #ifdef GPU_CAST
+      v_cast.update_device(_nall*3,true);
+      tag_cast.update_device(_nall,true);
+      int block_size=64;
+      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
+      k_cast_x.set_size(GX,block_size);
+      k_cast_x.run(&v, &v_cast, &tag_cast, &_nall);
+      #else
+      v.update_device(_nall*4,true);
+      #endif
+      _v_avail=true;
+    }
+    time_vel.stop();
+  }
+
+  /// Calls cast_v_data and add_v_data and times the routines
+  inline void cast_copy_v(double **host_ptr, tagint *host_tag) {
+    cast_v_data(host_ptr,host_tag);
+    add_v_data(host_ptr,host_tag);
+  }
+
   /// Add in casting time from additional data (seconds)
   inline void add_cast_time(double t) { _time_cast+=t; }
 
@@ -362,7 +421,9 @@ class Atom {
   UCL_Vector<numtyp,numtyp> q;
   /// Quaterions
   UCL_Vector<numtyp,numtyp> quat;
-  
+  /// Velocities
+  UCL_Vector<numtyp,numtyp> v;  
+
   #ifdef GPU_CAST
   UCL_Vector<double,double> x_cast;
   UCL_Vector<int,int> type_cast;
@@ -372,6 +433,7 @@ class Atom {
   UCL_D_Vec<unsigned> dev_cell_id;
   /// Cell list identifiers for device nbor builds
   UCL_D_Vec<int> dev_particle_id;
+
   /// Atom tag information for device nbor builds
   UCL_D_Vec<tagint> dev_tag;
   
@@ -381,7 +443,7 @@ class Atom {
   UCL_H_Vec<int> host_particle_id;
 
   /// Device timers
-  UCL_Timer time_pos, time_q, time_quat;
+  UCL_Timer time_pos, time_q, time_quat, time_vel;
   
   /// Geryon device
   UCL_Device *dev;
@@ -396,11 +458,11 @@ class Atom {
   bool _compiled;
   
   // True if data has been copied to device already
-  bool _x_avail, _q_avail, _quat_avail, _resized;
+  bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
 
   bool alloc(const int nall);
   
-  bool _allocated, _rot, _charge, _bonds, _other;
+  bool _allocated, _rot, _charge, _bonds, _vel, _other;
   int _max_atoms, _nall, _gpu_nbor;
   bool _host_view;
   double _time_cast, _time_transfer;
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 554a535909..0859062345 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -213,12 +213,12 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
 
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
-                         const bool rot, const int nlocal, 
-                         const int host_nlocal, const int nall,
-                         Neighbor *nbor, const int maxspecial,
-                         const int gpu_host, const int max_nbors, 
-                         const double cell_size, const bool pre_cut,
-                         const int threads_per_atom) {
+                  const bool rot, const int nlocal, 
+                  const int host_nlocal, const int nall,
+                  Neighbor *nbor, const int maxspecial,
+                  const int gpu_host, const int max_nbors, 
+                  const double cell_size, const bool pre_cut,
+                  const int threads_per_atom, const bool vel) {
   if (!_device_init)
     return -1;
   if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
@@ -245,7 +245,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
 
   if (_init_count==0) {
     // Initialize atom and nbor data
-    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0))
+    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
       return -3;
       
     _data_in_estimate++;
@@ -253,12 +253,16 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
       _data_in_estimate++;
     if (rot)
       _data_in_estimate++;
+    if (vel)
+      _data_in_estimate++;
   } else {
     if (atom.charge()==false && charge)
       _data_in_estimate++;
     if (atom.quaternion()==false && rot)
       _data_in_estimate++;
-    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial))
+    if (atom.velocity()==false && vel)
+      _data_in_estimate++;
+    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
       return -3;
   }
   
@@ -318,7 +322,7 @@ void DeviceT::set_double_precompute
 
 template <class numtyp, class acctyp>
 void DeviceT::init_message(FILE *screen, const char *name,
-                                  const int first_gpu, const int last_gpu) {
+                           const int first_gpu, const int last_gpu) {
   #if defined(USE_OPENCL)
   std::string fs="";
   #elif defined(USE_CUDART)
@@ -330,7 +334,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
   if (_replica_me == 0 && screen) {
     fprintf(screen,"\n-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
-    fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
+    fprintf(screen,"- Using acceleration for %s:\n",name);
     fprintf(screen,"-  with %d proc(s) per device.\n",_procs_per_gpu);
     #ifdef _OPENMP
     fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
@@ -361,7 +365,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
       } else
         sname+="Double Precision)";
 
-      fprintf(screen,"GPU %d: %s\n",i,sname.c_str());
+      fprintf(screen,"Device %d: %s\n",i,sname.c_str());
     }
 
     fprintf(screen,"-------------------------------------");
@@ -371,8 +375,8 @@ void DeviceT::init_message(FILE *screen, const char *name,
 
 template <class numtyp, class acctyp>
 void DeviceT::estimate_gpu_overhead(const int kernel_calls, 
-                                           double &gpu_overhead,
-                                           double &gpu_driver_overhead) {
+                                    double &gpu_overhead,
+                                    double &gpu_driver_overhead) {
   UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
   UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
   UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
@@ -506,16 +510,17 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
+  double t_time=times[0]+times[1]+times[2]+times[3]+times[4];
 
   if (replica_me()==0)
     if (screen && times[5]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (time_device()) {
+      if (time_device() && t_time>0) {
         fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
         fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
@@ -527,7 +532,8 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
       }
       if (nbor.gpu_nbor()==2)
         fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[8]/_replica_size);
-      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[5]/_replica_size);
+      if (times[5]>0)
+        fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
@@ -541,14 +547,14 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
 
 template <class numtyp, class acctyp>
 void DeviceT::output_kspace_times(UCL_Timer &time_in, 
-                                         UCL_Timer &time_out,
-                                         UCL_Timer &time_map,
-                                         UCL_Timer &time_rho,
-                                         UCL_Timer &time_interp,
-                                         Answer<numtyp,acctyp> &ans, 
-                                         const double max_bytes, 
-                                         const double cpu_time, 
-                                         const double idle_time, FILE *screen) {
+                                  UCL_Timer &time_out,
+                                  UCL_Timer &time_map,
+                                  UCL_Timer &time_rho,
+                                  UCL_Timer &time_interp,
+                                  Answer<numtyp,acctyp> &ans, 
+                                  const double max_bytes, 
+                                  const double cpu_time, 
+                                  const double idle_time, FILE *screen) {
   double single[8], times[8];
 
   single[0]=time_out.total_seconds();
@@ -566,16 +572,17 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
+  double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
 
   if (replica_me()==0)
     if (screen && times[6]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"      Device Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (time_device()) {
+      if (time_device() && t_time>0) {
         fprintf(screen,"Data Out:        %.4f s.\n",times[0]/_replica_size);
         fprintf(screen,"Data In:         %.4f s.\n",times[1]/_replica_size);
         fprintf(screen,"Kernel (map):    %.4f s.\n",times[2]/_replica_size);
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index d7bc70edaf..9a767e74b3 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -77,7 +77,7 @@ class Device {
            const int nlocal, const int host_nlocal, const int nall,
            Neighbor *nbor, const int maxspecial, const int gpu_host,
            const int max_nbors, const double cell_size, const bool pre_cut,
-           const int threads_per_atom);
+           const int threads_per_atom, const bool vel=false);
 
   /// Initialize the device for Atom storage only
   /** \param nlocal Total number of local particles to allocate memory for