From b56bdd2d7bf3dac72bd98e713433ffa0152ac115 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 24 Jun 2016 13:55:14 -0400
Subject: [PATCH] remove trailing whitespace in gpu library

---
 lib/gpu/lal_answer.cpp             |  46 ++--
 lib/gpu/lal_atom.cpp               |  46 ++--
 lib/gpu/lal_atom.cu                |   6 +-
 lib/gpu/lal_atom.h                 |  60 +++---
 lib/gpu/lal_balance.h              |  18 +-
 lib/gpu/lal_base_atomic.cpp        |  20 +-
 lib/gpu/lal_base_atomic.h          |  22 +-
 lib/gpu/lal_base_charge.cpp        |  16 +-
 lib/gpu/lal_base_charge.h          |  12 +-
 lib/gpu/lal_base_dipole.cpp        |  18 +-
 lib/gpu/lal_base_dipole.h          |  12 +-
 lib/gpu/lal_base_dpd.cpp           |  18 +-
 lib/gpu/lal_base_dpd.h             |  12 +-
 lib/gpu/lal_base_ellipsoid.cpp     |  34 +--
 lib/gpu/lal_base_ellipsoid.h       |  28 +--
 lib/gpu/lal_beck.cpp               |  10 +-
 lib/gpu/lal_beck.cu                |  26 +--
 lib/gpu/lal_beck.h                 |  12 +-
 lib/gpu/lal_beck_ext.cpp           |   6 +-
 lib/gpu/lal_born.cpp               |  22 +-
 lib/gpu/lal_born.cu                |  58 ++---
 lib/gpu/lal_born.h                 |  20 +-
 lib/gpu/lal_born_coul_long.cpp     |  26 +--
 lib/gpu/lal_born_coul_long.cu      | 268 +++++++++++------------
 lib/gpu/lal_born_coul_long.h       |  20 +-
 lib/gpu/lal_born_coul_long_ext.cpp |  30 +--
 lib/gpu/lal_born_coul_wolf.cpp     |  28 +--
 lib/gpu/lal_born_coul_wolf.cu      |  64 +++---
 lib/gpu/lal_born_coul_wolf.h       |  20 +-
 lib/gpu/lal_born_coul_wolf_ext.cpp |  22 +-
 lib/gpu/lal_born_ext.cpp           |  28 +--
 lib/gpu/lal_buck.cpp               |  28 +--
 lib/gpu/lal_buck.cu                |  54 ++---
 lib/gpu/lal_buck.h                 |  18 +-
 lib/gpu/lal_buck_coul.cpp          |  28 +--
 lib/gpu/lal_buck_coul.cu           |  80 +++----
 lib/gpu/lal_buck_coul.h            |  20 +-
 lib/gpu/lal_buck_coul_ext.cpp      |  22 +-
 lib/gpu/lal_buck_coul_long.cpp     |  24 +--
 lib/gpu/lal_buck_coul_long.cu      | 276 ++++++++++++------------
 lib/gpu/lal_buck_coul_long.h       |  14 +-
 lib/gpu/lal_buck_coul_long_ext.cpp |  18 +-
 lib/gpu/lal_buck_ext.cpp           |  22 +-
 lib/gpu/lal_cg_cmm.cpp             |  22 +-
 lib/gpu/lal_cg_cmm.cu              |  44 ++--
 lib/gpu/lal_cg_cmm.h               |  10 +-
 lib/gpu/lal_cg_cmm_ext.cpp         |  12 +-
 lib/gpu/lal_cg_cmm_long.cpp        |  24 +--
 lib/gpu/lal_cg_cmm_long.cu         |  38 ++--
 lib/gpu/lal_cg_cmm_long.h          |  12 +-
 lib/gpu/lal_cg_cmm_long_ext.cpp    |  14 +-
 lib/gpu/lal_charmm_long.cpp        |  20 +-
 lib/gpu/lal_charmm_long.cu         |  42 ++--
 lib/gpu/lal_charmm_long.h          |  12 +-
 lib/gpu/lal_charmm_long_ext.cpp    |  10 +-
 lib/gpu/lal_colloid.cpp            |  30 +--
 lib/gpu/lal_colloid.cu             | 112 +++++-----
 lib/gpu/lal_colloid.h              |  20 +-
 lib/gpu/lal_colloid_ext.cpp        |  20 +-
 lib/gpu/lal_coul.cpp               |  18 +-
 lib/gpu/lal_coul.cu                |  38 ++--
 lib/gpu/lal_coul.h                 |  12 +-
 lib/gpu/lal_coul_debye.cpp         |  16 +-
 lib/gpu/lal_coul_debye.cu          |  26 +--
 lib/gpu/lal_coul_debye.h           |  12 +-
 lib/gpu/lal_coul_debye_ext.cpp     |  14 +-
 lib/gpu/lal_coul_dsf.cpp           |  18 +-
 lib/gpu/lal_coul_dsf.cu            |  56 ++---
 lib/gpu/lal_coul_dsf.h             |  10 +-
 lib/gpu/lal_coul_dsf_ext.cpp       |  20 +-
 lib/gpu/lal_coul_ext.cpp           |  16 +-
 lib/gpu/lal_coul_long.cpp          |  12 +-
 lib/gpu/lal_coul_long.cu           |  18 +-
 lib/gpu/lal_coul_long.h            |   6 +-
 lib/gpu/lal_coul_long_ext.cpp      |   8 +-
 lib/gpu/lal_device.cpp             | 108 +++++-----
 lib/gpu/lal_device.cu              |   6 +-
 lib/gpu/lal_device.h               |  68 +++---
 lib/gpu/lal_dipole_lj.cpp          |  16 +-
 lib/gpu/lal_dipole_lj.cu           |  88 ++++----
 lib/gpu/lal_dipole_lj.h            |   8 +-
 lib/gpu/lal_dipole_lj_ext.cpp      |  10 +-
 lib/gpu/lal_dipole_lj_sf.cpp       |  20 +-
 lib/gpu/lal_dipole_lj_sf.cu        | 122 +++++------
 lib/gpu/lal_dipole_lj_sf.h         |   8 +-
 lib/gpu/lal_dipole_lj_sf_ext.cpp   |  10 +-
 lib/gpu/lal_dpd.cpp                |  26 +--
 lib/gpu/lal_dpd.cu                 |  84 ++++----
 lib/gpu/lal_dpd.h                  |  18 +-
 lib/gpu/lal_dpd_ext.cpp            |  20 +-
 lib/gpu/lal_eam.cpp                | 134 ++++++------
 lib/gpu/lal_eam.cu                 | 136 ++++++------
 lib/gpu/lal_eam.h                  |  54 ++---
 lib/gpu/lal_eam_alloy_ext.cpp      |  32 +--
 lib/gpu/lal_eam_ext.cpp            |  32 +--
 lib/gpu/lal_eam_fs_ext.cpp         |  32 +--
 lib/gpu/lal_ellipsoid_extra.h      |  14 +-
 lib/gpu/lal_ellipsoid_nbor.cu      |  34 +--
 lib/gpu/lal_gauss.cpp              |  18 +-
 lib/gpu/lal_gauss.cu               |  60 +++---
 lib/gpu/lal_gauss.h                |  18 +-
 lib/gpu/lal_gauss_ext.cpp          |  22 +-
 lib/gpu/lal_gayberne.cpp           |  66 +++---
 lib/gpu/lal_gayberne.cu            |  72 +++----
 lib/gpu/lal_gayberne.h             |  26 +--
 lib/gpu/lal_gayberne_ext.cpp       |  20 +-
 lib/gpu/lal_gayberne_lj.cu         | 130 ++++++------
 lib/gpu/lal_lj.cpp                 |  26 +--
 lib/gpu/lal_lj.cu                  |  60 +++---
 lib/gpu/lal_lj.h                   |  16 +-
 lib/gpu/lal_lj96.cpp               |  14 +-
 lib/gpu/lal_lj96.cu                |  50 ++---
 lib/gpu/lal_lj96.h                 |  10 +-
 lib/gpu/lal_lj96_ext.cpp           |   6 +-
 lib/gpu/lal_lj_class2_long.cpp     |  12 +-
 lib/gpu/lal_lj_class2_long.cu      |  42 ++--
 lib/gpu/lal_lj_class2_long.h       |   8 +-
 lib/gpu/lal_lj_class2_long_ext.cpp |   6 +-
 lib/gpu/lal_lj_coul.cpp            |  20 +-
 lib/gpu/lal_lj_coul.cu             |  46 ++--
 lib/gpu/lal_lj_coul.h              |   8 +-
 lib/gpu/lal_lj_coul_debye.cpp      |  18 +-
 lib/gpu/lal_lj_coul_debye.cu       |  42 ++--
 lib/gpu/lal_lj_coul_debye.h        |   8 +-
 lib/gpu/lal_lj_coul_debye_ext.cpp  |  10 +-
 lib/gpu/lal_lj_coul_ext.cpp        |   8 +-
 lib/gpu/lal_lj_coul_long.cpp       |  18 +-
 lib/gpu/lal_lj_coul_long.cu        |  38 ++--
 lib/gpu/lal_lj_coul_long.h         |  10 +-
 lib/gpu/lal_lj_coul_long_ext.cpp   |  16 +-
 lib/gpu/lal_lj_coul_msm.cpp        |  20 +-
 lib/gpu/lal_lj_coul_msm.cu         |  30 +--
 lib/gpu/lal_lj_coul_msm.h          |  14 +-
 lib/gpu/lal_lj_coul_msm_ext.cpp    |   8 +-
 lib/gpu/lal_lj_cubic.cpp           |  22 +-
 lib/gpu/lal_lj_cubic.cu            |  64 +++---
 lib/gpu/lal_lj_cubic.h             |  16 +-
 lib/gpu/lal_lj_cubic_ext.cpp       |  14 +-
 lib/gpu/lal_lj_dsf.cpp             |  20 +-
 lib/gpu/lal_lj_dsf.cu              |  46 ++--
 lib/gpu/lal_lj_dsf.h               |   8 +-
 lib/gpu/lal_lj_dsf_ext.cpp         |   8 +-
 lib/gpu/lal_lj_expand.cpp          |  24 +--
 lib/gpu/lal_lj_expand.cu           |  58 ++---
 lib/gpu/lal_lj_expand.h            |  14 +-
 lib/gpu/lal_lj_expand_ext.cpp      |  12 +-
 lib/gpu/lal_lj_ext.cpp             |  12 +-
 lib/gpu/lal_lj_gromacs.cpp         |  22 +-
 lib/gpu/lal_lj_gromacs.cu          |  26 +--
 lib/gpu/lal_lj_gromacs.h           |  12 +-
 lib/gpu/lal_lj_gromacs_ext.cpp     |  12 +-
 lib/gpu/lal_mie.cpp                |  12 +-
 lib/gpu/lal_mie.cu                 |  42 ++--
 lib/gpu/lal_mie.h                  |  12 +-
 lib/gpu/lal_mie_ext.cpp            |   6 +-
 lib/gpu/lal_morse.cpp              |  24 +--
 lib/gpu/lal_morse.cu               |  48 ++---
 lib/gpu/lal_morse.h                |  12 +-
 lib/gpu/lal_morse_ext.cpp          |  12 +-
 lib/gpu/lal_neighbor_cpu.cu        |   6 +-
 lib/gpu/lal_neighbor_gpu.cu        |  72 +++----
 lib/gpu/lal_neighbor_shared.cpp    |   2 +-
 lib/gpu/lal_neighbor_shared.h      |   6 +-
 lib/gpu/lal_pppm.cpp               |  44 ++--
 lib/gpu/lal_pppm.cu                |  50 ++---
 lib/gpu/lal_pppm.h                 |  28 +--
 lib/gpu/lal_pppm_ext.cpp           |  18 +-
 lib/gpu/lal_precision.h            |   8 +-
 lib/gpu/lal_preprocessor.h         |  20 +-
 lib/gpu/lal_re_squared.cpp         |  52 ++---
 lib/gpu/lal_re_squared.cu          |  40 ++--
 lib/gpu/lal_re_squared.h           |  20 +-
 lib/gpu/lal_re_squared_ext.cpp     |  20 +-
 lib/gpu/lal_re_squared_lj.cu       | 140 ++++++------
 lib/gpu/lal_soft.cpp               |  14 +-
 lib/gpu/lal_soft.cu                |  32 +--
 lib/gpu/lal_soft.h                 |  14 +-
 lib/gpu/lal_soft_ext.cpp           |  14 +-
 lib/gpu/lal_sw_ext.cpp             |  24 +--
 lib/gpu/lal_table.cpp              |  88 ++++----
 lib/gpu/lal_table.cu               | 328 ++++++++++++++---------------
 lib/gpu/lal_table.h                |  36 ++--
 lib/gpu/lal_table_ext.cpp          |  12 +-
 lib/gpu/lal_yukawa.cpp             |  16 +-
 lib/gpu/lal_yukawa.cu              |  52 ++---
 lib/gpu/lal_yukawa.h               |  16 +-
 lib/gpu/lal_yukawa_colloid.cpp     |  62 +++---
 lib/gpu/lal_yukawa_colloid.cu      |  62 +++---
 lib/gpu/lal_yukawa_colloid.h       |  28 +--
 lib/gpu/lal_yukawa_colloid_ext.cpp |  24 +--
 lib/gpu/lal_yukawa_ext.cpp         |  20 +-
 lib/gpu/lal_zbl.cpp                |  24 +--
 lib/gpu/lal_zbl.cu                 |  76 +++----
 lib/gpu/lal_zbl.h                  |  20 +-
 lib/gpu/lal_zbl_ext.cpp            |  18 +-
 195 files changed, 3257 insertions(+), 3257 deletions(-)

diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp
index dd0b5d2424..bd8c7ef843 100644
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -24,7 +24,7 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),
 }
 
 template <class numtyp, class acctyp>
-int AnswerT::bytes_per_atom() const { 
+int AnswerT::bytes_per_atom() const {
   int bytes=11*sizeof(acctyp);
   if (_rot)
     bytes+=4*sizeof(acctyp);
@@ -38,19 +38,19 @@ bool AnswerT::alloc(const int inum) {
   _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
 
   bool success=true;
-  
+
   _ans_fields=4;
   if (_rot)
     _ans_fields+=4;
-  
+
   // ---------------------------  Device allocations
   success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
                                  UCL_READ_WRITE)==UCL_SUCCESS);
   success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY,
                                 UCL_READ_WRITE)==UCL_SUCCESS);
   _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
-  
-  _allocated=true;  
+
+  _allocated=true;
   return success;
 }
 
@@ -69,21 +69,21 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot,
   if (_charge)
     _e_fields++;
   _ev_fields=6+_e_fields;
-    
+
   // Initialize atom and nbor data
   int ef_inum=inum;
   if (ef_inum==0)
     ef_inum=1000;
-  
+
   // Initialize timers for the selected device
   time_answer.init(*dev);
   time_answer.zero();
   _time_cast=0.0;
   _time_cpu_idle=0.0;
-  
+
   return success && alloc(ef_inum);
 }
-  
+
 template <class numtyp, class acctyp>
 bool AnswerT::add_fields(const bool charge, const bool rot) {
   bool realloc=false;
@@ -127,15 +127,15 @@ void AnswerT::clear() {
 template <class numtyp, class acctyp>
 double AnswerT::host_memory_usage() const {
   int atom_bytes=4;
-  if (_charge) 
+  if (_charge)
     atom_bytes+=1;
-  if (_rot) 
+  if (_rot)
     atom_bytes+=4;
   int ans_bytes=atom_bytes+_ev_fields;
   return ans_bytes*(_max_local)*sizeof(acctyp)+
          sizeof(Answer<numtyp,acctyp>);
 }
-  
+
 template <class numtyp, class acctyp>
 void AnswerT::copy_answers(const bool eflag, const bool vflag,
                                const bool ef_atom, const bool vf_atom) {
@@ -144,8 +144,8 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
   _vflag=vflag;
   _ef_atom=ef_atom;
   _vf_atom=vf_atom;
-    
-  int csize=_ev_fields;    
+
+  int csize=_ev_fields;
   if (!eflag)
     csize-=_e_fields;
   if (!vflag)
@@ -180,7 +180,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
     for (int i=0; i<_inum; i++)
       evdwl+=engv[i];
     if (_ef_atom)
-      if (_ilist==NULL) 
+      if (_ilist==NULL)
         for (int i=0; i<_inum; i++)
           eatom[i]+=engv[i];
       else
@@ -196,18 +196,18 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
       if (_vf_atom)
         if (_ilist==NULL) {
           int ii=0;
-          for (int i=vstart; i<iend; i++) 
+          for (int i=vstart; i<iend; i++)
             vatom[ii++][j]+=engv[i];
         } else {
           int ii=0;
-          for (int i=vstart; i<iend; i++) 
+          for (int i=vstart; i<iend; i++)
             vatom[_ilist[ii++]][j]+=engv[i];
         }
       vstart+=_inum;
       iend+=_inum;
     }
   }
-  
+
   return evdwl;
 }
 
@@ -242,8 +242,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
       }
     vstart=iend;
     iend+=_inum;
-  } 
-  if (_vflag) { 
+  }
+  if (_vflag) {
     for (int j=0; j<6; j++) {
       for (int i=vstart; i<iend; i++)
         virial[j]+=engv[i];
@@ -254,12 +254,12 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
         } else {
           for (int i=vstart, ii=0; i<iend; i++)
             vatom[_ilist[ii++]][j]+=engv[i];
-        }  
+        }
       vstart+=_inum;
       iend+=_inum;
     }
   }
-  
+
   return evdwl;
 }
 
diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index a250584dfa..222ba0525e 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ AtomT::Atom() : _compiled(false),_allocated(false),
 }
 
 template <class numtyp, class acctyp>
-int AtomT::bytes_per_atom() const { 
+int AtomT::bytes_per_atom() const {
   int id_space=0;
   if (_gpu_nbor==1)
     id_space=2;
@@ -51,7 +51,7 @@ bool AtomT::alloc(const int nall) {
   _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
 
   bool success=true;
-  
+
   // Ignore host/device transfers?
   _host_view=false;
   if (dev->shared_memory() && sizeof(numtyp)==sizeof(double)) {
@@ -60,11 +60,11 @@ bool AtomT::alloc(const int nall) {
     assert(0==1);
     #endif
   }
-      
+
   // Allocate storage for CUDPP sort
   #ifdef USE_CUDPP
   if (_gpu_nbor==1) {
-    CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
+    CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
     if (CUDPP_SUCCESS != result)
       return false;
   }
@@ -110,7 +110,7 @@ bool AtomT::alloc(const int nall) {
     } else {
       success=success && (host_particle_id.alloc(_max_atoms,*dev,
                                                  UCL_WRITE_ONLY)==UCL_SUCCESS);
-      success=success && 
+      success=success &&
              (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
     }
     if (_gpu_nbor==2 && _host_view)
@@ -124,8 +124,8 @@ bool AtomT::alloc(const int nall) {
   gpu_bytes+=x.device.row_bytes();
   if (gpu_bytes>_max_gpu_bytes)
     _max_gpu_bytes=gpu_bytes;
-  
-  _allocated=true;  
+
+  _allocated=true;
   return success;
 }
 
@@ -135,7 +135,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
   bool success=true;
   // Ignore host/device transfers?
   int gpu_bytes=0;
-  
+
   if (charge && _charge==false) {
     _charge=true;
     _other=true;
@@ -179,7 +179,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
     _gpu_nbor=gpu_nbor;
     #ifdef USE_CUDPP
     if (_gpu_nbor==1) {
-      CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
+      CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
       if (CUDPP_SUCCESS != result)
         return false;
     }
@@ -198,9 +198,9 @@ bool AtomT::add_fields(const bool charge, const bool rot,
     } else {
       success=success && (host_particle_id.alloc(_max_atoms,*dev,
                                                  UCL_WRITE_ONLY)==UCL_SUCCESS);
-      success=success && 
+      success=success &&
              (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
-    }             
+    }
   }
 
   return success;
@@ -230,7 +230,7 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
-  
+
   // Initialize timers for the selected device
   time_pos.init(*dev);
   time_q.init(*dev);
@@ -241,14 +241,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
   time_quat.zero();
   time_vel.zero();
   _time_cast=0.0;
-  
+
   #ifdef GPU_CAST
   compile_kernels(*dev);
   #endif
-  
+
   return success && alloc(ef_nall);
 }
-  
+
 template <class numtyp, class acctyp>
 void AtomT::clear_resize() {
   if (!_allocated)
@@ -274,7 +274,7 @@ void AtomT::clear_resize() {
   #ifdef USE_CUDPP
   if (_gpu_nbor==1) cudppDestroyPlan(sort_plan);
   #endif
-  
+
   if (_gpu_nbor==2) {
     host_particle_id.clear();
     host_cell_id.clear();
@@ -305,21 +305,21 @@ void AtomT::clear() {
 template <class numtyp, class acctyp>
 double AtomT::host_memory_usage() const {
   int atom_bytes=4;
-  if (_charge) 
+  if (_charge)
     atom_bytes+=1;
-  if (_rot) 
+  if (_rot)
     atom_bytes+=4;
-  if (_vel) 
+  if (_vel)
     atom_bytes+=4;
   return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }
-  
+
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
 void AtomT::sort_neighbor(const int num_atoms) {
   #ifdef USE_CUDPP
-  CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), 
-                                 (int *)dev_particle_id.begin(), 
+  CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
+                                 (int *)dev_particle_id.begin(),
                                  8*sizeof(unsigned), num_atoms);
   if (CUDPP_SUCCESS != result) {
     printf("Error in cudppSort\n");
diff --git a/lib/gpu/lal_atom.cu b/lib/gpu/lal_atom.cu
index 2a78719ffb..28ff31c566 100644
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -17,9 +17,9 @@
 #include "lal_preprocessor.h"
 #endif
 
-__kernel void kernel_cast_x(__global numtyp4 *restrict x_type, 
+__kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
                             const __global double *restrict x,
-                            const __global int *restrict type, 
+                            const __global int *restrict type,
                             const int nall) {
   int ii=GLOBAL_ID_X;
 
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index 23112fe712..1b4e17d972 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -57,19 +57,19 @@ class Atom {
 
   /// Set number of local+ghost atoms for future copy operations
   inline void nall(const int n) { _nall=n; }
-  
+
   /// Memory usage per atom in this class
-  int bytes_per_atom() const; 
+  int bytes_per_atom() const;
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param rot True if atom storage needs quaternions
     * \param gpu_nbor 0 if neighboring will be performed on host
     *        gpu_nbor 1 if neighboring will be performed on device
     *        gpu_nbor 2 if binning on host and neighboring on device **/
-  bool init(const int nall, const bool charge, const bool rot, 
-            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, 
+  bool init(const int nall, const bool charge, const bool rot,
+            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
             const bool vel=false);
-  
+
   /// Check if we have enough device storage and realloc if not
   /** Returns true if resized with any call during this timestep **/
   inline bool resize(const int nall, bool &success) {
@@ -81,7 +81,7 @@ class Atom {
     }
     return _resized;
   }
-  
+
   /// If already initialized by another LAMMPS style, add fields as necessary
   /** \param rot True if atom storage needs quaternions
     * \param gpu_nbor 0 if neighboring will be performed on host
@@ -89,28 +89,28 @@ class Atom {
     *        gpu_nbor 2 if binning on host and neighboring on device **/
   bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
                   const bool bonds, const bool vel=false);
-  
+
   /// Returns true if GPU is using charges
   bool charge() { return _charge; }
-  
+
   /// Returns true if GPU is using quaternions
   bool quaternion() { return _rot; }
-  
+
   /// Returns true if GPU is using velocities
   bool velocity() { return _vel; }
 
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
-  
+
   /// Free all memory on host and device
   void clear();
- 
+
   /// Return the total amount of host memory used by class in bytes
   double host_memory_usage() const;
 
   /// Sort arrays for neighbor list calculation on device
   void sort_neighbor(const int num_atoms);
-  
+
   /// Add copy times to timers
   inline void acc_timers() {
     time_pos.add_to_total();
@@ -150,18 +150,18 @@ class Atom {
       total+=time_vel.total_seconds();
       time_vel.zero_total();
     }
-    
+
     return total+_time_transfer/1000.0;
   }
-  
+
   /// Return the total time for data cast/pack
   /** Zeros the time so that atom times are only included once **/
-  inline double cast_time() 
+  inline double cast_time()
     { double t=_time_cast; _time_cast=0.0; return t; }
 
   /// Pack LAMMPS atom type constants into matrix and copy to device
   template <class dev_typ, class t1>
-  inline void type_pack1(const int n, const int m_size, 
+  inline void type_pack1(const int n, const int m_size,
                          UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
                          t1 **one) {
     int ii=0;
@@ -215,7 +215,7 @@ class Atom {
     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
     ucl_copy(dev_v,view,false);
   }
-  
+
   /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
   template <class dev_typ, class t1, class t2, class t3, class t4>
   inline void type_pack4(const int n, const int m_size,
@@ -239,7 +239,7 @@ class Atom {
 
   /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
   template <class dev_typ, class t1, class t2>
-  inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v, 
+  inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
                          UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
     for (int i=0; i<n; i++) {
       buffer[i*2]=static_cast<numtyp>(one[i][i]);
@@ -279,7 +279,7 @@ class Atom {
 
   /// Copy positions and types to device asynchronously
   /** Copies nall() elements **/
-  inline void add_x_data(double **host_ptr, int *host_type) { 
+  inline void add_x_data(double **host_ptr, int *host_type) {
     time_pos.start();
     if (_x_avail==false) {
       #ifdef GPU_CAST
@@ -376,7 +376,7 @@ class Atom {
 
   /// Copy velocities and tags to device asynchronously
   /** Copies nall() elements **/
-  inline void add_v_data(double **host_ptr, tagint *host_tag) { 
+  inline void add_v_data(double **host_ptr, tagint *host_tag) {
     time_vel.start();
     if (_v_avail==false) {
       #ifdef GPU_CAST
@@ -407,8 +407,8 @@ class Atom {
   inline void add_transfer_time(double t) { _time_transfer+=t; }
 
   /// Return number of bytes used on device
-  inline double max_gpu_bytes() 
-    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 
+  inline double max_gpu_bytes()
+    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
 
   /// Returns true if the device is addressing memory on the host
   inline bool host_view() { return _host_view; }
@@ -422,7 +422,7 @@ class Atom {
   /// Quaterions
   UCL_Vector<numtyp,numtyp> quat;
   /// Velocities
-  UCL_Vector<numtyp,numtyp> v;  
+  UCL_Vector<numtyp,numtyp> v;
 
   #ifdef GPU_CAST
   UCL_Vector<double,double> x_cast;
@@ -436,7 +436,7 @@ class Atom {
 
   /// Atom tag information for device nbor builds
   UCL_D_Vec<tagint> dev_tag;
-  
+
   /// Cell list identifiers for hybrid nbor builds
   UCL_H_Vec<int> host_cell_id;
   /// Cell list identifiers for hybrid nbor builds
@@ -444,7 +444,7 @@ class Atom {
 
   /// Device timers
   UCL_Timer time_pos, time_q, time_quat, time_vel;
-  
+
   /// Geryon device
   UCL_Device *dev;
 
@@ -456,19 +456,19 @@ class Atom {
   #endif
 
   bool _compiled;
-  
+
   // True if data has been copied to device already
   bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
 
   bool alloc(const int nall);
-  
+
   bool _allocated, _rot, _charge, _bonds, _vel, _other;
   int _max_atoms, _nall, _gpu_nbor;
   bool _host_view;
   double _time_cast, _time_transfer;
-  
+
   double _max_gpu_bytes;
-  
+
   #ifdef USE_CUDPP
   CUDPPConfiguration sort_config;
   CUDPPHandle sort_plan;
diff --git a/lib/gpu/lal_balance.h b/lib/gpu/lal_balance.h
index cf09cf86fb..e90e94bee1 100644
--- a/lib/gpu/lal_balance.h
+++ b/lib/gpu/lal_balance.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -44,7 +44,7 @@ class Balance {
       _init_done=false;
     }
   }
-  
+
   /// Return the timestep since initialization
   inline int timestep() { return _timestep; }
 
@@ -96,7 +96,7 @@ class Balance {
   inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } }
 
   /// Calculate the new host/device split based on the cpu and device times
-  /** \note Only does calculation every _HD_BALANCE_EVERY timesteps 
+  /** \note Only does calculation every _HD_BALANCE_EVERY timesteps
             (and first 10) **/
   inline void balance(const double cpu_time);
 
@@ -105,13 +105,13 @@ class Balance {
     balance(cpu_time);
     return get_gpu_count(ago,inum_full);
   }
-  
+
  private:
   Device<numtyp,acctyp> *_device;
   UCL_Timer _device_time;
   bool _init_done;
   int _gpu_nbor;
-  
+
   bool _load_balance;
   double _actual_split, _avg_split, _desired_split, _max_split;
   int _avg_count;
@@ -123,15 +123,15 @@ class Balance {
 #define BalanceT Balance<numtyp,acctyp>
 
 template <class numtyp, class acctyp>
-void BalanceT::init(Device<numtyp, acctyp> *gpu, 
+void BalanceT::init(Device<numtyp, acctyp> *gpu,
                            const int gpu_nbor, const double split) {
   clear();
   _gpu_nbor=gpu_nbor;
   _init_done=true;
-  
+
   _device=gpu;
   _device_time.init(*gpu->gpu);
-  
+
   if (split<0.0) {
     _load_balance=true;
     _desired_split=0.90;
@@ -163,7 +163,7 @@ int BalanceT::get_gpu_count(const int ago, const int inum_full) {
   _timestep++;
   return _inum;
 }
-    
+
 template <class numtyp, class acctyp>
 void BalanceT::balance(const double cpu_time) {
   if (_measure_this_step) {
diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp
index 191f218bd8..e59dae1a6f 100644
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@@ -9,10 +9,10 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 #include "lal_base_atomic.h"
 using namespace LAMMPS_AL;
 #define BaseAtomicT BaseAtomic<numtyp, acctyp>
@@ -63,13 +63,13 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
     _nbor_data=&(nbor->dev_packed);
   } else
     _nbor_data=&(nbor->dev_nbor);
-    
+
   int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,false,
                            _threads_per_atom);
   if (success!=0)
     return success;
-    
+
   ucl_device=device->gpu;
   atom=&device->atom;
 
@@ -139,7 +139,7 @@ int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist,
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
-  
+
   return ilist;
 }
 
@@ -188,7 +188,7 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full,
     zero_timers();
     return;
   }
-  
+
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time);
   ans->inum(inum);
@@ -217,7 +217,7 @@ template <class numtyp, class acctyp>
 int ** BaseAtomicT::compute(const int ago, const int inum_full,
                                  const int nall, double **host_x, int *host_type,
                                  double *sublo, double *subhi, tagint *tag,
-                                 int **nspecial, tagint **special, const bool eflag, 
+                                 int **nspecial, tagint **special, const bool eflag,
                                  const bool vflag, const bool eatom,
                                  const bool vatom, int &host_start,
                                  int **ilist, int **jnum,
@@ -230,12 +230,12 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
     zero_timers();
     return NULL;
   }
-  
+
   hd_balancer.balance(cpu_time);
   int inum=hd_balancer.get_gpu_count(ago,inum_full);
   ans->inum(inum);
   host_start=inum;
- 
+
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@@ -255,7 +255,7 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
   ans->copy_answers(eflag,vflag,eatom,vatom);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
-  
+
   return nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_base_atomic.h b/lib/gpu/lal_base_atomic.h
index eaf55f46e2..e3e9829abc 100644
--- a/lib/gpu/lal_base_atomic.h
+++ b/lib/gpu/lal_base_atomic.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -41,7 +41,7 @@ class BaseAtomic {
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * \param k_name name for the kernel for force calculation
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -49,8 +49,8 @@ class BaseAtomic {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
-                  const int maxspecial, const double cell_size, 
-                  const double gpu_split, FILE *screen, 
+                  const int maxspecial, const double cell_size,
+                  const double gpu_split, FILE *screen,
                   const void *pair_program, const char *k_name);
 
   /// Estimate the overhead for GPU context changes and CPU driver
@@ -80,7 +80,7 @@ class BaseAtomic {
     * \note host_inum is 0 if the host is performing neighboring
     * \note nlocal+host_inum=total number local particles
     * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                            const int max_nbors, bool &success) {
     nbor->resize(inum,host_inum,max_nbors,success);
   }
@@ -119,7 +119,7 @@ class BaseAtomic {
   /// Build neighbor list on device
   void build_nbor_list(const int inum, const int host_inum,
                        const int nall, double **host_x, int *host_type,
-                       double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
                        tagint **special, bool &success);
 
   /// Pair loop with host neighboring
@@ -133,19 +133,19 @@ class BaseAtomic {
   int * compute(const int ago, const int inum_full,
                 const int nall, double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 const double cpu_time, bool &success);
 
   /// Pair loop with device neighboring
   int ** compute(const int ago, const int inum_full,
                  const int nall, double **host_x, int *host_type, double *sublo,
                  double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag, 
-                 const bool eatom, const bool vatom, int &host_start, 
+                 tagint **special, const bool eflag, const bool vflag,
+                 const bool eatom, const bool vatom, int &host_start,
                  int **ilist, int **numj, const double cpu_time, bool &success);
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp
index e7fe2b62f4..c6341f7d57 100644
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@@ -10,7 +10,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -64,7 +64,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
     _nbor_data=&(nbor->dev_packed);
   } else
     _nbor_data=&(nbor->dev_nbor);
-    
+
   int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,false,
                            _threads_per_atom);
@@ -153,7 +153,7 @@ template <class numtyp, class acctyp>
 inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
                                          const int nall, double **host_x,
                                          int *host_type, double *sublo,
-                                         double *subhi, tagint *tag, 
+                                         double *subhi, tagint *tag,
                                          int **nspecial, tagint **special,
                                          bool &success) {
   success=true;
@@ -192,7 +192,7 @@ void BaseChargeT::compute(const int f_ago, const int inum_full,
     zero_timers();
     return;
   }
-  
+
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time);
   ans->inum(inum);
@@ -226,7 +226,7 @@ template <class numtyp, class acctyp>
 int** BaseChargeT::compute(const int ago, const int inum_full,
                                 const int nall, double **host_x, int *host_type,
                                 double *sublo, double *subhi, tagint *tag,
-                                int **nspecial, tagint **special, const bool eflag, 
+                                int **nspecial, tagint **special, const bool eflag,
                                 const bool vflag, const bool eatom,
                                 const bool vatom, int &host_start,
                                 int **ilist, int **jnum,
@@ -240,12 +240,12 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
     zero_timers();
     return NULL;
   }
-  
+
   hd_balancer.balance(cpu_time);
   int inum=hd_balancer.get_gpu_count(ago,inum_full);
   ans->inum(inum);
   host_start=inum;
- 
+
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@@ -271,7 +271,7 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
   ans->copy_answers(eflag,vflag,eatom,vatom);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
-  
+
   return nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h
index e791507432..64c19554b9 100644
--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@@ -10,7 +10,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -42,7 +42,7 @@ class BaseCharge {
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * \param k_name name for the kernel for force calculation
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -83,7 +83,7 @@ class BaseCharge {
     * \note host_inum is 0 if the host is performing neighboring
     * \note nlocal+host_inum=total number local particles
     * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                            const int max_nbors, bool &success) {
     nbor->resize(inum,host_inum,max_nbors,success);
   }
@@ -137,12 +137,12 @@ class BaseCharge {
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd);
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp
index 12e3b20d96..478f0092c7 100644
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@@ -10,7 +10,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -65,7 +65,7 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
     _nbor_data=&(nbor->dev_packed);
   } else
     _nbor_data=&(nbor->dev_nbor);
-    
+
   int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,false,
                            _threads_per_atom);
@@ -155,7 +155,7 @@ template <class numtyp, class acctyp>
 inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
                                          const int nall, double **host_x,
                                          int *host_type, double *sublo,
-                                         double *subhi, tagint *tag, 
+                                         double *subhi, tagint *tag,
                                          int **nspecial, tagint **special,
                                          bool &success) {
   success=true;
@@ -194,7 +194,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
     zero_timers();
     return;
   }
-  
+
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time);
   ans->inum(inum);
@@ -230,12 +230,12 @@ template <class numtyp, class acctyp>
 int** BaseDipoleT::compute(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special, const bool eflag, 
+                           int **nspecial, tagint **special, const bool eflag,
                            const bool vflag, const bool eatom,
                            const bool vatom, int &host_start,
                            int **ilist, int **jnum,
                            const double cpu_time, bool &success,
-                           double *host_q, double **host_mu, 
+                           double *host_q, double **host_mu,
                            double *boxlo, double *prd) {
   acc_timers();
   if (inum_full==0) {
@@ -245,12 +245,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
     zero_timers();
     return NULL;
   }
-  
+
   hd_balancer.balance(cpu_time);
   int inum=hd_balancer.get_gpu_count(ago,inum_full);
   ans->inum(inum);
   host_start=inum;
- 
+
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@@ -279,7 +279,7 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
   ans->copy_answers(eflag,vflag,eatom,vatom);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
-  
+
   return nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_base_dipole.h b/lib/gpu/lal_base_dipole.h
index 2e495c8747..b51c4303cf 100644
--- a/lib/gpu/lal_base_dipole.h
+++ b/lib/gpu/lal_base_dipole.h
@@ -10,7 +10,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -40,7 +40,7 @@ class BaseDipole {
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * \param k_name name for the kernel for force calculation
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -82,7 +82,7 @@ class BaseDipole {
     * \note host_inum is 0 if the host is performing neighboring
     * \note nlocal+host_inum=total number local particles
     * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                            const int max_nbors, bool &success) {
     nbor->resize(inum,host_inum,max_nbors,success);
   }
@@ -136,12 +136,12 @@ class BaseDipole {
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double *charge, double **mu, double *boxlo, double *prd);
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp
index 0efb68a9fb..941f463b14 100644
--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@@ -64,7 +64,7 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
     _nbor_data=&(nbor->dev_packed);
   } else
     _nbor_data=&(nbor->dev_nbor);
-    
+
   int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,false,
                            _threads_per_atom,true);
@@ -153,7 +153,7 @@ template <class numtyp, class acctyp>
 inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
                                       const int nall, double **host_x,
                                       int *host_type, double *sublo,
-                                      double *subhi, tagint *tag, 
+                                      double *subhi, tagint *tag,
                                       int **nspecial, tagint **special,
                                       bool &success) {
   success=true;
@@ -182,7 +182,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
                        const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom,
                        int &host_start, const double cpu_time,
-                       bool &success, tagint *tag, double **host_v, 
+                       bool &success, tagint *tag, double **host_v,
                        const double dtinvsqrt, const int seed, const int timestep,
                        const int nlocal, double *boxlo, double *prd) {
   acc_timers();
@@ -193,7 +193,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
     zero_timers();
     return;
   }
-  
+
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time);
   ans->inum(inum);
@@ -228,12 +228,12 @@ template <class numtyp, class acctyp>
 int** BaseDPDT::compute(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, tagint *tag,
-                        int **nspecial, tagint **special, const bool eflag, 
+                        int **nspecial, tagint **special, const bool eflag,
                         const bool vflag, const bool eatom,
                         const bool vatom, int &host_start,
                         int **ilist, int **jnum,
                         const double cpu_time, bool &success,
-                        double **host_v, const double dtinvsqrt, 
+                        double **host_v, const double dtinvsqrt,
                         const int seed, const int timestep,
                         double *boxlo, double *prd) {
   acc_timers();
@@ -244,12 +244,12 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
     zero_timers();
     return NULL;
   }
-  
+
   hd_balancer.balance(cpu_time);
   int inum=hd_balancer.get_gpu_count(ago,inum_full);
   ans->inum(inum);
   host_start=inum;
- 
+
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@@ -276,7 +276,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
   ans->copy_answers(eflag,vflag,eatom,vatom);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
-  
+
   return nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h
index 97640ed40e..7a75282d0a 100644
--- a/lib/gpu/lal_base_dpd.h
+++ b/lib/gpu/lal_base_dpd.h
@@ -40,7 +40,7 @@ class BaseDPD {
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * \param k_name name for the kernel for force calculation
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -81,7 +81,7 @@ class BaseDPD {
     * \note host_inum is 0 if the host is performing neighboring
     * \note nlocal+host_inum=total number local particles
     * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                            const int max_nbors, bool &success) {
     nbor->resize(inum,host_inum,max_nbors,success);
   }
@@ -129,20 +129,20 @@ class BaseDPD {
                int **firstneigh, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success, tagint *tag,
-               double **v, const double dtinvsqrt, const int seed, 
+               double **v, const double dtinvsqrt, const int seed,
                const int timestep, const int nlocal, double *boxlo, double *prd);
 
   /// Pair loop with device neighboring
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double **v, const double dtinvsqrt, const int seed,
                 const int timestep, double *boxlo, double *prd);
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index 4200c02e1c..8918a3140c 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -70,7 +70,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
     _gpu_host=1;
 
   _threads_per_atom=device->threads_per_atom();
-    
+
   int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,true,
                            1);
@@ -113,7 +113,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
     return -8;
   if (_multiple_forms && gpu_nbor!=0)
     return -9;
-  
+
   if (_multiple_forms)
     ans->force.zero();
 
@@ -142,7 +142,7 @@ void BaseEllipsoidT::clear_base() {
   // Output any timing information
   output_times();
   host_olist.clear();
-  
+
   if (_compiled) {
     k_nbor_fast.clear();
     k_nbor.clear();
@@ -156,7 +156,7 @@ void BaseEllipsoidT::clear_base() {
     delete lj_program;
     _compiled=false;
   }
- 
+
   time_nbor1.clear();
   time_ellipsoid.clear();
   time_nbor2.clear();
@@ -230,7 +230,7 @@ void BaseEllipsoidT::output_times() {
       if (times[6]>0)
         fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);      
+      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
       fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
       fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
@@ -241,10 +241,10 @@ void BaseEllipsoidT::output_times() {
 }
 
 // ---------------------------------------------------------------------------
-// Pack neighbors to limit thread divergence for lj-lj and ellipse 
+// Pack neighbors to limit thread divergence for lj-lj and ellipse
 // ---------------------------------------------------------------------------
 template<class numtyp, class acctyp>
-void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, 
+void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
                                 const int inum, const int form_low,
                                 const int form_high, const bool shared_types,
                                 int ntypes) {
@@ -264,18 +264,18 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
 // Copy neighbor list from host
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseEllipsoidT::reset_nbors(const int nall, const int inum, 
+void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
                                  const int osize, int *ilist,
                                  int *numj, int *type, int **firstneigh,
                                  bool &success) {
   success=true;
-    
+
   int mn=nbor->max_nbor_loop(osize,numj,ilist);
   resize_atom(nall,success);
   resize_local(inum,0,mn,osize,success);
   if (!success)
     return;
-    
+
   if (_multiple_forms) {
     int p=0;
     for (int i=0; i<osize; i++) {
@@ -315,7 +315,7 @@ template <class numtyp, class acctyp>
 inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
                                             const int nall, double **host_x,
                                             int *host_type, double *sublo,
-                                            double *subhi, tagint *tag, 
+                                            double *subhi, tagint *tag,
                                             int **nspecial, tagint **special,
                                             bool &success) {
   success=true;
@@ -354,7 +354,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
     zero_timers();
     return NULL;
   }
-  
+
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time);
   ans->inum(inum);
@@ -394,7 +394,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
                               double **host_x, int *host_type, double *sublo,
                               double *subhi, tagint *tag, int **nspecial,
                               tagint **special, const bool eflag, const bool vflag,
-                              const bool eatom, const bool vatom, 
+                              const bool eatom, const bool vatom,
                               int &host_start, int **ilist, int **jnum,
                               const double cpu_time, bool &success,
                               double **host_quat) {
@@ -410,7 +410,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
   ans->inum(inum);
   _last_ellipse=std::min(inum,_max_last_ellipse);
   host_start=inum;
-  
+
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@@ -419,7 +419,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
       return NULL;
     atom->cast_quat_data(host_quat[0]);
     hd_balancer.start_timer();
-  } else {    
+  } else {
     atom->cast_x_data(host_x,host_type);
     atom->cast_quat_data(host_quat[0]);
     hd_balancer.start_timer();
@@ -444,9 +444,9 @@ double BaseEllipsoidT::host_memory_usage_base() const {
 }
 
 template <class numtyp, class acctyp>
-void BaseEllipsoidT::compile_kernels(UCL_Device &dev, 
+void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
                                      const void *ellipsoid_string,
-                                     const void *lj_string, 
+                                     const void *lj_string,
                                      const char *kname, const bool e_s) {
   if (_compiled)
     return;
diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h
index e289430f43..7deeccbf44 100644
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@@ -42,7 +42,7 @@ class BaseEllipsoid {
     * \param gpu_split fraction of particles handled by device
     * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
     * \param k_name name for the kernel for force calculation
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -68,7 +68,7 @@ class BaseEllipsoid {
       quat_tex.bind_float(atom->quat,4);
       lj_pos_tex.bind_float(atom->x,4);
       lj_quat_tex.bind_float(atom->quat,4);
-    }      
+    }
   }
 
   /// Check if there is enough storage for neighbors and realloc if not
@@ -78,7 +78,7 @@ class BaseEllipsoid {
     * \param olist_size size of list of particles from CPU neighboring
     * \note host_inum is 0 if the host is performing neighboring
     * \note if GPU is neighboring nlocal+host_inum=total number local particles
-    * \note if CPU is neighboring olist_size=total number of local particles 
+    * \note if CPU is neighboring olist_size=total number of local particles
     * \note if GPU is neighboring olist_size=0 **/
   inline void resize_local(const int nlocal, const int host_inum,
                            const int max_nbors, const int olist_size,
@@ -101,7 +101,7 @@ class BaseEllipsoid {
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear_base();
-  
+
   /// Output any timing information
   void output_times();
 
@@ -130,7 +130,7 @@ class BaseEllipsoid {
       ans->acc_timers();
     }
   }
-  
+
   /// Zero timers
   inline void zero_timers() {
     time_nbor1.zero();
@@ -148,9 +148,9 @@ class BaseEllipsoid {
     ans->zero_timers();
   }
 
-  /// Pack neighbors to limit thread divergence for lj-lj and ellipse 
+  /// Pack neighbors to limit thread divergence for lj-lj and ellipse
   void pack_nbors(const int GX, const int BX, const int start, const int inum,
-                  const int form_low, const int form_high, 
+                  const int form_low, const int form_high,
                   const bool shared_types, int ntypes);
 
   /// Copy neighbor list from host
@@ -174,17 +174,17 @@ class BaseEllipsoid {
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double **host_quat);
 
   /// Build neighbor list on accelerator
-  void build_nbor_list(const int inum, const int host_inum, const int nall, 
+  void build_nbor_list(const int inum, const int host_inum, const int nall,
                        double **host_x, int *host_type, double *sublo,
                        double *subhi, bool &success);
-                       
-  // -------------------------- DEVICE DATA ------------------------- 
+
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
@@ -207,7 +207,7 @@ class BaseEllipsoid {
   /// Atom Data
   Atom<numtyp,acctyp> *atom;
 
-  // --------------------------- TYPE DATA -------------------------- 
+  // --------------------------- TYPE DATA --------------------------
 
   /// cut_form.x = cutsq, cut_form.y = form
   UCL_D_Vec<numtyp2> cut_form;
@@ -240,7 +240,7 @@ class BaseEllipsoid {
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
-  // True if we want to use fast GB-sphere or sphere-sphere calculations 
+  // True if we want to use fast GB-sphere or sphere-sphere calculations
   bool _multiple_forms;
   int **_host_form;
   int _last_ellipse, _max_last_ellipse;
diff --git a/lib/gpu/lal_beck.cpp b/lib/gpu/lal_beck.cpp
index 062c095957..165a02b71a 100644
--- a/lib/gpu/lal_beck.cpp
+++ b/lib/gpu/lal_beck.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,17 +33,17 @@ BeckT::Beck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-BeckT::~Beck() { 
+BeckT::~Beck() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BeckT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int BeckT::init(const int ntypes, 
+int BeckT::init(const int ntypes,
                 double **host_cutsq, double **host_aa,
                 double **host_alpha, double **host_beta,
                 double **host_AA, double **host_BB,
@@ -126,7 +126,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
diff --git a/lib/gpu/lal_beck.cu b/lib/gpu/lal_beck.cu
index 7ccefd8859..7d72128b5f 100644
--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,7 +24,7 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_beck(const __global numtyp4 *restrict x_, 
+__kernel void k_beck(const __global numtyp4 *restrict x_,
                      const __global numtyp4 *restrict beck1,
                      const __global numtyp4 *restrict beck2,
                      const int lj_types,
@@ -50,20 +50,20 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -76,7 +76,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<beck2[mtype].z) {
         numtyp r = ucl_sqrt(rsq);
@@ -103,7 +103,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
           numtyp term1inv = ucl_recip(term1);
           numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
           e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -133,7 +133,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -143,7 +143,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
     beck1[tid]=beck1_in[tid];
     beck2[tid]=beck2_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -152,7 +152,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -166,7 +166,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -179,7 +179,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<beck2[mtype].z) {
         numtyp r = ucl_sqrt(rsq);
         numtyp r5 = rsq*rsq*r;
@@ -205,7 +205,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
           numtyp term1inv = ucl_recip(term1);
           numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
           e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_beck.h b/lib/gpu/lal_beck.h
index fa56db2402..db26bebeb0 100644
--- a/lib/gpu/lal_beck.h
+++ b/lib/gpu/lal_beck.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Beck : public BaseAtomic<numtyp, acctyp> {
  public:
   Beck();
-  ~Beck(); 
+  ~Beck();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -41,8 +41,8 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
            double **host_aa, double **host_alpha,
            double **host_beta, double **host_AA,
            double **host_BB, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -67,7 +67,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_beck_ext.cpp b/lib/gpu/lal_beck_ext.cpp
index 28ca0df346..1552b640e8 100644
--- a/lib/gpu/lal_beck_ext.cpp
+++ b/lib/gpu/lal_beck_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -77,7 +77,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
                         cell_size, gpu_split, screen);
 
     BLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,7 +102,7 @@ int ** beck_gpu_compute_n(const int ago, const int inum_full,
   return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void beck_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp
index 55cb24d3b0..36898b3910 100644
--- a/lib/gpu/lal_born.cpp
+++ b/lib/gpu/lal_born.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,10 +33,10 @@ BornT::Born() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-BornT::~Born() { 
+BornT::~Born() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BornT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -44,12 +44,12 @@ int BornT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int BornT::init(const int ntypes, double **host_cutsq,
-                double **host_rhoinv, double **host_born1, double **host_born2, 
+                double **host_rhoinv, double **host_born1, double **host_born2,
                 double **host_born3, double **host_a, double **host_c,
                 double **host_d, double **host_sigma,
                 double **host_offset, double *host_special_lj,
-                const int nlocal, const int nall, const int max_nbors, 
-                const int maxspecial, const double cell_size, 
+                const int nlocal, const int nall, const int max_nbors,
+                const int maxspecial, const double cell_size,
                 const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -102,14 +102,14 @@ void BornT::reinit(const int ntypes, double **host_rhoinv,
                    double **host_born1, double **host_born2,
                    double **host_born3, double **host_a, double **host_c,
                    double **host_d, double **host_offset) {
-  
+
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
                          host_born1,host_born2,host_born3);
   this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
@@ -151,7 +151,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -169,7 +169,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &coeff1, &coeff2,
-                     &cutsq_sigma, &_lj_types, &sp_lj, 
+                     &cutsq_sigma, &_lj_types, &sp_lj,
                      &this->nbor->dev_nbor,
                      &this->_nbor_data->begin(), &this->ans->force,
                      &this->ans->engv, &eflag, &vflag, &ainum,
diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu
index 5f917be846..0ca7fea5fe 100644
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,16 +24,16 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_born(const __global numtyp4 *restrict x_, 
+__kernel void k_born(const __global numtyp4 *restrict x_,
                      const __global numtyp4 *restrict coeff1,
-                     const __global numtyp4 *restrict coeff2, 
+                     const __global numtyp4 *restrict coeff2,
                      const __global numtyp2 *restrict cutsq_sigma,
-                     const int lj_types, 
-                     const __global numtyp *restrict sp_lj_in, 
+                     const int lj_types,
+                     const __global numtyp *restrict sp_lj_in,
                      const __global int *dev_nbor,
-                     const __global int *dev_packed, 
+                     const __global int *dev_packed,
                      __global acctyp4 *restrict ans,
-                     __global acctyp *restrict engv, 
+                     __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -51,20 +51,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -77,17 +77,17 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r2inv<cutsq_sigma[mtype].x) {
         numtyp r=ucl_sqrt(r2inv);
         numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp
                 - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -95,7 +95,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
         if (eflag>0) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
             + coeff2[mtype].z*r2inv*r6inv;
-          energy+=factor_lj*(e-coeff2[mtype].w); 
+          energy+=factor_lj*(e-coeff2[mtype].w);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -113,20 +113,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_born_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_born_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff1_in,
-                          const __global numtyp4 *restrict coeff2_in, 
+                          const __global numtyp4 *restrict coeff2_in,
                           const __global numtyp2 *restrict cutsq_sigma,
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
-                          __global acctyp4 *restrict ans, 
-                          __global acctyp *restrict engv, 
-                          const int eflag, const int vflag, const int inum, 
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -137,7 +137,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       coeff2[tid]=coeff2_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -146,7 +146,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -160,7 +160,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -173,13 +173,13 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<cutsq_sigma[mtype].x) {
         numtyp r=ucl_sqrt(r2inv);
         numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp
           - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
         force*=factor_lj;
 
@@ -190,7 +190,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
         if (eflag>0) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
             + coeff2[mtype].z*r2inv*r6inv;
-          energy+=factor_lj*(e-coeff2[mtype].w);  
+          energy+=factor_lj*(e-coeff2[mtype].w);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_born.h b/lib/gpu/lal_born.h
index 6fed6461d2..685f4d87a9 100644
--- a/lib/gpu/lal_born.h
+++ b/lib/gpu/lal_born.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Born : public BaseAtomic<numtyp, acctyp> {
  public:
   Born();
-  ~Born(); 
+  ~Born();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -38,20 +38,20 @@ class Born : public BaseAtomic<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_born1, double **host_born2, 
+           double **host_rhoinv, double **host_born1, double **host_born2,
            double **host_born3, double **host_a, double **host_c,
-           double **host_d, double **host_sigma, 
+           double **host_d, double **host_sigma,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_rhoinv,
               double **host_born1, double **host_born2,
               double **host_born3, double **host_a, double **host_c,
               double **host_d, double **host_offset);
-       
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -77,7 +77,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_born_coul_long.cpp b/lib/gpu/lal_born_coul_long.cpp
index 94becf8c69..242961e80c 100644
--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,17 +37,17 @@ template <class numtyp, class acctyp>
 BornCoulLongT::~BornCoulLongT() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BornCoulLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
-                       double **host_born1, double **host_born2, double **host_born3, 
-                       double **host_a, double **host_c, double **host_d, 
-                       double **host_sigma, double **host_offset, 
+int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+                       double **host_born1, double **host_born2, double **host_born3,
+                       double **host_a, double **host_c, double **host_d,
+                       double **host_sigma, double **host_offset,
                        double *host_special_lj, const int nlocal,
                        const int nall, const int max_nbors,
                        const int maxspecial, const double cell_size,
@@ -85,11 +85,11 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
 		         host_d,host_offset);
-  
+
   cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
              host_cut_ljsq,host_sigma);
-  
+
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_lj[i];
@@ -142,7 +142,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -157,15 +157,15 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
                           &this->ans->force,
                           &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->atom->q,
-                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e, 
+                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e,
                           &_g_ewald, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, 
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->ans->force, &this->ans->engv, 
+                   &this->ans->force, &this->ans->engv,
                    &eflag, &vflag, &ainum,
-                   &nbor_pitch, &this->atom->q, 
+                   &nbor_pitch, &this->atom->q,
                    &cutsq_sigma, &_cut_coulsq,
                    &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu
index 3d74f2087a..4cb4ea448f 100644
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_born_long(const __global numtyp4 *restrict x_, 
+__kernel void k_born_long(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff1,
                           const __global numtyp4 *restrict coeff2,
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                           const __global numtyp *restrict q_,
-                          const __global numtyp4 *restrict cutsq_sigma, 
+                          const __global numtyp4 *restrict cutsq_sigma,
                           const numtyp cut_coulsq, const numtyp qqrd2e,
                           const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
@@ -64,14 +64,14 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -114,129 +114,129 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
           numtyp r = ucl_sqrt(rsq);
           rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
-          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
-            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
-        } else forceborn = (numtyp)0.0;
-
-        force = (forceborn + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < coeff1[mtype].w) {
-            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
-              + coeff2[mtype].z*r2inv*r6inv;
-            energy+=factor_lj*(e-coeff2[mtype].w);
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
-  } // if ii
-}
-
-__kernel void k_born_long_fast(const __global numtyp4 *restrict x_, 
-                               const __global numtyp4 *restrict coeff1_in,
-                               const __global numtyp4 *restrict coeff2_in, 
-                               const __global numtyp *restrict sp_lj_in,
-                               const __global int *dev_nbor, 
-                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, 
-                               const __global numtyp *restrict q_,
-                               const __global numtyp4 *restrict cutsq_sigma,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
-      coeff2[tid]=coeff2_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    __local int n_stride;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-  
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    numtyp qtmp; fetch(qtmp,i,q_tex);
-    int iw=ix.w;
-    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=dev_packed[nbor];
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<cutsq_sigma[mtype].x) {
-        numtyp r2inv=ucl_recip(rsq);
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
-        numtyp rexp = (numtyp)0.0;
-        
-        if (rsq < cut_coulsq) {
-          numtyp r=ucl_rsqrt(r2inv);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = ucl_exp(-grij*grij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          fetch(prefactor,j,q_tex);
-          prefactor *= qqrd2e * qtmp/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else forcecoul = (numtyp)0.0;
-
-        if (rsq < cutsq_sigma[mtype].y) {
-          numtyp r = ucl_sqrt(rsq);
-          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
-          r6inv = r2inv*r2inv*r2inv;
-          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
+        } else forceborn = (numtyp)0.0;
+
+        force = (forceborn + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < coeff1[mtype].w) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+              + coeff2[mtype].z*r2inv*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].w);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
+                               const __global numtyp4 *restrict coeff1_in,
+                               const __global numtyp4 *restrict coeff2_in,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict cutsq_sigma,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    coeff1[tid]=coeff1_in[tid];
+    if (eflag>0)
+      coeff2[tid]=coeff2_in[tid];
+  }
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq_sigma[mtype].x) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
+        numtyp rexp = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r=ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else forcecoul = (numtyp)0.0;
+
+        if (rsq < cutsq_sigma[mtype].y) {
+          numtyp r = ucl_sqrt(rsq);
+          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
             + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
         } else forceborn = (numtyp)0.0;
 
diff --git a/lib/gpu/lal_born_coul_long.h b/lib/gpu/lal_born_coul_long.h
index 4dc5021f03..e0de27c71c 100644
--- a/lib/gpu/lal_born_coul_long.h
+++ b/lib/gpu/lal_born_coul_long.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,19 +30,19 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
-           double **host_born1, double **host_born2, double **host_born3, 
-           double **host_a, double **host_c, double **host_d, 
+  int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+           double **host_born1, double **host_born2, double **host_born3,
+           double **host_a, double **host_c, double **host_d,
            double **host_sigma, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald);
@@ -59,12 +59,12 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
 
   // --------------------------- TYPE DATA --------------------------
 
-  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, 
+  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
   /// coeff1.w = born3
   UCL_D_Vec<numtyp4> coeff1;
   /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
   UCL_D_Vec<numtyp4> coeff2;
-  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, 
+  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
   /// cutsq_sigma.z = sigma
   UCL_D_Vec<numtyp4> cutsq_sigma;
   /// Special LJ values [0-3] and Special Coul values [4-7]
@@ -73,7 +73,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e, _g_ewald;
diff --git a/lib/gpu/lal_born_coul_long_ext.cpp b/lib/gpu/lal_born_coul_long_ext.cpp
index 382e9a2b2c..8c1ff0413f 100644
--- a/lib/gpu/lal_born_coul_long_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,9 +30,9 @@ static BornCoulLong<PRECISION,ACC_PRECISION> BORNCLMF;
 int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                     double **host_born1, double **host_born2, double **host_born3,
                     double **host_a, double **host_c, double **host_d,
-                    double **sigma, double **offset, double *special_lj, 
-                    const int inum, const int nall, const int max_nbors, 
-                    const int maxspecial, const double cell_size, int &gpu_mode, 
+                    double **sigma, double **offset, double *special_lj,
+                    const int inum, const int nall, const int max_nbors,
+                    const int maxspecial, const double cell_size, int &gpu_mode,
                     FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
                     double *host_special_coul, const double qqrd2e,
                     const double g_ewald) {
@@ -58,10 +58,10 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
-                          host_born3, host_a, host_c, host_d, sigma, offset, 
-                          special_lj, inum, nall, 300, maxspecial, cell_size, 
-                          gpu_split, screen, host_cut_ljsq, host_cut_coulsq, 
+    init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                          host_born3, host_a, host_c, host_d, sigma, offset,
+                          special_lj, inum, nall, 300, maxspecial, cell_size,
+                          gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                           host_special_coul, qqrd2e, g_ewald);
 
   BORNCLMF.device->world_barrier();
@@ -78,14 +78,14 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
-                            host_born3, host_a, host_c, host_d, sigma, offset, 
-                            special_lj, inum, nall, 300, maxspecial, cell_size, 
-                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq, 
+      init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                            host_born3, host_a, host_c, host_d, sigma, offset,
+                            special_lj, inum, nall, 300, maxspecial, cell_size,
+                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                             host_special_coul, qqrd2e, g_ewald);
 
     BORNCLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,7 +102,7 @@ void borncl_gpu_clear() {
 
 int** borncl_gpu_compute_n(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
                            tagint **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum,  const double cpu_time,
@@ -112,7 +112,7 @@ int** borncl_gpu_compute_n(const int ago, const int inum_full,
                           subhi, tag, nspecial, special, eflag, vflag, eatom,
                           vatom, host_start, ilist, jnum, cpu_time, success,
                           host_q, boxlo, prd);
-}  
+}
 			
 void borncl_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp
index 7615c1dd53..fa832206ee 100644
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,17 +37,17 @@ template <class numtyp, class acctyp>
 BornCoulWolfT::~BornCoulWolfT() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BornCoulWolfT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
-                        double **host_born1, double **host_born2, double **host_born3, 
-                        double **host_a, double **host_c, double **host_d, 
-                        double **host_sigma, double **host_offset, 
+int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+                        double **host_born1, double **host_born2, double **host_born3,
+                        double **host_a, double **host_c, double **host_d,
+                        double **host_sigma, double **host_offset,
                         double *host_special_lj, const int nlocal,
                         const int nall, const int max_nbors,
                         const int maxspecial, const double cell_size,
@@ -85,11 +85,11 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
 		                     host_d,host_offset);
-  
+
   cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
                          host_cut_ljsq,host_sigma);
-  
+
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_lj[i];
@@ -144,7 +144,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -157,17 +157,17 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->atom->q,
-                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e, 
-                          &_alf, &_e_shift, &_f_shift, 
+                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e,
+                          &_alf, &_e_shift, &_f_shift,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, 
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                    &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                    &nbor_pitch, &this->atom->q,
                    &cutsq_sigma, &_cut_coulsq,
-                   &_qqrd2e, &_alf, &_e_shift, &_f_shift, 
+                   &_qqrd2e, &_alf, &_e_shift, &_f_shift,
                    &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu
index e7706b408a..0dc7d08c63 100644
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -31,21 +31,21 @@ texture<int2> q_tex;
 
 #define MY_PIS (acctyp)1.77245385090551602729
 
-__kernel void k_born_wolf(const __global numtyp4 *restrict x_, 
+__kernel void k_born_wolf(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff1,
-                          const __global numtyp4 *restrict coeff2, 
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict coeff2,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                           const __global numtyp *restrict q_,
-                          const __global numtyp4 *restrict cutsq_sigma, 
+                          const __global numtyp4 *restrict cutsq_sigma,
                           const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp alf, const numtyp e_shift, 
+                          const numtyp alf, const numtyp e_shift,
                           const numtyp f_shift, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -67,20 +67,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
@@ -108,12 +108,12 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
         numtyp forcecoul, forceborn, force, r6inv, prefactor;
         numtyp v_sh = (numtyp)0.0;
         numtyp rexp = (numtyp)0.0;
-        
+
         if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
           numtyp r = ucl_sqrt(rsq);
           rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
-          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
             + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
         } else forceborn = (numtyp)0.0;
 
@@ -147,7 +147,7 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
               + coeff2[mtype].z*r2inv*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].w);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -165,20 +165,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp4 *restrict coeff1_in,
-                               const __global numtyp4 *restrict coeff2_in, 
+                               const __global numtyp4 *restrict coeff2_in,
                                const __global numtyp *restrict sp_lj_in,
-                               const __global int *dev_nbor, 
+                               const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, 
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
                                const __global numtyp *restrict q_,
                                const __global numtyp4 *restrict cutsq_sigma,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp alf, const numtyp e_shift, 
+                               const numtyp alf, const numtyp e_shift,
                                const numtyp f_shift, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -193,7 +193,7 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       coeff2[tid]=coeff2_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -201,23 +201,23 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
@@ -244,12 +244,12 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
         numtyp forcecoul, forceborn, force, r6inv, prefactor;
         numtyp v_sh = (numtyp)0.0;
         numtyp rexp = (numtyp)0.0;
-        
+
         if (rsq < cutsq_sigma[mtype].y) {
           numtyp r = ucl_sqrt(rsq);
           rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
-          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
             + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
         } else forceborn = (numtyp)0.0;
 
diff --git a/lib/gpu/lal_born_coul_wolf.h b/lib/gpu/lal_born_coul_wolf.h
index 9e02d23233..4b2406b989 100644
--- a/lib/gpu/lal_born_coul_wolf.h
+++ b/lib/gpu/lal_born_coul_wolf.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,19 +30,19 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
-           double **host_born1, double **host_born2, double **host_born3, 
-           double **host_a, double **host_c, double **host_d, 
+  int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+           double **host_born1, double **host_born2, double **host_born3,
+           double **host_a, double **host_c, double **host_d,
            double **host_sigma, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double alf, const double e_shift,
@@ -60,12 +60,12 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
 
   // --------------------------- TYPE DATA --------------------------
 
-  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, 
+  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
   /// coeff1.w = born3
   UCL_D_Vec<numtyp4> coeff1;
   /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
   UCL_D_Vec<numtyp4> coeff2;
-  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, 
+  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
   /// cutsq_sigma.z = sigma
   UCL_D_Vec<numtyp4> cutsq_sigma;
   /// Special LJ values [0-3] and Special Coul values [4-7]
@@ -74,7 +74,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift;
diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp
index b56c526119..5083afe0c4 100644
--- a/lib/gpu/lal_born_coul_wolf_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -28,7 +28,7 @@ static BornCoulWolf<PRECISION,ACC_PRECISION> BORNCWMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                    double **host_born1, double **host_born2, double **host_born3, 
+                    double **host_born1, double **host_born2, double **host_born3,
                     double **host_a, double **host_c, double **host_d,
                     double **sigma, double **offset, double *special_lj, const int inum,
                     const int nall, const int max_nbors, const int maxspecial,
@@ -60,9 +60,9 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   if (world_me==0)
     init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma,
-                          offset, special_lj, inum, nall, 300, 
+                          offset, special_lj, inum, nall, 300,
                           maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
-                          host_cut_coulsq, host_special_coul, qqrd2e, 
+                          host_cut_coulsq, host_special_coul, qqrd2e,
                           alf, e_shift, f_shift);
 
   BORNCWMF.device->world_barrier();
@@ -79,15 +79,15 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
-                            host_born3, host_a, host_c, host_d, sigma, 
-                            offset, special_lj, inum, nall, 300, 
+      init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                            host_born3, host_a, host_c, host_d, sigma,
+                            offset, special_lj, inum, nall, 300,
                             maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
-                            host_cut_coulsq, host_special_coul, qqrd2e, 
+                            host_cut_coulsq, host_special_coul, qqrd2e,
                             alf, e_shift, f_shift);
 
     BORNCWMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -104,7 +104,7 @@ void borncw_gpu_clear() {
 
 int** borncw_gpu_compute_n(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
                            tagint **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum,  const double cpu_time,
@@ -114,7 +114,7 @@ int** borncw_gpu_compute_n(const int ago, const int inum_full,
                           subhi, tag, nspecial, special, eflag, vflag, eatom,
                           vatom, host_start, ilist, jnum, cpu_time, success,
                           host_q, boxlo, prd);
-}  
+}
 			
 void borncw_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp
index 6bd51e6d68..171020e769 100644
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -28,9 +28,9 @@ static Born<PRECISION,ACC_PRECISION> BORNMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                  double **host_born1, double **host_born2, 
-                  double **host_born3, double **host_a, double **host_c, 
-                  double **host_d, double **sigma,      
+                  double **host_born1, double **host_born2,
+                  double **host_born3, double **host_a, double **host_c,
+                  double **host_d, double **sigma,
                   double **offset, double *special_lj, const int inum,
                   const int nall, const int max_nbors,  const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen) {
@@ -56,7 +56,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
+    init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                         host_born3, host_a, host_c, host_d, sigma,
                         offset, special_lj, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen);
@@ -75,13 +75,13 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
-                          host_born3, host_a, host_c, host_d, sigma, 
+      init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                          host_born3, host_a, host_c, host_d, sigma,
                           offset, special_lj, inum, nall, 300,
                           maxspecial, cell_size, gpu_split, screen);
 
     BORNMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,24 +102,24 @@ void born_gpu_reinit(const int ntypes, double **host_rhoinv,
   int world_me=BORNMF.device->world_me();
   int gpu_rank=BORNMF.device->gpu_rank();
   int procs_per_gpu=BORNMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
                   host_born3, host_a, host_c, host_d, offset);
-  
+
   BORNMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
                     host_born3, host_a, host_c, host_d, offset);
-    
+
     BORNMF.device->gpu_barrier();
   }
 }
 
 void born_gpu_clear() {
-  BORNMF.clear(); 
+  BORNMF.clear();
 }
 
 int ** born_gpu_compute_n(const int ago, const int inum_full,
@@ -132,7 +132,7 @@ int ** born_gpu_compute_n(const int ago, const int inum_full,
   return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void born_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp
index f66759ee3a..aa82f0014d 100644
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,10 +33,10 @@ BuckT::Buck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-BuckT::~Buck() { 
+BuckT::~Buck() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BuckT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -44,11 +44,11 @@ int BuckT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int BuckT::init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_buck1, double **host_buck2, 
-           double **host_a, double **host_c, 
+           double **host_rhoinv, double **host_buck1, double **host_buck2,
+           double **host_a, double **host_c,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -95,14 +95,14 @@ template <class numtyp, class acctyp>
 void BuckT::reinit(const int ntypes, double **host_cutsq,
                    double **host_rhoinv, double **host_buck1, double **host_buck2,
                    double **host_a, double **host_c, double **host_offset) {
-  
+
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
                          host_buck1,host_buck2,host_cutsq);
   this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
@@ -143,7 +143,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -154,13 +154,13 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
-                          &vflag, &ainum, &nbor_pitch, 
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu
index 955547e598..c1e1c7d7e2 100644
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_buck(const __global numtyp4 *restrict x_, 
+__kernel void k_buck(const __global numtyp4 *restrict x_,
                      const __global numtyp4 *restrict coeff1,
-                     const __global numtyp4 *restrict coeff2, 
-                     const int lj_types, 
+                     const __global numtyp4 *restrict coeff2,
+                     const int lj_types,
                      const __global numtyp *restrict sp_lj_in,
-                     const __global int *dev_nbor, 
+                     const __global int *dev_nbor,
                      const __global int *dev_packed,
                      __global acctyp4 *restrict ans,
-                     __global acctyp *restrict engv, 
+                     __global acctyp *restrict engv,
                      const int eflag,  const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -50,20 +50,20 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -76,24 +76,24 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r2inv<coeff1[mtype].w) {
         numtyp r=ucl_sqrt(r2inv);
         numtyp rexp = ucl_exp(-r*coeff1[mtype].x);
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp
                 - coeff1[mtype].z*r6inv);
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
-          energy+=factor_lj*(e-coeff2[mtype].z); 
+          energy+=factor_lj*(e-coeff2[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -111,19 +111,19 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_buck_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_buck_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff1_in,
-                          const __global numtyp4 *restrict coeff2_in, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict coeff2_in,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
-                          const int eflag, const int vflag, const int inum, 
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -134,7 +134,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       coeff2[tid]=coeff2_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -143,7 +143,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -157,7 +157,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -170,13 +170,13 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<coeff1[mtype].w) {
         numtyp r=ucl_sqrt(r2inv);
         numtyp rexp = ucl_exp(-r*coeff1[mtype].x);
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp
                 - coeff1[mtype].z*r6inv);
         force*=factor_lj;
 
@@ -186,7 +186,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
 
         if (eflag>0) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
-          energy+=factor_lj*(e-coeff2[mtype].z); 
+          energy+=factor_lj*(e-coeff2[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_buck.h b/lib/gpu/lal_buck.h
index ebcd72d990..3b84066355 100644
--- a/lib/gpu/lal_buck.h
+++ b/lib/gpu/lal_buck.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Buck : public BaseAtomic<numtyp, acctyp> {
  public:
   Buck();
-  ~Buck(); 
+  ~Buck();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -38,18 +38,18 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_buck1, double **host_buck2, 
-           double **host_a, double **host_c, 
+           double **host_rhoinv, double **host_buck1, double **host_buck2,
+           double **host_a, double **host_c,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_cutsq,
             double **host_rhoinv, double **host_buck1, double **host_buck2,
             double **host_a, double **host_c, double **host_offset);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -72,7 +72,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_buck_coul.cpp b/lib/gpu/lal_buck_coul.cpp
index bec640e7a6..9de019d871 100644
--- a/lib/gpu/lal_buck_coul.cpp
+++ b/lib/gpu/lal_buck_coul.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,10 +33,10 @@ BuckCoulT::BuckCoul() : BaseCharge<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-BuckCoulT::~BuckCoul() { 
+BuckCoulT::~BuckCoul() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BuckCoulT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -44,11 +44,11 @@ int BuckCoulT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int BuckCoulT::init(const int ntypes, double **host_cutsq,
-                   double **host_rhoinv, double **host_buck1, double **host_buck2, 
-                   double **host_a, double **host_c, 
+                   double **host_rhoinv, double **host_buck1, double **host_buck2,
+                   double **host_a, double **host_c,
                    double **host_offset, double *host_special_lj,
-                   const int nlocal, const int nall, const int max_nbors, 
-                   const int maxspecial, const double cell_size, 
+                   const int nlocal, const int nall, const int max_nbors,
+                   const int maxspecial, const double cell_size,
                    const double gpu_split, FILE *_screen, double **host_cut_ljsq,
                    double **host_cut_coulsq, double *host_special_coul,
                    const double qqrd2e) {
@@ -82,20 +82,20 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
 		         host_offset);
-  
+
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq,
             host_cut_ljsq, host_cut_coulsq);
-  
+
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_lj[i];
     host_write[i+4]=host_special_coul[i];
   }
   ucl_copy(sp_lj,host_write,8,false);
-  
+
   _qqrd2e = qqrd2e;
-  
+
   _allocated=true;
   this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes();
   return 0;
@@ -135,7 +135,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -147,12 +147,12 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, &this->atom->q, 
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &cutsq, &_qqrd2e, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->q,
                      &cutsq, &_qqrd2e, &this->_threads_per_atom);
diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu
index 87604a02ea..6f0d414825 100644
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_buck_coul(const __global numtyp4 *restrict x_, 
+__kernel void k_buck_coul(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff1,
-                          const __global numtyp4 *restrict coeff2, 
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict coeff2,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                           const __global numtyp *restrict q_ ,
-                          const __global numtyp4 *restrict cutsq, 
+                          const __global numtyp4 *restrict cutsq,
                           const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -63,21 +63,21 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
-    
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
-      
+
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
       factor_coul = sp_lj[sbmask(j)+4];
@@ -91,30 +91,30 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<cutsq[mtype].x) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp forcecoul, forcebuck, force, r6inv;
         numtyp rexp = (numtyp)0.0;
-        
+
         if (rsq < cutsq[mtype].y) { // buckingham
           numtyp r=ucl_sqrt(rsq);
           rexp = ucl_exp(-r*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
-          forcebuck = (coeff1[mtype].y*r*rexp 
+          forcebuck = (coeff1[mtype].y*r*rexp
                   - coeff1[mtype].z*r6inv)*factor_lj;
         } else
           forcebuck = (numtyp)0.0;
-        
+
         if (rsq < coeff2[mtype].z) {
           fetch(forcecoul,j,q_tex);
           forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
         } else
           forcecoul = (numtyp)0.0;
-        
+
         force = (forcebuck + forcecoul) * r2inv;
-        
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -142,22 +142,22 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp4 *restrict coeff1_in,
-                               const __global numtyp4 *restrict coeff2_in, 
-                               const __global numtyp *restrict sp_lj_in, 
-                               const __global int *dev_nbor, 
-                               const __global int *dev_packed, 
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, 
+                               const __global numtyp4 *restrict coeff2_in,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
                                const __global numtyp *restrict q_,
-                               const __global numtyp4 *restrict _cutsq, 
+                               const __global numtyp4 *restrict _cutsq,
                                const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -170,7 +170,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       coeff2[tid]=coeff2_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -180,7 +180,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -195,7 +195,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
-      
+
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
       factor_coul = sp_lj[sbmask(j)+4];
@@ -209,27 +209,27 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<cutsq[mtype].x) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp forcecoul, forcebuck, force, r6inv;
         numtyp rexp = (numtyp)0.0;
-        
+
         if (rsq < cutsq[mtype].y) { // buckingham
           numtyp r=ucl_sqrt(rsq);
           rexp = ucl_exp(-r*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
-          forcebuck = (coeff1[mtype].y*r*rexp 
+          forcebuck = (coeff1[mtype].y*r*rexp
                   - coeff1[mtype].z*r6inv)*factor_lj;
         } else
           forcebuck = (numtyp)0.0;
-        
+
         if (rsq < cutsq[mtype].z) {
           fetch(forcecoul,j,q_tex);
           forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
         } else
           forcecoul = (numtyp)0.0;
-        
+
         force = (forcebuck + forcecoul) * r2inv;
 
         f.x+=delx*force;
@@ -241,7 +241,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
           if (rsq < cutsq[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].z);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_buck_coul.h b/lib/gpu/lal_buck_coul.h
index e4bf59107c..3f8428bfe1 100644
--- a/lib/gpu/lal_buck_coul.h
+++ b/lib/gpu/lal_buck_coul.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class BuckCoul : public BaseCharge<numtyp, acctyp> {
  public:
   BuckCoul();
-  ~BuckCoul(); 
+  ~BuckCoul();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -38,11 +38,11 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_buck1, double **host_buck2, 
-           double **host_a, double **host_c, 
+           double **host_rhoinv, double **host_buck1, double **host_buck2,
+           double **host_a, double **host_c,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            double **host_cut_coulsq, double *host_special_coul,
            const double qqrd2e);
@@ -71,11 +71,11 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
-  
+
   numtyp _qqrd2e;
-  
+
  private:
   bool _allocated;
   void loop(const bool _eflag, const bool _vflag);
diff --git a/lib/gpu/lal_buck_coul_ext.cpp b/lib/gpu/lal_buck_coul_ext.cpp
index dd696fc6bb..3335f4ba47 100644
--- a/lib/gpu/lal_buck_coul_ext.cpp
+++ b/lib/gpu/lal_buck_coul_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -28,8 +28,8 @@ static BuckCoul<PRECISION,ACC_PRECISION> BUCKCMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                 double **host_buck1, double **host_buck2, 
-                 double **host_a, double **host_c,       
+                 double **host_buck1, double **host_buck2,
+                 double **host_a, double **host_c,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen,
@@ -57,9 +57,9 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
+    init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                        host_a, host_c, offset, special_lj, inum, nall, 300,
-                       maxspecial, cell_size, gpu_split, screen, 
+                       maxspecial, cell_size, gpu_split, screen,
                        host_cut_ljsq, host_cut_coulsq,
                        host_special_coul, qqrd2e);
 
@@ -77,14 +77,14 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
+      init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                        host_a, host_c, offset, special_lj, inum, nall, 300,
-                       maxspecial, cell_size, gpu_split, screen, 
+                       maxspecial, cell_size, gpu_split, screen,
                        host_cut_ljsq, host_cut_coulsq,
                        host_special_coul, qqrd2e);
 
     BUCKCMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -96,12 +96,12 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 }
 
 void buckc_gpu_clear() {
-  BUCKCMF.clear(); 
+  BUCKCMF.clear();
 }
 
 int ** buckc_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
@@ -111,7 +111,7 @@ int ** buckc_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
-}  
+}
 			
 void buckc_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp
index 4aa720132a..bf9b5fb101 100644
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 BuckCoulLongT::~BuckCoulLongT() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BuckCoulLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,8 +45,8 @@ int BuckCoulLongT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
-                       double **host_rhoinv, double **host_buck1, double **host_buck2, 
-                       double **host_a, double **host_c, double **host_offset, 
+                       double **host_rhoinv, double **host_buck1, double **host_buck2,
+                       double **host_a, double **host_c, double **host_offset,
                        double *host_special_lj, const int nlocal,
                        const int nall, const int max_nbors,
                        const int maxspecial, const double cell_size,
@@ -84,10 +84,10 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
 		         host_offset);
-  
+
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
-  
+
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_lj[i];
@@ -139,7 +139,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -150,16 +150,16 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
-                          &cutsq, &_cut_coulsq, &_qqrd2e, 
+                          &cutsq, &_cut_coulsq, &_qqrd2e,
                           &_g_ewald, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &coeff1,  &coeff2, &_lj_types, &sp_lj, 
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &coeff1,  &coeff2, &_lj_types, &sp_lj,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                    &this->ans->force, &this->ans->engv, &eflag, &vflag,
-                   &ainum, &nbor_pitch, &this->atom->q, &cutsq, 
+                   &ainum, &nbor_pitch, &this->atom->q, &cutsq,
                    &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu
index fc68d12471..da3237a31f 100644
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, 
+__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
                                const __global numtyp4 *restrict coeff1,
-                               const __global numtyp4 *restrict coeff2, 
-                               const int lj_types, 
-                               const __global numtyp *restrict sp_lj_in, 
-                               const __global int *dev_nbor, 
-                               const __global int *dev_packed, 
+                               const __global numtyp4 *restrict coeff2,
+                               const int lj_types,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
                                __global acctyp4 *restrict ans,
-                               __global acctyp *restrict engv, 
+                               __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
-                               const int nbor_pitch, 
+                               const int nbor_pitch,
                                const __global numtyp *restrict q_,
-                               const __global numtyp *restrict cutsq, 
+                               const __global numtyp *restrict cutsq,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
                                const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
@@ -64,14 +64,14 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -98,136 +98,136 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
         numtyp r2inv=ucl_recip(rsq);
         numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
         numtyp rexp = (numtyp)0.0;
-        
+
         if (rsq < coeff1[mtype].w) { // cut_ljsq
           numtyp r=ucl_sqrt(rsq);
           rexp = ucl_exp(-r*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
-          force_lj = (coeff1[mtype].y*r*rexp 
-                  - coeff1[mtype].z*r6inv)*factor_lj;
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = ucl_rsqrt(r2inv);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = ucl_exp(-grij*grij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          fetch(prefactor,j,q_tex);
-          prefactor *= qqrd2e * qtmp/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < coeff1[mtype].w) {
-            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
-            energy+=factor_lj*(e-coeff2[mtype].z);
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
-  } // if ii
-}
-
-__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, 
-                                    const __global numtyp4 *restrict coeff1_in,
-                                    const __global numtyp4 *restrict coeff2_in, 
-                                    const __global numtyp *restrict sp_lj_in,
-                                    const __global int *dev_nbor, 
-                                    const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans,
-                                    __global acctyp *restrict engv, 
-                                    const int eflag, const int vflag, 
-                                    const int inum, const int nbor_pitch,
-                                    const __global numtyp *restrict q_, 
-                                    const __global numtyp *restrict cutsq,
-                                    const numtyp cut_coulsq, 
-                                    const numtyp qqrd2e, const numtyp g_ewald, 
-                                    const int t_per_atom) {
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
-      coeff2[tid]=coeff2_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    __local int n_stride;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-  
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    numtyp qtmp; fetch(qtmp,i,q_tex);
-    int iw=ix.w;
-    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=dev_packed[nbor];
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<cutsq[mtype]) {
-        numtyp r2inv=ucl_recip(rsq);
-        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
-        numtyp rexp = (numtyp)0.0;
-        
-        if (rsq < coeff1[mtype].w) {
-          numtyp r=ucl_sqrt(rsq);
-          rexp = ucl_exp(-r*coeff1[mtype].x);
-          r6inv = r2inv*r2inv*r2inv;
-          force_lj = (coeff1[mtype].y*r*rexp 
+          force_lj = (coeff1[mtype].y*r*rexp
+                  - coeff1[mtype].z*r6inv)*factor_lj;
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < coeff1[mtype].w) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
+                                    const __global numtyp4 *restrict coeff1_in,
+                                    const __global numtyp4 *restrict coeff2_in,
+                                    const __global numtyp *restrict sp_lj_in,
+                                    const __global int *dev_nbor,
+                                    const __global int *dev_packed,
+                                    __global acctyp4 *restrict ans,
+                                    __global acctyp *restrict engv,
+                                    const int eflag, const int vflag,
+                                    const int inum, const int nbor_pitch,
+                                    const __global numtyp *restrict q_,
+                                    const __global numtyp *restrict cutsq,
+                                    const numtyp cut_coulsq,
+                                    const numtyp qqrd2e, const numtyp g_ewald,
+                                    const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    coeff1[tid]=coeff1_in[tid];
+    if (eflag>0)
+      coeff2[tid]=coeff2_in[tid];
+  }
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
+        numtyp rexp = (numtyp)0.0;
+
+        if (rsq < coeff1[mtype].w) {
+          numtyp r=ucl_sqrt(rsq);
+          rexp = ucl_exp(-r*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = (coeff1[mtype].y*r*rexp
                   - coeff1[mtype].z*r6inv)*factor_lj;
         } else
           force_lj = (numtyp)0.0;
diff --git a/lib/gpu/lal_buck_coul_long.h b/lib/gpu/lal_buck_coul_long.h
index dc59d7ad4e..4a70a3a097 100644
--- a/lib/gpu/lal_buck_coul_long.h
+++ b/lib/gpu/lal_buck_coul_long.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -38,11 +38,11 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_buck1, double **host_buck2, 
-           double **host_a, double **host_c, 
+           double **host_rhoinv, double **host_buck1, double **host_buck2,
+           double **host_a, double **host_c,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald);
@@ -71,7 +71,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e, _g_ewald;
diff --git a/lib/gpu/lal_buck_coul_long_ext.cpp b/lib/gpu/lal_buck_coul_long_ext.cpp
index 9c0c331ee1..51e0d233d3 100644
--- a/lib/gpu/lal_buck_coul_long_ext.cpp
+++ b/lib/gpu/lal_buck_coul_long_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -28,7 +28,7 @@ static BuckCoulLong<PRECISION,ACC_PRECISION> BUCKCLMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                 double **host_buck1, double **host_buck2, 
+                 double **host_buck1, double **host_buck2,
                  double **host_a, double **host_c,
                   double **offset, double *special_lj, const int inum,
                   const int nall, const int max_nbors, const int maxspecial,
@@ -58,8 +58,8 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
-                        host_a, host_c, offset, special_lj, inum, nall, 300, 
+    init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
+                        host_a, host_c, offset, special_lj, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
@@ -77,13 +77,13 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
-                        host_a, host_c, offset, special_lj, inum, nall, 300, 
+      init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
+                        host_a, host_c, offset, special_lj, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
     BUCKCLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -100,7 +100,7 @@ void buckcl_gpu_clear() {
 
 int** buckcl_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -110,7 +110,7 @@ int** buckcl_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
+}
 			
 void buckcl_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp
index 75c88e8dbe..36a780426c 100644
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -28,8 +28,8 @@ static Buck<PRECISION,ACC_PRECISION> BUCKMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                 double **host_buck1, double **host_buck2, 
-                 double **host_a, double **host_c,       
+                 double **host_buck1, double **host_buck2,
+                 double **host_a, double **host_c,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen) {
@@ -55,7 +55,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
+    init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
@@ -73,12 +73,12 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
+      init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
     BUCKMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -98,24 +98,24 @@ void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv,
   int world_me=BUCKMF.device->world_me();
   int gpu_rank=BUCKMF.device->gpu_rank();
   int procs_per_gpu=BUCKMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                   host_a, host_c, offset);
-  
+
   BUCKMF.device->world_barrier();
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                     host_a, host_c, offset);
-    
+
     BUCKMF.device->gpu_barrier();
   }
 }
 
 void buck_gpu_clear() {
-  BUCKMF.clear(); 
+  BUCKMF.clear();
 }
 
 int ** buck_gpu_compute_n(const int ago, const int inum_full,
@@ -128,7 +128,7 @@ int ** buck_gpu_compute_n(const int ago, const int inum_full,
   return BUCKMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void buck_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_cg_cmm.cpp b/lib/gpu/lal_cg_cmm.cpp
index 96455888f0..11974e05e0 100644
--- a/lib/gpu/lal_cg_cmm.cpp
+++ b/lib/gpu/lal_cg_cmm.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -33,23 +33,23 @@ CGCMMT::CGCMM() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-CGCMMT::~CGCMM() { 
+CGCMMT::~CGCMM() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CGCMMT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int CGCMMT::init(const int ntypes, double **host_cutsq, 
-                          int **host_cg_type, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
+int CGCMMT::init(const int ntypes, double **host_cutsq,
+                          int **host_cg_type, double **host_lj1,
+                          double **host_lj2, double **host_lj3,
+                          double **host_lj4, double **host_offset,
                           double *host_special_lj, const int nlocal,
                           const int nall, const int max_nbors,
-                          const int maxspecial, const double cell_size, 
+                          const int maxspecial, const double cell_size,
                           const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -75,7 +75,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
     host_write[i]=0.0;
 
   lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, 
+  this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq,
                          host_cg_type,host_lj1,host_lj2);
 
   lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
@@ -126,7 +126,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -138,7 +138,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch,  
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
diff --git a/lib/gpu/lal_cg_cmm.cu b/lib/gpu/lal_cg_cmm.cu
index 8f89f74d22..70d2ab6092 100644
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_cg_cmm.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_cg_cmm(const __global numtyp4 *restrict x_, 
+__kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
                        const __global numtyp4 *restrict lj1,
-                       const __global numtyp4 *restrict lj3, 
-                       const int lj_types, 
+                       const __global numtyp4 *restrict lj3,
+                       const int lj_types,
                        const __global numtyp *restrict sp_lj_in,
-                       const __global int *dev_nbor, 
+                       const __global int *dev_nbor,
                        const __global int *dev_packed,
                        __global acctyp4 *restrict ans,
-                       __global acctyp *restrict engv, 
+                       __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -50,20 +50,20 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -76,12 +76,12 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r2inv<lj1[mtype].x) {
         r2inv=ucl_recip(r2inv);
         numtyp inv1,inv2;
-        
+
         if (lj1[mtype].y == 2) {
           inv1=r2inv*r2inv;
           inv2=inv1*inv1;
@@ -93,7 +93,7 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
           inv2=inv1;
         }
         numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -116,9 +116,9 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict lj1_in,
-                            const __global numtyp4 *restrict lj3_in, 
+                            const __global numtyp4 *restrict lj3_in,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
@@ -139,30 +139,30 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -175,11 +175,11 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<lj1[mtype].x) {
         r2inv=ucl_recip(r2inv);
         numtyp inv1,inv2;
-        
+
         if (lj1[mtype].y == (numtyp)2) {
           inv1=r2inv*r2inv;
           inv2=inv1*inv1;
@@ -191,7 +191,7 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
           inv2=inv1;
         }
         numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
diff --git a/lib/gpu/lal_cg_cmm.h b/lib/gpu/lal_cg_cmm.h
index 394cd81254..b7895b5898 100644
--- a/lib/gpu/lal_cg_cmm.h
+++ b/lib/gpu/lal_cg_cmm.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class CGCMM : public BaseAtomic<numtyp, acctyp> {
  public:
   CGCMM();
-  ~CGCMM(); 
+  ~CGCMM();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,7 +40,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, int **host_cg_type,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
@@ -66,7 +66,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _cmm_types;
 
  private:
diff --git a/lib/gpu/lal_cg_cmm_ext.cpp b/lib/gpu/lal_cg_cmm_ext.cpp
index 0d2c3d8fbf..2a00271736 100644
--- a/lib/gpu/lal_cg_cmm_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -28,9 +28,9 @@ static CGCMM<PRECISION,ACC_PRECISION> CMMMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
-                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj1, double **host_lj2, double **host_lj3,
                  double **host_lj4, double **offset, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
+                 const int inum, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen) {
   CMMMF.clear();
@@ -55,7 +55,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
+    init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
@@ -78,7 +78,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                          maxspecial, cell_size, gpu_split, screen);
 
     CMMMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -103,7 +103,7 @@ int** cmm_gpu_compute_n(const int ago, const int inum_full,
   return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_cg_cmm_long.cpp b/lib/gpu/lal_cg_cmm_long.cpp
index 92e6bd04b5..14b5b7622c 100644
--- a/lib/gpu/lal_cg_cmm_long.cpp
+++ b/lib/gpu/lal_cg_cmm_long.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -37,22 +37,22 @@ template <class numtyp, class acctyp>
 CGCMMLongT::~CGCMMLong() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CGCMMLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int CGCMMLongT::init(const int ntypes, double **host_cutsq, 
-                           int **host_cg_type, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
+int CGCMMLongT::init(const int ntypes, double **host_cutsq,
+                           int **host_cg_type, double **host_lj1,
+                           double **host_lj2, double **host_lj3,
+                           double **host_lj4, double **host_offset,
                            double *host_special_lj, const int nlocal,
                            const int nall, const int max_nbors,
                            const int maxspecial, const double cell_size,
                            const double gpu_split, FILE *_screen,
-                           double **host_cut_ljsq, 
+                           double **host_cut_ljsq,
                            const double host_cut_coulsq,
                            double *host_special_coul, const double qqrd2e,
                            const double g_ewald) {
@@ -137,7 +137,7 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -149,13 +149,13 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, &this->atom->q, 
-                          &_cut_coulsq, &_qqrd2e, &_g_ewald, 
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
diff --git a/lib/gpu/lal_cg_cmm_long.cu b/lib/gpu/lal_cg_cmm_long.cu
index ae8b6cda47..f6942d1809 100644
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_cg_cmm_long.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -29,12 +29,12 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, 
+__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict lj1,
-                            const __global numtyp4 *restrict lj3, 
-                            const int lj_types, 
+                            const __global numtyp4 *restrict lj3,
+                            const int lj_types,
                             const __global numtyp *restrict sp_lj_in,
-                            const __global int *dev_nbor, 
+                            const __global int *dev_nbor,
                             const __global int *dev_packed,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
@@ -70,7 +70,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -136,7 +136,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].y) {
             energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
                       lj3[mtype].w;
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -154,17 +154,17 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp4 *restrict lj1_in,
-                                 const __global numtyp4 *restrict lj3_in, 
-                                 const __global numtyp *restrict sp_lj_in, 
-                                 const __global int *dev_nbor, 
+                                 const __global numtyp4 *restrict lj3_in,
+                                 const __global numtyp *restrict sp_lj_in,
+                                 const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  __global acctyp4 *restrict ans,
-                                 __global acctyp *restrict engv, 
-                                 const int eflag, const int vflag, 
+                                 __global acctyp *restrict engv,
+                                 const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
-                                 const __global numtyp *restrict q_, 
+                                 const __global numtyp *restrict q_,
                                  const numtyp cut_coulsq, const numtyp qqrd2e,
                                  const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
@@ -179,7 +179,7 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
     lj1[tid]=lj1_in[tid];
     lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -187,16 +187,16 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
@@ -262,7 +262,7 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].y) {
             energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
                       lj3[mtype].w;
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_cg_cmm_long.h b/lib/gpu/lal_cg_cmm_long.h
index bde5c79c74..aa0cbfbaf0 100644
--- a/lib/gpu/lal_cg_cmm_long.h
+++ b/lib/gpu/lal_cg_cmm_long.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,8 +40,8 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, int ** cg_type,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald);
@@ -58,7 +58,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
 
   // --------------------------- TYPE DATA --------------------------
 
-  /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, 
+  /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2,
   UCL_D_Vec<numtyp4> lj1;
   /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
   UCL_D_Vec<numtyp4> lj3;
@@ -68,7 +68,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e, _g_ewald;
diff --git a/lib/gpu/lal_cg_cmm_long_ext.cpp b/lib/gpu/lal_cg_cmm_long_ext.cpp
index 966588bf9b..2fa3f2aead 100644
--- a/lib/gpu/lal_cg_cmm_long_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_long_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -28,9 +28,9 @@ static CGCMMLong<PRECISION,ACC_PRECISION> CMMLMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj1, double **host_lj2, double **host_lj3,
                   double **host_lj4, double **offset, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
+                  const int inum, const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size, int &gpu_mode,
                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
                   double *host_special_coul, const double qqrd2e,
@@ -58,7 +58,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
   int init_ok=0;
   if (world_me==0)
     init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                        host_lj4, offset, special_lj, inum, nall, 300, 
+                        host_lj4, offset, special_lj, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
 
@@ -82,7 +82,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                           host_cut_ljsq, host_cut_coulsq, host_special_coul,
                           qqrd2e, g_ewald);
     CMMLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -99,7 +99,7 @@ void cmml_gpu_clear() {
 
 int** cmml_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
@@ -109,7 +109,7 @@ int** cmml_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q,boxlo,prd);
-}  
+}
 			
 void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp
index 157072dc22..9cd032b3c6 100644
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 CHARMMLongT::~CHARMMLong() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,9 +45,9 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int CHARMMLongT::init(const int ntypes,
-                           double host_cut_bothsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
+                           double host_cut_bothsq, double **host_lj1,
+                           double **host_lj2, double **host_lj3,
+                           double **host_lj4, double **host_offset,
                            double *host_special_lj, const int nlocal,
                            const int nall, const int max_nbors,
                            const int maxspecial, const double cell_size,
@@ -144,7 +144,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -153,17 +153,17 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, 
+    this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
-                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, 
+                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->q,
                      &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu
index dde50da300..244131f833 100644
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -31,14 +31,14 @@ texture<int2> q_tex;
 
 __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict lj1,
-                            const int lj_types, 
+                            const int lj_types,
                             const __global numtyp *restrict sp_lj,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
                             __global acctyp4 *restrict ans,
-                            __global acctyp *restrict engv, 
-                            const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, 
+                            __global acctyp *restrict engv,
+                            const int eflag, const int vflag, const int inum,
+                            const int nbor_pitch,
                             const __global numtyp *restrict q_,
                             const numtyp cut_coulsq, const numtyp qqrd2e,
                             const numtyp g_ewald, const numtyp denom_lj,
@@ -61,7 +61,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -93,7 +93,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
           force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
           if (rsq > cut_lj_innersq) {
             switch1 = (cut_ljsq-rsq);
-            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
                              denom_lj;
             switch1 *= switch1;
             switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
@@ -130,7 +130,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
             if (rsq > cut_lj_innersq)
               e *= switch1;
             energy+=factor_lj*e;
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -148,19 +148,19 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp2 *restrict ljd_in,
-                                 const __global numtyp *restrict sp_lj_in, 
-                                 const __global int *dev_nbor, 
-                                 const __global int *dev_packed, 
+                                 const __global numtyp *restrict sp_lj_in,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
                                  __global acctyp4 *restrict ans,
-                                 __global acctyp *restrict engv, 
-                                 const int eflag, const int vflag, 
-                                 const int inum, const int nbor_pitch, 
+                                 __global acctyp *restrict engv,
+                                 const int eflag, const int vflag,
+                                 const int inum, const int nbor_pitch,
                                  const __global numtyp *restrict q_,
                                  const numtyp cut_coulsq, const numtyp qqrd2e,
                                  const numtyp g_ewald, const numtyp denom_lj,
-                                 const numtyp cut_bothsq, const numtyp cut_ljsq, 
+                                 const numtyp cut_bothsq, const numtyp cut_ljsq,
                                  const numtyp cut_lj_innersq,
                                  const int t_per_atom) {
   int tid, ii, offset;
@@ -174,7 +174,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
     ljd[tid]=ljd_in[tid];
   if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
     ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -182,16 +182,16 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -229,7 +229,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
           force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
           if (rsq > cut_lj_innersq) {
             switch1 = (cut_ljsq-rsq);
-            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
                              denom_lj;
             switch1 *= switch1;
             switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
diff --git a/lib/gpu/lal_charmm_long.h b/lib/gpu/lal_charmm_long.h
index 201a5c3694..011083db13 100644
--- a/lib/gpu/lal_charmm_long.h
+++ b/lib/gpu/lal_charmm_long.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,12 +40,12 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double host_cut_bothsq,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald,
-           const double cut_lj_innersq, const double denom_lj, 
+           const double cut_lj_innersq, const double denom_lj,
            double **epsilon, double **sigma, const bool mix_arithmetic);
 
   /// Clear all host and device data
@@ -70,7 +70,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e, _g_ewald, _denom_lj;
diff --git a/lib/gpu/lal_charmm_long_ext.cpp b/lib/gpu/lal_charmm_long_ext.cpp
index 807988a3e8..3f7445f306 100644
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -87,7 +87,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                           sigma, mix_arithmetic);
 
     CRMLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -104,7 +104,7 @@ void crml_gpu_clear() {
 
 int** crml_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
@@ -114,14 +114,14 @@ int** crml_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
+}
 			
 void crml_gpu_compute(const int ago, const int inum_full,
 	 	                  const int nall, double **host_x, int *host_type,
                       int *ilist, int *numj, int **firstneigh,
 		                  const bool eflag, const bool vflag, const bool eatom,
                       const bool vatom, int &host_start, const double cpu_time,
-                      bool &success, double *host_q, const int nlocal, 
+                      bool &success, double *host_q, const int nlocal,
                       double *boxlo, double *prd) {
   CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
                  eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
diff --git a/lib/gpu/lal_colloid.cpp b/lib/gpu/lal_colloid.cpp
index 28045217d3..fb2b643e5e 100644
--- a/lib/gpu/lal_colloid.cpp
+++ b/lib/gpu/lal_colloid.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,23 +33,23 @@ ColloidT::Colloid() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-ColloidT::~Colloid() { 
+ColloidT::~Colloid() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int ColloidT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int ColloidT::init(const int ntypes, 
-                   double **host_cutsq, double **host_lj1, 
-                   double **host_lj2, double **host_lj3, 
-                   double **host_lj4, double **host_offset, 
-                   double *host_special_lj, double **host_a12, 
-                   double **host_a1, double **host_a2, 
-                   double **host_d1, double **host_d2, 
+int ColloidT::init(const int ntypes,
+                   double **host_cutsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3,
+                   double **host_lj4, double **host_offset,
+                   double *host_special_lj, double **host_a12,
+                   double **host_a1, double **host_a2,
+                   double **host_d1, double **host_d2,
                    double **host_sigma3, double **host_sigma6,
                    int **host_form, const int nlocal,
                    const int nall, const int max_nbors,
@@ -97,7 +97,7 @@ int ColloidT::init(const int ntypes,
   UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
                              UCL_WRITE_ONLY);
   for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
-                                
+
   form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<ntypes; i++)
     for (int j=0; j<ntypes; j++) {
@@ -153,7 +153,7 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -170,9 +170,9 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
-                     &colloid1, &colloid2, &form, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, &eflag, &vflag, 
+                     &colloid1, &colloid2, &form,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu
index a4d6c8bf33..89ba71deef 100644
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,18 +24,18 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_colloid(const __global numtyp4 *restrict x_, 
+__kernel void k_colloid(const __global numtyp4 *restrict x_,
                         const __global numtyp4 *restrict lj1,
-                        const __global numtyp4 *restrict lj3, 
-                        const int lj_types, 
-                        const __global numtyp *restrict sp_lj_in, 
-                        const __global numtyp4 *restrict colloid1, 
+                        const __global numtyp4 *restrict lj3,
+                        const int lj_types,
+                        const __global numtyp *restrict sp_lj_in,
+                        const __global numtyp4 *restrict colloid1,
                         const __global numtyp4 *restrict colloid2,
-                        const __global int *form, 
-                        const __global int *dev_nbor, 
-                        const __global int *dev_packed, 
+                        const __global int *form,
+                        const __global int *dev_nbor,
+                        const __global int *dev_packed,
                         __global acctyp4 *restrict ans,
-                        __global acctyp *restrict engv, 
+                        __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -53,20 +53,20 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -79,21 +79,21 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
-      if (rsq<lj1[mtype].z) {   
+      if (rsq<lj1[mtype].z) {
         numtyp r,r2inv,r6inv;
         numtyp c1,c2,fR,evdwl;
         numtyp K[9],h[4],g[4];
         numtyp force = (numtyp)0;
- 
+
         if (form[mtype]==0) { // SMALL_SMALL
           r2inv=ucl_recip(rsq);
           r6inv = r2inv*r2inv*r2inv;
           force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
           force*=factor_lj;
         } else if (form[mtype]==1) { // SMALL_LARGE
-          c2 = colloid1[mtype].z; 
+          c2 = colloid1[mtype].z;
           K[1] = c2*c2;
           K[2] = rsq;
           K[0] = K[1] - rsq;
@@ -102,15 +102,15 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
           K[3] *= K[3]*K[3];
           K[6] = K[3]*K[3];
           fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
-          force = (numtyp)4.0/(numtyp)15.0*fR * 
-             ((numtyp)2.0*(K[1]+K[2]) * 
-             (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
+          force = (numtyp)4.0/(numtyp)15.0*fR *
+             ((numtyp)2.0*(K[1]+K[2]) *
+             (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
              colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
           force*=factor_lj;
         } else if (form[mtype]==2) { // LARGE_LARGE
           r = ucl_sqrt(rsq);
-          c1 = colloid1[mtype].y; 
-          c2 = colloid1[mtype].z; 
+          c1 = colloid1[mtype].y;
+          c2 = colloid1[mtype].z;
           K[0] = c1*c2;
           K[1] = c1+c2;
           K[2] = c1-c2;
@@ -137,11 +137,11 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
           evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
           numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
           numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
-                       (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
+                       (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
                        ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
           force = factor_lj * (dUR+dUA)/r;
         }
-  
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -151,14 +151,14 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
           if (form[mtype]==0) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           } else if (form[mtype]==1) {
-            e=(numtyp)2.0/(numtyp)9.0*fR * 
-              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +  
+            e=(numtyp)2.0/(numtyp)9.0*fR *
+              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +
               (numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]);
           } else if (form[mtype]==2) {
-            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * 
+            e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
               ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
-          } 
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          }
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -176,22 +176,22 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_colloid_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict lj1_in,
-                             const __global numtyp4 *restrict lj3_in, 
+                             const __global numtyp4 *restrict lj3_in,
                              const __global numtyp *restrict sp_lj_in,
-                             const __global numtyp4 *restrict colloid1_in, 
+                             const __global numtyp4 *restrict colloid1_in,
                              const __global numtyp4 *restrict colloid2_in,
-                             const __global int *form_in, 
-                             const __global int *dev_nbor, 
-                             const __global int *dev_packed, 
-                             __global acctyp4 *restrict ans, 
-                             __global acctyp *restrict engv, 
-                             const int eflag, const int vflag, const int inum, 
+                             const __global int *form_in,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -208,7 +208,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -217,7 +217,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -231,7 +231,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -244,20 +244,20 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<lj1[mtype].z) {
         numtyp r,r2inv,r6inv;
         numtyp c1,c2,fR,evdwl;
         numtyp K[9],h[4],g[4];
         numtyp force = (numtyp)0;
- 
+
         if (form[mtype]==0) { // SMALL_SMALL
           r2inv=ucl_recip(rsq);
           r6inv = r2inv*r2inv*r2inv;
           force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
           force*=factor_lj;
         } else if (form[mtype]==1) { // SMALL_LARGE
-          c2 = colloid1[mtype].z; 
+          c2 = colloid1[mtype].z;
           K[1] = c2*c2;
           K[2] = rsq;
           K[0] = K[1] - rsq;
@@ -266,15 +266,15 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
           K[3] *= K[3]*K[3];
           K[6] = K[3]*K[3];
           fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
-          force = (numtyp)4.0/(numtyp)15.0*fR * 
-            ((numtyp)2.0*(K[1]+K[2]) * 
-            (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
+          force = (numtyp)4.0/(numtyp)15.0*fR *
+            ((numtyp)2.0*(K[1]+K[2]) *
+            (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
             colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
           force*=factor_lj;
         } else if (form[mtype]==2) { // LARGE_LARGE
           r = ucl_sqrt(rsq);
-          c1 = colloid1[mtype].y; 
-          c2 = colloid1[mtype].z; 
+          c1 = colloid1[mtype].y;
+          c2 = colloid1[mtype].z;
           K[0] = c1*c2;
           K[1] = c1+c2;
           K[2] = c1-c2;
@@ -301,11 +301,11 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
           evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
           numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
           numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
-            (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
+            (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
             ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
           force = factor_lj * (dUR+dUA)/r;
         } else force = (numtyp)0.0;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -315,15 +315,15 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
           if (form[mtype]==0) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           } else if (form[mtype]==1) {
-            e=(numtyp)2.0/(numtyp)9.0*fR * 
+            e=(numtyp)2.0/(numtyp)9.0*fR *
               ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+
               (numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
               colloid2[mtype].w/K[6]);
           } else if (form[mtype]==2) {
-            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * 
+            e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
               ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
-          } 
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          }
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_colloid.h b/lib/gpu/lal_colloid.h
index 416beabcdf..dfbd4dbadd 100644
--- a/lib/gpu/lal_colloid.h
+++ b/lib/gpu/lal_colloid.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Colloid : public BaseAtomic<numtyp, acctyp> {
  public:
   Colloid();
-  ~Colloid(); 
+  ~Colloid();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,11 +40,11 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           double **host_a12, double **host_a1, double **host_a2, 
-           double **host_d1, double **host_d2, double **host_sigma3, 
-           double **host_sigma6, int **host_form, 
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           double **host_a12, double **host_a1, double **host_a2,
+           double **host_d1, double **host_d2, double **host_sigma3,
+           double **host_sigma6, int **host_form,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -65,7 +65,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
   UCL_D_Vec<numtyp4> lj3;
   /// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2
   UCL_D_Vec<numtyp4> colloid1;
-  /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3, 
+  /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3,
   /// colloid2.w = sigma6
   UCL_D_Vec<numtyp4> colloid2;
   /// form
@@ -76,7 +76,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_colloid_ext.cpp b/lib/gpu/lal_colloid_ext.cpp
index ea83cb6417..f88ced8443 100644
--- a/lib/gpu/lal_colloid_ext.cpp
+++ b/lib/gpu/lal_colloid_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -29,9 +29,9 @@ static Colloid<PRECISION,ACC_PRECISION> COLLMF;
 // ---------------------------------------------------------------------------
 int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                      double **host_lj2, double **host_lj3, double **host_lj4,
-                     double **offset, double *special_lj, 
-                     double **host_a12, double **host_a1, double **host_a2, 
-                     double **host_d1, double **host_d2, double **host_sigma3, 
+                     double **offset, double *special_lj,
+                     double **host_a12, double **host_a1, double **host_a2,
+                     double **host_d1, double **host_d2, double **host_sigma3,
                      double **host_sigma6, int **host_form, const int inum,
                      const int nall, const int max_nbors,  const int maxspecial,
                      const double cell_size, int &gpu_mode, FILE *screen) {
@@ -57,9 +57,9 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+    init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                         host_lj4, offset, special_lj, host_a12, host_a1,
-                        host_a2, host_d1, host_d2, host_sigma3, 
+                        host_a2, host_d1, host_d2, host_sigma3,
                         host_sigma6, host_form, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen);
 
@@ -78,13 +78,13 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, host_a12, host_a1, host_a2, 
-                          host_d1, host_d2, host_sigma3, host_sigma6, host_form, 
+                          offset, special_lj, host_a12, host_a1, host_a2,
+                          host_d1, host_d2, host_sigma3, host_sigma6, host_form,
                           inum, nall, 300, maxspecial,
                           cell_size, gpu_split, screen);
 
     COLLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -109,7 +109,7 @@ int ** colloid_gpu_compute_n(const int ago, const int inum_full,
   return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void colloid_gpu_compute(const int ago, const int inum_full, const int nall,
                          double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_coul.cpp b/lib/gpu/lal_coul.cpp
index 53fb3dae82..a06a29e610 100644
--- a/lib/gpu/lal_coul.cpp
+++ b/lib/gpu/lal_coul.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndtrung@umich.edu
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 CoulT::~Coul() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CoulT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -75,7 +75,7 @@ int CoulT::init(const int ntypes, double **host_scale, double **host_cutsq,
 
   scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale);
-  
+
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
 
@@ -97,10 +97,10 @@ void CoulT::reinit(const int ntypes, double **host_scale) {
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale);
 }
 
@@ -138,7 +138,7 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -149,14 +149,14 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &cutsq, &_qqrd2e, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv,
                      &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
                      &cutsq, &_qqrd2e, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_coul.cu b/lib/gpu/lal_coul.cu
index e955922a7c..503e674c81 100644
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : ndtrung@umich.edu
 // ***************************************************************************/
 
@@ -33,14 +33,14 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict scale,
                      const int lj_types,
                      const __global numtyp *restrict sp_cl_in,
-                     const __global int *dev_nbor, 
-                     const __global int *dev_packed, 
+                     const __global int *dev_nbor,
+                     const __global int *dev_packed,
                      __global acctyp4 *restrict ans,
-                     __global acctyp *restrict engv, 
+                     __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
-                     const int nbor_pitch, 
-                     const __global numtyp *restrict q_, 
-                     const __global numtyp *restrict cutsq, 
+                     const int nbor_pitch,
+                     const __global numtyp *restrict q_,
+                     const __global numtyp *restrict cutsq,
                      const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -50,7 +50,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
   sp_cl[1]=sp_cl_in[1];
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -58,13 +58,13 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -120,14 +120,14 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
 __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict scale,
                           const __global numtyp *restrict sp_cl_in,
-                          const __global int *dev_nbor, 
+                          const __global int *dev_nbor,
                           const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nbor_pitch, 
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch,
                           const __global numtyp *restrict q_,
-                          const __global numtyp *restrict _cutsq, 
+                          const __global numtyp *restrict _cutsq,
                           const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -139,7 +139,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=_cutsq[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -147,15 +147,15 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_coul.h b/lib/gpu/lal_coul.h
index 4374abd80d..6d9b6b1b2b 100644
--- a/lib/gpu/lal_coul.h
+++ b/lib/gpu/lal_coul.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndtrung@umich.edu
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -39,13 +39,13 @@ class Coul : public BaseCharge<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_scale,
            double **host_cutsq, double *host_special_coul,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, const double qqrd2e);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_scale);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -68,7 +68,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e;
diff --git a/lib/gpu/lal_coul_debye.cpp b/lib/gpu/lal_coul_debye.cpp
index 990dff6db9..9098aeacb1 100644
--- a/lib/gpu/lal_coul_debye.cpp
+++ b/lib/gpu/lal_coul_debye.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndtrung@umich.edu
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 CoulDebyeT::~CoulDebye() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CoulDebyeT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -87,7 +87,7 @@ int CoulDebyeT::init(const int ntypes, double **host_scale,
 
   _qqrd2e=qqrd2e;
   _kappa=kappa;
-  
+
   _allocated=true;
   this->_max_bytes=cutsq.row_bytes()+scale.row_bytes()+sp_cl.row_bytes();
   return 0;
@@ -98,10 +98,10 @@ void CoulDebyeT::reinit(const int ntypes, double **host_scale) {
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale);
 }
 
@@ -139,7 +139,7 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -156,9 +156,9 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
-                     &ainum, &nbor_pitch, &this->atom->q, &cutsq, 
+                     &ainum, &nbor_pitch, &this->atom->q, &cutsq,
                      &_qqrd2e, &_kappa, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_coul_debye.cu b/lib/gpu/lal_coul_debye.cu
index 0e4c0ea2d0..464a1b18de 100644
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : ndtrung@umich.edu
 // ***************************************************************************/
 
@@ -31,16 +31,16 @@ texture<int2> q_tex;
 
 __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict scale,
-                           const int lj_types, 
+                           const int lj_types,
                            const __global numtyp *restrict sp_cl_in,
-                           const __global int *dev_nbor, 
-                           const __global int *dev_packed, 
+                           const __global int *dev_nbor,
+                           const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch,
                            const __global numtyp *restrict q_ ,
-                           const __global numtyp *restrict cutsq, 
+                           const __global numtyp *restrict cutsq,
                            const numtyp qqrd2e, const numtyp kappa,
                            const int t_per_atom) {
   int tid, ii, offset;
@@ -59,27 +59,27 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     numtyp factor_coul;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_coul = sp_cl[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
@@ -146,7 +146,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
     scale[tid]=scale_in[tid];
     cutsq[tid]=_cutsq[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -154,15 +154,15 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_coul_debye.h b/lib/gpu/lal_coul_debye.h
index 885f08cd34..328c3dd64e 100644
--- a/lib/gpu/lal_coul_debye.h
+++ b/lib/gpu/lal_coul_debye.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndtrung@umich.edu
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -39,14 +39,14 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_scale,
            double **host_cutsq, double *host_special_coul,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen,
            const double qqrd2e, const double kappa);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_scale);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -69,7 +69,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e,_kappa;
diff --git a/lib/gpu/lal_coul_debye_ext.cpp b/lib/gpu/lal_coul_debye_ext.cpp
index ced08b63e4..f205cd6adf 100644
--- a/lib/gpu/lal_coul_debye_ext.cpp
+++ b/lib/gpu/lal_coul_debye_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndtrung@umich.edu
  ***************************************************************************/
 
@@ -75,7 +75,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
                          maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);
 
     CDEMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -93,16 +93,16 @@ void cdebye_gpu_reinit(const int ntypes, double **host_scale) {
   int world_me=CDEMF.device->world_me();
   int gpu_rank=CDEMF.device->gpu_rank();
   int procs_per_gpu=CDEMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     CDEMF.reinit(ntypes, host_scale);
-  
+
   CDEMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       CDEMF.reinit(ntypes, host_scale);
-    
+
     CDEMF.device->gpu_barrier();
   }
 }
@@ -123,7 +123,7 @@ int** cdebye_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
+}
 			
 void cdebye_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_coul_dsf.cpp b/lib/gpu/lal_coul_dsf.cpp
index ca81d32b2d..32c4342fbe 100644
--- a/lib/gpu/lal_coul_dsf.cpp
+++ b/lib/gpu/lal_coul_dsf.cpp
@@ -37,18 +37,18 @@ template <class numtyp, class acctyp>
 CoulDSFT::~CoulDSF() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CoulDSFT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int CoulDSFT::init(const int ntypes, const int nlocal, const int nall, 
-                   const int max_nbors, const int maxspecial, 
+int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,
+                   const int max_nbors, const int maxspecial,
                    const double cell_size, const double gpu_split, FILE *_screen,
                    const double host_cut_coulsq, double *host_special_coul,
-                   const double qqrd2e, const double e_shift, const double f_shift, 
+                   const double qqrd2e, const double e_shift, const double f_shift,
                    const double alpha) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -123,7 +123,7 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -134,15 +134,15 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, 
+    this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv,
                      &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
                      &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                      &this->_threads_per_atom);
diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu
index fc5bf5f138..82c44cd382 100644
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@@ -31,18 +31,18 @@ texture<int2> q_tex;
 
 #define MY_PIS (acctyp)1.77245385090551602729
 
-__kernel void k_coul_dsf(const __global numtyp4 *restrict x_, 
-                         const int lj_types, 
-                         const __global numtyp *restrict sp_lj_in, 
-                         const __global int *dev_nbor, 
-                         const __global int *dev_packed, 
+__kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
+                         const int lj_types,
+                         const __global numtyp *restrict sp_lj_in,
+                         const __global int *dev_nbor,
+                         const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
-                         __global acctyp *restrict engv, 
+                         __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
-                         const int nbor_pitch, 
+                         const int nbor_pitch,
                          const __global numtyp *restrict q_ ,
                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                         const numtyp e_shift, const numtyp f_shift, 
+                         const numtyp e_shift, const numtyp f_shift,
                          const numtyp alpha, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -60,19 +60,19 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
@@ -102,9 +102,9 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
         numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
         erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-        forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+        forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
           rsq*f_shift-factor_coul);
-        
+
         force = forcecoul * r2inv;
 
         f.x+=delx*force;
@@ -131,17 +131,17 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
-                              const __global int *dev_nbor, 
+                              const __global int *dev_nbor,
                               const __global int *dev_packed,
-                              __global acctyp4 *restrict ans, 
-                              __global acctyp *restrict engv, 
-                              const int eflag, const int vflag, const int inum, 
-                              const int nbor_pitch, 
+                              __global acctyp4 *restrict ans,
+                              __global acctyp *restrict engv,
+                              const int eflag, const int vflag, const int inum,
+                              const int nbor_pitch,
                               const __global numtyp *restrict q_,
                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                              const numtyp e_shift, const numtyp f_shift, 
+                              const numtyp e_shift, const numtyp f_shift,
                               const numtyp alpha, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -149,7 +149,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
   __local numtyp sp_lj[4];
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -157,25 +157,25 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
- 
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
 
@@ -201,9 +201,9 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
         numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
         erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-        forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+        forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
           rsq*f_shift-factor_coul);
-        
+
         force = forcecoul * r2inv;
 
         f.x+=delx*force;
diff --git a/lib/gpu/lal_coul_dsf.h b/lib/gpu/lal_coul_dsf.h
index 0c5b063026..e52a51d583 100644
--- a/lib/gpu/lal_coul_dsf.h
+++ b/lib/gpu/lal_coul_dsf.h
@@ -30,18 +30,18 @@ class CoulDSF : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, const int nlocal, const int nall, 
-           const int max_nbors, const int maxspecial, 
+  int init(const int ntypes, const int nlocal, const int nall,
+           const int max_nbors, const int maxspecial,
            const double cell_size, const double gpu_split, FILE *screen,
            const double host_cut_coulsq, double *host_special_coul,
-           const double qqrd2e, const double e_shift, const double f_shift, 
+           const double qqrd2e, const double e_shift, const double f_shift,
            const double alpha);
 
   /// Clear all host and device data
@@ -62,7 +62,7 @@ class CoulDSF : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e;
diff --git a/lib/gpu/lal_coul_dsf_ext.cpp b/lib/gpu/lal_coul_dsf_ext.cpp
index e65a090a16..174ec0d839 100644
--- a/lib/gpu/lal_coul_dsf_ext.cpp
+++ b/lib/gpu/lal_coul_dsf_ext.cpp
@@ -27,11 +27,11 @@ static CoulDSF<PRECISION,ACC_PRECISION> CDMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int cdsf_gpu_init(const int ntypes, const int inum, const int nall, 
+int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
                   const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
-                  const double host_cut_coulsq, double *host_special_coul, 
-                  const double qqrd2e, const double e_shift, const double f_shift, 
+                  const double host_cut_coulsq, double *host_special_coul,
+                  const double qqrd2e, const double e_shift, const double f_shift,
                   const double alpha) {
   CDMF.clear();
   gpu_mode=CDMF.device->gpu_mode();
@@ -55,8 +55,8 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, 
-                      gpu_split, screen, host_cut_coulsq, host_special_coul, 
+    init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size,
+                      gpu_split, screen, host_cut_coulsq, host_special_coul,
                       qqrd2e, e_shift, f_shift, alpha);
 
   CDMF.device->world_barrier();
@@ -73,12 +73,12 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, 
-                        gpu_split, screen, host_cut_coulsq, host_special_coul, 
+      init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size,
+                        gpu_split, screen, host_cut_coulsq, host_special_coul,
                         qqrd2e, e_shift, f_shift, alpha);
 
     CDMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -95,7 +95,7 @@ void cdsf_gpu_clear() {
 
 int** cdsf_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
@@ -105,7 +105,7 @@ int** cdsf_gpu_compute_n(const int ago, const int inum_full,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success,
                       host_q, boxlo, prd);
-}  
+}
 			
 void cdsf_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_coul_ext.cpp b/lib/gpu/lal_coul_ext.cpp
index 291546d5b1..c124622cee 100644
--- a/lib/gpu/lal_coul_ext.cpp
+++ b/lib/gpu/lal_coul_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndtrung@umich.edu
  ***************************************************************************/
 
@@ -75,7 +75,7 @@ int coul_gpu_init(const int ntypes, double **host_scale,
                           maxspecial, cell_size, gpu_split, screen, qqrd2e);
 
     COULMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -93,16 +93,16 @@ void coul_gpu_reinit(const int ntypes, double **host_scale) {
   int world_me=COULMF.device->world_me();
   int gpu_rank=COULMF.device->gpu_rank();
   int procs_per_gpu=COULMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     COULMF.reinit(ntypes, host_scale);
-  
+
   COULMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       COULMF.reinit(ntypes, host_scale);
-    
+
     COULMF.device->gpu_barrier();
   }
 }
@@ -113,7 +113,7 @@ void coul_gpu_clear() {
 
 int** coul_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
@@ -123,7 +123,7 @@ int** coul_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
-}  
+}
 			
 void coul_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_coul_long.cpp b/lib/gpu/lal_coul_long.cpp
index d6e16a9668..513e6d074d 100644
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@@ -36,7 +36,7 @@ template <class numtyp, class acctyp>
 CoulLongT::~CoulLong() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CoulLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -67,13 +67,13 @@ int CoulLongT::init(const int ntypes, double **host_scale,
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
 
   scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale);
-  
+
   sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_coul[i];
@@ -129,7 +129,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -141,13 +141,13 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv,
-                          &eflag, &vflag, &ainum, &nbor_pitch, 
+                          &eflag, &vflag, &ainum, &nbor_pitch,
                           &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu
index 12bbbee7d2..365195e00c 100644
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@@ -123,16 +123,16 @@ texture<int2> q_tex;
 
 #endif
 
-__kernel void k_coul_long(const __global numtyp4 *restrict x_, 
+__kernel void k_coul_long(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict scale,
                           const int lj_types,
-                          const __global numtyp *restrict sp_cl_in, 
+                          const __global numtyp *restrict sp_cl_in,
                           const __global int *dev_nbor,
-                          const __global int *dev_packed, 
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                           const __global numtyp *restrict q_,
                           const numtyp cut_coulsq, const numtyp qqrd2e,
                           const numtyp g_ewald, const int t_per_atom) {
@@ -216,15 +216,15 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict scale_in,
                                const __global numtyp *restrict sp_cl_in,
-                               const __global int *dev_nbor, 
+                               const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans, 
+                               __global acctyp4 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
-                               const int nbor_pitch, 
+                               const int nbor_pitch,
                                const __global numtyp *restrict q_,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
                                const numtyp g_ewald, const int t_per_atom) {
diff --git a/lib/gpu/lal_coul_long.h b/lib/gpu/lal_coul_long.h
index 52ed60111b..6ed9c1a018 100644
--- a/lib/gpu/lal_coul_long.h
+++ b/lib/gpu/lal_coul_long.h
@@ -30,7 +30,7 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -43,10 +43,10 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
 	         const double gpu_split, FILE *screen,
 	         const double host_cut_coulsq, double *host_special_coul,
 	         const double qqrd2e, const double g_ewald);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **scale);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp
index 5552dc2437..2bc2af082e 100644
--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@@ -95,16 +95,16 @@ void cl_gpu_reinit(const int ntypes, double **host_scale) {
   int world_me=CLMF.device->world_me();
   int gpu_rank=CLMF.device->gpu_rank();
   int procs_per_gpu=CLMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     CLMF.reinit(ntypes, host_scale);
-  
+
   CLMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       CLMF.reinit(ntypes, host_scale);
-    
+
     CLMF.device->gpu_barrier();
   }
 }
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index f326657e31..1943de64c6 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -45,8 +45,8 @@ DeviceT::~Device() {
 
 template <class numtyp, class acctyp>
 int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                         const int last_gpu, const int gpu_mode, 
-                         const double p_split, const int nthreads, 
+                         const int last_gpu, const int gpu_mode,
+                         const double p_split, const int nthreads,
                          const int t_per_atom, const double cell_size,
                          char *ocl_vendor, const int block_pair) {
   _nthreads=nthreads;
@@ -83,8 +83,8 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
                 MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
   std::string node_string=std::string(node_name);
-  
-  // Get the number of procs per node                
+
+  // Get the number of procs per node
   std::map<std::string,int> name_map;
   std::map<std::string,int>::iterator np;
   for (int i=0; i<_world_size; i++) {
@@ -104,12 +104,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
       split_id=split_num;
     split_num++;
   }
-  
+
   // Set up a per node communicator and find rank within
   MPI_Comm node_comm;
-  MPI_Comm_split(_comm_world, split_id, 0, &node_comm);  
+  MPI_Comm_split(_comm_world, split_id, 0, &node_comm);
   int node_rank;
-  MPI_Comm_rank(node_comm,&node_rank);                  
+  MPI_Comm_rank(node_comm,&node_rank);
 
   // set the device ID
   _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
@@ -120,7 +120,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   _time_device=true;
   if (_procs_per_gpu>1)
     _time_device=false;
-  
+
   // Set up a per device communicator
   MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
   MPI_Comm_rank(_comm_gpu,&_gpu_rank);
@@ -128,12 +128,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   gpu=new UCL_Device();
   if (my_gpu>=gpu->num_devices())
     return -2;
-    
+
   #ifndef CUDA_PROXY
   if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
     return -7;
   #endif
-  
+
   if (gpu->set(my_gpu)!=UCL_SUCCESS)
     return -6;
 
@@ -144,7 +144,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
 
   if (set_ocl_params(ocl_vendor)!=0)
     return -11;
-  
+
   int flag=0;
   for (int i=0; i<_procs_per_gpu; i++) {
     if (_gpu_rank==i)
@@ -162,7 +162,7 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
     s_vendor=ocl_vendor;
   if (s_vendor=="none")
     s_vendor="generic";
-  
+
   if (s_vendor=="kepler") {
     _ocl_vendor_name="NVIDIA Kepler";
     #if defined (__APPLE__) || defined(MACOSX)
@@ -170,19 +170,19 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
     #else
     _ocl_vendor_string="-DKEPLER_OCL";
     #endif
-  } else if (s_vendor=="fermi") {    
+  } else if (s_vendor=="fermi") {
     _ocl_vendor_name="NVIDIA Fermi";
     _ocl_vendor_string="-DFERMI_OCL";
-  } else if (s_vendor=="cypress") {    
+  } else if (s_vendor=="cypress") {
     _ocl_vendor_name="AMD Cypress";
     _ocl_vendor_string="-DCYPRESS_OCL";
-  } else if (s_vendor=="phi") {    
+  } else if (s_vendor=="phi") {
     _ocl_vendor_name="Intel Phi";
     _ocl_vendor_string="-DPHI_OCL";
-  } else if (s_vendor=="intel") {    
+  } else if (s_vendor=="intel") {
     _ocl_vendor_name="Intel CPU";
     _ocl_vendor_string="-DINTEL_OCL";
-  } else if (s_vendor=="generic") {    
+  } else if (s_vendor=="generic") {
     _ocl_vendor_name="GENERIC";
     _ocl_vendor_string="-DGENERIC_OCL";
   } else {
@@ -220,10 +220,10 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
 
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
-                  const bool rot, const int nlocal, 
+                  const bool rot, const int nlocal,
                   const int host_nlocal, const int nall,
                   Neighbor *nbor, const int maxspecial,
-                  const int gpu_host, const int max_nbors, 
+                  const int gpu_host, const int max_nbors,
                   const double cell_size, const bool pre_cut,
                   const int threads_per_atom, const bool vel) {
   if (!_device_init)
@@ -254,7 +254,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
     // Initialize atom and nbor data
     if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
       return -3;
-      
+
     _data_in_estimate++;
     if (charge)
       _data_in_estimate++;
@@ -272,12 +272,12 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
     if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
       return -3;
   }
-  
+
   if (!ans.init(ef_nlocal,charge,rot,*gpu))
     return -3;
 
   if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
-                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, 
+                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
                   _block_cell_id, _block_nbor_build, threads_per_atom,
                   _warp_size, _time_device, compile_string()))
     return -3;
@@ -294,7 +294,7 @@ template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
                          const int nall) {
   if (!_device_init)
-    return -1;                          
+    return -1;
   if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
     return -5;
 
@@ -361,7 +361,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
       if (i==first_gpu)
         sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+
               toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ (";
-      else              
+      else
         sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+
               toa(gpu->clock_rate(i))+" GHZ (";
       if (sizeof(PRECISION)==4) {
@@ -381,7 +381,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
 }
 
 template <class numtyp, class acctyp>
-void DeviceT::estimate_gpu_overhead(const int kernel_calls, 
+void DeviceT::estimate_gpu_overhead(const int kernel_calls,
                                     double &gpu_overhead,
                                     double &gpu_driver_overhead) {
   UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
@@ -394,38 +394,38 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
     dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
     timers_in=new UCL_Timer[_data_in_estimate];
   }
-  
+
   if (_data_out_estimate>0) {
     host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
     dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
     timers_out=new UCL_Timer[_data_out_estimate];
   }
-  
+
   if (kernel_calls>0) {
     kernel_data=new UCL_D_Vec<int>[kernel_calls];
     timers_kernel=new UCL_Timer[kernel_calls];
   }
-  
+
   for (int i=0; i<_data_in_estimate; i++) {
     host_data_in[i].alloc(1,*gpu);
     dev_data_in[i].alloc(1,*gpu);
     timers_in[i].init(*gpu);
-  }  
-  
+  }
+
   for (int i=0; i<_data_out_estimate; i++) {
     host_data_out[i].alloc(1,*gpu);
     dev_data_out[i].alloc(1,*gpu);
     timers_out[i].init(*gpu);
-  }  
-  
+  }
+
   for (int i=0; i<kernel_calls; i++) {
     kernel_data[i].alloc(1,*gpu);
     timers_kernel[i].init(*gpu);
-  }  
-  
+  }
+
   gpu_overhead=0.0;
   gpu_driver_overhead=0.0;
-  
+
   for (int i=0; i<10; i++) {
     gpu->sync();
     gpu_barrier();
@@ -439,7 +439,7 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
       ucl_copy(dev_data_in[i],host_data_in[i],true);
       timers_in[i].stop();
     }
-    
+
     for (int i=0; i<kernel_calls; i++) {
       timers_kernel[i].start();
       zero(kernel_data[i],1);
@@ -455,7 +455,7 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
 
     double time=over_timer.seconds();
     driver_time=MPI_Wtime()-driver_time;
-     
+
     if (time_device()) {
       for (int i=0; i<_data_in_estimate; i++)
         timers_in[i].add_to_total();
@@ -464,7 +464,7 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
       for (int i=0; i<_data_out_estimate; i++)
         timers_out[i].add_to_total();
     }
-    
+
     double mpi_time, mpi_driver_time;
     MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
     MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
@@ -479,24 +479,24 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
     delete [] dev_data_in;
     delete [] timers_in;
   }
-  
+
   if (_data_out_estimate>0) {
     delete [] host_data_out;
     delete [] dev_data_out;
     delete [] timers_out;
   }
-  
+
   if (kernel_calls>0) {
     delete [] kernel_data;
     delete [] timers_kernel;
   }
-}              
+}
 
 template <class numtyp, class acctyp>
-void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans, 
-                           Neighbor &nbor, const double avg_split, 
+void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
+                           Neighbor &nbor, const double avg_split,
                            const double max_bytes, const double gpu_overhead,
-                           const double driver_overhead, 
+                           const double driver_overhead,
                            const int threads_per_atom, FILE *screen) {
   double single[9], times[9];
   int post_final=0;
@@ -557,14 +557,14 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
 }
 
 template <class numtyp, class acctyp>
-void DeviceT::output_kspace_times(UCL_Timer &time_in, 
+void DeviceT::output_kspace_times(UCL_Timer &time_in,
                                   UCL_Timer &time_out,
                                   UCL_Timer &time_map,
                                   UCL_Timer &time_rho,
                                   UCL_Timer &time_interp,
-                                  Answer<numtyp,acctyp> &ans, 
-                                  const double max_bytes, 
-                                  const double cpu_time, 
+                                  Answer<numtyp,acctyp> &ans,
+                                  const double max_bytes,
+                                  const double cpu_time,
                                   const double idle_time, FILE *screen) {
   double single[8], times[8];
 
@@ -664,7 +664,7 @@ int DeviceT::compile_kernels() {
   k_info.set_size(1,1);
   k_info.run(&gpu_lib_data);
   gpu_lib_data.update_host(false);
-  
+
   _ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
   #ifndef USE_OPENCL
   if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch()))
@@ -705,7 +705,7 @@ int DeviceT::compile_kernels() {
   if (_threads_per_charge & (_threads_per_charge - 1))
     _threads_per_charge=1;
 
-  return flag;    
+  return flag;
 }
 
 template <class numtyp, class acctyp>
@@ -718,12 +718,12 @@ template class Device<PRECISION,ACC_PRECISION>;
 Device<PRECISION,ACC_PRECISION> global_device;
 
 int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                    const int last_gpu, const int gpu_mode, 
+                    const int last_gpu, const int gpu_mode,
                     const double particle_split, const int nthreads,
-                    const int t_per_atom, const double cell_size, 
+                    const int t_per_atom, const double cell_size,
                     char *opencl_vendor, const int block_pair) {
   return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
-                                   particle_split,nthreads,t_per_atom, 
+                                   particle_split,nthreads,t_per_atom,
                                    cell_size,opencl_vendor,block_pair);
 }
 
diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu
index 28b58f7760..6761b23fbb 100644
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -17,10 +17,10 @@
 #include "lal_preprocessor.h"
 #endif
 
-__kernel void kernel_zero(__global int *restrict mem, 
+__kernel void kernel_zero(__global int *restrict mem,
                           int numel) {
   int ii=GLOBAL_ID_X;
-  
+
   if (ii<numel)
     mem[ii]=0;
 }
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 77321f5462..4f7b594c7c 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -28,29 +28,29 @@
 
 namespace LAMMPS_AL {
 
-template <class numtyp, class acctyp, 
+template <class numtyp, class acctyp,
           class grdtyp, class grdtyp4> class PPPM;
 
 template <class numtyp, class acctyp>
 class Device {
  public:
   Device();
-  ~Device(); 
- 
+  ~Device();
+
   /// Initialize the device for use by this process
   /** Sets up a per-device MPI communicator for load balancing and initializes
-    * the device (>=first_gpu and <=last_gpu) that this proc will be using 
+    * the device (>=first_gpu and <=last_gpu) that this proc will be using
     * Returns:
     * -  0 if successfull
     * - -2 if GPU not found
     * - -4 if GPU library not compiled for GPU
     * - -6 if GPU could not be initialized for use
-    * - -7 if accelerator sharing is not currently allowed on system 
+    * - -7 if accelerator sharing is not currently allowed on system
     * - -11 if vendor_string has the wrong number of parameters **/
-  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
-                  const int last_gpu, const int gpu_mode, 
+  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
+                  const int last_gpu, const int gpu_mode,
                   const double particle_split, const int nthreads,
-                  const int t_per_atom, const double cell_size, 
+                  const int t_per_atom, const double cell_size,
                   char *vendor_string, const int block_pair);
 
   /// Initialize the device for Atom and Neighbor storage
@@ -62,9 +62,9 @@ class Device {
     *                 1 if gpu_nbor is true, and host needs a half nbor list,
     *                 2 if gpu_nbor is true, and host needs a full nbor list
     * \param max_nbors Initial number of rows in the neighbor matrix
-    * \param cell_size cutoff+skin 
+    * \param cell_size cutoff+skin
     * \param pre_cut True if cutoff test will be performed in separate kernel
-    *                than the force kernel 
+    *                than the force kernel
     * \param threads_per_atom value to be used by the neighbor list only
     *
     * Returns:
@@ -113,25 +113,25 @@ class Device {
 
   /// Returns true if double precision is supported on card
   inline bool double_precision() { return gpu->double_precision(); }
-  
+
   /// Output a message with timing information
-  void output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans, 
-                    Neighbor &nbor, const double avg_split, 
+  void output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
+                    Neighbor &nbor, const double avg_split,
                     const double max_bytes, const double gpu_overhead,
-                    const double driver_overhead, 
+                    const double driver_overhead,
                     const int threads_per_atom, FILE *screen);
 
   /// Output a message with timing information
   void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
                            UCL_Timer & time_map, UCL_Timer & time_rho,
-                           UCL_Timer &time_interp, 
-                           Answer<numtyp,acctyp> &ans, 
+                           UCL_Timer &time_interp,
+                           Answer<numtyp,acctyp> &ans,
                            const double max_bytes, const double cpu_time,
                            const double cpu_idle_time, FILE *screen);
 
   /// Clear all memory on host and device associated with atom and nbor data
   void clear();
-  
+
   /// Clear all memory on host and device
   void clear_device();
 
@@ -149,24 +149,24 @@ class Device {
       while (ans_queue.empty()==false) {
         evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
         ans_queue.pop();
-      }                                                 
+      }
       return evdw;
     }
     return 0.0;
   }
 
   /// Start timer on host
-  inline void start_host_timer() 
+  inline void start_host_timer()
     { _cpu_full=MPI_Wtime(); _host_timer_started=true; }
-  
+
   /// Stop timer on host
-  inline void stop_host_timer() { 
+  inline void stop_host_timer() {
     if (_host_timer_started) {
-      _cpu_full=MPI_Wtime()-_cpu_full; 
+      _cpu_full=MPI_Wtime()-_cpu_full;
       _host_timer_started=false;
     }
   }
-  
+
   /// Return host time
   inline double host_time() { return _cpu_full; }
 
@@ -239,8 +239,8 @@ class Device {
   /// Number of threads executing concurrently on same multiproc
   inline int warp_size() const { return _warp_size; }
 
-  // -------------------- SHARED DEVICE ROUTINES -------------------- 
-  // Perform asynchronous zero of integer array 
+  // -------------------- SHARED DEVICE ROUTINES --------------------
+  // Perform asynchronous zero of integer array
   void zero(UCL_D_Vec<int> &mem, const int numel) {
     int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
                                     _block_pair));
@@ -248,25 +248,25 @@ class Device {
     k_zero.run(&mem,&numel);
   }
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Geryon Device
   UCL_Device *gpu;
 
   enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH};
 
-  // --------------------------- ATOM DATA -------------------------- 
+  // --------------------------- ATOM DATA --------------------------
 
   /// Atom Data
   Atom<numtyp,acctyp> atom;
 
   // --------------------------- NBOR DATA ----------------------------
-  
+
   /// Neighbor Data
   NeighborShared _neighbor_shared;
 
   // ------------------------ LONG RANGE DATA -------------------------
-  
+
   // Long Range Data
   int _long_range_precompute;
   PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
@@ -282,7 +282,7 @@ class Device {
       pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
                               boxlo,prd);
   }
-  
+
   inline std::string compile_string() { return _ocl_compile_string; }
 
  private:
@@ -290,7 +290,7 @@ class Device {
   int _init_count;
   bool _device_init, _host_timer_started, _time_device;
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;
-  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
+  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
       _replica_size;
   int _gpu_mode, _first_device, _last_device, _nthreads;
   double _particle_split;
@@ -310,10 +310,10 @@ class Device {
   int compile_kernels();
 
   int _data_in_estimate, _data_out_estimate;
-  
+
   std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string;
   int set_ocl_params(char *);
-  
+
   template <class t>
   inline std::string toa(const t& in) {
     std::ostringstream o;
diff --git a/lib/gpu/lal_dipole_lj.cpp b/lib/gpu/lal_dipole_lj.cpp
index e96e15eaf9..c97b76c820 100644
--- a/lib/gpu/lal_dipole_lj.cpp
+++ b/lib/gpu/lal_dipole_lj.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 DipoleLJT::~DipoleLJ() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int DipoleLJT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,8 +45,8 @@ int DipoleLJT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int DipoleLJT::init(const int ntypes,
-                    double **host_cutsq, double **host_lj1, 
-                    double **host_lj2, double **host_lj3, 
+                    double **host_cutsq, double **host_lj1,
+                    double **host_lj2, double **host_lj3,
                     double **host_lj4, double **host_offset,
                     double *host_special_lj, const int nlocal,
                     const int nall, const int max_nbors,
@@ -138,7 +138,7 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -151,7 +151,7 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->q, 
+                          &ainum, &nbor_pitch, &this->atom->q,
                           &this->atom->quat, &cutsq,
                           &_qqrd2e, &this->_threads_per_atom);
   } else {
@@ -160,8 +160,8 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
                      &_lj_types, &sp_lj, &this->nbor->dev_nbor,
                      &this->_nbor_data->begin(), &this->ans->force,
                      &this->ans->engv, &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->q, 
-                     &this->atom->quat, &cutsq, 
+                     &nbor_pitch, &this->atom->q,
+                     &this->atom->quat, &cutsq,
                      &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu
index b6483d1ef8..42c2bde144 100644
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -132,17 +132,17 @@ texture<int4,1> mu_tex;
 
 #endif
 
-__kernel void k_dipole_lj(const __global numtyp4 *restrict x_, 
+__kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict lj1,
-                          const __global numtyp4 *restrict lj3, 
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict lj3,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                           const __global numtyp *restrict q_,
                           const __global numtyp4 *restrict mu_,
                           const __global numtyp *restrict cutsq,
@@ -171,14 +171,14 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
@@ -225,18 +225,18 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
           rinv = ucl_rsqrt(rsq);
 
           // charge-charge
-          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
+          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
             r3inv = r2inv*rinv;
             pre1 = qtmp*qj*r3inv;
 
             forcecoul.x += pre1*delx;
             forcecoul.y += pre1*dely;
             forcecoul.z += pre1*delz;
-          }                    
+          }
 
           // dipole-dipole
           if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
 	          r7inv = r5inv*r2inv;
             pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
@@ -251,7 +251,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
             forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x;
             forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y;
             forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z;
-	    
+	
             numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
             numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
             numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
@@ -263,12 +263,12 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
 
           // dipole-charge
           if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             pre1 = (numtyp)3.0*qj*r5inv * pidotr;
             pre2 = qj*r3inv;
-              
+
             forcecoul.x += pre2*mui.x - pre1*delx;
             forcecoul.y += pre2*mui.y - pre1*dely;
             forcecoul.z += pre2*mui.z - pre1*delz;
@@ -276,7 +276,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
             ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
             ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
           }
-          
+
           // charge-dipole
           if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
             r3inv = r2inv*rinv;
@@ -284,7 +284,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
             pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr;
             pre2 = qtmp*r3inv;
-            
+
             forcecoul.x += pre1*delx - pre2*muj.x;
             forcecoul.y += pre1*dely - pre2*muj.y;
             forcecoul.z += pre1*delz - pre2*muj.z;
@@ -306,12 +306,12 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
         tor.z+=fq*ticoul.z;
 
         if (eflag>0) {
-          acctyp e = (acctyp)0.0;  
+          acctyp e = (acctyp)0.0;
           if (rsq < lj1[mtype].w) {
             e = qtmp*qj*rinv;
             if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
               e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr;
-            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
+            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
               e += -qj*r3inv*pidotr;
             if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
               e += qtmp*r3inv*pjdotr;
@@ -322,7 +322,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].z) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*force.x;
@@ -340,19 +340,19 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp4 *restrict lj1_in,
-                               const __global numtyp4 *restrict lj3_in, 
+                               const __global numtyp4 *restrict lj3_in,
                                const __global numtyp *restrict sp_lj_in,
-                               const __global int *dev_nbor, 
+                               const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, 
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
                                const __global numtyp *restrict q_,
                                const __global numtyp4 *restrict mu_,
-                               const __global numtyp *restrict _cutsq, 
+                               const __global numtyp *restrict _cutsq,
                                const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -369,7 +369,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -381,16 +381,16 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
@@ -424,7 +424,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
         numtyp pdotp, pidotr, pjdotr;
         acctyp4 forcecoul, ticoul;
         acctyp4 force;
-        
+
         forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
         ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
 
@@ -437,7 +437,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
           rinv = ucl_rsqrt(rsq);
 
           // charge-charge
-          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
+          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
             r3inv = r2inv*rinv;
             pre1 = qtmp*qj*r3inv;
 
@@ -448,7 +448,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
 
           // dipole-dipole
           if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
             r7inv = r5inv*r2inv;
             pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
@@ -463,7 +463,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
             forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x;
             forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y;
             forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z;
-	    
+	
             numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
             numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
             numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
@@ -474,13 +474,13 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
           }
 
           // dipole-charge
-          if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { 
+          if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
             r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             pre1 = (numtyp)3.0*qj*r5inv * pidotr;
             pre2 = qj*r3inv;
-            
+
             forcecoul.x += pre2*mui.x - pre1*delx;
             forcecoul.y += pre2*mui.y - pre1*dely;
             forcecoul.z += pre2*mui.z - pre1*delz;
@@ -488,7 +488,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
             ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
             ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
           }
-          
+
           // charge-dipole
           if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
             r3inv = r2inv*rinv;
@@ -496,7 +496,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
             pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr;
             pre2 = qtmp*r3inv;
-            
+
             forcecoul.x += pre1*delx - pre2*muj.x;
             forcecoul.y += pre1*dely - pre2*muj.y;
             forcecoul.z += pre1*delz - pre2*muj.z;
@@ -519,12 +519,12 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
         tor.z+=fq*ticoul.z;
 
         if (eflag>0) {
-          acctyp e = (acctyp)0;  
+          acctyp e = (acctyp)0;
           if (rsq < lj1[mtype].w) {
             e = qtmp*qj*rinv;
             if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
               e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr;
-            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
+            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
               e += -qj*r3inv*pidotr;
             if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
               e += qtmp*r3inv*pjdotr;
diff --git a/lib/gpu/lal_dipole_lj.h b/lib/gpu/lal_dipole_lj.h
index b08b7a8669..615784ee8b 100644
--- a/lib/gpu/lal_dipole_lj.h
+++ b/lib/gpu/lal_dipole_lj.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class DipoleLJ : public BaseDipole<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,7 +40,7 @@ class DipoleLJ : public BaseDipole<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            double **host_cut_coulsq, double *host_special_coul,
@@ -70,7 +70,7 @@ class DipoleLJ : public BaseDipole<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e;
diff --git a/lib/gpu/lal_dipole_lj_ext.cpp b/lib/gpu/lal_dipole_lj_ext.cpp
index 55bbe0b804..2591d3c0ed 100644
--- a/lib/gpu/lal_dipole_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -81,7 +81,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
     DPLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -98,17 +98,17 @@ void dpl_gpu_clear() {
 
 int** dpl_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
-                        bool &success, double *host_q, double **host_mu, 
+                        bool &success, double *host_q, double **host_mu,
                         double *boxlo, double *prd) {
   return DPLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, host_mu, boxlo, prd);
-}  
+}
 			
 void dpl_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_dipole_lj_sf.cpp b/lib/gpu/lal_dipole_lj_sf.cpp
index 5a145dc762..a33f38084f 100644
--- a/lib/gpu/lal_dipole_lj_sf.cpp
+++ b/lib/gpu/lal_dipole_lj_sf.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 DipoleLJSFT::~DipoleLJSF() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int DipoleLJSFT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,8 +45,8 @@ int DipoleLJSFT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int DipoleLJSFT::init(const int ntypes,
-                      double **host_cutsq, double **host_lj1, 
-                      double **host_lj2, double **host_lj3, 
+                      double **host_cutsq, double **host_lj1,
+                      double **host_lj2, double **host_lj3,
                       double **host_lj4,
                       double *host_special_lj, const int nlocal,
                       const int nall, const int max_nbors,
@@ -138,7 +138,7 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -151,17 +151,17 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->q, 
+                          &ainum, &nbor_pitch, &this->atom->q,
                           &this->atom->quat, &cutsq,
                           &_qqrd2e, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &lj1, &lj3,
                      &_lj_types, &sp_lj, &this->nbor->dev_nbor,
-                     &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, &eflag, &vflag, 
-                     &ainum, &nbor_pitch, &this->atom->q, 
-                     &this->atom->quat, &cutsq, 
+                     &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->atom->q,
+                     &this->atom->quat, &cutsq,
                      &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu
index 8469ed9ac9..5769c3a1a1 100644
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -133,20 +133,20 @@ texture<int4,1> mu_tex;
 
 #endif
 
-__kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, 
+__kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict lj1,
-                             const __global numtyp4 *restrict lj3, 
-                             const int lj_types, 
-                             const __global numtyp *restrict sp_lj_in, 
-                             const __global int *dev_nbor, 
-                             const __global int *dev_packed, 
+                             const __global numtyp4 *restrict lj3,
+                             const int lj_types,
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
                              __global acctyp4 *restrict ans,
-                             __global acctyp *restrict engv, 
+                             __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
-                             const int nbor_pitch, 
+                             const int nbor_pitch,
                              const __global numtyp *restrict q_ ,
                              const __global numtyp4 *restrict mu_,
-                             const __global numtyp *restrict cutsq, 
+                             const __global numtyp *restrict cutsq,
                              const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -172,14 +172,14 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
@@ -236,48 +236,48 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
           rcutcoul2inv = ucl_recip(lj1[mtype].w);
 
           // charge-charge
-          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
+          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
             r3inv = r2inv*rinv;
             pre1 = qtmp*qj*rinv*(r2inv-rcutcoul2inv);
 
             forcecoul.x += pre1*delx;
             forcecoul.y += pre1*dely;
             forcecoul.z += pre1*delz;
-          }                    
+          }
 
           // dipole-dipole
           if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
-	          
+	
             pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
-            
+
             afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
             pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr);
             aforcecoul.x = pre1*delx;
             aforcecoul.y = pre1*dely;
             aforcecoul.z = pre1*delz;
- 	    
+ 	
             bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+
               (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
             presf = (numtyp)2.0*r2inv*pidotr*pjdotr;
             bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx);
             bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely);
             bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz);
-   
+
             forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x);
             forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y);
             forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z);
-            
+
             pre2 = (numtyp)3.0*bfac*r5inv*pjdotr;
             pre4 = -bfac*r3inv;
 
             numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
             numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
             numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
-  
+
             ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
             ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
             ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
@@ -285,12 +285,12 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
 
           // dipole-charge
           if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             rcutcoul2inv=ucl_recip(lj1[mtype].w);
             pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv);
-            pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
+            pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
               (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
             pre2 = qj*r3inv * pqfac;
 
@@ -301,7 +301,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
             ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
             ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
           }
-          
+
           // charge-dipole
           if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
             r3inv = r2inv*rinv;
@@ -309,10 +309,10 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
             rcutcoul2inv=ucl_recip(lj1[mtype].w);
             pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv);
-            qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
+            qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
               (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
             pre2 = qtmp*r3inv * qpfac;
-            
+
             forcecoul.x += pre1*delx - pre2*muj.x;
             forcecoul.y += pre1*dely - pre2*muj.y;
             forcecoul.z += pre1*delz - pre2*muj.z;
@@ -334,13 +334,13 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
         tor.z+=fq*ticoul.z;
 
         if (eflag>0) {
-          acctyp e = (acctyp)0.0;  
+          acctyp e = (acctyp)0.0;
           if (rsq < lj1[mtype].w) {
             numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
             e = qtmp*qj*rinv*fac*fac;
             if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
               e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr);
-            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
+            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
               e += -qj*r3inv*pidotr * pqfac;
             if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
               e += qtmp*r3inv*pjdotr * qpfac;
@@ -350,12 +350,12 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
 
           if (rsq < lj1[mtype].z) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) +
-              rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - 
+              rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv -
               (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv +
-              rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + 
+              rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv +
               (numtyp)4.0*lj3[mtype].y);
             energy+=factor_lj*e;
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*force.x;
@@ -372,19 +372,19 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict lj1_in,
-                                  const __global numtyp4 *restrict lj3_in, 
+                                  const __global numtyp4 *restrict lj3_in,
                                   const __global numtyp *restrict sp_lj_in,
-                                  const __global int *dev_nbor, 
+                                  const __global int *dev_nbor,
                                   const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans, 
-                                  __global acctyp *restrict engv, 
-                                  const int eflag, const int vflag, 
-                                  const int inum, const int nbor_pitch, 
+                                  __global acctyp4 *restrict ans,
+                                  __global acctyp *restrict engv,
+                                  const int eflag, const int vflag,
+                                  const int inum, const int nbor_pitch,
                                   const __global numtyp *restrict q_,
                                   const __global numtyp4 *restrict mu_,
-                                  const __global numtyp *restrict _cutsq, 
+                                  const __global numtyp *restrict _cutsq,
                                   const numtyp qqrd2e,
                                   const int t_per_atom) {
   int tid, ii, offset;
@@ -402,7 +402,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -414,16 +414,16 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
@@ -480,41 +480,41 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
           rcutcoul2inv = ucl_recip(lj1[mtype].w);
 
           // charge-charge
-          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
+          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
             r3inv = r2inv*rinv;
             pre1 = qtmp*qj*rinv*(r2inv-rcutcoul2inv);
 
             forcecoul.x += pre1*delx;
             forcecoul.y += pre1*dely;
             forcecoul.z += pre1*delz;
-          }                    
+          }
 
           // dipole-dipole
           if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
-	          
+	
             pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
-            
+
             afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
             pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr);
             aforcecoul.x = pre1*delx;
             aforcecoul.y = pre1*dely;
             aforcecoul.z = pre1*delz;
-	    
+	
             bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+
               (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
             presf = (numtyp)2.0*r2inv*pidotr*pjdotr;
             bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx);
             bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely);
             bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz);
-	    
+	
             forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x);
             forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y);
             forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z);
-            
+
             pre2 = (numtyp)3.0*bfac*r5inv*pjdotr;
             pre4 = -bfac*r3inv;
 
@@ -529,11 +529,11 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
 
           // dipole-charge
           if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv);
-            pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
+            pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
               (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
             pre2 = qj*r3inv * pqfac;
 
@@ -544,7 +544,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
             ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
             ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
           }
-          
+
           // charge-dipole
           if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
             r3inv = r2inv*rinv;
@@ -552,10 +552,10 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
 
             pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv);
-            qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
+            qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
               (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
             pre2 = qtmp*r3inv * qpfac;
-            
+
             forcecoul.x += pre1*delx - pre2*muj.x;
             forcecoul.y += pre1*dely - pre2*muj.y;
             forcecoul.z += pre1*delz - pre2*muj.z;
@@ -577,13 +577,13 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
         tor.z+=fq*ticoul.z;
 
         if (eflag>0) {
-          acctyp e = (acctyp)0.0;  
+          acctyp e = (acctyp)0.0;
           if (rsq < lj1[mtype].w) {
             numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
             e = qtmp*qj*rinv*fac*fac;
             if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
               e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr);
-            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
+            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
               e += -qj*r3inv*pidotr * pqfac;
             if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
               e += qtmp*r3inv*pjdotr * qpfac;
@@ -593,12 +593,12 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
 
           if (rsq < lj1[mtype].z) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) +
-              rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - 
+              rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv -
               (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv +
-              rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + 
+              rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv +
               (numtyp)4.0*lj3[mtype].y);
             energy+=factor_lj*e;
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*force.x;
diff --git a/lib/gpu/lal_dipole_lj_sf.h b/lib/gpu/lal_dipole_lj_sf.h
index 83cea4c2a4..20357385a2 100644
--- a/lib/gpu/lal_dipole_lj_sf.h
+++ b/lib/gpu/lal_dipole_lj_sf.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,7 +40,7 @@ class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            double **host_cut_coulsq, double *host_special_coul,
@@ -70,7 +70,7 @@ class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e;
diff --git a/lib/gpu/lal_dipole_lj_sf_ext.cpp b/lib/gpu/lal_dipole_lj_sf_ext.cpp
index 8abf78c903..840afbe1c2 100644
--- a/lib/gpu/lal_dipole_lj_sf_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -81,7 +81,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                            host_cut_coulsq, host_special_coul, qqrd2e);
 
     DPLSFMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -98,17 +98,17 @@ void dplsf_gpu_clear() {
 
 int** dplsf_gpu_compute_n(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                          double *sublo, double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
-                          bool &success, double *host_q, double **host_mu, 
+                          bool &success, double *host_q, double **host_mu,
                           double *boxlo, double *prd) {
   return DPLSFMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                          subhi, tag, nspecial, special, eflag, vflag, eatom,
                          vatom, host_start, ilist, jnum, cpu_time, success,
                          host_q, host_mu, boxlo, prd);
-}  
+}
 			
 void dplsf_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_dpd.cpp b/lib/gpu/lal_dpd.cpp
index 3736f89323..f05707ef1d 100644
--- a/lib/gpu/lal_dpd.cpp
+++ b/lib/gpu/lal_dpd.cpp
@@ -33,23 +33,23 @@ DPDT::DPD() : BaseDPD<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-DPDT::~DPD() { 
+DPDT::~DPD() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int DPDT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int DPDT::init(const int ntypes, 
-               double **host_cutsq, double **host_a0, 
-               double **host_gamma, double **host_sigma, 
+int DPDT::init(const int ntypes,
+               double **host_cutsq, double **host_a0,
+               double **host_gamma, double **host_sigma,
                double **host_cut, double *host_special_lj,
-               const bool tstat_only, 
-               const int nlocal, const int nall, 
-               const int max_nbors, const int maxspecial, 
+               const bool tstat_only,
+               const int nlocal, const int nall,
+               const int max_nbors, const int maxspecial,
                const double cell_size,
                const double gpu_split, FILE *_screen) {
   int success;
@@ -90,7 +90,7 @@ int DPDT::init(const int ntypes,
 
   _tstat_only = 0;
   if (tstat_only) _tstat_only=1;
-  
+
   _allocated=true;
   this->_max_bytes=coeff.row_bytes()+cutsq.row_bytes()+sp_lj.row_bytes();
   return 0;
@@ -130,7 +130,7 @@ void DPDT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -147,8 +147,8 @@ void DPDT::loop(const bool _eflag, const bool _vflag) {
                           &this->_tstat_only, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt,
                      &this->_seed, &this->_timestep, &this->_tstat_only,
@@ -166,5 +166,5 @@ void DPDT::update_coeff(int ntypes, double **host_a0, double **host_gamma,
   this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_a0,host_gamma,
 			 host_sigma,host_cut);
 }
-               
+
 template class DPD<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_dpd.cu b/lib/gpu/lal_dpd.cu
index 209bc0233e..e32404ff5c 100644
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@@ -37,7 +37,7 @@ texture<int4,1> vel_tex;
 #define _USE_UNIFORM_SARU_LCG
 #endif
 
-// References: 
+// References:
 // 1. Y. Afshar, F. Schmid, A. Pishevar, S. Worley, Comput. Phys. Comm. 184 (2013), 1119–1128.
 // 2. C. L. Phillips, J. A. Anderson, S. C. Glotzer, Comput. Phys. Comm. 230 (2011), 7191-7201.
 // PRNG period = 3666320093*2^32 ~ 2^64 ~ 10^19
@@ -49,9 +49,9 @@ texture<int4,1> vel_tex;
 #define TWO_N32 0.232830643653869628906250e-9f /* 2^-32 */
 
 // specifically implemented for steps = 1; high = 1.0; low = -1.0
-// returns uniformly distributed random numbers u in [-1.0;1.0] 
-// using the inherent LCG, then multiply u with sqrt(3) to "match" 
-// with a normal random distribution. 
+// returns uniformly distributed random numbers u in [-1.0;1.0]
+// using the inherent LCG, then multiply u with sqrt(3) to "match"
+// with a normal random distribution.
 // Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12)
 // Curly brackets to make variables local to the scope.
 #ifdef _USE_UNIFORM_SARU_LCG
@@ -80,8 +80,8 @@ texture<int4,1> vel_tex;
 #endif
 
 // specifically implemented for steps = 1; high = 1.0; low = -1.0
-// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 
-// then multiply u with sqrt(3) to "match" with a normal random distribution 
+// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8
+// then multiply u with sqrt(3) to "match" with a normal random distribution
 // Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12)
 #ifdef _USE_UNIFORM_SARU_TEA8
 #define SQRT3 (numtyp)1.7320508075688772935274463
@@ -119,7 +119,7 @@ texture<int4,1> vel_tex;
 #endif
 
 // specifically implemented for steps = 1; high = 1.0; low = -1.0
-// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], 
+// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0],
 // and uses the polar method (Marsaglia's) to transform to a normal random value
 // This is used to compared with CPU DPD using RandMars::gaussian()
 #ifdef _USE_GAUSSIAN_SARU_LCG
@@ -160,20 +160,20 @@ texture<int4,1> vel_tex;
   randnum = r2*fac;                                                           \
 }
 #endif
-                                                                             
-__kernel void k_dpd(const __global numtyp4 *restrict x_, 
+
+__kernel void k_dpd(const __global numtyp4 *restrict x_,
                     const __global numtyp4 *restrict coeff,
-                    const int lj_types, 
-                    const __global numtyp *restrict sp_lj, 
-                    const __global int * dev_nbor, 
-                    const __global int * dev_packed, 
-                    __global acctyp4 *restrict ans, 
-                    __global acctyp *restrict engv, 
+                    const int lj_types,
+                    const __global numtyp *restrict sp_lj,
+                    const __global int * dev_nbor,
+                    const __global int * dev_packed,
+                    __global acctyp4 *restrict ans,
+                    __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
-                    const int nbor_pitch, 
+                    const int nbor_pitch,
                     const __global numtyp4 *restrict v_,
                     const __global numtyp *restrict cutsq,
-                    const numtyp dtinvsqrt, const int seed, 
+                    const numtyp dtinvsqrt, const int seed,
                     const int timestep, const int tstat_only,
                     const int t_per_atom) {
   int tid, ii, offset;
@@ -185,13 +185,13 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i];
@@ -199,7 +199,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
 
     numtyp factor_dpd;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_dpd = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -214,7 +214,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<cutsq[mtype]) {
         numtyp r=ucl_sqrt(rsq);
@@ -231,8 +231,8 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
         if (tag1 > tag2) {
           tag1 = jtag; tag2 = itag;
         }
-        
-        numtyp randnum = (numtyp)0.0;  
+
+        numtyp randnum = (numtyp)0.0;
         saru(tag1, tag2, seed, timestep, randnum);
 
         // conservative force = a0 * wd, or 0 if tstat only
@@ -244,7 +244,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
         force -= coeff[mtype].y*wd*wd*dot*rinv;
         force += coeff[mtype].z*wd*randnum*dtinvsqrt;
         force*=factor_dpd*rinv;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -254,7 +254,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
           // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
           // eng shifted to 0.0 at cutoff
           numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
-          energy+=factor_dpd*e; 
+          energy+=factor_dpd*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -272,23 +272,23 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_dpd_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict coeff_in,
-                         const __global numtyp *restrict sp_lj_in, 
-                         const __global int * dev_nbor, 
-                         const __global int * dev_packed, 
-                         __global acctyp4 *restrict ans, 
-                         __global acctyp *restrict engv, 
-                         const int eflag, const int vflag, const int inum, 
+                         const __global numtyp *restrict sp_lj_in,
+                         const __global int * dev_nbor,
+                         const __global int * dev_packed,
+                         __global acctyp4 *restrict ans,
+                         __global acctyp *restrict engv,
+                         const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
                          const __global numtyp4 *restrict v_,
                          const __global numtyp *restrict cutsq,
-                         const numtyp dtinvsqrt, const int seed, 
+                         const numtyp dtinvsqrt, const int seed,
                          const int timestep, const int tstat_only,
                          const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -296,7 +296,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -305,7 +305,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
@@ -320,7 +320,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_dpd;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_dpd = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -335,7 +335,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<cutsq[mtype]) {
         numtyp r=ucl_sqrt(rsq);
         if (r < EPSILON) continue;
@@ -351,8 +351,8 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
         if (tag1 > tag2) {
           tag1 = jtag; tag2 = itag;
         }
-        
-        numtyp randnum = (numtyp)0.0;  
+
+        numtyp randnum = (numtyp)0.0;
         saru(tag1, tag2, seed, timestep, randnum);
 
         // conservative force = a0 * wd, or 0 if tstat only
@@ -364,7 +364,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
         force -= coeff[mtype].y*wd*wd*dot*rinv;
         force += coeff[mtype].z*wd*randnum*dtinvsqrt;
         force*=factor_dpd*rinv;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -374,7 +374,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
           // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
           // eng shifted to 0.0 at cutoff
           numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
-          energy+=factor_dpd*e; 
+          energy+=factor_dpd*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_dpd.h b/lib/gpu/lal_dpd.h
index 449d7b1d8c..42ef854522 100644
--- a/lib/gpu/lal_dpd.h
+++ b/lib/gpu/lal_dpd.h
@@ -24,23 +24,23 @@ template <class numtyp, class acctyp>
 class DPD : public BaseDPD<numtyp, acctyp> {
  public:
   DPD();
-  ~DPD(); 
+  ~DPD();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_a0, 
+  int init(const int ntypes, double **host_cutsq, double **host_a0,
            double **host_gamma, double **host_sigma, double **host_cut,
            double *host_special_lj, bool tstat_only, const int nlocal,
-           const int nall, const int max_nbors, const int maxspecial, 
+           const int nall, const int max_nbors, const int maxspecial,
            const double cell_size, const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -52,11 +52,11 @@ class DPD : public BaseDPD<numtyp, acctyp> {
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
-  
+
   /// Update coeff if needed (tstat only)
   void update_coeff(int ntypes, double **host_a0, double **host_gamma,
                     double **host_sigma, double **host_cut);
-  
+
   // --------------------------- TYPE DATA --------------------------
 
   /// coeff.x = a0, coeff.y = gamma, coeff.z = sigma, coeff.w = cut
@@ -70,12 +70,12 @@ class DPD : public BaseDPD<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
-  
+
   /// Only used for thermostat
   int _tstat_only;
-  
+
  private:
   bool _allocated;
   void loop(const bool _eflag, const bool _vflag);
diff --git a/lib/gpu/lal_dpd_ext.cpp b/lib/gpu/lal_dpd_ext.cpp
index 327074d087..792f638cd8 100644
--- a/lib/gpu/lal_dpd_ext.cpp
+++ b/lib/gpu/lal_dpd_ext.cpp
@@ -54,7 +54,7 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, 
+    init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
                        host_cut, special_lj, tstat_only, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
@@ -72,12 +72,12 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, 
+      init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
                          host_cut, special_lj, tstat_only, inum, nall, 300,
                          maxspecial, cell_size, gpu_split, screen);
 
     DPDMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -95,25 +95,25 @@ void dpd_gpu_clear() {
 int ** dpd_gpu_compute_n(const int ago, const int inum_full, const int nall,
                          double **host_x, int *host_type, double *sublo,
                          double *subhi, tagint *tag, int **nspecial,
-                         tagint **special, const bool eflag, const bool vflag, 
-                         const bool eatom, const bool vatom, int &host_start, 
+                         tagint **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time, bool &success,
-                         double **host_v, const double dtinvsqrt, 
+                         double **host_v, const double dtinvsqrt,
                          const int seed, const int timestep,
                          double *boxlo, double *prd) {
   return DPDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, ilist, jnum, cpu_time, success, 
+                       vatom, host_start, ilist, jnum, cpu_time, success,
                        host_v, dtinvsqrt, seed, timestep, boxlo, prd);
-}  
+}
 			
 void dpd_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success, tagint *tag,
-                     double **host_v, const double dtinvsqrt, 
-                     const int seed, const int timestep, 
+                     double **host_v, const double dtinvsqrt,
+                     const int seed, const int timestep,
                      const int nlocal, double *boxlo, double *prd) {
   DPDMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj,
                 firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success,
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index c856a8e667..b83972f4db 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -9,10 +9,10 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
- 
+
 #if defined(USE_OPENCL)
 #include "eam_cl.h"
 #elif defined(USE_CUDART)
@@ -33,7 +33,7 @@ using namespace LAMMPS_AL;
 extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
-EAMT::EAM() : BaseAtomic<numtyp,acctyp>(), 
+EAMT::EAM() : BaseAtomic<numtyp,acctyp>(),
   _compiled_energy(false), _allocated(false) {
 }
 
@@ -41,46 +41,46 @@ template <class numtyp, class acctyp>
 EAMT::~EAM() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
                int **host_type2z2r, int *host_type2frho,
                double ***host_rhor_spline, double ***host_z2r_spline,
-               double ***host_frho_spline, double rdr, double rdrho, 
+               double ***host_frho_spline, double rdr, double rdrho,
                double rhomax, int nrhor, int nrho, int nz2r, int nfrho, int nr,
                const int nlocal, const int nall, const int max_nbors,
-               const int maxspecial, const double cell_size, 
-               const double gpu_split, FILE *_screen) 
+               const int maxspecial, const double cell_size,
+               const double gpu_split, FILE *_screen)
 {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
                             gpu_split,_screen,eam,"k_eam");
-  
+
   if (success!=0)
     return success;
-  
+
   // allocate fp
-  
+
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
 
   _max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
   _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-                                     
+
   k_energy.set_function(*(this->pair_program),"k_energy");
   k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
   fp_tex.get_texture(*(this->pair_program),"fp_tex");
   fp_tex.bind_float(_fp,1);
   _compiled_energy = true;
-  
+
   // Initialize timers for selected GPU
   time_pair2.init(*(this->ucl_device));
   time_pair2.zero();
-  
+
   time_fp1.init(*(this->ucl_device));
   time_fp1.zero();
-  
+
   time_fp2.init(*(this->ucl_device));
   time_fp2.zero();
 
@@ -93,7 +93,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
     lj_types=max_shared_types;
     shared_types=true;
   }
-  
+
   _ntypes=lj_types;
   _cutforcesq=host_cutforcesq;
   _rdr=rdr;
@@ -104,26 +104,26 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   _nz2r=nz2r;
   _nfrho=nfrho;
   _nr=nr;
-  
+
   UCL_H_Vec<int2> dview_type(lj_types*lj_types,*(this->ucl_device),
                              UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<lj_types*lj_types; i++) {
     dview_type[i].x=0; dview_type[i].y=0;
   }
-                                
+
   // pack type2rhor and type2z2r
   type2rhor_z2r.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  
+
   for (int i=0; i<ntypes; i++) {
     for (int j=0; j<ntypes; j++) {
       dview_type[i*lj_types+j].x=host_type2rhor[i][j];
       dview_type[i*lj_types+j].y=host_type2z2r[i][j];
     }
   }
-  
+
   ucl_copy(type2rhor_z2r,dview_type,false);
-  
+
   // pack type2frho
   UCL_H_Vec<int> dview_type2frho(lj_types,*(this->ucl_device),
                                  UCL_WRITE_ONLY);
@@ -136,7 +136,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   // pack frho_spline
   UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device),
                                        UCL_WRITE_ONLY);
-                               
+
   for (int ix=0; ix<nfrho; ix++)
     for (int iy=0; iy<nrho+1; iy++) {
       dview_frho_spline[ix*(nrho+1)+iy].x=host_frho_spline[ix][iy][0];
@@ -166,7 +166,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   // pack rhor_spline
   UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device),
                                        UCL_WRITE_ONLY);
-                               
+
   for (int ix=0; ix<nrhor; ix++)
     for (int iy=0; iy<nr+1; iy++) {
       dview_rhor_spline[ix*(nr+1)+iy].x=host_rhor_spline[ix][iy][0];
@@ -196,7 +196,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   // pack z2r_spline
   UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device),
                                       UCL_WRITE_ONLY);
-                               
+
   for (int ix=0; ix<nz2r; ix++)
     for (int iy=0; iy<nr+1; iy++) {
       dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][0];
@@ -204,12 +204,12 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
       dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][2];
       dview_z2r_spline[ix*(nr+1)+iy].w=(numtyp)0;
     }
-  
+
   z2r_spline1.alloc(nz2r*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(z2r_spline1,dview_z2r_spline,false);
   z2r_spline1_tex.get_texture(*(this->pair_program),"z2r_sp1_tex");
   z2r_spline1_tex.bind_float(z2r_spline1,4);
-  
+
   for (int ix=0; ix<nz2r; ix++)
     for (int iy=0; iy<nr+1; iy++) {
       dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][3];
@@ -217,7 +217,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
       dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][5];
       dview_z2r_spline[ix*(nr+1)+iy].w=host_z2r_spline[ix][iy][6];
     }
-  
+
   z2r_spline2.alloc(nz2r*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(z2r_spline2,dview_z2r_spline,false);
   z2r_spline2_tex.get_texture(*(this->pair_program),"z2r_sp2_tex");
@@ -241,7 +241,7 @@ void EAMT::clear() {
   if (!_allocated)
     return;
   _allocated=false;
-  
+
   type2rhor_z2r.clear();
   type2frho.clear();
   rhor_spline1.clear();
@@ -250,13 +250,13 @@ void EAMT::clear() {
   frho_spline2.clear();
   z2r_spline1.clear();
   z2r_spline2.clear();
-  
+
   _fp.clear();
-  
+
   time_pair2.clear();
   time_fp1.clear();
   time_fp2.clear();
-  
+
   if (_compiled_energy) {
     k_energy_fast.clear();
     k_energy.clear();
@@ -283,20 +283,20 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
                    int &host_start, const double cpu_time,
                    bool &success, void **fp_ptr) {
   this->acc_timers();
-  
+
   if (this->device->time_device()) {
     // Put time from the second part to the total time_pair
     this->time_pair.add_time_to_total(time_pair2.time());
-    
+
     // Add transfer time from device -> host after part 1
     this->atom->add_transfer_time(time_fp1.time());
-    
+
     // Add transfer time from host -> device before part 2
     this->atom->add_transfer_time(time_fp2.time());
   }
-  
+
   // ------------------- Resize FP Array for EAM --------------------
-  
+
   if (nall>_max_fp_size) {
     _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
     _fp.resize(_max_fp_size);
@@ -313,7 +313,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
     this->zero_timers();
     return;
   }
-  
+
   int ago=this->hd_balancer.ago_first(f_ago);
   int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
   this->ans->inum(inum);
@@ -326,7 +326,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
     if (!success)
       return;
   }
-  
+
   this->atom->cast_x_data(host_x,host_type);
   this->atom->add_x_data(host_x,host_type);
 
@@ -345,36 +345,36 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** EAMT::compute(const int ago, const int inum_full, const int nall,
-                    double **host_x, int *host_type, double *sublo, 
+                    double **host_x, int *host_type, double *sublo,
                     double *subhi, tagint *tag, int **nspecial, tagint **special,
                     const bool eflag, const bool vflag, const bool eatom,
                     const bool vatom, int &host_start, int **ilist, int **jnum,
-                    const double cpu_time, bool &success, int &inum, 
+                    const double cpu_time, bool &success, int &inum,
                     void **fp_ptr) {
   this->acc_timers();
-  
+
   if (this->device->time_device()) {
     // Put time from the second part to the total time_pair
     this->time_pair.add_time_to_total(time_pair2.time());
-    
+
     // Add transfer time from device -> host after part 1
     this->atom->add_transfer_time(time_fp1.time());
-    
+
     // Add transfer time from host -> device before part 2
     this->atom->add_transfer_time(time_fp2.time());
   }
 
   // ------------------- Resize FP Array for EAM --------------------
-  
+
   if (nall>_max_fp_size) {
     _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
     _fp.resize(_max_fp_size);
     fp_tex.bind_float(_fp,1);
-  }      
-  *fp_ptr=_fp.host.begin();  
+  }
+  *fp_ptr=_fp.host.begin();
 
   // -----------------------------------------------------------------
-  
+
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -382,14 +382,14 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
     this->zero_timers();
     return NULL;
   }
-  
+
   // load balance, returning the atom count on the device (inum)
   this->hd_balancer.balance(cpu_time);
   inum=this->hd_balancer.get_gpu_count(ago,inum_full);
   this->ans->inum(inum);
   host_start=inum;
- 
-  // Build neighbor list on GPU if necessary 
+
+  // Build neighbor list on GPU if necessary
   if (ago==0) {
     this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                           sublo, subhi, tag, nspecial, special, success);
@@ -403,14 +403,14 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
   *jnum=this->nbor->host_acc.begin();
 
   loop(eflag,vflag);
-  
+
   // copy fp from device to host for comm
   _nlocal=inum_full;
   time_fp1.start();
   _fp.update_host(inum_full,true);
   time_fp1.stop();
   time_fp1.sync_stop();
-  
+
   return this->nbor->host_jlist.begin()-host_start;
 }
 
@@ -420,20 +420,20 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
 template <class numtyp, class acctyp>
 void EAMT::compute2(int *ilist, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom) {
-  if (this->ans->inum()==0) 
+  if (this->ans->inum()==0)
     return;
-  
+
   this->hd_balancer.start_timer();
   time_fp2.start();
   this->add_fp_data();
   time_fp2.stop();
-  
+
   loop2(eflag,vflag);
   if (ilist == NULL)
     this->ans->copy_answers(eflag,vflag,eatom,vatom);
   else
     this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist);
-  
+
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 }
@@ -455,27 +455,27 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
-  
+
   if (shared_types) {
     this->k_energy_fast.set_size(GX,BX);
     this->k_energy_fast.run(&this->atom->x, &type2rhor_z2r, &type2frho,
-                            &rhor_spline2, &frho_spline1,&frho_spline2, 
-                            &this->nbor->dev_nbor,  &this->_nbor_data->begin(), 
+                            &rhor_spline2, &frho_spline1,&frho_spline2,
+                            &this->nbor->dev_nbor,  &this->_nbor_data->begin(),
                             &_fp, &this->ans->engv, &eflag, &ainum,
                             &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
                             &_rhomax, &_nrho, &_nr, &this->_threads_per_atom);
   } else {
     this->k_energy.set_size(GX,BX);
     this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho,
-                       &rhor_spline2, &frho_spline1, &frho_spline2, 
-                       &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, 
+                       &rhor_spline2, &frho_spline1, &frho_spline2,
+                       &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp,
                        &this->ans->engv,&eflag, &ainum, &nbor_pitch,
                        &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_rhomax, &_nrho,
                        &_nr, &this->_threads_per_atom);
@@ -501,25 +501,25 @@ void EAMT::loop2(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair2.start();
-  
+
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &_fp, &type2rhor_z2r,
-                          &rhor_spline1, &z2r_spline1, &z2r_spline2, 
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                          &rhor_spline1, &z2r_spline1, &z2r_spline2,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &_cutforcesq, &_rdr,
                           &_nr, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1, 
+    this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1,
                      &z2r_spline1, &z2r_spline2, &this->nbor->dev_nbor,
                      &this->_nbor_data->begin(), &this->ans->force,
                      &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu
index 054b3ca6db..13440b7d45 100644
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -82,7 +82,7 @@ texture<int4> z2r_sp2_tex;
       engv[ii]=energy;                                                      \
     }                                                                       \
   }
-  
+
 #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom,     \
                       offset, elag, vflag, ans, engv)                       \
   if (t_per_atom>1) {                                                       \
@@ -188,37 +188,37 @@ texture<int4> z2r_sp2_tex;
 
 #endif
 
-__kernel void k_energy(const __global numtyp4 *restrict x_, 
+__kernel void k_energy(const __global numtyp4 *restrict x_,
                        const __global int2 *restrict type2rhor_z2r,
-                       const __global int *restrict type2frho, 
-                       const __global numtyp4 *restrict rhor_spline2, 
+                       const __global int *restrict type2frho,
+                       const __global numtyp4 *restrict rhor_spline2,
                        const __global numtyp4 *restrict frho_spline1,
                        const __global numtyp4 *restrict frho_spline2,
-                       const __global int *dev_nbor, 
+                       const __global int *dev_nbor,
                        const __global int *dev_packed,
-                       __global numtyp *restrict fp_, 
-                       __global acctyp *restrict engv, 
+                       __global numtyp *restrict fp_,
+                       __global acctyp *restrict engv,
                        const int eflag, const int inum, const int nbor_pitch,
-                       const int ntypes,  const numtyp cutforcesq, 
-                       const numtyp rdr, const numtyp rdrho, 
+                       const int ntypes,  const numtyp cutforcesq,
+                       const numtyp rdr, const numtyp rdrho,
                        const numtyp rhomax, const int nrho,
                        const int nr, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   acctyp rho = (acctyp)0;
   acctyp energy = (acctyp)0;
-   
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
-    
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
@@ -231,60 +231,60 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-      
+
       if (rsq<cutforcesq) {
         numtyp p = ucl_sqrt(rsq)*rdr + (numtyp)1.0;
         int m=p;
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
-        
+
         int mtype = jtype*ntypes+itype;
         int index = type2rhor_z2r[mtype].x*(nr+1)+m;
         numtyp4 coeff; fetch4(coeff,index,rhor_sp2_tex);
         rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       }
     } // for nbor
-    
+
     store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
         eflag,vflag,engv,rdrho,nrho,i,rhomax);
   } // if ii
 }
 
-__kernel void k_energy_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_energy_fast(const __global numtyp4 *restrict x_,
                             const __global int2 *restrict type2rhor_z2r_in,
-                            const __global int *restrict type2frho_in, 
-                            const __global numtyp4 *restrict rhor_spline2, 
+                            const __global int *restrict type2frho_in,
+                            const __global numtyp4 *restrict rhor_spline2,
                             const __global numtyp4 *restrict frho_spline1,
                             const __global numtyp4 *restrict frho_spline2,
-                            const __global int *dev_nbor, 
-                            const __global int *dev_packed, 
-                            __global numtyp *restrict fp_, 
-                            __global acctyp *restrict engv, 
-                            const int eflag,  const int inum, 
-                            const int nbor_pitch, const int ntypes, 
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global numtyp *restrict fp_,
+                            __global acctyp *restrict engv,
+                            const int eflag,  const int inum,
+                            const int nbor_pitch, const int ntypes,
                             const numtyp cutforcesq,  const numtyp rdr,
-                            const numtyp rdrho, const numtyp rhomax, 
-                            const int nrho, const int nr, 
+                            const numtyp rdrho, const numtyp rhomax,
+                            const int nrho, const int nr,
                             const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local int type2frho[MAX_SHARED_TYPES];
 
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     type2rhor_z2r[tid]=type2rhor_z2r_in[tid];
   }
-  
+
   if (tid<MAX_SHARED_TYPES) {
     type2frho[tid]=type2frho_in[tid];
   }
 
   acctyp rho = (acctyp)0;
   acctyp energy = (acctyp)0;
-  
-  __syncthreads(); 
+
+  __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
@@ -292,10 +292,10 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
-    
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
@@ -307,14 +307,14 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-      
+
       if (rsq<cutforcesq) {
         numtyp p = ucl_sqrt(rsq)*rdr + (numtyp)1.0;
         int m=p;
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
-        
+
         int jtype=fast_mul((int)MAX_SHARED_TYPES,jx.w);
         int mtype = jtype+itype;
         int index = type2rhor_z2r[mtype].x*(nr+1)+m;
@@ -322,24 +322,24 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
         rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       }
     } // for nbor
-    
+
     store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
                     eflag,vflag,engv,rdrho,nrho,i,rhomax);
   } // if ii
 }
 
-__kernel void k_eam(const __global numtyp4 *restrict x_, 
+__kernel void k_eam(const __global numtyp4 *restrict x_,
                     const __global numtyp *fp_,
                     const __global int2 *type2rhor_z2r,
-                    const __global numtyp4 *rhor_spline1, 
+                    const __global numtyp4 *rhor_spline1,
                     const __global numtyp4 *z2r_spline1,
                     const __global numtyp4 *z2r_spline2,
                     const __global int *dev_nbor,
-                    const __global int *dev_packed, 
+                    const __global int *dev_packed,
                     __global acctyp4 *ans,
-                    __global acctyp *engv, 
+                    __global acctyp *engv,
                     const int eflag, const int vflag,  const int inum,
-                    const int nbor_pitch, const int ntypes, 
+                    const int nbor_pitch, const int ntypes,
                     const numtyp cutforcesq,  const numtyp rdr, const int nr,
                     const int t_per_atom) {
   int tid, ii, offset;
@@ -353,14 +353,14 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp ifp; fetch(ifp,i,fp_tex);  //fp_[i];
     int itype=ix.w;
@@ -377,7 +377,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-      
+
       if (rsq<cutforcesq) {
         numtyp r = ucl_sqrt(rsq);
         numtyp p = r*rdr + (numtyp)1.0;
@@ -385,7 +385,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
-        
+
         int mtype,index;
         numtyp4 coeff;
 
@@ -398,22 +398,22 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
         fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
-              
+
         mtype = itype*ntypes+jtype;
         index = type2rhor_z2r[mtype].y*(nr+1)+m;
         fetch4(coeff,index,z2r_sp1_tex);
         numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
         fetch4(coeff,index,z2r_sp2_tex);
         numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
-        
+
         numtyp recip = ucl_recip(r);
         numtyp phi = z2*recip;
         numtyp phip = z2p*recip - phi*recip;
         numtyp psip;
         fetch(psip,j,fp_tex);
-        psip = ifp*rhojp + psip*rhoip + phip; 
+        psip = ifp*rhojp + psip*rhoip + phip;
         numtyp force = -psip*recip;
-        
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -437,22 +437,22 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
 
 }
 
-__kernel void k_eam_fast(const __global numtyp4 *x_, 
+__kernel void k_eam_fast(const __global numtyp4 *x_,
                          const __global numtyp *fp_,
                          const __global int2 *type2rhor_z2r_in,
-                         const __global numtyp4 *rhor_spline1, 
+                         const __global numtyp4 *rhor_spline1,
                          const __global numtyp4 *z2r_spline1,
                          const __global numtyp4 *z2r_spline2,
-                         const __global int *dev_nbor, 
-                         const __global int *dev_packed, 
-                         __global acctyp4 *ans, 
-                         __global acctyp *engv, 
-                         const int eflag, const int vflag, const int inum, 
-                         const int nbor_pitch, const numtyp cutforcesq, 
+                         const __global int *dev_nbor,
+                         const __global int *dev_packed,
+                         __global acctyp4 *ans,
+                         __global acctyp *engv,
+                         const int eflag, const int vflag, const int inum,
+                         const int nbor_pitch, const numtyp cutforcesq,
                          const numtyp rdr, const int nr, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
 
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -487,13 +487,13 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jw=jx.w;
       int jtype=fast_mul((int)MAX_SHARED_TYPES,jw);
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<cutforcesq) {
         numtyp r = ucl_sqrt(rsq);
         numtyp p = r*rdr + (numtyp)1.0;
@@ -501,35 +501,35 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
-        
+
         numtyp4 coeff;
         int mtype,index;
-        
+
         mtype = itype+jw;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
         fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
-        
+
         mtype = jtype+iw;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
         fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
-        
+
         mtype = itype+jw;
         index = type2rhor_z2r[mtype].y*(nr+1)+m;
         fetch4(coeff,index,z2r_sp1_tex);
         numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
         fetch4(coeff,index,z2r_sp2_tex);
         numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
-      
+
         numtyp recip = ucl_recip(r);
         numtyp phi = z2*recip;
         numtyp phip = z2p*recip - phi*recip;
         numtyp psip;
         fetch(psip,j,fp_tex);
-        psip = ifp*rhojp + psip*rhoip + phip; 
+        psip = ifp*rhojp + psip*rhoip + phip;
         numtyp force = -psip*recip;
-        
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h
index 698f9938cb..ce26edc1f4 100644
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -26,12 +26,12 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
  public:
   EAM();
   ~EAM();
-                  
+
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -41,11 +41,11 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
            int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline,
            double ***host_z2r_spline, double ***host_frho_spline, double rdr,
-           double rdrho, double rhomax, int nrhor, int nrho, int nz2r, 
-           int nfrho, int nr, const int nlocal, const int nall, 
+           double rdrho, double rhomax, int nrhor, int nrho, int nz2r,
+           int nfrho, int nr, const int nlocal, const int nall,
            const int max_nbors, const int maxspecial, const double cell_size,
            const double gpu_split, FILE *_screen);
-  
+
   // Copy charges to device asynchronously
   inline void add_fp_data() {
     int nghost=this->atom->nall()-_nlocal;
@@ -57,7 +57,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
       ucl_copy(dev_view,host_view,nghost,true);
     }
   }
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -67,7 +67,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
-  
+
   /// Pair loop with host neighboring
   void compute(const int f_ago, const int inum_full, const int, const int nall,
                double **host_x, int *host_type, int *ilist, int *numj,
@@ -75,23 +75,23 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success,
                void **fp_ptr);
-               
+
   /// Pair loop with device neighboring
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 int &inum, void **fp_ptr);
 
   /// Pair loop with host neighboring
-  void compute2(int *ilist, const bool eflag, const bool vflag, 
+  void compute2(int *ilist, const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom);
-  
+
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Kernel k_energy, k_energy_fast;
-  
+
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture fp_tex;
   UCL_Texture rhor_spline1_tex, rhor_spline2_tex;
@@ -99,37 +99,37 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
   UCL_Texture z2r_spline1_tex, z2r_spline2_tex;
 
   // --------------------------- DEVICE DATA --------------------------
-  
+
   /// Device Timers
   UCL_Timer time_pair2, time_fp1, time_fp2;
-  
+
   // --------------------------- TYPE DATA --------------------------
-    
+
   UCL_D_Vec<int2> type2rhor_z2r;
   UCL_D_Vec<int> type2frho;
-  
+
   UCL_D_Vec<numtyp4> z2r_spline1, z2r_spline2;
   UCL_D_Vec<numtyp4> frho_spline1, frho_spline2;
   UCL_D_Vec<numtyp4> rhor_spline1, rhor_spline2;
-    
+
   numtyp _cutforcesq,_rdr,_rdrho, _rhomax;
-  
+
   int _nfrho,_nrhor,_nrho,_nz2r,_nr;
-  
+
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
-  
-  /// Number of atom types 
+
+  /// Number of atom types
   int _ntypes;
-  
+
   int _max_fp_size;
-  
+
   /// True of energy kernels are compiled
   bool _compiled_energy;
-  
+
   /// Per-atom arrays
   UCL_Vector<numtyp,numtyp> _fp;
-  
+
 protected:
   bool _allocated;
   int _nlocal;
diff --git a/lib/gpu/lal_eam_alloy_ext.cpp b/lib/gpu/lal_eam_alloy_ext.cpp
index 282f93afeb..9209ed5c26 100644
--- a/lib/gpu/lal_eam_alloy_ext.cpp
+++ b/lib/gpu/lal_eam_alloy_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -27,14 +27,14 @@ static EAM<PRECISION,ACC_PRECISION> EAMALMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, 
+int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
                  int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                  double ***host_rhor_spline, double ***host_z2r_spline,
                  double ***host_frho_spline,
-                 double rdr, double rdrho, double rhomax, int nrhor, 
-                 int nrho, int nz2r, int nfrho, int nr, 
-                 const int nlocal, const int nall, const int max_nbors, 
-                 const int maxspecial, const double cell_size, 
+                 double rdr, double rdrho, double rhomax, int nrhor,
+                 int nrho, int nz2r, int nfrho, int nr,
+                 const int nlocal, const int nall, const int max_nbors,
+                 const int maxspecial, const double cell_size,
                  int &gpu_mode, FILE *screen, int &fp_size) {
   EAMALMF.clear();
   gpu_mode=EAMALMF.device->gpu_mode();
@@ -46,11 +46,11 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
   int procs_per_gpu=EAMALMF.device->procs_per_gpu();
 
   // disable host/device split for now
-  if (gpu_split != 1.0) 
+  if (gpu_split != 1.0)
     return -8;
-    
+
   fp_size=sizeof(PRECISION);
-    
+
   EAMALMF.device->init_message(screen,"eam/alloy",first_gpu,last_gpu);
 
   bool message=false;
@@ -66,7 +66,7 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
   if (world_me==0)
     init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
-                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, 
+                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
                        nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
                        gpu_split, screen);
 
@@ -86,12 +86,12 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
     if (gpu_rank==i && world_me!=0)
       init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
-                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, 
+                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
                          nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMALMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -108,7 +108,7 @@ void eam_alloy_gpu_clear() {
 
 int ** eam_alloy_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -117,10 +117,10 @@ int ** eam_alloy_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        inum, fp_ptr);
-}  
+}
 
-void eam_alloy_gpu_compute(const int ago, const int inum_full, const int nlocal, 
-                     const int nall, double **host_x, int *host_type, 
+void eam_alloy_gpu_compute(const int ago, const int inum_full, const int nlocal,
+                     const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh, const bool eflag,
                      const bool vflag, const bool eatom, const bool vatom,
                      int &host_start, const double cpu_time, bool &success,
diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp
index d56f750e2f..1b5602f808 100644
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -27,14 +27,14 @@ static EAM<PRECISION,ACC_PRECISION> EAMMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int eam_gpu_init(const int ntypes, double host_cutforcesq, 
+int eam_gpu_init(const int ntypes, double host_cutforcesq,
                  int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                  double ***host_rhor_spline, double ***host_z2r_spline,
                  double ***host_frho_spline,
-                 double rdr, double rdrho, double rhomax, int nrhor, 
-                 int nrho, int nz2r, int nfrho, int nr, 
-                 const int nlocal, const int nall, const int max_nbors, 
-                 const int maxspecial, const double cell_size, 
+                 double rdr, double rdrho, double rhomax, int nrhor,
+                 int nrho, int nz2r, int nfrho, int nr,
+                 const int nlocal, const int nall, const int max_nbors,
+                 const int maxspecial, const double cell_size,
                  int &gpu_mode, FILE *screen, int &fp_size) {
   EAMMF.clear();
   gpu_mode=EAMMF.device->gpu_mode();
@@ -46,11 +46,11 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
   int procs_per_gpu=EAMMF.device->procs_per_gpu();
 
   // disable host/device split for now
-  if (gpu_split != 1.0) 
+  if (gpu_split != 1.0)
     return -8;
-    
+
   fp_size=sizeof(PRECISION);
-    
+
   EAMMF.device->init_message(screen,"eam",first_gpu,last_gpu);
 
   bool message=false;
@@ -66,7 +66,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
   if (world_me==0)
     init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
-                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, 
+                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
                        nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
                        gpu_split, screen);
 
@@ -86,12 +86,12 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
     if (gpu_rank==i && world_me!=0)
       init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
-                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, 
+                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
                          nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -108,7 +108,7 @@ void eam_gpu_clear() {
 
 int ** eam_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -117,10 +117,10 @@ int ** eam_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        inum, fp_ptr);
-}  
+}
 
-void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, 
-                     const int nall, double **host_x, int *host_type, 
+void eam_gpu_compute(const int ago, const int inum_full, const int nlocal,
+                     const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh, const bool eflag,
                      const bool vflag, const bool eatom, const bool vatom,
                      int &host_start, const double cpu_time, bool &success,
diff --git a/lib/gpu/lal_eam_fs_ext.cpp b/lib/gpu/lal_eam_fs_ext.cpp
index 4992f3ab98..b9e25466aa 100644
--- a/lib/gpu/lal_eam_fs_ext.cpp
+++ b/lib/gpu/lal_eam_fs_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -27,14 +27,14 @@ static EAM<PRECISION,ACC_PRECISION> EAMFSMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, 
+int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
                  int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                  double ***host_rhor_spline, double ***host_z2r_spline,
                  double ***host_frho_spline,
-                 double rdr, double rdrho, double rhomax, int nrhor, 
-                 int nrho, int nz2r, int nfrho, int nr, 
-                 const int nlocal, const int nall, const int max_nbors, 
-                 const int maxspecial, const double cell_size, 
+                 double rdr, double rdrho, double rhomax, int nrhor,
+                 int nrho, int nz2r, int nfrho, int nr,
+                 const int nlocal, const int nall, const int max_nbors,
+                 const int maxspecial, const double cell_size,
                  int &gpu_mode, FILE *screen, int &fp_size) {
   EAMFSMF.clear();
   gpu_mode=EAMFSMF.device->gpu_mode();
@@ -46,11 +46,11 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
   int procs_per_gpu=EAMFSMF.device->procs_per_gpu();
 
   // disable host/device split for now
-  if (gpu_split != 1.0) 
+  if (gpu_split != 1.0)
     return -8;
-    
+
   fp_size=sizeof(PRECISION);
-    
+
   EAMFSMF.device->init_message(screen,"eam/fs",first_gpu,last_gpu);
 
   bool message=false;
@@ -66,7 +66,7 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
   if (world_me==0)
     init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
-                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, 
+                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
                        nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
                        gpu_split, screen);
 
@@ -86,12 +86,12 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
     if (gpu_rank==i && world_me!=0)
       init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
-                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, 
+                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
                          nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMFSMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -108,7 +108,7 @@ void eam_fs_gpu_clear() {
 
 int ** eam_fs_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -117,10 +117,10 @@ int ** eam_fs_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        inum, fp_ptr);
-}  
+}
 
-void eam_fs_gpu_compute(const int ago, const int inum_full, const int nlocal, 
-                     const int nall, double **host_x, int *host_type, 
+void eam_fs_gpu_compute(const int ago, const int inum_full, const int nlocal,
+                     const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh, const bool eflag,
                      const bool vflag, const bool eatom, const bool vatom,
                      int &host_start, const double cpu_time, bool &success,
diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h
index b33f087212..51f785b905 100644
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@@ -245,8 +245,8 @@ ucl_inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
 
 ucl_inline numtyp gpu_det3(const numtyp m[9])
 {
-  numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - 
-    m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + 
+  numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] -
+    m[3]*m[1]*m[8] + m[3]*m[2]*m[7] +
     m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
   return ans;
 };
@@ -255,7 +255,7 @@ ucl_inline numtyp gpu_det3(const numtyp m[9])
    diagonal matrix times a full matrix
 ------------------------------------------------------------------------- */
 
-ucl_inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], 
+ucl_inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9],
                               numtyp ans[9])
 {
   ans[0] = shape.x*m[0];
@@ -421,7 +421,7 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
   t = aug[9]/aug[5];
   aug[10]-=t*aug[6];
   aug[11]-=t*aug[7];
-  
+
   if (aug[10] == (numtyp)0.0)
     *error_flag=2;
 
@@ -440,11 +440,11 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
    quat = [w i j k]
 ------------------------------------------------------------------------- */
 
-ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, 
+ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi,
                                     numtyp mat[9])
 {
   numtyp4 q; fetch4(q,qi,quat_tex);
-  
+
   numtyp w2 = q.x*q.x;
   numtyp i2 = q.y*q.y;
   numtyp j2 = q.z*q.z;
@@ -561,7 +561,7 @@ ucl_inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9])
 ------------------------------------------------------------------------- */
 
 ucl_inline void gpu_times_column3(const numtyp m[9], const numtyp v[3],
-                                numtyp ans[3]) 
+                                numtyp ans[3])
 {
   ans[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2];
   ans[1] = m[3]*v[0] + m[4]*v[1] + m[5]*v[2];
diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu
index 30d864aecc..cac77f5dd3 100644
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@@ -29,14 +29,14 @@ texture<int4,1> pos_tex;
 // -- Only unpack neighbors matching the specified inclusive range of forms
 // -- Only unpack neighbors within cutoff
 // ---------------------------------------------------------------------------
-__kernel void kernel_nbor(const __global numtyp4 *restrict x_, 
-                          const __global numtyp2 *restrict cut_form, 
-                          const int ntypes, 
+__kernel void kernel_nbor(const __global numtyp4 *restrict x_,
+                          const __global numtyp2 *restrict cut_form,
+                          const int ntypes,
                           __global int *dev_nbor,
-                          const int nbor_pitch, const int start, const int inum, 
-                          const __global int *dev_ij, 
+                          const int nbor_pitch, const int start, const int inum,
+                          const __global int *dev_ij,
                           const int form_low, const int form_high) {
-                                
+
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X+start;
 
@@ -47,11 +47,11 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
     nbor+=nbor_pitch;
     int nbor_end=nbor+fast_mul(numj,nbor_pitch);
     int packed=ii+nbor_pitch+nbor_pitch;
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul(iw,ntypes);
-    int newj=0;  
+    int newj=0;
     for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
       int j=dev_ij[nbor];
       j &= NEIGHMASK;
@@ -85,14 +85,14 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
 // -- Only unpack neighbors within cutoff
 // -- Fast version of routine that uses shared memory for LJ constants
 // ---------------------------------------------------------------------------
-__kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_, 
+__kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp2 *restrict cut_form,
-                               __global int *dev_nbor, 
-                               const int nbor_pitch, const int start, 
-                               const int inum, 
-                               const __global int *dev_ij, 
+                               __global int *dev_nbor,
+                               const int nbor_pitch, const int start,
+                               const int inum,
+                               const __global int *dev_ij,
                                const int form_low, const int form_high) {
-                                
+
   int ii=THREAD_ID_X;
   __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -110,19 +110,19 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
     nbor+=nbor_pitch;
     int nbor_end=nbor+fast_mul(numj,nbor_pitch);
     int packed=ii+nbor_pitch+nbor_pitch;
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    int newj=0;  
+    int newj=0;
     for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
       int j=dev_ij[nbor];
       j &= NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       int mtype=itype+jtype;
-      
+
       if (form[mtype]>=form_low && form[mtype]<=form_high) {
         // Compute r12;
         numtyp rsq=jx.x-ix.x;
diff --git a/lib/gpu/lal_gauss.cpp b/lib/gpu/lal_gauss.cpp
index 342ec4ecda..ef1559c5b6 100644
--- a/lib/gpu/lal_gauss.cpp
+++ b/lib/gpu/lal_gauss.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,19 +33,19 @@ GaussT::Gauss() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-GaussT::~Gauss() { 
+GaussT::~Gauss() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int GaussT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int GaussT::init(const int ntypes, 
-                 double **host_cutsq, double **host_a, 
-                 double **host_b, double **host_offset, 
+int GaussT::init(const int ntypes,
+                 double **host_cutsq, double **host_a,
+                 double **host_b, double **host_offset,
                  double *host_special_lj, const int nlocal,
                  const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size,
@@ -94,10 +94,10 @@ void GaussT::reinit(const int ntypes, double **host_cutsq, double **host_a,
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,gauss1,host_write,host_a,host_b,
 			                   host_cutsq,host_offset);
 }
@@ -135,7 +135,7 @@ void GaussT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu
index 6accf36a06..98e71ea413 100644
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,14 +24,14 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_gauss(const __global numtyp4 *restrict x_, 
+__kernel void k_gauss(const __global numtyp4 *restrict x_,
                       const __global numtyp4 *restrict gauss1,
-                      const int lj_types, 
-                      const __global numtyp *restrict sp_lj_in, 
-                      const __global int *dev_nbor, 
-                      const __global int *dev_packed, 
+                      const int lj_types,
+                      const __global numtyp *restrict sp_lj_in,
+                      const __global int *dev_nbor,
+                      const __global int *dev_packed,
                       __global acctyp4 *restrict ans,
-                      __global acctyp *restrict engv, 
+                      __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
                       const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -49,20 +49,20 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -75,22 +75,22 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<gauss1[mtype].z) {
         numtyp r2inv = ucl_recip(rsq);
         numtyp r = ucl_sqrt(rsq);
-        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq* 
-        ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj; 
+        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq*
+        ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj;
 
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
-          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - 
+          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
             gauss1[mtype].w);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -108,18 +108,18 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_gauss_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp4 *restrict gauss1_in,
-                           const __global numtyp *restrict sp_lj_in, 
+                           const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
-                           const __global int *dev_packed, 
+                           const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
-                           __global acctyp *restrict engv, 
-                           const int eflag, const int vflag, const int inum, 
+                           __global acctyp *restrict engv,
+                           const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -127,7 +127,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     gauss1[tid]=gauss1_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -136,7 +136,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -150,7 +150,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -163,21 +163,21 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<gauss1[mtype].z) {
         numtyp r2inv = ucl_recip(rsq);
         numtyp r = ucl_sqrt(rsq);
-        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq* 
-        ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj; 
-      
+        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq*
+        ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj;
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
-          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - 
+          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
             gauss1[mtype].w);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_gauss.h b/lib/gpu/lal_gauss.h
index 1fd58adae5..d023310c6d 100644
--- a/lib/gpu/lal_gauss.h
+++ b/lib/gpu/lal_gauss.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Gauss : public BaseAtomic<numtyp, acctyp> {
  public:
   Gauss();
-  ~Gauss(); 
+  ~Gauss();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -38,16 +38,16 @@ class Gauss : public BaseAtomic<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
-           double **host_a, double **host_b, double **host_offset, 
+           double **host_a, double **host_b, double **host_offset,
            double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_cutsq,
               double **host_a, double **host_b, double **host_offset);
-           
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -68,7 +68,7 @@ class Gauss : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp
index 7c15a12591..834c03cf64 100644
--- a/lib/gpu/lal_gauss_ext.cpp
+++ b/lib/gpu/lal_gauss_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -27,9 +27,9 @@ static Gauss<PRECISION,ACC_PRECISION> GLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, 
-                   double **host_b, double **offset, double *special_lj, 
-                   const int inum, const int nall, const int max_nbors,  
+int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
+                   double **host_b, double **offset, double *special_lj,
+                   const int inum, const int nall, const int max_nbors,
                    const int maxspecial,
                    const double cell_size, int &gpu_mode, FILE *screen) {
   GLMF.clear();
@@ -54,7 +54,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, 
+    init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
                        offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
@@ -77,7 +77,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
                         cell_size, gpu_split, screen);
 
     GLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -96,16 +96,16 @@ void gauss_gpu_reinit(const int ntypes, double **cutsq, double **host_a,
   int world_me=GLMF.device->world_me();
   int gpu_rank=GLMF.device->gpu_rank();
   int procs_per_gpu=GLMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     GLMF.reinit(ntypes, cutsq, host_a, host_b, offset);
-  
+
   GLMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       GLMF.reinit(ntypes, cutsq, host_a, host_b, offset);
-    
+
     GLMF.device->gpu_barrier();
   }
 }
@@ -124,7 +124,7 @@ int ** gauss_gpu_compute_n(const int ago, const int inum_full,
   return GLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void gauss_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_gayberne.cpp b/lib/gpu/lal_gayberne.cpp
index 1d38810ae8..5abef659b6 100644
--- a/lib/gpu/lal_gayberne.cpp
+++ b/lib/gpu/lal_gayberne.cpp
@@ -37,21 +37,21 @@ GayBerneT::GayBerne() : BaseEllipsoid<numtyp,acctyp>(),
 }
 
 template <class numtyp, class acctyp>
-GayBerneT::~GayBerne() { 
+GayBerneT::~GayBerne() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int GayBerneT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int GayBerneT::init(const int ntypes, const double gamma, 
-                         const double upsilon, const double mu, 
-                         double **host_shape, double **host_well, 
-                         double **host_cutsq, double **host_sigma, 
-                         double **host_epsilon, double *host_lshape, 
+int GayBerneT::init(const int ntypes, const double gamma,
+                         const double upsilon, const double mu,
+                         double **host_shape, double **host_well,
+                         double **host_cutsq, double **host_sigma,
+                         double **host_epsilon, double *host_lshape,
                          int **h_form, double **host_lj1, double **host_lj2,
                          double **host_lj3, double **host_lj4,
                          double **host_offset, const double *host_special_lj,
@@ -100,11 +100,11 @@ int GayBerneT::init(const int ntypes, const double gamma,
 
   dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
   dev_error.zero();
-    
+
   // Allocate, cast and asynchronous memcpy of constant data
   // Copy data for bonded interactions
   gamma_upsilon_mu.alloc(7,*(this->ucl_device),UCL_READ_ONLY);
-  host_write[0]=static_cast<numtyp>(gamma); 
+  host_write[0]=static_cast<numtyp>(gamma);
   host_write[1]=static_cast<numtyp>(upsilon);
   host_write[2]=static_cast<numtyp>(mu);
   host_write[3]=static_cast<numtyp>(host_special_lj[0]);
@@ -117,7 +117,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
   UCL_H_Vec<double> d_view;
   d_view.view(host_lshape,lshape.numel(),*(this->ucl_device));
   ucl_copy(lshape,d_view,false);
-    
+
   // Copy shape, well, sigma, epsilon, and cutsq onto GPU
   // - cast if necessary
   shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
@@ -138,7 +138,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
   }
   view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
   ucl_copy(well,view4,false);
-  
+
   _allocated=true;
   this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+
                    lj1.row_bytes()+lj3.row_bytes()+gamma_upsilon_mu.row_bytes()+
@@ -155,7 +155,7 @@ void GayBerneT::clear() {
   UCL_H_Vec<int> err_flag(1,*(this->ucl_device));
   ucl_copy(err_flag,dev_error,false);
   if (err_flag[0] == 2)
-    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
+    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
   err_flag.clear();
 
   _allocated=false;
@@ -170,7 +170,7 @@ void GayBerneT::clear() {
   well.clear();
   lshape.clear();
   gamma_upsilon_mu.clear();
-  
+
   this->clear_base();
 }
 
@@ -196,7 +196,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=0, NGX;
   int stride=this->nbor->nbor_pitch();
   int ainum=this->ans->inum();
@@ -214,12 +214,12 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
 
       this->time_ellipsoid.start();
       this->k_ellipsoid.set_size(GX,BX);
-      this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, 
+      this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
                             &this->shape, &this->well, &this->gamma_upsilon_mu,
-                            &this->sigma_epsilon, &this->_lj_types, 
-                            &this->lshape, &this->nbor->dev_nbor, &stride, 
+                            &this->sigma_epsilon, &this->_lj_types,
+                            &this->lshape, &this->nbor->dev_nbor, &stride,
                             &this->ans->force, &ainum, &this->ans->engv,
-                            &this->dev_error, &eflag, &vflag, 
+                            &this->dev_error, &eflag, &vflag,
                             &this->_last_ellipse, &this->_threads_per_atom);
       this->time_ellipsoid.stop();
 
@@ -248,12 +248,12 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
       this->time_ellipsoid2.start();
       this->k_sphere_ellipsoid.set_size(GX,BX);
       this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
-                                   &this->shape,  &this->well, 
-                                   &this->gamma_upsilon_mu, 
-                                   &this->sigma_epsilon, &this->_lj_types, 
-                                   &this->lshape,  &this->nbor->dev_nbor, 
-                                   &stride, &this->ans->force, 
-                                   &this->ans->engv, &this->dev_error, 
+                                   &this->shape,  &this->well,
+                                   &this->gamma_upsilon_mu,
+                                   &this->sigma_epsilon, &this->_lj_types,
+                                   &this->lshape,  &this->nbor->dev_nbor,
+                                   &stride, &this->ans->force,
+                                   &this->ans->engv, &this->dev_error,
                                    &eflag, &vflag, &this->_last_ellipse,
                                    &ainum, &this->_threads_per_atom);
       this->time_ellipsoid2.stop();
@@ -264,28 +264,28 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
       this->ans->force.zero();
       this->ans->engv.zero();
       this->time_nbor1.stop();
-      this->time_ellipsoid.start();                                 
+      this->time_ellipsoid.start();
       this->time_ellipsoid.stop();
       this->time_nbor2.start();
       this->time_nbor2.stop();
       this->time_ellipsoid2.start();
       this->time_ellipsoid2.stop();
     }
-    
+
     // ------------         LJ      ---------------
     this->time_lj.start();
     if (this->_last_ellipse<this->ans->inum()) {
       if (this->_shared_types) {
         this->k_lj_fast.set_size(GX,BX);
-        this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3, 
-                            &this->gamma_upsilon_mu, &stride, 
+        this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
+                            &this->gamma_upsilon_mu, &stride,
                             &this->nbor->dev_packed, &this->ans->force,
-                            &this->ans->engv, &this->dev_error, &eflag, 
+                            &this->ans->engv, &this->dev_error, &eflag,
                             &vflag, &this->_last_ellipse, &ainum,
                             &this->_threads_per_atom);
       } else {
         this->k_lj.set_size(GX,BX);
-        this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3, 
+        this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3,
                        &this->_lj_types, &this->gamma_upsilon_mu, &stride,
                        &this->nbor->dev_packed, &this->ans->force,
                        &this->ans->engv, &this->dev_error, &eflag,
@@ -302,10 +302,10 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
     this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
 		                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
     this->time_nbor1.stop();
-    this->time_ellipsoid.start(); 
+    this->time_ellipsoid.start();
     this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->x,  &this->atom->quat, 
-                          &this->shape, &this->well, &this->gamma_upsilon_mu, 
+    this->k_ellipsoid.run(&this->atom->x,  &this->atom->quat,
+                          &this->shape, &this->well, &this->gamma_upsilon_mu,
                           &this->sigma_epsilon, &this->_lj_types, &this->lshape,
                           &this->nbor->dev_nbor, &stride, &this->ans->force,
                           &ainum,  &this->ans->engv, &this->dev_error,
diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu
index 1a7e69eeba..71f29c2742 100644
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@@ -17,62 +17,62 @@
 #include "lal_ellipsoid_extra.h"
 #endif
 
-ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, 
+ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape,
                                    numtyp ans[9])
 {
   numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
     m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
     m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
   den = ucl_recip(den);
-  
+
   ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
 		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
 		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
 		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
 		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
-  
+
   ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
 		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
 		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
 		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
 		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
-  
+
   ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
 		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
 		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
 		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
 		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
-  
+
   ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
 		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
 		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
 		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
 		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
-  
+
   ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
 		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
 		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
 		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
 		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
-  
+
   ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
 		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
 		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
 		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
 		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
-  
+
   ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
 		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
 		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
 		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
 		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
-  
+
   ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
 		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
 		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
 		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
 		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
-  
+
   ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
 		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
 		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
@@ -82,28 +82,28 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
 
 __kernel void k_gayberne(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict q,
-                         const __global numtyp4 *restrict shape, 
-                         const __global numtyp4 *restrict well, 
-                         const __global numtyp *restrict gum, 
-                         const __global numtyp2 *restrict sig_eps, 
-                         const int ntypes, 
-                         const __global numtyp *restrict lshape, 
-                         const __global int *dev_nbor, 
-                         const int stride, 
-                         __global acctyp4 *restrict ans, 
-                         const int astride, 
-                         __global acctyp *restrict engv, 
-                         __global int *restrict err_flag, 
+                         const __global numtyp4 *restrict shape,
+                         const __global numtyp4 *restrict well,
+                         const __global numtyp *restrict gum,
+                         const __global numtyp2 *restrict sig_eps,
+                         const int ntypes,
+                         const __global numtyp *restrict lshape,
+                         const __global int *dev_nbor,
+                         const int stride,
+                         __global acctyp4 *restrict ans,
+                         const int astride,
+                         __global acctyp *restrict engv,
+                         __global int *restrict err_flag,
                          const int eflag, const int vflag, const int inum,
                          const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
+  sp_lj[0]=gum[3];
+  sp_lj[1]=gum[4];
+  sp_lj[2]=gum[5];
+  sp_lj[3]=gum[6];
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
@@ -124,7 +124,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
     numtyp a1[9], b1[9], g1[9];
@@ -159,7 +159,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
 
       numtyp a2[9];
       gpu_quat_to_mat_trans(q,j,a2);
-  
+
       numtyp u_r, dUr[3], tUr[3], eta, teta[3];
       { // Compute U_r, dUr, eta, and teta
         // Compute g12
@@ -173,7 +173,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
           }
 
           { // Compute U_r and dUr
-    
+
             // Compute kappa
             numtyp kappa[3];
             gpu_mldivide3(g12,r12,kappa,err_flag);
@@ -189,7 +189,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
             kappa[2]*=ir;
 
             // energy
-  
+
             // compute u_r and dUr
             numtyp uslj_rsq;
             {
@@ -203,7 +203,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
               kappa[0]*=r;
               kappa[1]*=r;
               kappa[2]*=r;
-          
+
               int mtype=fast_mul(ntypes,itype)+jtype;
               numtyp sigma = sig_eps[mtype].x;
               numtyp epsilon = sig_eps[mtype].y;
@@ -235,14 +235,14 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
             }
           }
         }
-     
+
         // Compute eta
         {
           eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
           numtyp det_g12 = gpu_det3(g12);
           eta = ucl_powr(eta/det_g12,gum[1]);
         }
-    
+
         // Compute teta
         numtyp temp[9], tempv[3], tempv2[3];
         compute_eta_torque(g12,a1,ishape,temp);
@@ -255,7 +255,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
         teta[0] = tempv2[0];
         teta[1] = tempv2[1];
         teta[2] = tempv2[2];
-  
+
         tempv[0] = temp1*temp[3];
         tempv[1] = temp1*temp[4];
         tempv[2] = temp1*temp[5];
@@ -272,7 +272,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
         teta[1] += tempv2[1];
         teta[2] += tempv2[2];
       }
-  
+
       numtyp chi, dchi[3], tchi[3];
       { // Compute chi and dchi
 
@@ -355,7 +355,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
       tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
       tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
       tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
- 
+
     } // for nbor
     store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
                     vflag,ans,engv);
diff --git a/lib/gpu/lal_gayberne.h b/lib/gpu/lal_gayberne.h
index dacaf74282..8792f1f1db 100644
--- a/lib/gpu/lal_gayberne.h
+++ b/lib/gpu/lal_gayberne.h
@@ -25,14 +25,14 @@ template <class numtyp, class acctyp>
 class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
  public:
   GayBerne();
-  ~GayBerne(); 
+  ~GayBerne();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device 
+    * \param gpu_split fraction of particles handled by device
     * \return false if there is not sufficient memory or device init prob
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -41,18 +41,18 @@ class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, const double gamma,
            const double upsilon, const double mu, double **host_shape,
-           double **host_well, double **host_cutsq, double **host_sigma, 
+           double **host_well, double **host_cutsq, double **host_sigma,
            double **host_epsilon, double *host_lshape, int **h_form,
-           double **host_lj1, double **host_lj2, double **host_lj3, 
-           double **host_lj4, double **host_offset, 
-           const double *host_special_lj, const int nlocal, const int nall, 
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset,
+           const double *host_special_lj, const int nlocal, const int nall,
            const int max_nbors, const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
- 
+
   /// Returns memory usage on device per atom
   int bytes_per_atom(const int max_nbors) const;
 
@@ -61,8 +61,8 @@ class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
 
   /// Device Error Flag - Set if a bad matrix inversion occurs
   UCL_D_Vec<int> dev_error;
-  
-  // --------------------------- TYPE DATA -------------------------- 
+
+  // --------------------------- TYPE DATA --------------------------
 
   /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
   UCL_D_Vec<numtyp4> lj1;
@@ -72,12 +72,12 @@ class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
   UCL_D_Vec<numtyp2> sigma_epsilon;
   // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ...
   UCL_D_Vec<numtyp> gamma_upsilon_mu;
-  
+
   /// If atom type constants fit in shared memory, use fast kernels
   bool _shared_types;
   int _lj_types;
-   
-  // --------------------------- ATOM DATA -------------------------- 
+
+  // --------------------------- ATOM DATA --------------------------
 
   /// Aspherical Const Data for Atoms
   UCL_D_Vec<numtyp4> shape, well;
diff --git a/lib/gpu/lal_gayberne_ext.cpp b/lib/gpu/lal_gayberne_ext.cpp
index e674fb376b..451550e7ef 100644
--- a/lib/gpu/lal_gayberne_ext.cpp
+++ b/lib/gpu/lal_gayberne_ext.cpp
@@ -33,7 +33,7 @@ int gb_gpu_init(const int ntypes, const double gamma,
                 double **epsilon, double *host_lshape, int **form,
                 double **host_lj1, double **host_lj2, double **host_lj3,
                 double **host_lj4, double **offset, double *special_lj,
-                const int inum, const int nall, const int max_nbors, 
+                const int inum, const int nall, const int max_nbors,
                 const int maxspecial, const double cell_size, int &gpu_mode,
                 FILE *screen) {
   GBMF.clear();
@@ -58,16 +58,16 @@ int gb_gpu_init(const int ntypes, const double gamma,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
-                      sigma, epsilon, host_lshape, form, host_lj1, 
-                      host_lj2, host_lj3, host_lj4, offset, special_lj, 
+    init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
+                      sigma, epsilon, host_lshape, form, host_lj1,
+                      host_lj2, host_lj3, host_lj4, offset, special_lj,
                       inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
                       screen);
 
   GBMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
-        
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
@@ -84,7 +84,7 @@ int gb_gpu_init(const int ntypes, const double gamma,
                         max_nbors, maxspecial, cell_size, gpu_split,  screen);
 
     GBMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -105,8 +105,8 @@ void gb_gpu_clear() {
 int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double **host_quat);
 
@@ -117,8 +117,8 @@ int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
                        const bool vatom, int &host_start, int **ilist,
                        int **jnum, const double cpu_time, bool &success,
                        double **host_quat) {
-  return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, 
-                      tag, nspecial, special, eflag, vflag, eatom, vatom, 
+  return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
+                      tag, nspecial, special, eflag, vflag, eatom, vatom,
                       host_start, ilist, jnum, cpu_time, success, host_quat);
 }
 
diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu
index 9b33b5f7f3..7925b72784 100644
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@@ -18,30 +18,30 @@
 #endif
 
 __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
-                                          const __global numtyp4 *restrict q, 
+                                          const __global numtyp4 *restrict q,
                                           const __global numtyp4 *restrict shape,
-                                          const __global numtyp4 *restrict well, 
-                                          const __global numtyp *restrict gum, 
+                                          const __global numtyp4 *restrict well,
+                                          const __global numtyp *restrict gum,
                                           const __global numtyp2 *restrict sig_eps,
-                                          const int ntypes, 
+                                          const int ntypes,
                                           const __global numtyp *restrict lshape,
-                                          const __global int *dev_nbor, 
+                                          const __global int *dev_nbor,
                                           const int stride,
-                                          __global acctyp4 *restrict ans, 
+                                          __global acctyp4 *restrict ans,
                                           __global acctyp *restrict engv,
-                                          __global int *restrict err_flag, 
+                                          __global int *restrict err_flag,
                                           const int eflag, const int vflag,
-                                          const int start, const int inum, 
+                                          const int start, const int inum,
                                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
+  sp_lj[0]=gum[3];
+  sp_lj[1]=gum[4];
+  sp_lj[2]=gum[5];
+  sp_lj[3]=gum[6];
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
@@ -58,16 +58,16 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
-      
+
     numtyp oner=shape[itype].x;
     numtyp one_well=well[itype].x;
-  
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_nbor[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -84,7 +84,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
 
       ir = ucl_rsqrt(ir);
       numtyp r = ucl_recip(ir);
-      
+
       numtyp r12hat[3];
       r12hat[0]=r12[0]*ir;
       r12hat[1]=r12[1]*ir;
@@ -92,7 +92,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
 
       numtyp a2[9];
       gpu_quat_to_mat_trans(q,j,a2);
-  
+
       numtyp u_r, dUr[3], eta;
       { // Compute U_r, dUr, eta, and teta
         // Compute g12
@@ -110,11 +110,11 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
             g12[3]=g2[3];
             g12[5]=g2[5];
             g12[6]=g2[6];
-            g12[7]=g2[7];    
+            g12[7]=g2[7];
           }
-  
+
           { // Compute U_r and dUr
-    
+
             // Compute kappa
             numtyp kappa[3];
             gpu_mldivide3(g12,r12,kappa,err_flag);
@@ -123,9 +123,9 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
             kappa[0]*=ir;
             kappa[1]*=ir;
             kappa[2]*=ir;
-  
+
             // energy
-  
+
             // compute u_r and dUr
             numtyp uslj_rsq;
             {
@@ -139,7 +139,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
               kappa[0]*=r;
               kappa[1]*=r;
               kappa[2]*=r;
-          
+
               int mtype=fast_mul(ntypes,itype)+jtype;
               numtyp sigma = sig_eps[mtype].x;
               numtyp epsilon = sig_eps[mtype].y;
@@ -161,7 +161,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
             }
           }
         }
-     
+
         // Compute eta
         {
           eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
@@ -169,7 +169,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
           eta = ucl_powr(eta/det_g12,gum[1]);
         }
       }
-  
+
       numtyp chi, dchi[3];
       { // Compute chi and dchi
 
@@ -187,7 +187,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
           b12[3]=b2[3];
           b12[5]=b2[5];
           b12[6]=b2[6];
-          b12[7]=b2[7];    
+          b12[7]=b2[7];
         }
 
         // compute chi_12
@@ -244,16 +244,16 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_gayberne_lj(const __global numtyp4 *restrict x_, 
-                            const __global numtyp4 *restrict lj1, 
-                            const __global numtyp4 *restrict lj3, 
-                            const int lj_types, 
-                            const __global numtyp *restrict gum, 
-                            const int stride, 
-                            const __global int *dev_ij, 
-                            __global acctyp4 *restrict ans, 
-                            __global acctyp *restrict engv, 
-                            __global int *restrict err_flag, 
+__kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
+                            const __global numtyp4 *restrict lj1,
+                            const __global numtyp4 *restrict lj3,
+                            const int lj_types,
+                            const __global numtyp *restrict gum,
+                            const int stride,
+                            const __global int *dev_ij,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            __global int *restrict err_flag,
                             const int eflag, const int vflag, const int start,
                             const int inum, const int t_per_atom) {
   int tid, ii, offset;
@@ -261,10 +261,10 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
   ii+=start;
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
+  sp_lj[0]=gum[3];
+  sp_lj[1]=gum[4];
+  sp_lj[2]=gum[5];
+  sp_lj[3]=gum[6];
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
@@ -274,20 +274,20 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -300,21 +300,21 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int ii=itype*lj_types+jtype;
       if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
-          energy+=factor_lj*(e-lj3[ii].z); 
+          energy+=factor_lj*(e-lj3[ii].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -332,33 +332,33 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, 
-                                 const __global numtyp4 *restrict lj1_in, 
-                                 const __global numtyp4 *restrict lj3_in, 
-                                 const __global numtyp *restrict gum, 
-                                 const int stride, 
+__kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
+                                 const __global numtyp4 *restrict lj1_in,
+                                 const __global numtyp4 *restrict lj3_in,
+                                 const __global numtyp *restrict gum,
+                                 const int stride,
                                  const __global int *dev_ij,
-                                 __global acctyp4 *restrict ans, 
+                                 __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
-                                 __global int *restrict err_flag, 
-                                 const int eflag, const int vflag, 
-                                 const int start, const int inum, 
+                                 __global int *restrict err_flag,
+                                 const int eflag, const int vflag,
+                                 const int start, const int inum,
                                  const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
 
-  __local numtyp sp_lj[4];                              
+  __local numtyp sp_lj[4];
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   if (tid<4)
-    sp_lj[tid]=gum[tid+3];    
+    sp_lj[tid]=gum[tid+3];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0;
@@ -367,9 +367,9 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -383,7 +383,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -396,19 +396,19 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp
index 6c6e145319..2190e40516 100644
--- a/lib/gpu/lal_lj.cpp
+++ b/lib/gpu/lal_lj.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -33,20 +33,20 @@ LJT::LJ() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-LJT::~LJ() { 
+LJT::~LJ() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int LJT::init(const int ntypes, 
-                          double **host_cutsq, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
+int LJT::init(const int ntypes,
+                          double **host_cutsq, double **host_lj1,
+                          double **host_lj2, double **host_lj3,
+                          double **host_lj4, double **host_offset,
                           double *host_special_lj, const int nlocal,
                           const int nall, const int max_nbors,
                           const int maxspecial, const double cell_size,
@@ -99,10 +99,10 @@ void LJT::reinit(const int ntypes, double **host_cutsq, double **host_lj1,
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
                          host_cutsq);
   this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
@@ -143,7 +143,7 @@ void LJT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -155,12 +155,12 @@ void LJT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, 
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu
index 9569cb0fd7..5838ac95cf 100644
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_lj(const __global numtyp4 *restrict x_, 
+__kernel void k_lj(const __global numtyp4 *restrict x_,
                    const __global numtyp4 *restrict lj1,
-                   const __global numtyp4 *restrict lj3, 
-                   const int lj_types, 
-                   const __global numtyp *restrict sp_lj, 
-                   const __global int * dev_nbor, 
-                   const __global int * dev_packed, 
-                   __global acctyp4 *restrict ans, 
-                   __global acctyp *restrict engv, 
+                   const __global numtyp4 *restrict lj3,
+                   const int lj_types,
+                   const __global numtyp *restrict sp_lj,
+                   const __global int * dev_nbor,
+                   const __global int * dev_packed,
+                   __global acctyp4 *restrict ans,
+                   __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
                    const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -44,19 +44,19 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -69,21 +69,21 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r2inv<lj1[mtype].z) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -101,19 +101,19 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_fast(const __global numtyp4 *restrict x_,
                         const __global numtyp4 *restrict lj1_in,
-                        const __global numtyp4 *restrict lj3_in, 
-                        const __global numtyp *restrict sp_lj_in, 
-                        const __global int * dev_nbor, 
-                        const __global int * dev_packed, 
-                        __global acctyp4 *restrict ans, 
-                        __global acctyp *restrict engv, 
-                        const int eflag, const int vflag, const int inum, 
+                        const __global numtyp4 *restrict lj3_in,
+                        const __global numtyp *restrict sp_lj_in,
+                        const __global int * dev_nbor,
+                        const __global int * dev_packed,
+                        __global acctyp4 *restrict ans,
+                        __global acctyp *restrict engv,
+                        const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -124,7 +124,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -133,7 +133,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
@@ -146,7 +146,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -159,19 +159,19 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<lj1[mtype].z) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_lj.h b/lib/gpu/lal_lj.h
index 63a3e8a6c9..01ce85c8ea 100644
--- a/lib/gpu/lal_lj.h
+++ b/lib/gpu/lal_lj.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class LJ : public BaseAtomic<numtyp, acctyp> {
  public:
   LJ();
-  ~LJ(); 
+  ~LJ();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,15 +40,15 @@ class LJ : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_cutsq,
               double **host_lj1, double **host_lj2, double **host_lj3,
               double **host_lj4, double **host_offset);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -71,7 +71,7 @@ class LJ : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_lj96.cpp b/lib/gpu/lal_lj96.cpp
index 70e46b9fe1..b59495c41a 100644
--- a/lib/gpu/lal_lj96.cpp
+++ b/lib/gpu/lal_lj96.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -36,7 +36,7 @@ template <class numtyp, class acctyp>
 LJ96T::~LJ96() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJ96T::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -44,9 +44,9 @@ int LJ96T::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int LJ96T::init(const int ntypes,
-                           double **host_cutsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
+                           double **host_cutsq, double **host_lj1,
+                           double **host_lj2, double **host_lj3,
+                           double **host_lj4, double **host_offset,
                            double *host_special_lj, const int nlocal,
                            const int nall, const int max_nbors,
                            const int maxspecial, const double cell_size,
@@ -126,7 +126,7 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -138,7 +138,7 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, 
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu
index b219b8bf0d..3bb7750022 100644
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -26,13 +26,13 @@ texture<int4,1> pos_tex;
 
 __kernel void k_lj96(const __global numtyp4 *restrict x_,
                      const __global numtyp4 *restrict lj1,
-                     const __global numtyp4 *restrict lj3, 
-                     const int lj_types, 
-                     const __global numtyp *restrict sp_lj_in, 
-                     const __global int *dev_nbor, 
-                     const __global int *dev_packed, 
+                     const __global numtyp4 *restrict lj3,
+                     const int lj_types,
+                     const __global numtyp *restrict sp_lj_in,
+                     const __global int *dev_nbor,
+                     const __global int *dev_packed,
                      __global acctyp4 *restrict ans,
-                     __global acctyp *restrict engv, 
+                     __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -50,20 +50,20 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -76,7 +76,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r2inv<lj1[mtype].z) {
         r2inv=ucl_recip(r2inv);
@@ -84,14 +84,14 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
         numtyp r3inv = ucl_sqrt(r6inv);
         numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -109,15 +109,15 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj96_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict lj1_in,
-                          const __global numtyp4 *restrict lj3_in, 
+                          const __global numtyp4 *restrict lj3_in,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
-                          const int eflag, const int vflag, const int inum, 
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -132,30 +132,30 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -168,20 +168,20 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<lj1[mtype].z) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp r3inv = ucl_sqrt(r6inv);
         numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_lj96.h b/lib/gpu/lal_lj96.h
index 7d51e287d3..3fdea5265e 100644
--- a/lib/gpu/lal_lj96.h
+++ b/lib/gpu/lal_lj96.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJ96 : public BaseAtomic<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,8 +40,8 @@ class LJ96 : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -66,7 +66,7 @@ class LJ96 : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_lj96_ext.cpp b/lib/gpu/lal_lj96_ext.cpp
index 14c32ef95e..c7ec9f4448 100644
--- a/lib/gpu/lal_lj96_ext.cpp
+++ b/lib/gpu/lal_lj96_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -77,7 +77,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           cell_size, gpu_split, screen);
 
     LJ96MF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,7 +102,7 @@ int** lj96_gpu_compute_n(const int ago, const int inum_full,
   return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void lj96_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_lj_class2_long.cpp b/lib/gpu/lal_lj_class2_long.cpp
index ef59843c4a..0109446b95 100644
--- a/lib/gpu/lal_lj_class2_long.cpp
+++ b/lib/gpu/lal_lj_class2_long.cpp
@@ -38,7 +38,7 @@ template <class numtyp, class acctyp>
 LJClass2LongT::~LJClass2Long() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJClass2LongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -46,8 +46,8 @@ int LJClass2LongT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int LJClass2LongT::init(const int ntypes, double **host_cutsq,
-                        double **host_lj1, double **host_lj2, double **host_lj3, 
-                        double **host_lj4, double **host_offset, 
+                        double **host_lj1, double **host_lj2, double **host_lj3,
+                        double **host_lj4, double **host_offset,
                         double *host_special_lj, const int nlocal,
                         const int nall, const int max_nbors,
                         const int maxspecial, const double cell_size,
@@ -136,7 +136,7 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -145,11 +145,11 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, 
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
-                          &_cut_coulsq, &_qqrd2e, &_g_ewald, 
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu
index e16de3a327..41ceca35d7 100644
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@@ -32,15 +32,15 @@ texture<int2> q_tex;
 __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
                                const __global numtyp4 *restrict lj1,
                                const __global numtyp4 *restrict lj3,
-                               const int lj_types, 
+                               const int lj_types,
                                const __global numtyp *restrict sp_lj_in,
-                               const __global int *dev_nbor, 
+                               const __global int *dev_nbor,
                                const __global int *dev_packed,
                                __global acctyp4 *restrict ans,
                                __global acctyp *restrict engv,
-                               const int eflag,  const int vflag, 
-                               const int inum, const int nbor_pitch, 
-                               const __global numtyp *restrict q_, 
+                               const int eflag,  const int vflag,
+                               const int inum, const int nbor_pitch,
+                               const __global numtyp *restrict q_,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
                                const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
@@ -63,14 +63,14 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -129,7 +129,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].w) {
             numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -147,20 +147,20 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
                                     const __global numtyp4 *restrict lj1_in,
-                                    const __global numtyp4 *restrict lj3_in, 
+                                    const __global numtyp4 *restrict lj3_in,
                                     const __global numtyp *restrict sp_lj_in,
-                                    const __global int *dev_nbor, 
+                                    const __global int *dev_nbor,
                                     const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans, 
-                                    __global acctyp *restrict engv, 
-                                    const int eflag, const int vflag, 
-                                    const int inum, const int nbor_pitch, 
+                                    __global acctyp4 *restrict ans,
+                                    __global acctyp *restrict engv,
+                                    const int eflag, const int vflag,
+                                    const int inum, const int nbor_pitch,
                                     const __global numtyp *restrict q_,
-                                    const numtyp cut_coulsq, 
+                                    const numtyp cut_coulsq,
                                     const numtyp qqrd2e,
-                                    const numtyp g_ewald, 
+                                    const numtyp g_ewald,
                                     const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -175,7 +175,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -183,16 +183,16 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_lj_class2_long.h b/lib/gpu/lal_lj_class2_long.h
index 9dd151f63a..d07b974a90 100644
--- a/lib/gpu/lal_lj_class2_long.h
+++ b/lib/gpu/lal_lj_class2_long.h
@@ -30,7 +30,7 @@ class LJClass2Long : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,8 +40,8 @@ class LJClass2Long : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald);
@@ -68,7 +68,7 @@ class LJClass2Long : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e, _g_ewald;
diff --git a/lib/gpu/lal_lj_class2_long_ext.cpp b/lib/gpu/lal_lj_class2_long_ext.cpp
index 4bb3aad7ad..fa3e95f1f2 100644
--- a/lib/gpu/lal_lj_class2_long_ext.cpp
+++ b/lib/gpu/lal_lj_class2_long_ext.cpp
@@ -82,7 +82,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
     C2CLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -99,7 +99,7 @@ void c2cl_gpu_clear() {
 
 int** c2cl_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -109,7 +109,7 @@ int** c2cl_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
+}
 			
 void c2cl_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_lj_coul.cpp b/lib/gpu/lal_lj_coul.cpp
index 8030f3cfc2..00a5c108d9 100644
--- a/lib/gpu/lal_lj_coul.cpp
+++ b/lib/gpu/lal_lj_coul.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 LJCoulT::~LJCoul() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJCoulT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,9 +45,9 @@ int LJCoulT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int LJCoulT::init(const int ntypes,
-                          double **host_cutsq, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
+                          double **host_cutsq, double **host_lj1,
+                          double **host_lj2, double **host_lj3,
+                          double **host_lj4, double **host_offset,
                           double *host_special_lj, const int nlocal,
                           const int nall, const int max_nbors,
                           const int maxspecial, const double cell_size,
@@ -138,7 +138,7 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -149,14 +149,14 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &cutsq, &_qqrd2e, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv,
                      &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
                      &cutsq, &_qqrd2e, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu
index 364203db22..5c7f0da46f 100644
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_lj_coul(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_coul(const __global numtyp4 *restrict x_,
                         const __global numtyp4 *restrict lj1,
-                        const __global numtyp4 *restrict  lj3, 
-                        const int lj_types, 
+                        const __global numtyp4 *restrict  lj3,
+                        const int lj_types,
                         const __global numtyp *restrict sp_lj_in,
-                        const __global int *dev_nbor, 
-                        const __global int *dev_packed, 
+                        const __global int *dev_nbor,
+                        const __global int *dev_packed,
                         __global acctyp4 *restrict ans,
-                        __global acctyp *restrict engv, 
+                        __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
-                        const int nbor_pitch, 
-                        const __global numtyp *restrict q_, 
-                        const __global numtyp *restrict cutsq, 
+                        const int nbor_pitch,
+                        const __global numtyp *restrict q_,
+                        const __global numtyp *restrict cutsq,
                         const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -63,14 +63,14 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -120,7 +120,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].z) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -140,16 +140,16 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
 
 __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict lj1_in,
-                             const __global numtyp4 *restrict lj3_in, 
+                             const __global numtyp4 *restrict lj3_in,
                              const __global numtyp *restrict sp_lj_in,
-                             const __global int *dev_nbor, 
+                             const __global int *dev_nbor,
                              const __global int *dev_packed,
                              __global acctyp4 *restrict ans,
-                             __global acctyp *restrict engv, 
-                             const int eflag, const int vflag, const int inum, 
-                             const int nbor_pitch, 
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch,
                              const __global numtyp *restrict q_,
-                             const __global numtyp *restrict _cutsq, 
+                             const __global numtyp *restrict _cutsq,
                              const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -166,7 +166,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -174,16 +174,16 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_lj_coul.h b/lib/gpu/lal_lj_coul.h
index abea5a2d55..a262c0837f 100644
--- a/lib/gpu/lal_lj_coul.h
+++ b/lib/gpu/lal_lj_coul.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJCoul : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,7 +40,7 @@ class LJCoul : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            double **host_cut_coulsq, double *host_special_coul,
@@ -70,7 +70,7 @@ class LJCoul : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e;
diff --git a/lib/gpu/lal_lj_coul_debye.cpp b/lib/gpu/lal_lj_coul_debye.cpp
index 135a4dfd9d..1b230096a4 100644
--- a/lib/gpu/lal_lj_coul_debye.cpp
+++ b/lib/gpu/lal_lj_coul_debye.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 LJCoulDebyeT::~LJCoulDebye() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,9 +45,9 @@ int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int LJCoulDebyeT::init(const int ntypes,
-                       double **host_cutsq, double **host_lj1, 
-                       double **host_lj2, double **host_lj3, 
-                       double **host_lj4, double **host_offset, 
+                       double **host_cutsq, double **host_lj1,
+                       double **host_lj2, double **host_lj3,
+                       double **host_lj4, double **host_offset,
                        double *host_special_lj, const int nlocal,
                        const int nall, const int max_nbors,
                        const int maxspecial, const double cell_size,
@@ -98,7 +98,7 @@ int LJCoulDebyeT::init(const int ntypes,
 
   _qqrd2e=qqrd2e;
   _kappa=kappa;
-  
+
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
                    sp_lj.row_bytes();
@@ -140,7 +140,7 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -157,9 +157,9 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
-                     &ainum, &nbor_pitch, &this->atom->q, &cutsq, 
+                     &ainum, &nbor_pitch, &this->atom->q, &cutsq,
                      &_qqrd2e, &_kappa, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu
index 308504c6c8..91b105b3da 100644
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_lj_debye(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_debye(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict lj1,
-                         const __global numtyp4 *restrict lj3, 
-                         const int lj_types, 
-                         const __global numtyp *restrict sp_lj_in, 
-                         const __global int *dev_nbor, 
-                         const __global int *dev_packed, 
+                         const __global numtyp4 *restrict lj3,
+                         const int lj_types,
+                         const __global numtyp *restrict sp_lj_in,
+                         const __global int *dev_nbor,
+                         const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
                          const __global numtyp *restrict q_ ,
-                         const __global numtyp *restrict cutsq, 
+                         const __global numtyp *restrict cutsq,
                          const numtyp qqrd2e, const numtyp kappa,
                          const int t_per_atom) {
   int tid, ii, offset;
@@ -64,14 +64,14 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -86,7 +86,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
@@ -127,7 +127,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
           }
           if (rsq < lj1[mtype].w) {
             e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -147,15 +147,15 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
 
 __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp4 *restrict lj1_in,
-                              const __global numtyp4 *restrict lj3_in, 
+                              const __global numtyp4 *restrict lj3_in,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
                               __global acctyp4 *restrict ans,
-                              __global acctyp *restrict engv, 
-                              const int eflag, const int vflag, const int inum, 
-                              const int nbor_pitch, 
-                              const __global numtyp *restrict q_, 
+                              __global acctyp *restrict engv,
+                              const int eflag, const int vflag, const int inum,
+                              const int nbor_pitch,
+                              const __global numtyp *restrict q_,
                               const __global numtyp *restrict _cutsq,
                               const numtyp qqrd2e, const numtyp kappa,
                               const int t_per_atom) {
@@ -174,7 +174,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -182,16 +182,16 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_lj_coul_debye.h b/lib/gpu/lal_lj_coul_debye.h
index 73e38c647d..1d3d0ba375 100644
--- a/lib/gpu/lal_lj_coul_debye.h
+++ b/lib/gpu/lal_lj_coul_debye.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,7 +40,7 @@ class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            double **host_cut_coulsq, double *host_special_coul,
@@ -70,7 +70,7 @@ class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e,_kappa;
diff --git a/lib/gpu/lal_lj_coul_debye_ext.cpp b/lib/gpu/lal_lj_coul_debye_ext.cpp
index 67f5a0075f..8ec189a764 100644
--- a/lib/gpu/lal_lj_coul_debye_ext.cpp
+++ b/lib/gpu/lal_lj_coul_debye_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,7 +33,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
                   double **host_cut_ljsq, double **host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e, 
+                  double *host_special_coul, const double qqrd2e,
                   const double kappa) {
   LJCDMF.clear();
   gpu_mode=LJCDMF.device->gpu_mode();
@@ -82,7 +82,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           host_cut_coulsq, host_special_coul, qqrd2e, kappa);
 
     LJCDMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -99,7 +99,7 @@ void ljcd_gpu_clear() {
 
 int** ljcd_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
@@ -109,7 +109,7 @@ int** ljcd_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
+}
 			
 void ljcd_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_lj_coul_ext.cpp b/lib/gpu/lal_lj_coul_ext.cpp
index 3b5cc09805..297ac7414e 100644
--- a/lib/gpu/lal_lj_coul_ext.cpp
+++ b/lib/gpu/lal_lj_coul_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -81,7 +81,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
     LJCMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -98,7 +98,7 @@ void ljc_gpu_clear() {
 
 int** ljc_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
@@ -108,7 +108,7 @@ int** ljc_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
-}  
+}
 			
 void ljc_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_lj_coul_long.cpp b/lib/gpu/lal_lj_coul_long.cpp
index 03f32a5fd0..71205af0ea 100644
--- a/lib/gpu/lal_lj_coul_long.cpp
+++ b/lib/gpu/lal_lj_coul_long.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 LJCoulLongT::~LJCoulLong() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJCoulLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,9 +45,9 @@ int LJCoulLongT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int LJCoulLongT::init(const int ntypes,
-                           double **host_cutsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
+                           double **host_cutsq, double **host_lj1,
+                           double **host_lj2, double **host_lj3,
+                           double **host_lj4, double **host_offset,
                            double *host_special_lj, const int nlocal,
                            const int nall, const int max_nbors,
                            const int maxspecial, const double cell_size,
@@ -109,10 +109,10 @@ void LJCoulLongT::reinit(const int ntypes, double **host_cutsq, double **host_lj
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
                          host_cutsq, host_cut_ljsq);
   this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
@@ -153,7 +153,7 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -162,7 +162,7 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, 
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu
index e0aa2e8a58..0e25bb2dbc 100644
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -29,17 +29,17 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict lj1,
-                             const __global numtyp4 *restrict lj3, 
-                             const int lj_types, 
-                             const __global numtyp *restrict sp_lj_in, 
-                             const __global int *dev_nbor, 
+                             const __global numtyp4 *restrict lj3,
+                             const int lj_types,
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor,
                              const __global int *dev_packed,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
-                             const int nbor_pitch, 
+                             const int nbor_pitch,
                              const __global numtyp *restrict q_,
                              const numtyp cut_coulsq, const numtyp qqrd2e,
                              const numtyp g_ewald, const int t_per_atom) {
@@ -63,14 +63,14 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -127,7 +127,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].w) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -145,14 +145,14 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict lj1_in,
-                                  const __global numtyp4 *restrict lj3_in, 
+                                  const __global numtyp4 *restrict lj3_in,
                                   const __global numtyp *restrict sp_lj_in,
-                                  const __global int *dev_nbor, 
+                                  const __global int *dev_nbor,
                                   const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans, 
-                                  __global acctyp *restrict engv, 
+                                  __global acctyp4 *restrict ans,
+                                  __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
                                   const __global numtyp *restrict q_,
@@ -171,7 +171,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -179,16 +179,16 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_lj_coul_long.h b/lib/gpu/lal_lj_coul_long.h
index 2708cab05b..7b2d79c2a6 100644
--- a/lib/gpu/lal_lj_coul_long.h
+++ b/lib/gpu/lal_lj_coul_long.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,8 +40,8 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald);
@@ -73,7 +73,7 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e, _g_ewald;
diff --git a/lib/gpu/lal_lj_coul_long_ext.cpp b/lib/gpu/lal_lj_coul_long_ext.cpp
index dc93365f22..95bd369336 100644
--- a/lib/gpu/lal_lj_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_coul_long_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -82,7 +82,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
     LJCLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,15 +102,15 @@ void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
   int world_me=LJCLMF.device->world_me();
   int gpu_rank=LJCLMF.device->gpu_rank();
   int procs_per_gpu=LJCLMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
-    LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, 
+    LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                   offset, host_cut_ljsq);
   LJCLMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
-      LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, 
+      LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                     offset, host_cut_ljsq);
     LJCLMF.device->gpu_barrier();
   }
@@ -122,7 +122,7 @@ void ljcl_gpu_clear() {
 
 int** ljcl_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -132,7 +132,7 @@ int** ljcl_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
+}
 			
 void ljcl_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_lj_coul_msm.cpp b/lib/gpu/lal_lj_coul_msm.cpp
index dd045b7970..7559a93b90 100644
--- a/lib/gpu/lal_lj_coul_msm.cpp
+++ b/lib/gpu/lal_lj_coul_msm.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 LJCoulMSMT::~LJCoulMSM() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJCoulMSMT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,8 +45,8 @@ int LJCoulMSMT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int LJCoulMSMT::init(const int ntypes,
-                     double **host_cutsq, double **host_lj1, 
-                     double **host_lj2, double **host_lj3, 
+                     double **host_cutsq, double **host_lj1,
+                     double **host_lj2, double **host_lj3,
                      double **host_lj4, double **host_gcons,
                      double **host_dgcons, double **host_offset,
                      double *host_special_lj, const int nlocal,
@@ -93,11 +93,11 @@ int LJCoulMSMT::init(const int ntypes,
   ncols = 7;
   UCL_H_Vec<numtyp> dview_gcons(nrows*ncols,*(this->ucl_device),
                                 UCL_WRITE_ONLY);
-                               
+
   for (int ix=0; ix<nrows; ix++)
     for (int iy=0; iy<ncols; iy++)
       dview_gcons[ix*ncols+iy]=host_gcons[ix][iy];
-  
+
   gcons.alloc(nrows*ncols,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(gcons,dview_gcons,false);
   gcons_tex.get_texture(*(this->pair_program),"gcons_tex");
@@ -107,11 +107,11 @@ int LJCoulMSMT::init(const int ntypes,
   ncols = 6;
   UCL_H_Vec<numtyp> dview_dgcons(nrows*ncols,*(this->ucl_device),
                                  UCL_WRITE_ONLY);
-                               
+
   for (int ix=0; ix<nrows; ix++)
     for (int iy=0; iy<ncols; iy++)
       dview_dgcons[ix*ncols+iy]=host_dgcons[ix][iy];
-  
+
   dgcons.alloc(nrows*ncols,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(dgcons,dview_dgcons,false);
   dgcons_tex.get_texture(*(this->pair_program),"dgcons_tex");
@@ -170,7 +170,7 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -179,7 +179,7 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj, 
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
diff --git a/lib/gpu/lal_lj_coul_msm.cu b/lib/gpu/lal_lj_coul_msm.cu
index 0c7c3cdace..3f73c6f47d 100644
--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -80,19 +80,19 @@ ucl_inline numtyp dgamma(const numtyp rho, const int order,
     return ((numtyp)-1.0/rho/rho);
 }
 
-__kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict lj1,
                              const __global numtyp4 *restrict lj3,
                              const __global numtyp *restrict gcons,
                              const __global numtyp *restrict dgcons,
                              const int lj_types,
-                             const __global numtyp *restrict sp_lj_in, 
-                             const __global int *dev_nbor, 
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor,
                              const __global int *dev_packed,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
-                             const int nbor_pitch, 
+                             const int nbor_pitch,
                              const __global numtyp *restrict q_,
                              const numtyp cut_coulsq, const numtyp qqrd2e,
                              const int order, const int t_per_atom) {
@@ -116,20 +116,20 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
-    
+
     numtyp cut_coul = ucl_sqrt(cut_coulsq);
-    
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
 
@@ -181,7 +181,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].w) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -199,7 +199,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp4 *restrict lj1_in,
                                  const __global numtyp4 *restrict lj3_in,
                                  const __global numtyp *restrict gcons,
@@ -227,7 +227,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -235,16 +235,16 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_lj_coul_msm.h b/lib/gpu/lal_lj_coul_msm.h
index f1ef71cc2b..48d49a8742 100644
--- a/lib/gpu/lal_lj_coul_msm.h
+++ b/lib/gpu/lal_lj_coul_msm.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -41,8 +41,8 @@ class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_gcons, double **host_dgcons,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const int order, const double qqrd2e);
@@ -65,14 +65,14 @@ class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
   UCL_D_Vec<numtyp4> lj3;
   /// Special LJ values [0-3] and Special Coul values [4-7]
   UCL_D_Vec<numtyp> sp_lj;
-  
+
   UCL_D_Vec<numtyp> gcons, dgcons;
   UCL_Texture gcons_tex, dgcons_tex;
-  
+
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e;
diff --git a/lib/gpu/lal_lj_coul_msm_ext.cpp b/lib/gpu/lal_lj_coul_msm_ext.cpp
index ecf3254cf9..ceff1f7c66 100644
--- a/lib/gpu/lal_lj_coul_msm_ext.cpp
+++ b/lib/gpu/lal_lj_coul_msm_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -84,7 +84,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           host_cut_coulsq, host_special_coul, order, qqrd2e);
 
     LJCMLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -101,7 +101,7 @@ void ljcm_gpu_clear() {
 
 int** ljcm_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -111,7 +111,7 @@ int** ljcm_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
+}
 			
 void ljcm_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_lj_cubic.cpp b/lib/gpu/lal_lj_cubic.cpp
index 25f83166e1..933795a8f6 100644
--- a/lib/gpu/lal_lj_cubic.cpp
+++ b/lib/gpu/lal_lj_cubic.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
@@ -33,21 +33,21 @@ LJCubicT::LJCubic() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-LJCubicT::~LJCubic() { 
+LJCubicT::~LJCubic() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJCubicT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int LJCubicT::init(const int ntypes, 
+int LJCubicT::init(const int ntypes,
                    double **host_cutsq, double **host_cut_inner_sq,
-                   double **host_cut_inner, double **host_sigma, 
-                   double **host_epsilon, double **host_lj1, 
-                   double **host_lj2, double **host_lj3, double **host_lj4, 
+                   double **host_cut_inner, double **host_sigma,
+                   double **host_epsilon, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4,
                    double *host_special_lj, const int nlocal,
                    const int nall, const int max_nbors,
                    const int maxspecial, const double cell_size,
@@ -132,7 +132,7 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -144,12 +144,12 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj2, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, 
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj2, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &lj1, &lj2, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_lj_cubic.cu b/lib/gpu/lal_lj_cubic.cu
index 420689383f..a4b1992f33 100644
--- a/lib/gpu/lal_lj_cubic.cu
+++ b/lib/gpu/lal_lj_cubic.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/
 
@@ -31,16 +31,16 @@ texture<int4,1> pos_tex;
 #define _DPHIDS (numtyp)2.6899009  // gradient at s
 #define _A3 (numtyp)27.93357       // cubic coefficient
 
-__kernel void k_lj_cubic(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict lj1,
                          const __global numtyp4 *restrict lj2,
-                         const __global numtyp2 *restrict lj3, 
-                         const int lj_types, 
-                         const __global numtyp *restrict sp_lj, 
-                         const __global int * dev_nbor, 
-                         const __global int * dev_packed, 
-                         __global acctyp4 *restrict ans, 
-                         __global acctyp *restrict engv, 
+                         const __global numtyp2 *restrict lj3,
+                         const int lj_types,
+                         const __global numtyp *restrict sp_lj,
+                         const __global int * dev_nbor,
+                         const __global int * dev_packed,
+                         __global acctyp4 *restrict ans,
+                         __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -52,19 +52,19 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -77,7 +77,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<lj1[mtype].z) {
         numtyp r2inv,r6inv,force,t;
@@ -93,18 +93,18 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
         }
 
         force*=factor_lj*r2inv;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e;
-          if (rsq <= lj2[mtype].x) 
+          if (rsq <= lj2[mtype].x)
             e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           else
             e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -122,20 +122,20 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp4 *restrict lj1_in,
                               const __global numtyp4 *restrict lj2_in,
-                              const __global numtyp2 *restrict lj3_in, 
-                              const __global numtyp *restrict sp_lj_in, 
-                              const __global int * dev_nbor, 
-                              const __global int * dev_packed, 
-                              __global acctyp4 *restrict ans, 
-                              __global acctyp *restrict engv, 
-                              const int eflag, const int vflag, const int inum, 
+                              const __global numtyp2 *restrict lj3_in,
+                              const __global numtyp *restrict sp_lj_in,
+                              const __global int * dev_nbor,
+                              const __global int * dev_packed,
+                              __global acctyp4 *restrict ans,
+                              __global acctyp *restrict engv,
+                              const int eflag, const int vflag, const int inum,
                               const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp2 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -148,7 +148,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -157,7 +157,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
@@ -170,7 +170,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -183,7 +183,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<lj1[mtype].z) {
         numtyp r2inv,r6inv,force,t;
         r2inv=ucl_recip(rsq);
@@ -198,18 +198,18 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
         }
 
         force*=factor_lj*r2inv;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e;
-          if (rsq <= lj2[mtype].x) 
+          if (rsq <= lj2[mtype].x)
             e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           else
             e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_lj_cubic.h b/lib/gpu/lal_lj_cubic.h
index 0fefc727eb..818fb3581b 100644
--- a/lib/gpu/lal_lj_cubic.h
+++ b/lib/gpu/lal_lj_cubic.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class LJCubic : public BaseAtomic<numtyp, acctyp> {
  public:
   LJCubic();
-  ~LJCubic(); 
+  ~LJCubic();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -39,11 +39,11 @@ class LJCubic : public BaseAtomic<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq, double **host_cut_inner_sq,
            double **host_cut_inner, double **host_sigma, double **host_epsilon,
-           double **host_lj1, double **host_lj2, double **host_lj3, 
-           double **host_lj4, double *host_special_lj, const int nlocal, 
-           const int nall, const int max_nbors, const int maxspecial, 
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double *host_special_lj, const int nlocal,
+           const int nall, const int max_nbors, const int maxspecial,
            const double cell_size, const double gpu_split, FILE *screen);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -68,7 +68,7 @@ class LJCubic : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_lj_cubic_ext.cpp b/lib/gpu/lal_lj_cubic_ext.cpp
index 518f706781..a45d02a8ca 100644
--- a/lib/gpu/lal_lj_cubic_ext.cpp
+++ b/lib/gpu/lal_lj_cubic_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
@@ -27,11 +27,11 @@ static LJCubic<PRECISION,ACC_PRECISION> LJCubicLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq, 
+int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq,
                   double **cut_inner, double **sigma, double **epsilon,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double *special_lj, 
-                  const int inum, const int nall, const int max_nbors, 
+                  double **host_lj1, double **host_lj2, double **host_lj3,
+                  double **host_lj4, double *special_lj,
+                  const int inum, const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
                   int &gpu_mode, FILE *screen) {
   LJCubicLMF.clear();
@@ -81,7 +81,7 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq,
                               cell_size, gpu_split, screen);
 
     LJCubicLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -106,7 +106,7 @@ int ** ljcb_gpu_compute_n(const int ago, const int inum_full,
   return LJCubicLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void ljcb_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_lj_dsf.cpp b/lib/gpu/lal_lj_dsf.cpp
index 1b8fdeabb0..384cf75d1f 100644
--- a/lib/gpu/lal_lj_dsf.cpp
+++ b/lib/gpu/lal_lj_dsf.cpp
@@ -37,22 +37,22 @@ template <class numtyp, class acctyp>
 LJDSFT::~LJDSF() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJDSFT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1, 
+int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3,  double **host_lj4,
-                 double **host_offset,  double *host_special_lj, 
+                 double **host_offset,  double *host_special_lj,
                  const int nlocal, const int nall, const int max_nbors,
-                 const int maxspecial, const double cell_size, 
+                 const int maxspecial, const double cell_size,
                  const double gpu_split, FILE *_screen,
                  double **host_cut_ljsq, const double host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e,
-                 const double e_shift, const double f_shift, 
+                 const double e_shift, const double f_shift,
                  const double alpha) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -138,7 +138,7 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -149,15 +149,15 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv,
                      &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
                      &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                      &this->_threads_per_atom);
diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu
index 5e0cd4aca9..323576fe77 100644
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@@ -31,20 +31,20 @@ texture<int2> q_tex;
 
 #define MY_PIS (acctyp)1.77245385090551602729
 
-__kernel void k_lj_dsf(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
                        const __global numtyp4 *restrict lj1,
-                       const __global numtyp4 *restrict lj3, 
-                       const int lj_types, 
-                       const __global numtyp *restrict sp_lj_in, 
-                       const __global int *dev_nbor, 
-                       const __global int *dev_packed, 
+                       const __global numtyp4 *restrict lj3,
+                       const int lj_types,
+                       const __global numtyp *restrict sp_lj_in,
+                       const __global int *dev_nbor,
+                       const __global int *dev_packed,
                        __global acctyp4 *restrict ans,
-                       __global acctyp *restrict engv, 
+                       __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
-                       const int nbor_pitch, 
+                       const int nbor_pitch,
                        const __global numtyp *restrict q_ ,
                        const numtyp cut_coulsq, const numtyp qqrd2e,
-                       const numtyp e_shift, const numtyp f_shift, 
+                       const numtyp e_shift, const numtyp f_shift,
                        const numtyp alpha, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -66,20 +66,20 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
@@ -119,7 +119,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
           numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
           erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-          forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+          forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
             rsq*f_shift-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
@@ -156,19 +156,19 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict lj1_in,
                             const __global numtyp4 *restrict lj3_in,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
                             __global acctyp4 *restrict ans,
-                            __global acctyp *restrict engv, 
-                            const int eflag, const int vflag, const int inum, 
+                            __global acctyp *restrict engv,
+                            const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
                             const __global numtyp *restrict q_,
                             const numtyp cut_coulsq, const numtyp qqrd2e,
-                            const numtyp e_shift, const numtyp f_shift, 
+                            const numtyp e_shift, const numtyp f_shift,
                             const numtyp alpha, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -183,7 +183,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -191,23 +191,23 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
@@ -246,7 +246,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
           numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
           erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-          forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+          forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
             rsq*f_shift-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
diff --git a/lib/gpu/lal_lj_dsf.h b/lib/gpu/lal_lj_dsf.h
index 5badf543c4..0195898ca4 100644
--- a/lib/gpu/lal_lj_dsf.h
+++ b/lib/gpu/lal_lj_dsf.h
@@ -30,7 +30,7 @@ class LJDSF : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,11 +40,11 @@ class LJDSF : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
-           const double qqrd2e, const double e_shift, const double f_shift, 
+           const double qqrd2e, const double e_shift, const double f_shift,
            const double alpha);
 
   /// Clear all host and device data
@@ -69,7 +69,7 @@ class LJDSF : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e;
diff --git a/lib/gpu/lal_lj_dsf_ext.cpp b/lib/gpu/lal_lj_dsf_ext.cpp
index 719a792d7f..f516da6622 100644
--- a/lib/gpu/lal_lj_dsf_ext.cpp
+++ b/lib/gpu/lal_lj_dsf_ext.cpp
@@ -34,7 +34,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  const double cell_size, int &gpu_mode, FILE *screen,
                  double **host_cut_ljsq, const double host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e,
-                 const double e_shift, const double f_shift, 
+                 const double e_shift, const double f_shift,
                  const double alpha) {
   LJDMF.clear();
   gpu_mode=LJDMF.device->gpu_mode();
@@ -85,7 +85,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          f_shift, alpha);
 
     LJDMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,7 +102,7 @@ void ljd_gpu_clear() {
 
 int** ljd_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
@@ -112,7 +112,7 @@ int** ljd_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
-}  
+}
 			
 void ljd_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_lj_expand.cpp b/lib/gpu/lal_lj_expand.cpp
index 03526bc095..c6d8a92e96 100644
--- a/lib/gpu/lal_lj_expand.cpp
+++ b/lib/gpu/lal_lj_expand.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ibains@nvidia.com
  ***************************************************************************/
 
@@ -36,7 +36,7 @@ template <class numtyp, class acctyp>
 LJExpandT::~LJExpand() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJExpandT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -97,17 +97,17 @@ void LJExpandT::reinit(const int ntypes, double **host_cutsq,
                        double **host_lj1, double **host_lj2,
                        double **host_lj3, double **host_lj4,
                        double **host_offset, double **host_shift) {
-  
+
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
                          host_cutsq, host_shift);
-  
+
   this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
                          host_offset);
 }
@@ -146,7 +146,7 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -155,15 +155,15 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, 
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
-                          &vflag, &ainum, &nbor_pitch, 
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu
index 6b79db2323..a951b4107a 100644
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : ibains@nvidia.com
 // ***************************************************************************/
 
@@ -26,15 +26,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_lj_expand(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_expand(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict lj1,
-                          const __global numtyp4 *restrict lj3, 
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict lj3,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -52,20 +52,20 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -78,7 +78,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r2inv<lj1[mtype].z) {
         numtyp r = ucl_sqrt(r2inv);
@@ -88,14 +88,14 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
         force*=factor_lj/rshift/r;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -113,15 +113,15 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp4 *restrict lj1_in,
-                               const __global numtyp4 *restrict lj3_in, 
-                               const __global numtyp *restrict sp_lj_in, 
-                               const __global int *dev_nbor, 
+                               const __global numtyp4 *restrict lj3_in,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
                                const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -136,30 +136,30 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(numtyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -172,7 +172,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<lj1[mtype].z) {
         numtyp r = ucl_sqrt(r2inv);
 	numtyp rshift = r - lj1[mtype].w;
@@ -181,14 +181,14 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
         force*=factor_lj/rshift/r;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_lj_expand.h b/lib/gpu/lal_lj_expand.h
index 0d0ae0b2e6..a732a3a686 100644
--- a/lib/gpu/lal_lj_expand.h
+++ b/lib/gpu/lal_lj_expand.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ibains@nvidia.com
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,15 +40,15 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double **host_offset, double **host_shift, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_cutsq,
               double **host_lj1, double **host_lj2, double **host_lj3,
               double **host_lj4, double **host_offset, double **host_shift);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -71,7 +71,7 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp
index 5303149d1f..d6ea4a9200 100644
--- a/lib/gpu/lal_lj_expand_ext.cpp
+++ b/lib/gpu/lal_lj_expand_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ibains@nvidia.com
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ static LJExpand<PRECISION,ACC_PRECISION> LJEMF;
 int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4,
                  double **offset, double **shift, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
+                 const int inum, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen) {
   LJEMF.clear();
@@ -78,7 +78,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split,screen);
 
     LJEMF.device->world_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -98,12 +98,12 @@ int lje_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
   int world_me=LJEMF.device->world_me();
   int gpu_rank=LJEMF.device->gpu_rank();
   int procs_per_gpu=LJEMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                 offset, shift);
   LJEMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
@@ -126,7 +126,7 @@ int** lje_gpu_compute_n(const int ago, const int inum_full,
   return LJEMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void lje_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_lj_ext.cpp b/lib/gpu/lal_lj_ext.cpp
index 345ed4d955..dcbbe5b01d 100644
--- a/lib/gpu/lal_lj_ext.cpp
+++ b/lib/gpu/lal_lj_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -54,7 +54,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+    init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
@@ -77,7 +77,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split, screen);
 
     LJLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -97,11 +97,11 @@ void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
   int world_me=LJLMF.device->world_me();
   int gpu_rank=LJLMF.device->gpu_rank();
   int procs_per_gpu=LJLMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
   LJLMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
@@ -123,7 +123,7 @@ int ** ljl_gpu_compute_n(const int ago, const int inum_full,
   return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_lj_gromacs.cpp b/lib/gpu/lal_lj_gromacs.cpp
index 75b2345378..75f5a41917 100644
--- a/lib/gpu/lal_lj_gromacs.cpp
+++ b/lib/gpu/lal_lj_gromacs.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 LJGROMACST::~LJGROMACS() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJGROMACST::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -47,11 +47,11 @@ template <class numtyp, class acctyp>
 int LJGROMACST::init(const int ntypes, double **host_cutsq,
                      double **host_lj1, double **host_lj2, double **host_lj3,
                      double **host_lj4, double *host_special_lj,
-                     const int nlocal, const int nall, const int max_nbors, 
-                     const int maxspecial, const double cell_size, 
+                     const int nlocal, const int nall, const int max_nbors,
+                     const int maxspecial, const double cell_size,
                      const double gpu_split, FILE *_screen,
                      double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
-                     double **host_ljsw4, double **host_ljsw5, 
+                     double **host_ljsw4, double **host_ljsw5,
                      double **cut_inner, double **cut_inner_sq) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -134,7 +134,7 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -146,16 +146,16 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw,
                           &sp_lj, &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, 
+                          &this->ans->force, &this->ans->engv,
                           &eflag, &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &_lj_types, 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &_lj_types,
                      &sp_lj, &this->nbor->dev_nbor,
-                     &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, 
-                     &eflag, &vflag, &ainum, &nbor_pitch, 
+                     &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv,
+                     &eflag, &vflag, &ainum, &nbor_pitch,
                      &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_lj_gromacs.cu b/lib/gpu/lal_lj_gromacs.cu
index f20d8634a5..93dc3d9456 100644
--- a/lib/gpu/lal_lj_gromacs.cu
+++ b/lib/gpu/lal_lj_gromacs.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -35,8 +35,8 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
-                           __global acctyp *restrict engv, 
-                           const int eflag, const int vflag, const int inum, 
+                           __global acctyp *restrict engv,
+                           const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -59,7 +59,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
@@ -83,7 +83,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
       if (rsq<lj1[mtype].z) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp force_lj, force, r6inv, t;
-        
+
         r6inv = r2inv*r2inv*r2inv;
         force_lj = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
         if (rsq > lj1[mtype].w) {
@@ -91,7 +91,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
           t = r - lj3[mtype].z;
           numtyp fswitch = r*t*t*(ljsw[mtype].x + ljsw[mtype].y*t);
           force_lj += fswitch;
-        } 
+        }
 
         force = factor_lj*force_lj * r2inv;
 
@@ -149,22 +149,22 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
     lj3[tid]=lj3_in[tid];
     ljsw[tid]=ljsw_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@@ -184,11 +184,11 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-      
+
       if (rsq<lj1[mtype].z) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp force_lj, force, r6inv, t;
-        
+
         r6inv = r2inv*r2inv*r2inv;
         force_lj = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
         if (rsq > lj1[mtype].w) {
@@ -196,7 +196,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
           t = r - lj3[mtype].z;
           numtyp fswitch = r*t*t*(ljsw[mtype].x + ljsw[mtype].y*t);
           force_lj += fswitch;
-        } 
+        }
 
         force = factor_lj*force_lj * r2inv;
 
diff --git a/lib/gpu/lal_lj_gromacs.h b/lib/gpu/lal_lj_gromacs.h
index dc949be4a9..1e0f72dafc 100644
--- a/lib/gpu/lal_lj_gromacs.h
+++ b/lib/gpu/lal_lj_gromacs.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJGROMACS : public BaseAtomic<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,11 +40,11 @@ class LJGROMACS : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen,
            double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
-           double **host_ljsw4, double **host_ljsw5, 
+           double **host_ljsw4, double **host_ljsw5,
            double **cut_inner, double **cut_inner_sq);
 
   /// Clear all host and device data
@@ -71,7 +71,7 @@ class LJGROMACS : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_lj_gromacs_ext.cpp b/lib/gpu/lal_lj_gromacs_ext.cpp
index b5eb0038b7..83f0ffc403 100644
--- a/lib/gpu/lal_lj_gromacs_ext.cpp
+++ b/lib/gpu/lal_lj_gromacs_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,7 +33,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                    const int nall, const int max_nbors, const int maxspecial,
                    const double cell_size, int &gpu_mode, FILE *screen,
                    double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
-                   double **host_ljsw4, double **host_ljsw5, 
+                   double **host_ljsw4, double **host_ljsw5,
                    double **cut_inner, double **cut_inner_sq) {
   LJGRMMF.clear();
   gpu_mode=LJGRMMF.device->gpu_mode();
@@ -59,7 +59,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   if (world_me==0)
     LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                  special_lj, inum, nall, 300, maxspecial, cell_size,
-                 gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, 
+                 gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3,
                  host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq);
 
   LJGRMMF.device->world_barrier();
@@ -78,11 +78,11 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     if (gpu_rank==i && world_me!=0)
       init_ok=LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                            special_lj, inum, nall, 300, maxspecial, cell_size,
-                           gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, 
+                           gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3,
                            host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq);
 
     LJGRMMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -107,7 +107,7 @@ int ** ljgrm_gpu_compute_n(const int ago, const int inum_full,
   return LJGRMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                          subhi, tag, nspecial, special, eflag, vflag, eatom,
                          vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void ljgrm_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_mie.cpp b/lib/gpu/lal_mie.cpp
index 2ab7cb8d14..a87771e9bb 100644
--- a/lib/gpu/lal_mie.cpp
+++ b/lib/gpu/lal_mie.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,17 +33,17 @@ MieT::Mie() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-MieT::~Mie() { 
+MieT::~Mie() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int MieT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int MieT::init(const int ntypes, double **host_cutsq, 
+int MieT::init(const int ntypes, double **host_cutsq,
                double **host_mie1, double **host_mie2,
                double **host_mie3, double **host_mie4,
                double **host_gamA, double **host_gamR,
@@ -81,7 +81,7 @@ int MieT::init(const int ntypes, double **host_cutsq,
   mie3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,mie3,host_write,host_mie3,host_mie4,
 			                   host_offset,host_cutsq);
-  
+
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   dview.view(host_special_lj,4,*(this->ucl_device));
@@ -126,7 +126,7 @@ void MieT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
diff --git a/lib/gpu/lal_mie.cu b/lib/gpu/lal_mie.cu
index 4d718897eb..33018566eb 100644
--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_mie(const __global numtyp4 *restrict x_, 
+__kernel void k_mie(const __global numtyp4 *restrict x_,
                     const __global numtyp4 *restrict mie1,
                     const __global numtyp4 *restrict mie3,
-                    const int lj_types, 
-                    const __global numtyp *restrict sp_lj_in, 
-                    const __global int *dev_nbor, 
-                    const __global int *dev_packed, 
+                    const int lj_types,
+                    const __global numtyp *restrict sp_lj_in,
+                    const __global int *dev_nbor,
+                    const __global int *dev_packed,
                     __global acctyp4 *restrict ans,
-                    __global acctyp *restrict engv, 
+                    __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -50,20 +50,20 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -76,7 +76,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<mie3[mtype].w) {
         numtyp r2inv = ucl_recip(rsq);
@@ -110,19 +110,19 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_mie_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_mie_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict mie1_in,
                          const __global numtyp4 *restrict mie3_in,
-                         const __global numtyp *restrict sp_lj_in, 
+                         const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
-                         const __global int *dev_packed, 
+                         const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
-                         __global acctyp *restrict engv, 
-                         const int eflag, const int vflag, const int inum, 
+                         __global acctyp *restrict engv,
+                         const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 mie1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 mie3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -132,7 +132,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
     mie1[tid]=mie1_in[tid];
     mie3[tid]=mie3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -141,7 +141,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -155,7 +155,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -168,7 +168,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<mie3[mtype].w) {
         numtyp r2inv = ucl_recip(rsq);
         numtyp rgamA = pow(r2inv,(mie1[mtype].z/(numtyp)2.0));
diff --git a/lib/gpu/lal_mie.h b/lib/gpu/lal_mie.h
index e6b8efebf0..8752fe1748 100644
--- a/lib/gpu/lal_mie.h
+++ b/lib/gpu/lal_mie.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Mie : public BaseAtomic<numtyp, acctyp> {
  public:
   Mie();
-  ~Mie(); 
+  ~Mie();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -41,8 +41,8 @@ class Mie : public BaseAtomic<numtyp, acctyp> {
            double **host_mie1, double **host_mie2, double **host_mie3,
            double **host_mie4, double **host_gamA, double **host_gamR,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -67,7 +67,7 @@ class Mie : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_mie_ext.cpp b/lib/gpu/lal_mie_ext.cpp
index d7c4187a42..f43cde2650 100644
--- a/lib/gpu/lal_mie_ext.cpp
+++ b/lib/gpu/lal_mie_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -81,7 +81,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
                         cell_size, gpu_split, screen);
 
     MLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -106,7 +106,7 @@ int ** mie_gpu_compute_n(const int ago, const int inum_full,
   return MLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void mie_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_morse.cpp b/lib/gpu/lal_morse.cpp
index ddf7d843e6..cbdf928863 100644
--- a/lib/gpu/lal_morse.cpp
+++ b/lib/gpu/lal_morse.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -33,20 +33,20 @@ MorseT::Morse() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-MorseT::~Morse() { 
+MorseT::~Morse() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int MorseT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int MorseT::init(const int ntypes, 
-                          double **host_cutsq, double **host_morse1, 
-                          double **host_r0, double **host_alpha, 
-                          double **host_d0, double **host_offset, 
+int MorseT::init(const int ntypes,
+                          double **host_cutsq, double **host_morse1,
+                          double **host_r0, double **host_alpha,
+                          double **host_d0, double **host_offset,
                           double *host_special_lj, const int nlocal,
                           const int nall, const int max_nbors,
                           const int maxspecial, const double cell_size,
@@ -125,7 +125,7 @@ void MorseT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -135,14 +135,14 @@ void MorseT::loop(const bool _eflag, const bool _vflag) {
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &mor1, &mor2, &sp_lj,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, 
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu
index 2015c71cb2..0a14071d19 100644
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -26,13 +26,13 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_morse(const __global numtyp4 *restrict x_, 
+__kernel void k_morse(const __global numtyp4 *restrict x_,
                       const __global numtyp4 *restrict mor1,
-                      const __global numtyp2 *restrict mor2, 
-                      const int lj_types, 
-                      const __global numtyp *restrict sp_lj_in, 
-                      const __global int *dev_nbor, 
-                      const __global int *dev_packed, 
+                      const __global numtyp2 *restrict mor2,
+                      const int lj_types,
+                      const __global numtyp *restrict sp_lj_in,
+                      const __global int *dev_nbor,
+                      const __global int *dev_packed,
                       __global acctyp4 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
@@ -59,13 +59,13 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -78,7 +78,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r<mor1[mtype].x) {
         r=ucl_sqrt(r);
@@ -86,14 +86,14 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
         dexp=ucl_exp(-mor1[mtype].w*dexp);
         numtyp dm=dexp*dexp-dexp;
         numtyp force = mor1[mtype].y*dm/r*factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y;
-          energy+=e*factor_lj; 
+          energy+=e*factor_lj;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -111,15 +111,15 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_morse_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_morse_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp4 *restrict mor1_in,
-                           const __global numtyp2 *restrict mor2_in, 
+                           const __global numtyp2 *restrict mor2_in,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
-                           __global acctyp *restrict engv, 
-                           const int eflag, const int vflag, const int inum, 
+                           __global acctyp *restrict engv,
+                           const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -134,30 +134,30 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       mor2[tid]=mor2_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -170,21 +170,21 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r = delx*delx+dely*dely+delz*delz;
-        
+
       if (r<mor1[mtype].x) {
         r=ucl_sqrt(r);
         numtyp dexp=r-mor1[mtype].z;
         dexp=ucl_exp(-mor1[mtype].w*dexp);
         numtyp dm=dexp*dexp-dexp;
         numtyp force = mor1[mtype].y*dm/r*factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y;
-          energy+=e*factor_lj; 
+          energy+=e*factor_lj;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_morse.h b/lib/gpu/lal_morse.h
index e64852f315..ef80fb4235 100644
--- a/lib/gpu/lal_morse.h
+++ b/lib/gpu/lal_morse.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Morse : public BaseAtomic<numtyp, acctyp> {
  public:
   Morse();
-  ~Morse(); 
+  ~Morse();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,8 +40,8 @@ class Morse : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_morse1, double **host_r0, double **host_alpha,
            double **host_d0, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -66,7 +66,7 @@ class Morse : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _types;
 
  private:
diff --git a/lib/gpu/lal_morse_ext.cpp b/lib/gpu/lal_morse_ext.cpp
index 3994473fd3..d07a83cd34 100644
--- a/lib/gpu/lal_morse_ext.cpp
+++ b/lib/gpu/lal_morse_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -28,9 +28,9 @@ static Morse<PRECISION,ACC_PRECISION> MORMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int mor_gpu_init(const int ntypes, double **cutsq,
-                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj1, double **host_lj2, double **host_lj3,
                  double **host_lj4, double **offset, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
+                 const int inum, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen) {
   MORMF.clear();
@@ -55,7 +55,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+    init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
@@ -78,7 +78,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
                          cell_size, gpu_split, screen);
 
     MORMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -103,7 +103,7 @@ int** mor_gpu_compute_n(const int ago, const int inum_full,
   return MORMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void mor_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_neighbor_cpu.cu b/lib/gpu/lal_neighbor_cpu.cu
index 384b88d9de..d005eb9f97 100644
--- a/lib/gpu/lal_neighbor_cpu.cu
+++ b/lib/gpu/lal_neighbor_cpu.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -17,7 +17,7 @@
 #include "lal_preprocessor.h"
 #endif
 
-__kernel void kernel_unpack(__global int *dev_nbor, 
+__kernel void kernel_unpack(__global int *dev_nbor,
                             const __global int *dev_ij,
                             const int inum, const int t_per_atom) {
   int tid=THREAD_ID_X;
@@ -33,7 +33,7 @@ __kernel void kernel_unpack(__global int *dev_nbor,
     list+=offset;
     nbor+=fast_mul(ii,t_per_atom-1)+offset;
     int stride=fast_mul(t_per_atom,inum);
-      
+
     for ( ; list<list_end; list++) {
       dev_nbor[nbor]=dev_ij[list];
       nbor+=stride;
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index 4f8464e803..add9c65fd4 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -10,7 +10,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : penwang@nvidia.com, brownw@ornl.gov
 // ***************************************************************************/
 
@@ -32,12 +32,12 @@ texture<float4> pos_tex;
 texture<int4,1> pos_tex;
 #endif
 
-__kernel void calc_cell_id(const numtyp4 *restrict pos, 
-                           unsigned *restrict cell_id, 
+__kernel void calc_cell_id(const numtyp4 *restrict pos,
+                           unsigned *restrict cell_id,
                            int *restrict particle_id,
-                           numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, 
-                           numtyp i_cell_size, int ncellx, int ncelly, 
-                           int ncellz, int inum, int nall, 
+                           numtyp boxlo0, numtyp boxlo1, numtyp boxlo2,
+                           numtyp i_cell_size, int ncellx, int ncelly,
+                           int ncellz, int inum, int nall,
                            int cells_in_cutoff) {
   int i = threadIdx.x + blockIdx.x*blockDim.x;
 
@@ -48,11 +48,11 @@ __kernel void calc_cell_id(const numtyp4 *restrict pos,
     p.x -= boxlo0;
     p.y -= boxlo1;
     p.z -= boxlo2;
-    
+
     int ix = int(p.x*i_cell_size+cells_in_cutoff);
     int iy = int(p.y*i_cell_size+cells_in_cutoff);
     int iz = int(p.z*i_cell_size+cells_in_cutoff);
-    
+
     int offset_lo, offset_hi;
     if (i<inum) {
       offset_lo=cells_in_cutoff;
@@ -61,21 +61,21 @@ __kernel void calc_cell_id(const numtyp4 *restrict pos,
       offset_lo=0;
       offset_hi=1;
     }
-    
+
     ix = max(ix,offset_lo);
     ix = min(ix,ncellx-offset_hi);
     iy = max(iy,offset_lo);
     iy = min(iy,ncelly-offset_hi);
     iz = max(iz,offset_lo);
     iz = min(iz,ncellz-offset_hi);
-    
+
     cell_id[i] = ix+iy*ncellx+iz*ncellx*ncelly;
     particle_id[i] = i;
   }
 }
 
 __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
-                                      int *restrict cell_counts, 
+                                      int *restrict cell_counts,
                                       int nall, int ncell) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < nall) {
@@ -83,18 +83,18 @@ __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
 
     // handle boundary cases
     if (idx == 0) {
-      for (int i = 0; i < id + 1; i++) 
+      for (int i = 0; i < id + 1; i++)
         cell_counts[i] = 0;
     }
     if (idx == nall - 1) {
-      for (int i = id+1; i <= ncell; i++) 
+      for (int i = id+1; i <= ncell; i++)
         cell_counts[i] = nall;
     }
 
     if (idx > 0 && idx < nall) {
       int id_l = cell_id[idx-1];
       if (id != id_l) {
-        for (int i = id_l+1; i <= id; i++) 
+        for (int i = id_l+1; i <= id; i++)
           cell_counts[i] = idx;
       }
     }
@@ -114,8 +114,8 @@ __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
 #endif
 #endif
 
-__kernel void transpose(__global tagint *restrict out, 
-                        const __global tagint *restrict in, 
+__kernel void transpose(__global tagint *restrict out,
+                        const __global tagint *restrict in,
                         int columns_in, int rows_in)
 {
 	__local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
@@ -138,12 +138,12 @@ __kernel void transpose(__global tagint *restrict out,
 		out[j*rows_in+i] = block[ti][tj];
 }
 
-__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, 
-                                   const __global int *restrict cell_particle_id, 
-                                   const __global int *restrict cell_counts, 
+__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
+                                   const __global int *restrict cell_particle_id,
+                                   const __global int *restrict cell_counts,
                                    __global int *nbor_list,
-                                   __global int *host_nbor_list, 
-                                   __global int *host_numj, 
+                                   __global int *host_nbor_list,
+                                   __global int *host_numj,
                                    int neigh_bin_size, numtyp cell_size,
                                    int ncellx, int ncelly, int ncellz,
                                    int inum, int nt, int nall, int t_per_atom,
@@ -154,7 +154,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
   int iy = BLOCK_ID_Y % (ncelly - cells_in_cutoff*2) + cells_in_cutoff;
   int iz = BLOCK_ID_Y / (ncelly - cells_in_cutoff*2) + cells_in_cutoff;
   int bsx = BLOCK_SIZE_X;
-	  
+	
   int icell = ix + iy*ncellx + iz*ncellx*ncelly;
 
   __local int cell_list_sh[BLOCK_NBOR_BUILD];
@@ -163,7 +163,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
   int icell_begin = cell_counts[icell];
   int icell_end = cell_counts[icell+1];
 
-  int nborz0 = iz-cells_in_cutoff, nborz1 = iz+cells_in_cutoff, 
+  int nborz0 = iz-cells_in_cutoff, nborz1 = iz+cells_in_cutoff,
       nbory0 = iy-cells_in_cutoff, nbory1 = iy+cells_in_cutoff,
       nborx0 = ix-cells_in_cutoff, nborx1 = ix+cells_in_cutoff;
 
@@ -174,9 +174,9 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
     int i = icell_begin + tid + ii*bsx;
     int pid_i = nall, pid_j, stride;
     numtyp4 atom_i, atom_j;
-    int cnt = 0;    
+    int cnt = 0;
     __global int *neigh_counts, *neigh_list;
-    
+
     if (i < icell_end)
       pid_i = cell_particle_id[i];
 
@@ -194,7 +194,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
     	neigh_counts=host_numj+pid_i-inum;
       neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
     }
-    
+
     // loop through neighbors
 
     for (int nborz = nborz0; nborz <= nborz1; nborz++) {
@@ -206,13 +206,13 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
           int jcell_begin = cell_counts[jcell];
           int jcell_end = cell_counts[jcell+1];
           int num_atom_cell = jcell_end - jcell_begin;
-	  
+	
           // load jcell to shared memory
           int num_iter = ucl_ceil((numtyp)num_atom_cell/bsx);
 
           for (int k = 0; k < num_iter; k++) {
             int end_idx = min(bsx, num_atom_cell-k*bsx);
-	    
+	
             if (tid < end_idx) {
               pid_j =  cell_particle_id[tid+k*bsx+jcell_begin];
               cell_list_sh[tid] = pid_j;
@@ -222,9 +222,9 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
               pos_sh[tid].z = atom_j.z;
             }
             __syncthreads();
-	    
+	
             if (pid_i < nt) {
-	    
+	
               for (int j = 0; j < end_idx; j++) {
                 int pid_j = cell_list_sh[j]; // gather from shared memory
                 diff.x = atom_i.x - pos_sh[j].x;
@@ -253,11 +253,11 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
   } // for (i)
 }
 
-__kernel void kernel_special(__global int *dev_nbor, 
-                             __global int *host_nbor_list, 
-                             const __global int *host_numj, 
+__kernel void kernel_special(__global int *dev_nbor,
+                             __global int *host_nbor_list,
+                             const __global int *host_numj,
                              const __global tagint *restrict tag,
-                             const __global int *restrict nspecial, 
+                             const __global int *restrict nspecial,
                              const __global tagint *restrict special,
                              int inum, int nt, int max_nbors, int t_per_atom) {
   int tid=THREAD_ID_X;
@@ -268,7 +268,7 @@ __kernel void kernel_special(__global int *dev_nbor,
   if (ii<nt) {
     int stride;
     __global int *list, *list_end;
-    
+
     int n1=nspecial[ii*3];
     int n2=nspecial[ii*3+1];
     int n3=nspecial[ii*3+2];
@@ -289,7 +289,7 @@ __kernel void kernel_special(__global int *dev_nbor,
       numj=host_numj[ii-inum];
       list_end=list+fast_mul(numj,stride);
     }
-  
+
     for ( ; list<list_end; list+=stride) {
       int nbor=*list;
       tagint jtag=tag[nbor];
diff --git a/lib/gpu/lal_neighbor_shared.cpp b/lib/gpu/lal_neighbor_shared.cpp
index d5d37883cb..b399d7441a 100644
--- a/lib/gpu/lal_neighbor_shared.cpp
+++ b/lib/gpu/lal_neighbor_shared.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
diff --git a/lib/gpu/lal_neighbor_shared.h b/lib/gpu/lal_neighbor_shared.h
index 31d74b0fa6..834ee8406d 100644
--- a/lib/gpu/lal_neighbor_shared.h
+++ b/lib/gpu/lal_neighbor_shared.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -36,7 +36,7 @@ class NeighborShared {
  public:
   NeighborShared() : _compiled(false) {}
   ~NeighborShared() { clear(); }
- 
+
   /// Free all memory on host and device
   void clear();
 
@@ -44,7 +44,7 @@ class NeighborShared {
   UCL_Texture neigh_tex;
 
   /// Compile kernels for neighbor lists
-  void compile_kernels(UCL_Device &dev, const int gpu_nbor, 
+  void compile_kernels(UCL_Device &dev, const int gpu_nbor,
                        const std::string flags);
 
   // ----------------------------- Kernels
diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp
index 3685cf9a89..fefa1172ab 100644
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -54,7 +54,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
                               const int nylo_out, const int nzlo_out,
                               const int nxhi_out, const int nyhi_out,
                               const int nzhi_out, grdtyp **rho_coeff,
-                              grdtyp **vd_brick_p, const double slab_volfactor, 
+                              grdtyp **vd_brick_p, const double slab_volfactor,
                               const int nx_pppm, const int ny_pppm,
                               const int nz_pppm, const bool split, int &flag) {
   _max_bytes=10;
@@ -101,7 +101,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
   _allocated=true;
   _max_bytes=0;
   _max_an_bytes=ans->gpu_bytes();
-  
+
   _order=order;
   _order_m_1=order-1;
   _order2=_order_m_1*_order;
@@ -130,7 +130,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
   view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
   ucl_copy(d_rho_coeff,view,true);
   _max_bytes+=d_rho_coeff.row_bytes();
-  
+
   // Allocate storage for grid
   _npts_x=nxhi_out-nxlo_out+1;
   _npts_y=nyhi_out-nylo_out+1;
@@ -165,10 +165,10 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
     flag=-3;
     return 0;
   }
-  
+
   error_flag.device.zero();
   _max_bytes+=1;
-  
+
   _cpu_idle_time=0.0;
 
   return brick.host.begin();
@@ -180,13 +180,13 @@ void PPPMT::clear(const double cpu_time) {
     return;
   _allocated=false;
   _precompute_done=false;
-  
+
   brick.clear();
   vd_brick.clear();
   d_brick_counts.clear();
   error_flag.clear();
   d_brick_atoms.clear();
-  
+
   acc_timers();
   device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
                               *ans,_max_bytes+_max_an_bytes,cpu_time,
@@ -216,7 +216,7 @@ void PPPMT::clear(const double cpu_time) {
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
                                  double **host_x, int *host_type, bool &success,
-                                 double *host_q, double *boxlo, 
+                                 double *host_q, double *boxlo,
                                  const double delxinv, const double delyinv,
                                  const double delzinv) {
   acc_timers();
@@ -224,7 +224,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
     zero_timers();
     return;
   }
-  
+
   ans->inum(nlocal);
 
   if (ago==0) {
@@ -250,7 +250,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
   int ainum=this->ans->inum();
-  
+
   // Boxlo adjusted to be upper left brick and shift for even spline order
   double shift=0.0;
   if (_order % 2)
@@ -258,7 +258,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
   _brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
   _brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
   _brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
-  
+
   _delxinv=delxinv;
   _delyinv=delyinv;
   _delzinv=delzinv;
@@ -268,7 +268,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
   device->zero(d_brick_counts,d_brick_counts.numel());
   k_particle_map.set_size(GX,BX);
   k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum,
-                     &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y, 
+                     &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y,
                      &_brick_z, &_delxinv, &_delyinv, &_delzinv, &_nlocal_x,
                      &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms,
                      &error_flag);
@@ -299,7 +299,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 int PPPMT::spread(const int ago, const int nlocal, const int nall,
                            double **host_x, int *host_type, bool &success,
-                           double *host_q, double *boxlo, 
+                           double *host_q, double *boxlo,
                            const double delxinv, const double delyinv,
                            const double delzinv) {
   if (_precompute_done==false) {
@@ -309,10 +309,10 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
   }
 
   device->stop_host_timer();
-  
+
   if (!success || nlocal==0)
     return 0;
-    
+
   double t=MPI_Wtime();
   time_out.sync_stop();
   _cpu_idle_time+=MPI_Wtime()-t;
@@ -325,10 +325,10 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
     error_flag.device.zero();
     d_brick_atoms.resize(_atom_stride*_max_brick_atoms);
     _max_bytes+=d_brick_atoms.row_bytes();
-    return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, 
+    return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
                   delxinv,delyinv,delzinv);
   }
-  
+
   return error_flag[0];
 }
 
@@ -340,18 +340,18 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) {
   time_in.start();
   vd_brick.update_device(true);
   time_in.stop();
-  
+
   time_interp.start();
   // Compute the block size and grid size to keep all cores busy
   int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
   int ainum=this->ans->inum();
-  
+
   k_interp.set_size(GX,BX);
   k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff,
                &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv,
-               &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale, 
+               &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale,
                &ans->force);
   time_interp.stop();
 
@@ -381,7 +381,7 @@ void PPPMT::compile_kernels(UCL_Device &dev) {
   #endif
 
   pppm_program=new UCL_Program(dev);
-  
+
   #ifdef USE_OPENCL
   pppm_program->load_string(pppm,flags.c_str());
   #else
diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu
index 99fe655dfd..11703d6d2a 100644
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -48,17 +48,17 @@ texture<int2> q_tex;
 // Number of pencils per block for charge spread
 #define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE)
 
-__kernel void particle_map(const __global numtyp4 *restrict x_,  
+__kernel void particle_map(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict q_,
-                           const grdtyp delvolinv, const int nlocal, 
-                           __global int *restrict counts, 
-                           __global grdtyp4 *restrict ans, 
+                           const grdtyp delvolinv, const int nlocal,
+                           __global int *restrict counts,
+                           __global grdtyp4 *restrict ans,
                            const grdtyp b_lo_x, const grdtyp b_lo_y,
                            const grdtyp b_lo_z, const grdtyp delxinv,
                            const grdtyp delyinv, const grdtyp delzinv,
                            const int nlocal_x, const int nlocal_y,
                            const int nlocal_z, const int atom_stride,
-                           const int max_atoms, 
+                           const int max_atoms,
                            __global int *restrict error) {
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X;
@@ -76,7 +76,7 @@ __kernel void particle_map(const __global numtyp4 *restrict x_,
     grdtyp4 delta;
     fetch(delta.w,ii,q_tex);
     delta.w*=delvolinv;
-    
+
     if (delta.w!=(grdtyp)0.0) {
       delta.x=(p.x-b_lo_x)*delxinv;
       nx=delta.x;
@@ -85,14 +85,14 @@ __kernel void particle_map(const __global numtyp4 *restrict x_,
       delta.z=(p.z-b_lo_z)*delzinv;
       nz=delta.z;
 
-      if (delta.x<(grdtyp)0 || delta.y<(grdtyp)0 || delta.z<(grdtyp)0 || 
+      if (delta.x<(grdtyp)0 || delta.y<(grdtyp)0 || delta.z<(grdtyp)0 ||
           nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z)
         *error=1;
       else {
         delta.x=nx+(grdtyp)0.5-delta.x;
         delta.y=ny+(grdtyp)0.5-delta.y;
         delta.z=nz+(grdtyp)0.5-delta.z;
-      
+
         int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx;
         int old=atom_add(counts+i, 1);
         if (old>=max_atoms) {
@@ -107,9 +107,9 @@ __kernel void particle_map(const __global numtyp4 *restrict x_,
 
 /* --------------------------- */
 
-__kernel void make_rho(const __global int *restrict counts, 
+__kernel void make_rho(const __global int *restrict counts,
                        const __global grdtyp4 *restrict atoms,
-                       __global grdtyp *restrict brick, 
+                       __global grdtyp *restrict brick,
                        const __global grdtyp *restrict _rho_coeff,
                        const int atom_stride, const int npts_x,
                        const int npts_y, const int npts_z, const int nlocal_x,
@@ -118,15 +118,15 @@ __kernel void make_rho(const __global int *restrict counts,
   __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
   __local grdtyp front[BLOCK_PENCILS][PENCIL_SIZE+PPPM_MAX_SPLINE];
   __local grdtyp ans[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
-  
+
   int tid=THREAD_ID_X;
   if (tid<order2+order)
     rho_coeff[tid]=_rho_coeff[tid];
-    
+
   int pid=tid/PENCIL_SIZE;
   int fid=tid%PENCIL_SIZE;
   int fid_halo=PENCIL_SIZE+fid;
-  if (fid<order) 
+  if (fid<order)
     front[pid][fid_halo]=(grdtyp)0.0;
 
   __syncthreads();
@@ -163,7 +163,7 @@ __kernel void make_rho(const __global int *restrict counts,
           int natoms=fast_mul(counts[pos],atom_stride);
           for (int row=pos; row<natoms; row+=atom_stride) {
             grdtyp4 delta=atoms[row];
-      
+
             grdtyp rho1d_1=(grdtyp)0.0;
             grdtyp rho1d_2=(grdtyp)0.0;
             for (int k=order2+order-1; k > -1; k-=order) {
@@ -184,14 +184,14 @@ __kernel void make_rho(const __global int *restrict counts,
         z_pos+=z_stride;
       }
     }
-    
+
     __syncthreads();
     if (fid<order) {
       front[pid][fid]=front[pid][fid_halo];
       front[pid][fid_halo]=(grdtyp)0.0;
-    } else 
+    } else
       front[pid][fid]=(grdtyp)0.0;
-    
+
     for (int n=0; n<order; n++) {
       front[pid][fid+n]+=ans[n][tid];
       __syncthreads();
@@ -204,16 +204,16 @@ __kernel void make_rho(const __global int *restrict counts,
   }
 }
 
-__kernel void interp(const __global numtyp4 *restrict x_, 
+__kernel void interp(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict q_,
-                     const int nlocal, 
+                     const int nlocal,
                      const __global grdtyp4 *restrict brick,
-                     const __global grdtyp *restrict _rho_coeff, 
+                     const __global grdtyp *restrict _rho_coeff,
                      const int npts_x, const int npts_yx, const grdtyp b_lo_x,
                      const grdtyp b_lo_y, const grdtyp b_lo_z,
                      const grdtyp delxinv,  const grdtyp delyinv,
                      const grdtyp delzinv, const int order,
-                     const int order2, const grdtyp qqrd2e_scale, 
+                     const int order2, const grdtyp qqrd2e_scale,
                      __global acctyp4 *restrict ans) {
   __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
   __local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
@@ -223,9 +223,9 @@ __kernel void interp(const __global numtyp4 *restrict x_,
   if (tid<order2+order)
     rho_coeff[tid]=_rho_coeff[tid];
   __syncthreads();
-  
+
   int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
-  
+
   int nx,ny,nz;
   grdtyp tx,ty,tz;
 
@@ -260,7 +260,7 @@ __kernel void interp(const __global numtyp4 *restrict x_,
           rho1d_1[k][tid]=rho_coeff[l]+rho1d_1[k][tid]*dy;
         }
       }
-        
+
       int mz=fast_mul(nz,npts_yx)+nx;
       for (int n=0; n<order; n++) {
         grdtyp rho1d_2=(grdtyp)0.0;
diff --git a/lib/gpu/lal_pppm.h b/lib/gpu/lal_pppm.h
index 3b5809ea6c..045423e079 100644
--- a/lib/gpu/lal_pppm.h
+++ b/lib/gpu/lal_pppm.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -48,9 +48,9 @@ class PPPM {
   grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order,
                 const int nxlo_out, const int nylo_out, const int nzlo_out,
                 const int nxhi_out, const int nyhi_out, const int nzhi_out,
-                grdtyp **rho_coeff, grdtyp **vd_brick, 
-                const double slab_volfactor, const int nx_pppm, 
-                const int ny_pppm, const int nz_pppm, const bool split, 
+                grdtyp **rho_coeff, grdtyp **vd_brick,
+                const double slab_volfactor, const int nx_pppm,
+                const int ny_pppm, const int nz_pppm, const bool split,
                 int &success);
 
   /// Check if there is enough storage for atom arrays and realloc if not
@@ -66,7 +66,7 @@ class PPPM {
   /// Check if there is enough storage for local atoms and realloc if not
   inline void resize_local(const int inum, bool &success) {
   }
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear(const double cpu_time);
@@ -118,7 +118,7 @@ class PPPM {
 
   void interp(const grdtyp qqrd2e_scale);
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
@@ -142,21 +142,21 @@ class PPPM {
 
   UCL_Vector<grdtyp,grdtyp> brick;
   UCL_Vector<grdtyp,grdtyp> vd_brick;
-  
+
   // Count of number of atoms assigned to each grid point
   UCL_D_Vec<int> d_brick_counts;
   // Atoms assigned to each grid point
   UCL_D_Vec<grdtyp4> d_brick_atoms;
-  
+
   // Error checking for out of bounds atoms
   UCL_Vector<int,int> error_flag;
-  
+
   // Number of grid points in brick (including ghost)
   int _npts_x, _npts_y, _npts_z, _npts_yx;
-  
+
   // Number of local grid points in brick
   int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride;
-  
+
   // -------------------------- SPLINE DATA -------------------------
   UCL_D_Vec<grdtyp> d_rho_coeff;
   int _order, _nlower, _nupper, _order_m_1, _order2;
@@ -180,12 +180,12 @@ class PPPM {
   int _block_size, _block_pencils, _pencil_size, _max_brick_atoms, _max_atoms;
   double  _max_bytes, _max_an_bytes;
   double _cpu_idle_time;
-  
-  grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv; 
+
+  grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv;
 
   double _slab_volfactor;
   int _nx_pppm, _ny_pppm, _nz_pppm;
-  
+
   void compile_kernels(UCL_Device &dev);
   void _precompute(const int ago, const int nlocal, const int nall,
                    double **host_x, int *host_type, bool &success,
diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp
index 6e5a82af5b..7e07d6c87b 100644
--- a/lib/gpu/lal_pppm_ext.cpp
+++ b/lib/gpu/lal_pppm_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ static PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4> PPPMD;
 // ---------------------------------------------------------------------------
 template <class grdtyp, class memtyp>
 grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
-                       FILE *screen, const int order, const int nxlo_out, 
+                       FILE *screen, const int order, const int nxlo_out,
                        const int nylo_out, const int nzlo_out,
                        const int nxhi_out, const int nyhi_out,
                        const int nzhi_out, grdtyp **rho_coeff,
@@ -82,7 +82,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
                            split,success);
 
     pppm.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -91,7 +91,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
 }
 
 float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
-                        const int order, const int nxlo_out, 
+                        const int order, const int nxlo_out,
                         const int nylo_out, const int nzlo_out,
                         const int nxhi_out, const int nyhi_out,
                         const int nzhi_out, float **rho_coeff,
@@ -102,7 +102,7 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
                          nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
                          slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success);
   if (split==false && respa==false)
-    PPPMF.device->set_single_precompute(&PPPMF);                         
+    PPPMF.device->set_single_precompute(&PPPMF);
   return b;
 }
 
@@ -133,20 +133,20 @@ void pppm_gpu_forces_f(double **f) {
 }
 
 double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
-                         const int order, const int nxlo_out, 
+                         const int order, const int nxlo_out,
                          const int nylo_out, const int nzlo_out,
                          const int nxhi_out, const int nyhi_out,
                          const int nzhi_out, double **rho_coeff,
                          double **vd_brick, const double slab_volfactor,
                          const int nx_pppm, const int ny_pppm,
-                         const int nz_pppm, const bool split, 
+                         const int nz_pppm, const bool split,
                          const bool respa, int &success) {
   double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
                           nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
                           vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
-                          split,success);                        
+                          split,success);
   if (split==false && respa==false)
-    PPPMD.device->set_double_precompute(&PPPMD);                         
+    PPPMD.device->set_double_precompute(&PPPMD);
   return b;
 }
 
diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h
index 810afb4c88..d5b1b9b6c0 100644
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -49,17 +49,17 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) {
   out << v.x << " " << v.y;
   return out;
 }
-  
+
 inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) {
   out << v.x << " " << v.y << " " << v.z;
   return out;
 }
-  
+
 inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) {
   out << v.x << " " << v.y;
   return out;
 }
-  
+
 inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
   out << v.x << " " << v.y << " " << v.z;
   return out;
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 9dbb3c5944..69a8e61bd4 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -9,16 +9,16 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
 //*************************************************************************
 //                           Preprocessor Definitions
-//                           
+//
 //  Note: It is assumed that constants with the same names are defined with
 //  the same values in all files.
-//  
+//
 //  ARCH
 //     Definition:   Architecture number for accelerator
 //  MEM_THREADS
@@ -35,22 +35,22 @@
 //     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
 //  PPPM_MAX_SPLINE
 //     Definition:   Maximum order for splines in PPPM
-//  PPPM_BLOCK_1D    
+//  PPPM_BLOCK_1D
 //     Definition:   Thread block size for PPPM kernels
 //     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
-//                   PPPM_BLOCK_1D%32==0 
+//                   PPPM_BLOCK_1D%32==0
 //  BLOCK_PAIR
 //     Definition:   Default thread block size for pair styles
 //     Restrictions:
 //  MAX_SHARED_TYPES 8
 //     Definition:   Max # of atom type params can be stored in shared memory
 //     Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
-//  BLOCK_CELL_2D 
+//  BLOCK_CELL_2D
 //     Definition:   Default block size in each dimension for cell list builds
 //                   and matrix transpose
-//  BLOCK_CELL_ID    
+//  BLOCK_CELL_ID
 //     Definition:   Default block size for binning atoms in cell list builds
-//  BLOCK_NBOR_BUILD 
+//  BLOCK_NBOR_BUILD
 //     Definition:   Default block size for neighbor list builds
 //  BLOCK_BIO_PAIR
 //     Definition:   Default thread block size for "bio" pair styles
@@ -78,10 +78,10 @@
 #define BLOCK_SIZE_Y blockDim.y
 #define __kernel extern "C" __global__
 #define __local __shared__
-#define __global  
+#define __global
 #define restrict __restrict__
 #define atom_add atomicAdd
-#define ucl_inline static __inline__ __device__ 
+#define ucl_inline static __inline__ __device__
 
 #ifdef __CUDA_ARCH__
 #define ARCH __CUDA_ARCH__
diff --git a/lib/gpu/lal_re_squared.cpp b/lib/gpu/lal_re_squared.cpp
index cbf50fab7d..55034aaf03 100644
--- a/lib/gpu/lal_re_squared.cpp
+++ b/lib/gpu/lal_re_squared.cpp
@@ -37,18 +37,18 @@ RESquaredT::RESquared() : BaseEllipsoid<numtyp,acctyp>(),
 }
 
 template <class numtyp, class acctyp>
-RESquaredT::~RESquared() { 
+RESquaredT::~RESquared() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int RESquaredT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, 
-                     double **host_cutsq, double **host_sigma, 
+int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
+                     double **host_cutsq, double **host_sigma,
                      double **host_epsilon, int **h_form, double **host_lj1,
                      double **host_lj2, double **host_lj3, double **host_lj4,
                      double **host_offset, const double *host_special_lj,
@@ -97,7 +97,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
 
   dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
   dev_error.zero();
-    
+
   // Allocate, cast and asynchronous memcpy of constant data
   // Copy data for bonded interactions
   special_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@@ -127,7 +127,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
   }
   view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
   ucl_copy(well,view4,false);
-  
+
   _allocated=true;
   this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+
                    lj1.row_bytes()+lj3.row_bytes()+special_lj.row_bytes()+
@@ -144,7 +144,7 @@ void RESquaredT::clear() {
   UCL_H_Vec<int> err_flag(1,*(this->ucl_device));
   ucl_copy(err_flag,dev_error,false);
   if (err_flag[0] == 2)
-    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
+    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
   err_flag.clear();
 
   _allocated=false;
@@ -158,7 +158,7 @@ void RESquaredT::clear() {
   shape.clear();
   well.clear();
   special_lj.clear();
-  
+
   this->clear_base();
 }
 
@@ -184,7 +184,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=0, NGX;
   int stride=this->nbor->nbor_pitch();
   int ainum=this->ans->inum();
@@ -204,10 +204,10 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->k_ellipsoid.set_size(GX,BX);
       this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
                             &this->shape, &this->well, &this->special_lj,
-                            &this->sigma_epsilon, &this->_lj_types, 
-                            &this->nbor->dev_nbor, &stride, 
+                            &this->sigma_epsilon, &this->_lj_types,
+                            &this->nbor->dev_nbor, &stride,
                             &this->ans->force,&ainum, &this->ans->engv,
-                            &this->dev_error, &eflag, &vflag, 
+                            &this->dev_error, &eflag, &vflag,
                             &this->_last_ellipse, &this->_threads_per_atom);
       this->time_ellipsoid.stop();
 
@@ -219,12 +219,12 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
 
       this->time_ellipsoid2.start();
       this->k_ellipsoid_sphere.set_size(GX,BX);
-      this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat, 
+      this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat,
                                    &this->shape, &this->well, &this->special_lj,
-                                   &this->sigma_epsilon, &this->_lj_types, 
+                                   &this->sigma_epsilon, &this->_lj_types,
                                    &this->nbor->dev_nbor, &stride,
                                    &this->ans->force,&ainum,
-                                   &this->ans->engv, &this->dev_error, 
+                                   &this->ans->engv, &this->dev_error,
                                    &eflag, &vflag, &this->_last_ellipse,
                                    &this->_threads_per_atom);
       this->time_ellipsoid2.stop();
@@ -251,12 +251,12 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->time_ellipsoid3.start();
       this->k_sphere_ellipsoid.set_size(GX,BX);
       this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
-                                   &this->shape, &this->well, &this->special_lj, 
+                                   &this->shape, &this->well, &this->special_lj,
                                    &this->sigma_epsilon, &this->_lj_types,
-                                   &this->nbor->dev_nbor, &stride, 
+                                   &this->nbor->dev_nbor, &stride,
                                    &this->ans->force, &this->ans->engv,
                                    &this->dev_error, &eflag, &vflag,
-                                   &this->_last_ellipse, &ainum, 
+                                   &this->_last_ellipse, &ainum,
                                    &this->_threads_per_atom);
       this->time_ellipsoid3.stop();
    } else {
@@ -266,13 +266,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->ans->force.zero();
       this->ans->engv.zero();
       this->time_nbor1.zero();
-      this->time_ellipsoid.zero();                                 
+      this->time_ellipsoid.zero();
       this->time_nbor2.zero();
       this->time_ellipsoid2.zero();
       this->time_nbor3.zero();
       this->time_ellipsoid3.zero();
     }
-    
+
     // ------------         LJ      ---------------
     this->time_lj.start();
     if (this->_last_ellipse<this->ans->inum()) {
@@ -287,7 +287,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       } else {
         this->k_lj.set_size(GX,BX);
         this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3,
-                       &this->_lj_types, &this->special_lj, &stride, 
+                       &this->_lj_types, &this->special_lj, &stride,
                        &this->nbor->dev_packed, &this->ans->force,
                        &this->ans->engv, &this->dev_error, &eflag, &vflag,
                        &this->_last_ellipse, &ainum, &this->_threads_per_atom);
@@ -302,13 +302,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
     this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
 		                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
     this->time_nbor1.stop();
-    this->time_ellipsoid.start(); 
+    this->time_ellipsoid.start();
     this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, 
-                          &this->shape, &this->well, &this->special_lj, 
-                          &this->sigma_epsilon, &this->_lj_types, 
+    this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
+                          &this->shape, &this->well, &this->special_lj,
+                          &this->sigma_epsilon, &this->_lj_types,
                           &this->nbor->dev_nbor, &stride, &this->ans->force,
-                          &ainum,  &this->ans->engv, &this->dev_error, 
+                          &ainum,  &this->ans->engv, &this->dev_error,
                           &eflag, &vflag, &ainum, &this->_threads_per_atom);
     this->time_ellipsoid.stop();
   }
diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu
index 3a65ce14ce..e238734074 100644
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@@ -34,31 +34,31 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
 
 __kernel void k_resquared(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict q,
-                          const __global numtyp4 *restrict shape, 
-                          const __global numtyp4 *restrict well, 
-                          const __global numtyp *restrict splj, 
-                          const __global numtyp2 *restrict sig_eps, 
-                          const int ntypes, 
+                          const __global numtyp4 *restrict shape,
+                          const __global numtyp4 *restrict well,
+                          const __global numtyp *restrict splj,
+                          const __global numtyp2 *restrict sig_eps,
+                          const int ntypes,
                           const __global int *dev_nbor,
-                          const int stride,  
+                          const int stride,
                           __global acctyp4 *restrict ans,
-                          const int astride, 
+                          const int astride,
                           __global acctyp *restrict engv,
-                          __global int *restrict err_flag, 
+                          __global int *restrict err_flag,
                           const int eflag, const int vflag, const int inum,
                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=splj[0];    
-  sp_lj[1]=splj[1];    
-  sp_lj[2]=splj[2];    
+  sp_lj[0]=splj[0];
+  sp_lj[1]=splj[1];
+  sp_lj[2]=splj[2];
   sp_lj[3]=splj[3];
-  
+
   __local numtyp b_alpha, cr60;
   b_alpha=(numtyp)45.0/(numtyp)56.0;
-  cr60=ucl_cbrt((numtyp)60.0);    
+  cr60=ucl_cbrt((numtyp)60.0);
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
@@ -79,7 +79,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
@@ -91,14 +91,14 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
     numtyp lAtwo1_0[9], lAtwo1_1[9], lAtwo1_2[9];  // A'*S^2*lA
     numtyp lAsa1_0[9], lAsa1_1[9], lAsa1_2[9];   // lAtwo+lA'*sa
     numtyp4 ishape;
-    
+
     ishape=shape[itype];
     numtyp4 ishape2;
     ishape2.x=ishape.x*ishape.x;
     ishape2.y=ishape.y*ishape.y;
     ishape2.z=ishape.z*ishape.z;
     numtyp ilshape = ishape.x*ishape.y*ishape.z;
-    
+
     {
       numtyp aTs[9];    // A1'*S1^2
       gpu_quat_to_mat_trans(q,i,a1);
@@ -148,7 +148,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
       numtyp a2[9];       // Rotation matrix (lab->body)
       numtyp gamma2[9];   // A'*S^2*A
       numtyp4 jshape;
-    
+
       jshape=shape[jtype];
       numtyp4 jshape2;
       jshape2.x=jshape.x*jshape.x;
@@ -189,7 +189,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
       H12[7] = gamma1[7]*sigma1+gamma2[7]*sigma2;
       H12[8] = gamma1[8]*sigma1+gamma2[8]*sigma2;
       dH=gpu_det3(H12);
-      
+
       numtyp sigma1p2, sigma2p2, lambda, nu;
       sigma1p2 = sigma1*sigma1;
       sigma2p2 = sigma2*sigma2;
@@ -299,7 +299,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
       hsec = ucl_recip(h12+(numtyp)3.0*sec);
       dspu = ucl_recip(h12)-hsec+stemp;
       pbsu = (numtyp)3.0*sigma*hsec;
-  
+
       numtyp dspr, pbsr;
       stemp = ucl_recip(ishape.x*cr60+h12)+
               ucl_recip(ishape.y*cr60+h12)+
@@ -310,7 +310,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
       hsec = ucl_recip(h12+b_alpha*sec);
       dspr = (numtyp)7.0/h12-hsec+stemp;
       pbsr = b_alpha*sigma*hsec;
-  
+
       numtyp dH12[9];
       numtyp dUa, dUr, deta, dchi, ddH, dh12;
       numtyp dsigma1, dsigma2;
diff --git a/lib/gpu/lal_re_squared.h b/lib/gpu/lal_re_squared.h
index c7441ed83e..8dc137d829 100644
--- a/lib/gpu/lal_re_squared.h
+++ b/lib/gpu/lal_re_squared.h
@@ -25,14 +25,14 @@ template <class numtyp, class acctyp>
 class RESquared : public BaseEllipsoid<numtyp, acctyp> {
  public:
   RESquared();
-  ~RESquared(); 
+  ~RESquared();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device 
+    * \param gpu_split fraction of particles handled by device
     * \return false if there is not sufficient memory or device init prob
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -41,7 +41,7 @@ class RESquared : public BaseEllipsoid<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_shape, double **host_well,
            double **host_cutsq, double **host_sigma,  double **host_epsilon,
-           int **h_form, double **host_lj1, double **host_lj2, 
+           int **h_form, double **host_lj1, double **host_lj2,
            double **host_lj3, double **host_lj4, double **host_offset,
            const double *host_special_lj, const int nlocal, const int nall,
            const int max_nbors, const int maxspecial, const double cell_size,
@@ -50,7 +50,7 @@ class RESquared : public BaseEllipsoid<numtyp, acctyp> {
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
- 
+
   /// Returns memory usage on device per atom
   int bytes_per_atom(const int max_nbors) const;
 
@@ -59,8 +59,8 @@ class RESquared : public BaseEllipsoid<numtyp, acctyp> {
 
   /// Device Error Flag - Set if a bad matrix inversion occurs
   UCL_D_Vec<int> dev_error;
-  
-  // --------------------------- TYPE DATA -------------------------- 
+
+  // --------------------------- TYPE DATA --------------------------
 
   /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
   UCL_D_Vec<numtyp4> lj1;
@@ -70,12 +70,12 @@ class RESquared : public BaseEllipsoid<numtyp, acctyp> {
   UCL_D_Vec<numtyp2> sigma_epsilon;
   /// special lj 0-4
   UCL_D_Vec<numtyp> special_lj;
-  
+
   /// If atom type constants fit in shared memory, use fast kernels
   bool _shared_types;
   int _lj_types;
-   
-  // --------------------------- ATOM DATA -------------------------- 
+
+  // --------------------------- ATOM DATA --------------------------
 
   /// Aspherical Const Data for Atoms
   UCL_D_Vec<numtyp4> shape, well;
diff --git a/lib/gpu/lal_re_squared_ext.cpp b/lib/gpu/lal_re_squared_ext.cpp
index e1d8fffb8f..b719dfe05f 100644
--- a/lib/gpu/lal_re_squared_ext.cpp
+++ b/lib/gpu/lal_re_squared_ext.cpp
@@ -28,8 +28,8 @@ static RESquared<PRECISION,ACC_PRECISION> REMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
-                double **sigma, double **epsilon, 
-                int **form, double **host_lj1, double **host_lj2, 
+                double **sigma, double **epsilon,
+                int **form, double **host_lj1, double **host_lj2,
                 double **host_lj3, double **host_lj4, double **offset,
                 double *special_lj, const int inum, const int nall,
                 const int max_nbors, const int maxspecial,
@@ -56,7 +56,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, 
+    init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon,
                       form, host_lj1, host_lj2, host_lj3, host_lj4, offset,
                       special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen);
@@ -64,7 +64,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
   REMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
-        
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
@@ -75,13 +75,13 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=REMF.init(ntypes, shape, well, cutsq,  sigma, epsilon, 
+      init_ok=REMF.init(ntypes, shape, well, cutsq,  sigma, epsilon,
                         form, host_lj1, host_lj2, host_lj3,
                         host_lj4, offset, special_lj,  inum, nall,
                         max_nbors, maxspecial, cell_size, gpu_split, screen);
 
     REMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,8 +102,8 @@ void re_gpu_clear() {
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double **host_quat);
 
@@ -114,8 +114,8 @@ int** re_gpu_compute_n(const int ago, const int inum_full, const int nall,
                        const bool vatom, int &host_start, int **ilist,
                        int **jnum, const double cpu_time, bool &success,
                        double **host_quat) {
-  return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, 
-                      tag, nspecial, special, eflag, vflag, eatom, vatom, 
+  return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
+                      tag, nspecial, special, eflag, vflag, eatom, vatom,
                       host_start, ilist, jnum, cpu_time, success, host_quat);
 }
 
diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu
index 4742e5bd8e..d69dae2461 100644
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@@ -129,32 +129,32 @@
 
 __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
                                            const __global numtyp4 *restrict q,
-                                           const __global numtyp4 *restrict shape, 
+                                           const __global numtyp4 *restrict shape,
                                            const __global numtyp4 *restrict well,
-                                           const __global numtyp *restrict splj, 
+                                           const __global numtyp *restrict splj,
                                            const __global numtyp2 *restrict sig_eps,
-                                           const int ntypes, 
+                                           const int ntypes,
                                            const __global int *dev_nbor,
-                                           const int stride, 
+                                           const int stride,
                                            __global acctyp4 *restrict ans,
-                                           const int astride, 
-                                           __global acctyp *restrict engv, 
-                                           __global int *restrict err_flag, 
-                                           const int eflag, const int vflag, 
-                                           const int inum, 
+                                           const int astride,
+                                           __global acctyp *restrict engv,
+                                           __global int *restrict err_flag,
+                                           const int eflag, const int vflag,
+                                           const int inum,
                                            const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=splj[0];    
-  sp_lj[1]=splj[1];    
-  sp_lj[2]=splj[2];    
+  sp_lj[0]=splj[0];
+  sp_lj[1]=splj[1];
+  sp_lj[2]=splj[2];
   sp_lj[3]=splj[3];
-  
+
   __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
   b_alpha=(numtyp)45.0/(numtyp)56.0;
-  cr60=ucl_cbrt((numtyp)60.0);    
+  cr60=ucl_cbrt((numtyp)60.0);
   solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
   solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
 
@@ -177,7 +177,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
@@ -223,7 +223,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
       sigma = sig_eps[mtype].x;
       epsilon = sig_eps[mtype].y*factor_lj;
 
-      numtyp aTs[9]; 
+      numtyp aTs[9];
       numtyp4 scorrect;
       numtyp half_sigma=sigma*(numtyp)0.5;
       scorrect.x = ishape.x+half_sigma;
@@ -260,7 +260,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
       Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0;
       Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua;
       Ua = epsilon*Ua*sigmap3*solv_f_a;
-    
+
       stemp = h12/cr60;
       Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
            (numtyp)60.0;
@@ -290,7 +290,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
       numtyp hsec = ucl_recip(h12+(numtyp)3.0*sec);
       numtyp dspu = ucl_recip(h12)-hsec+stemp;
       numtyp pbsu = (numtyp)3.0*sigma*hsec;
-  
+
       stemp = ucl_recip(ishape.x*cr60+h12)+
               ucl_recip(ishape.y*cr60+h12)+
               ucl_recip(ishape.z*cr60+h12)+
@@ -298,7 +298,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
       hsec = ucl_recip(h12+b_alpha*sec);
       numtyp dspr = (numtyp)7.0/h12-hsec+stemp;
       numtyp pbsr = b_alpha*sigma*hsec;
-  
+
       #pragma unroll
       for (int i=0; i<3; i++) {
         numtyp u[3];
@@ -334,7 +334,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
         }
 
       }
-    
+
       // torque on i
       numtyp fwae[3];
       gpu_row_times3(fourw,aTe,fwae);
@@ -384,33 +384,33 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
 }
 
 __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
-                                           const __global numtyp4 *restrict q, 
+                                           const __global numtyp4 *restrict q,
                                            const __global numtyp4 *restrict shape,
                                            const __global numtyp4 *restrict well,
                                            const __global numtyp *restrict splj,
                                            const __global numtyp2 *restrict sig_eps,
-                                           const int ntypes, 
+                                           const int ntypes,
                                            const __global int *dev_nbor,
-                                           const int stride, 
+                                           const int stride,
                                            __global acctyp4 *restrict ans,
-                                           __global acctyp *restrict engv, 
+                                           __global acctyp *restrict engv,
                                            __global int *restrict err_flag,
                                            const int eflag, const int vflag,
-                                           const int start, const int inum, 
+                                           const int start, const int inum,
                                            const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=splj[0];    
-  sp_lj[1]=splj[1];    
-  sp_lj[2]=splj[2];    
+  sp_lj[0]=splj[0];
+  sp_lj[1]=splj[1];
+  sp_lj[2]=splj[2];
   sp_lj[3]=splj[3];
-  
+
   __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
   b_alpha=(numtyp)45.0/(numtyp)56.0;
-  cr60=ucl_cbrt((numtyp)60.0);    
+  cr60=ucl_cbrt((numtyp)60.0);
   solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
   solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
 
@@ -429,7 +429,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 jx; fetch4(jx,j,pos_tex);
     int jtype=jx.w;
 
@@ -445,7 +445,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       numtyp a[9];       // Rotation matrix (lab->body)
       numtyp aTe[9];     // A'*E
       numtyp4 ishape;
-    
+
       ishape=shape[itype];
       gpu_quat_to_mat_trans(q,i,a);
       gpu_transpose_times_diag3(a,well[itype],aTe);
@@ -467,7 +467,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       sigma = sig_eps[mtype].x;
       epsilon = sig_eps[mtype].y*factor_lj;
 
-      numtyp aTs[9]; 
+      numtyp aTs[9];
       numtyp4 scorrect;
       numtyp half_sigma=sigma * (numtyp)0.5;
       scorrect.x = ishape.x+half_sigma;
@@ -477,7 +477,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5;
       scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5;
       gpu_transpose_times_diag3(a,scorrect,aTs);
-      
+
       // energy
 
       numtyp gamma[9], s[3];
@@ -505,7 +505,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       numtyp ilshape=ishape.x*ishape.y*ishape.z;
       Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua;
       Ua = epsilon*Ua*sigmap3*solv_f_a;
-    
+
       stemp = h12/cr60;
       Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
            (numtyp)60.0;
@@ -535,7 +535,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       numtyp hsec = ucl_recip(h12+(numtyp)3.0*sec);
       numtyp dspu = ucl_recip(h12)-hsec+stemp;
       numtyp pbsu = (numtyp)3.0*sigma*hsec;
-  
+
       stemp = ucl_recip(ishape.x*cr60+h12)+
               ucl_recip(ishape.y*cr60+h12)+
               ucl_recip(ishape.z*cr60+h12)+
@@ -543,7 +543,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       hsec = ucl_recip(h12+b_alpha*sec);
       numtyp dspr = (numtyp)7.0/h12-hsec+stemp;
       numtyp pbsr = b_alpha*sigma*hsec;
-  
+
       #pragma unroll
       for (int i=0; i<3; i++) {
         numtyp u[3];
@@ -584,15 +584,15 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_resquared_lj(const __global numtyp4 *restrict x_, 
-                             const __global numtyp4 *restrict lj1, 
-                             const __global numtyp4 *restrict lj3, 
-                             const int lj_types, 
-                             const __global numtyp *restrict gum, 
-                             const int stride, 
-                             const __global int *dev_ij, 
+__kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
+                             const __global numtyp4 *restrict lj1,
+                             const __global numtyp4 *restrict lj3,
+                             const int lj_types,
+                             const __global numtyp *restrict gum,
+                             const int stride,
+                             const __global int *dev_ij,
                              __global acctyp4 *restrict ans,
-                             __global acctyp *restrict engv, 
+                             __global acctyp *restrict engv,
                              __global int *restrict err_flag,
                              const int eflag, const int vflag, const int start,
                              const int inum, const int t_per_atom) {
@@ -601,10 +601,10 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
   ii+=start;
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=gum[0];    
-  sp_lj[1]=gum[1];    
-  sp_lj[2]=gum[2];    
-  sp_lj[3]=gum[3];    
+  sp_lj[0]=gum[0];
+  sp_lj[1]=gum[1];
+  sp_lj[2]=gum[2];
+  sp_lj[3]=gum[3];
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
@@ -614,20 +614,20 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -640,21 +640,21 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int ii=itype*lj_types+jtype;
       if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
-          energy+=factor_lj*(e-lj3[ii].z); 
+          energy+=factor_lj*(e-lj3[ii].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -671,33 +671,33 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, 
-                                  const __global numtyp4 *restrict lj1_in, 
-                                  const __global numtyp4 *restrict lj3_in, 
-                                  const __global numtyp *restrict gum, 
+__kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
+                                  const __global numtyp4 *restrict lj1_in,
+                                  const __global numtyp4 *restrict lj3_in,
+                                  const __global numtyp *restrict gum,
                                   const int stride,
                                   const __global int *dev_ij,
                                   __global acctyp4 *restrict ans,
-                                  __global acctyp *restrict engv, 
+                                  __global acctyp *restrict engv,
                                   __global int *restrict err_flag,
                                   const int eflag, const int vflag,
-                                  const int start, const int inum, 
+                                  const int start, const int inum,
                                   const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
 
-  __local numtyp sp_lj[4];                              
+  __local numtyp sp_lj[4];
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   if (tid<4)
-    sp_lj[tid]=gum[tid];    
+    sp_lj[tid]=gum[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0;
@@ -706,9 +706,9 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -722,7 +722,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -735,19 +735,19 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_soft.cpp b/lib/gpu/lal_soft.cpp
index c206a997a9..337bdd6738 100644
--- a/lib/gpu/lal_soft.cpp
+++ b/lib/gpu/lal_soft.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,10 +33,10 @@ SoftT::Soft() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-SoftT::~Soft() { 
+SoftT::~Soft() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int SoftT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -89,14 +89,14 @@ int SoftT::init(const int ntypes, double **host_cutsq,
 template <class numtyp, class acctyp>
 void SoftT::reinit(const int ntypes, double **host_cutsq,
                    double **host_prefactor, double **host_cut) {
-  
+
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_prefactor,
 			                   host_cut,host_cutsq);
 }
@@ -134,7 +134,7 @@ void SoftT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
diff --git a/lib/gpu/lal_soft.cu b/lib/gpu/lal_soft.cu
index b7c32b6879..831b986725 100644
--- a/lib/gpu/lal_soft.cu
+++ b/lib/gpu/lal_soft.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -26,7 +26,7 @@ texture<int4,1> pos_tex;
 
 #define MY_PI (acctyp)3.14159265358979323846
 
-__kernel void k_soft(const __global numtyp4 *restrict x_, 
+__kernel void k_soft(const __global numtyp4 *restrict x_,
                      const __global numtyp4 *restrict coeff,
                      const int lj_types,
                      const __global numtyp *restrict sp_lj_in,
@@ -51,20 +51,20 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -77,7 +77,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<coeff[mtype].z) {
         numtyp force;
@@ -86,14 +86,14 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
         if (r > (numtyp)0.0) force = factor_lj * coeff[mtype].x *
                        sin(arg) * MY_PI/coeff[mtype].y*ucl_recip(r);
         else force = (numtyp)0.0;
-        
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg));
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -111,7 +111,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_soft_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_soft_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff_in,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
@@ -122,7 +122,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -130,7 +130,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -139,7 +139,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -153,7 +153,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -166,7 +166,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<coeff[mtype].z) {
         numtyp force;
         numtyp r = ucl_sqrt(rsq);
@@ -174,14 +174,14 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
         if (r > (numtyp)0.0) force = factor_lj * coeff[mtype].x *
                        sin(arg) * MY_PI/coeff[mtype].y*ucl_recip(r);
         else force = (numtyp)0.0;
-        
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg));
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_soft.h b/lib/gpu/lal_soft.h
index 7fa529c4f5..e72673248c 100644
--- a/lib/gpu/lal_soft.h
+++ b/lib/gpu/lal_soft.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Soft : public BaseAtomic<numtyp, acctyp> {
  public:
   Soft();
-  ~Soft(); 
+  ~Soft();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,14 +40,14 @@ class Soft : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_prefactor, double **host_cut,
            double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_cutsq,
               double **host_prefactor, double **host_cut);
-           
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -68,7 +68,7 @@ class Soft : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kßernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_soft_ext.cpp b/lib/gpu/lal_soft_ext.cpp
index 9591923965..441fe35839 100644
--- a/lib/gpu/lal_soft_ext.cpp
+++ b/lib/gpu/lal_soft_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -77,7 +77,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
                         cell_size, gpu_split, screen);
 
     SLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -96,16 +96,16 @@ void soft_gpu_reinit(const int ntypes, double **cutsq, double **host_prefactor,
   int world_me=SLMF.device->world_me();
   int gpu_rank=SLMF.device->gpu_rank();
   int procs_per_gpu=SLMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut);
-  
+
   SLMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut);
-    
+
     SLMF.device->gpu_barrier();
   }
 }
@@ -124,7 +124,7 @@ int ** soft_gpu_compute_n(const int ago, const int inum_full,
   return SLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void soft_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_sw_ext.cpp b/lib/gpu/lal_sw_ext.cpp
index e2d1b5e4dd..8cb51307a1 100644
--- a/lib/gpu/lal_sw_ext.cpp
+++ b/lib/gpu/lal_sw_ext.cpp
@@ -27,14 +27,14 @@ static SW<PRECISION,ACC_PRECISION> SWMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, 
+int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors,
                 const double cell_size, int &gpu_mode, FILE *screen,
                 int* host_map, const int nelements, int*** host_elem2param, const int nparams,
                 const double* sw_epsilon, const double* sw_sigma,
                 const double* sw_lambda, const double* sw_gamma,
                 const double* sw_costheta, const double* sw_biga,
                 const double* sw_bigb, const double* sw_powerp,
-                const double* sw_powerq, const double* sw_cut, 
+                const double* sw_powerq, const double* sw_cut,
                 const double* sw_cutsq) {
   SWMF.clear();
   gpu_mode=SWMF.device->gpu_mode();
@@ -46,7 +46,7 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_
   int procs_per_gpu=SWMF.device->procs_per_gpu();
 
   // disable host/device split for now
-  if (gpu_split != 1.0) 
+  if (gpu_split != 1.0)
     return -8;
 
   SWMF.device->init_message(screen,"sw/gpu",first_gpu,last_gpu);
@@ -64,7 +64,7 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_
   if (world_me==0)
     init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
                       host_map, nelements, host_elem2param, nparams,
-                      sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, 
+                      sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
                       sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
 
   SWMF.device->world_barrier();
@@ -83,12 +83,12 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_
     if (gpu_rank==i && world_me!=0)
       init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
                         host_map, nelements, host_elem2param, nparams,
-                        sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, 
-                        sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, 
+                        sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
+                        sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut,
                         sw_cutsq);
 
     SWMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -113,12 +113,12 @@ int ** sw_gpu_compute_n(const int ago, const int inum_full,
   return SWMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
-void sw_gpu_compute(const int ago, const int nlocal, const int nall, 
-                    const int nlist, double **host_x, int *host_type, 
-                    int *ilist, int *numj, int **firstneigh, const bool eflag, 
-                    const bool vflag, const bool eatom, const bool vatom, 
+void sw_gpu_compute(const int ago, const int nlocal, const int nall,
+                    const int nlist, double **host_x, int *host_type,
+                    int *ilist, int *numj, int **firstneigh, const bool eflag,
+                    const bool vflag, const bool eatom, const bool vatom,
                     int &host_start, const double cpu_time, bool &success) {
   SWMF.compute(ago,nlocal,nall,nlist,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
diff --git a/lib/gpu/lal_table.cpp b/lib/gpu/lal_table.cpp
index c99bf85815..0de59c84b2 100644
--- a/lib/gpu/lal_table.cpp
+++ b/lib/gpu/lal_table.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -34,35 +34,35 @@ using namespace LAMMPS_AL;
 extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
-TableT::Table() : BaseAtomic<numtyp,acctyp>(), 
+TableT::Table() : BaseAtomic<numtyp,acctyp>(),
   _allocated(false), _compiled_styles(false) {
 }
 
 template <class numtyp, class acctyp>
-TableT::~Table() { 
+TableT::~Table() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int TableT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int TableT::init(const int ntypes, 
+int TableT::init(const int ntypes,
                 double **host_cutsq, double ***host_table_coeffs,
                 double **host_table_data,
                 double *host_special_lj, const int nlocal,
                 const int nall, const int max_nbors,
                 const int maxspecial, const double cell_size,
-                const double gpu_split, FILE *_screen, 
+                const double gpu_split, FILE *_screen,
                 int tabstyle, int ntables, int tablength) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
                             gpu_split,_screen,table,"k_table");
   if (success!=0)
     return success;
-  
+
   k_pair_linear.set_function(*(this->pair_program),"k_table_linear");
   k_pair_linear_fast.set_function(*(this->pair_program),"k_table_linear_fast");
   k_pair_spline.set_function(*(this->pair_program),"k_table_spline");
@@ -80,38 +80,38 @@ int TableT::init(const int ntypes,
     shared_types=true;
   }
   _lj_types=lj_types;
-  
+
   _tabstyle = tabstyle;
   _ntables = ntables;
   if (tabstyle != BITMAP) _tablength = tablength;
   else _tablength = 1 << tablength;
-  
+
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<int> host_write_int(lj_types*lj_types,*(this->ucl_device),
                                UCL_WRITE_ONLY);
 
-  for (int i=0; i<lj_types*lj_types; i++) 
+  for (int i=0; i<lj_types*lj_types; i++)
     host_write_int[i] = 0;
-    
+
   tabindex.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   nshiftbits.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   nmask.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
 
   for (int ix=1; ix<ntypes; ix++)
-    for (int iy=1; iy<ntypes; iy++) 
+    for (int iy=1; iy<ntypes; iy++)
       host_write_int[ix*lj_types+iy] = (int)host_table_coeffs[ix][iy][0]; // tabindex
   ucl_copy(tabindex,host_write_int,false);
-  
+
   for (int ix=1; ix<ntypes; ix++)
-    for (int iy=1; iy<ntypes; iy++)   
+    for (int iy=1; iy<ntypes; iy++)
       host_write_int[ix*lj_types+iy] = (int)host_table_coeffs[ix][iy][1]; // nshiftbits
   ucl_copy(nshiftbits,host_write_int,false);
-  
+
   for (int ix=1; ix<ntypes; ix++)
-    for (int iy=1; iy<ntypes; iy++)     
+    for (int iy=1; iy<ntypes; iy++)
       host_write_int[ix*lj_types+iy] = (int)host_table_coeffs[ix][iy][2]; // nmask
   ucl_copy(nmask,host_write_int,false);
-  
+
   UCL_H_Vec<numtyp4> host_write(lj_types*lj_types,*(this->ucl_device),
                                UCL_WRITE_ONLY);
 
@@ -151,7 +151,7 @@ int TableT::init(const int ntypes,
           host_write2[n*_tablength+k].z = host_table_data[n][6*k+2]; // f
           host_write2[n*_tablength+k].w = (numtyp)0;
       }
-    } 
+    }
   }
   ucl_copy(coeff3,host_write2,false);
 
@@ -166,21 +166,21 @@ int TableT::init(const int ntypes,
   for (int n=0; n<_ntables; n++) {
     if (tabstyle == LINEAR) {
       for (int k=0; k<_tablength-1; k++) {
-        host_write2[n*_tablength+k].x = (numtyp)0; 
+        host_write2[n*_tablength+k].x = (numtyp)0;
         host_write2[n*_tablength+k].y = host_table_data[n][6*k+3]; // de
         host_write2[n*_tablength+k].z = host_table_data[n][6*k+4]; // df
         host_write2[n*_tablength+k].w = (numtyp)0;
       }
     } else if (tabstyle == SPLINE) {
       for (int k=0; k<_tablength; k++) {
-        host_write2[n*_tablength+k].x = (numtyp)0; 
+        host_write2[n*_tablength+k].x = (numtyp)0;
         host_write2[n*_tablength+k].y = host_table_data[n][6*k+3]; // e2
         host_write2[n*_tablength+k].z = host_table_data[n][6*k+4]; // f2
         host_write2[n*_tablength+k].w = (numtyp)0;
       }
     } else if (tabstyle == BITMAP) {
       for (int k=0; k<_tablength; k++) {
-        host_write2[n*_tablength+k].x = (numtyp)0; 
+        host_write2[n*_tablength+k].x = (numtyp)0;
         host_write2[n*_tablength+k].y = host_table_data[n][6*k+3]; // de
         host_write2[n*_tablength+k].z = host_table_data[n][6*k+4]; // df
         host_write2[n*_tablength+k].w = host_table_data[n][6*k+5]; // drsq
@@ -188,12 +188,12 @@ int TableT::init(const int ntypes,
     }
   }
   ucl_copy(coeff4,host_write2,false);
-  
+
   UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device),
                              UCL_WRITE_ONLY);
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq);
-            
+
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   dview.view(host_special_lj,4,*(this->ucl_device));
@@ -220,7 +220,7 @@ void TableT::clear() {
   coeff3.clear();
   coeff4.clear();
   sp_lj.clear();
-  
+
   if (_compiled_styles) {
     k_pair_linear_fast.clear();
     k_pair_linear.clear();
@@ -230,7 +230,7 @@ void TableT::clear() {
     k_pair_bitmap.clear();
     _compiled_styles=false;
   }
-  
+
   this->clear_atomic();
 }
 
@@ -256,7 +256,7 @@ void TableT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -269,67 +269,67 @@ void TableT::loop(const bool _eflag, const bool _vflag) {
       this->k_pair_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
                             &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor,
                             &this->_nbor_data->begin(), &this->ans->force,
-                            &this->ans->engv, &eflag, &vflag, &ainum, 
+                            &this->ans->engv, &eflag, &vflag, &ainum,
                             &nbor_pitch, &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == LINEAR) {
       this->k_pair_linear_fast.set_size(GX,BX);
-      this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2, 
+      this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2,
                                    &coeff3, &coeff4, &cutsq, &sp_lj,
                                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                                    &this->ans->force, &this->ans->engv,
-                                   &eflag, &vflag, &ainum, &nbor_pitch, 
+                                   &eflag, &vflag, &ainum, &nbor_pitch,
                                    &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == SPLINE) {
       this->k_pair_spline_fast.set_size(GX,BX);
-      this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2, 
+      this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2,
                                    &coeff3, &coeff4, &cutsq, &sp_lj,
                                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                                    &this->ans->force, &this->ans->engv,
-                                   &eflag, &vflag, &ainum, &nbor_pitch, 
+                                   &eflag, &vflag, &ainum, &nbor_pitch,
                                    &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == BITMAP) {
       this->k_pair_bitmap_fast.set_size(GX,BX);
       this->k_pair_bitmap_fast.run(&this->atom->x, &tabindex, &nshiftbits,
                                    &nmask, &coeff2, &coeff3, &coeff4, &cutsq,
-                                   &sp_lj, &this->nbor->dev_nbor, 
+                                   &sp_lj, &this->nbor->dev_nbor,
                                    &this->_nbor_data->begin(), &this->ans->force,
                                    &this->ans->engv, &eflag, &vflag,
-                                   &ainum, &nbor_pitch, 
+                                   &ainum, &nbor_pitch,
                                    &this->_threads_per_atom, &_tablength);
-    } 
+    }
   } else {
     if (_tabstyle == LOOKUP) {
       this->k_pair.set_size(GX,BX);
-      this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3, 
-                       &coeff4, &_lj_types, &cutsq, &sp_lj, 
+      this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
+                       &coeff4, &_lj_types, &cutsq, &sp_lj,
                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                       &this->ans->force, &this->ans->engv, &eflag, 
+                       &this->ans->force, &this->ans->engv, &eflag,
                        &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom,
                        &_tablength);
     } else if (_tabstyle == LINEAR) {
       this->k_pair_linear.set_size(GX,BX);
       this->k_pair_linear.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
-                              &coeff4, &_lj_types, &cutsq, &sp_lj, 
+                              &coeff4, &_lj_types, &cutsq, &sp_lj,
                               &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                               &this->ans->force, &this->ans->engv, &eflag,
-                              &vflag, &ainum, &nbor_pitch, 
+                              &vflag, &ainum, &nbor_pitch,
                               &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == SPLINE) {
       this->k_pair_spline.set_size(GX,BX);
       this->k_pair_spline.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
-                              &coeff4, &_lj_types, &cutsq, &sp_lj, 
-                              &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                              &coeff4, &_lj_types, &cutsq, &sp_lj,
+                              &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                               &this->ans->force, &this->ans->engv, &eflag,
-                              &vflag, &ainum, &nbor_pitch, 
+                              &vflag, &ainum, &nbor_pitch,
                               &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == BITMAP) {
       this->k_pair_bitmap.set_size(GX,BX);
-      this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits, 
+      this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits,
                               &nmask, &coeff2, &coeff3, &coeff4, &_lj_types,
                               &cutsq, &sp_lj, &this->nbor->dev_nbor,
                               &this->_nbor_data->begin(), &this->ans->force,
                               &this->ans->engv, &eflag, &vflag, &ainum,
-                              &nbor_pitch, &this->_threads_per_atom, 
+                              &nbor_pitch, &this->_threads_per_atom,
                               &_tablength);
     }
   }
diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu
index 1033b7fbb8..971b56d96e 100644
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -39,39 +39,39 @@ typedef union {
 
 /// ---------------- LOOKUP -------------------------------------------------
 
-__kernel void k_table(const __global numtyp4 *restrict x_, 
+__kernel void k_table(const __global numtyp4 *restrict x_,
                       const __global int *restrict tabindex,
-                      const __global numtyp4 *restrict coeff2, 
+                      const __global numtyp4 *restrict coeff2,
                       const __global numtyp4 *restrict coeff3,
                       const __global numtyp4 *restrict coeff4,
                       const int lj_types,
                       const __global numtyp *restrict cutsq,
-                      const __global numtyp *restrict sp_lj_in, 
-                      const __global int *dev_nbor, 
-                      const __global int *dev_packed, 
-                      __global acctyp4 *restrict ans, 
-                      __global acctyp *restrict engv, 
-                      const int eflag, const int vflag, const int inum, 
-                      const int nbor_pitch, const int t_per_atom, 
+                      const __global numtyp *restrict sp_lj_in,
+                      const __global int *dev_nbor,
+                      const __global int *dev_packed,
+                      __global acctyp4 *restrict ans,
+                      __global acctyp *restrict engv,
+                      const int eflag, const int vflag, const int inum,
+                      const int nbor_pitch, const int t_per_atom,
                       int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -81,10 +81,10 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -92,13 +92,13 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype*lj_types+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp force = (numtyp)0;
@@ -107,14 +107,14 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
           idx = itable + tbindex*tablength;
           force = factor_lj * coeff3[idx].z;
         } else force = (numtyp)0.0;
-                       
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
-          if (itable < tlm1) 
+          if (itable < tlm1)
             e = coeff3[idx].y;
           energy+=factor_lj*e;
         }
@@ -136,21 +136,21 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
 
 __kernel void k_table_fast(const __global numtyp4 *restrict x_,
                            const __global int *restrict tabindex,
-                           const __global numtyp4 *restrict coeff2, 
+                           const __global numtyp4 *restrict coeff2,
                            const __global numtyp4 *restrict coeff3,
                            const __global numtyp4 *restrict coeff4,
                            const __global numtyp *restrict cutsq_in,
-                           const __global numtyp *restrict sp_lj_in, 
-                           const __global int *dev_nbor, 
-                           const __global int *dev_packed, 
-                           __global acctyp4 *restrict ans, 
-                           __global acctyp *restrict engv, 
-                           const int eflag, const int vflag, const int inum, 
-                           const int nbor_pitch, const int t_per_atom, 
+                           const __global numtyp *restrict sp_lj_in,
+                           const __global int *dev_nbor,
+                           const __global int *dev_packed,
+                           __global acctyp4 *restrict ans,
+                           __global acctyp *restrict engv,
+                           const int eflag, const int vflag, const int inum,
+                           const int nbor_pitch, const int t_per_atom,
                            int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -158,18 +158,18 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
- 
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -180,10 +180,10 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -191,13 +191,13 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp force = (numtyp)0;
@@ -206,14 +206,14 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
           idx = itable + tbindex*tablength;
           force = factor_lj * coeff3[idx].z;
         } else force = (numtyp)0.0;
-                       
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
-          if (itable < tlm1) 
+          if (itable < tlm1)
             e = coeff3[idx].y;
           energy+=factor_lj*e;
         }
@@ -235,24 +235,24 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
 
 /// ---------------- LINEAR -------------------------------------------------
 
-__kernel void k_table_linear(const __global numtyp4 *restrict x_, 
+__kernel void k_table_linear(const __global numtyp4 *restrict x_,
                              const __global int *restrict tabindex,
-                             const __global numtyp4 *restrict coeff2, 
+                             const __global numtyp4 *restrict coeff2,
                              const __global numtyp4 *restrict coeff3,
                              const __global numtyp4 *restrict coeff4,
                              const int lj_types,
                              const __global numtyp *restrict cutsq,
-                             const __global numtyp *restrict sp_lj_in, 
-                             const __global int *dev_nbor, 
-                             const __global int *dev_packed, 
-                             __global acctyp4 *restrict ans, 
-                             __global acctyp *restrict engv, 
-                             const int eflag, const int vflag, const int inum, 
-                             const int nbor_pitch, const int t_per_atom, 
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch, const int t_per_atom,
                              int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
@@ -265,9 +265,9 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -277,10 +277,10 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -288,13 +288,13 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype*lj_types+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp fraction=(numtyp)0;
@@ -307,14 +307,14 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
           value = coeff3[idx].z + fraction*coeff4[idx].z;
           force = factor_lj * value;
         } else force = (numtyp)0.0;
-             
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
-          if (itable < tlm1) 
+          if (itable < tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
@@ -334,23 +334,23 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
                                   const __global int *restrict tabindex,
-                                  const __global numtyp4 *restrict coeff2, 
+                                  const __global numtyp4 *restrict coeff2,
                                   const __global numtyp4 *restrict coeff3,
                                   const __global numtyp4 *restrict coeff4,
                                   const __global numtyp *restrict cutsq_in,
-                                  const __global numtyp *restrict sp_lj_in, 
-                                  const __global int *dev_nbor, 
-                                  const __global int *dev_packed, 
-                                  __global acctyp4 *restrict ans, 
-                                  __global acctyp *restrict engv, 
-                                  const int eflag, const int vflag, 
-                                  const int inum, const int nbor_pitch, 
+                                  const __global numtyp *restrict sp_lj_in,
+                                  const __global int *dev_nbor,
+                                  const __global int *dev_packed,
+                                  __global acctyp4 *restrict ans,
+                                  __global acctyp *restrict engv,
+                                  const int eflag, const int vflag,
+                                  const int inum, const int nbor_pitch,
                                   const int t_per_atom, int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -358,7 +358,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -369,7 +369,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -380,10 +380,10 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -391,13 +391,13 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp fraction=(numtyp)0;
@@ -410,14 +410,14 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
           value = coeff3[idx].z + fraction*coeff4[idx].z;
           force = factor_lj * value;
         } else force = (numtyp)0.0;
-             
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
-          if (itable < tlm1) 
+          if (itable < tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
@@ -439,39 +439,39 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
 
 /// ---------------- SPLINE -------------------------------------------------
 
-__kernel void k_table_spline(const __global numtyp4 *restrict x_, 
+__kernel void k_table_spline(const __global numtyp4 *restrict x_,
                              const __global int *restrict tabindex,
-                             const __global numtyp4 *restrict coeff2, 
+                             const __global numtyp4 *restrict coeff2,
                              const __global numtyp4 *restrict coeff3,
                              const __global numtyp4 *restrict coeff4,
                              const int lj_types,
                              const __global numtyp *restrict cutsq,
-                             const __global numtyp *restrict sp_lj_in, 
-                             const __global int *dev_nbor, 
-                             const __global int *dev_packed, 
-                             __global acctyp4 *restrict ans, 
-                             __global acctyp *restrict engv, 
-                             const int eflag, const int vflag, const int inum, 
-                             const int nbor_pitch, const int t_per_atom, 
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch, const int t_per_atom,
                              int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-    
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -481,10 +481,10 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -492,13 +492,13 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype*lj_types+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp a = (numtyp)0;
@@ -510,12 +510,12 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
           idx = itable + tbindex*tablength;
           b = (rsq - coeff3[idx].x) * coeff2[mtype].y;
           a = (numtyp)1.0 - b;
-          value = a * coeff3[idx].z + b * coeff3[idx+1].z + 
-            ((a*a*a-a)*coeff4[idx].z + (b*b*b-b)*coeff4[idx+1].z) * 
+          value = a * coeff3[idx].z + b * coeff3[idx+1].z +
+            ((a*a*a-a)*coeff4[idx].z + (b*b*b-b)*coeff4[idx+1].z) *
                   coeff2[mtype].z;
           force = factor_lj * value;
         } else force = (numtyp)0.0;
-              
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -523,10 +523,10 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1) {
-            e = a * coeff3[idx].y + b * coeff3[idx+1].y + 
-                ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) * 
+            e = a * coeff3[idx].y + b * coeff3[idx+1].y +
+                ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) *
                   coeff2[mtype].z;
-          }  
+          }
           energy+=factor_lj*e;
         }
         if (vflag>0) {
@@ -545,23 +545,23 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_table_spline_fast(const __global numtyp4 *x_, 
+__kernel void k_table_spline_fast(const __global numtyp4 *x_,
                                   const __global int *tabindex,
-                                  const __global numtyp4* coeff2, 
+                                  const __global numtyp4* coeff2,
                                   const __global numtyp4 *coeff3,
                                   const __global numtyp4 *coeff4,
                                   const __global numtyp *cutsq_in,
-                                  const __global numtyp* sp_lj_in, 
-                                  const __global int *dev_nbor, 
-                                  const __global int *dev_packed, 
-                                  __global acctyp4 *ans, 
-                                  __global acctyp *engv, 
-                                  const int eflag, const int vflag, 
-                                  const int inum, const int nbor_pitch, 
+                                  const __global numtyp* sp_lj_in,
+                                  const __global int *dev_nbor,
+                                  const __global int *dev_packed,
+                                  __global acctyp4 *ans,
+                                  __global acctyp *engv,
+                                  const int eflag, const int vflag,
+                                  const int inum, const int nbor_pitch,
                                   const int t_per_atom, int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -569,7 +569,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -578,9 +578,9 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -591,10 +591,10 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -602,13 +602,13 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp a = (numtyp)0;
@@ -620,12 +620,12 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
           idx = itable + tbindex*tablength;
           b = (rsq - coeff3[idx].x) * coeff2[mtype].y;
           a = (numtyp)1.0 - b;
-          value = a * coeff3[idx].z + b * coeff3[idx+1].z + 
-            ((a*a*a-a)*coeff4[idx].z + (b*b*b-b)*coeff4[idx+1].z) * 
+          value = a * coeff3[idx].z + b * coeff3[idx+1].z +
+            ((a*a*a-a)*coeff4[idx].z + (b*b*b-b)*coeff4[idx+1].z) *
                   coeff2[mtype].z;
           force = factor_lj * value;
         } else force = (numtyp)0.0;
-              
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -633,10 +633,10 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1) {
-            e = a * coeff3[idx].y + b * coeff3[idx+1].y + 
-                ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) * 
+            e = a * coeff3[idx].y + b * coeff3[idx+1].y +
+                ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) *
                   coeff2[mtype].z;
-          }  
+          }
           energy+=factor_lj*e;
         }
         if (vflag>0) {
@@ -657,41 +657,41 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
 
 /// ---------------- BITMAP -------------------------------------------------
 
-__kernel void k_table_bitmap(const __global numtyp4 *x_, 
+__kernel void k_table_bitmap(const __global numtyp4 *x_,
                              const __global int *tabindex,
-                             const __global int *nshiftbits, 
+                             const __global int *nshiftbits,
                              const __global int *nmask,
-                             const __global numtyp4* coeff2, 
+                             const __global numtyp4* coeff2,
                              const __global numtyp4 *coeff3,
                              const __global numtyp4 *coeff4,
                              const int lj_types,
                              const __global numtyp *cutsq,
-                             const __global numtyp* sp_lj_in, 
-                             const __global int *dev_nbor, 
-                             const __global int *dev_packed, 
-                             __global acctyp4 *ans, 
-                             __global acctyp *engv, 
-                             const int eflag, const int vflag, const int inum, 
-                             const int nbor_pitch, const int t_per_atom, 
+                             const __global numtyp* sp_lj_in,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             __global acctyp4 *ans,
+                             __global acctyp *engv,
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch, const int t_per_atom,
                              int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -701,10 +701,10 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -712,19 +712,19 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype*lj_types+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp fraction=(numtyp)0;
         numtyp value = (numtyp)0;
         numtyp force = (numtyp)0;
-        union_int_float rsq_lookup; 
+        union_int_float rsq_lookup;
         rsq_lookup.f = rsq;
         itable = rsq_lookup.i & nmask[mtype];
         itable >>= nshiftbits[mtype];
@@ -734,14 +734,14 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
           value = coeff3[idx].z + fraction*coeff4[idx].z;
           force = factor_lj * value;
         } else force = (numtyp)0.0;
-          
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
-          if (itable <= tlm1) 
+          if (itable <= tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
@@ -761,25 +761,25 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
   } // if ii
 }
 
-__kernel void k_table_bitmap_fast(const __global numtyp4 *x_, 
+__kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
                                   const __global int *tabindex,
-                                  const __global int *nshiftbits, 
+                                  const __global int *nshiftbits,
                                   const __global int *nmask,
-                                  const __global numtyp4* coeff2, 
+                                  const __global numtyp4* coeff2,
                                   const __global numtyp4 *coeff3,
                                   const __global numtyp4 *coeff4,
                                   const __global numtyp *cutsq_in,
-                                  const __global numtyp* sp_lj_in, 
-                                  const __global int *dev_nbor, 
-                                  const __global int *dev_packed, 
-                                  __global acctyp4 *ans, 
-                                  __global acctyp *engv, 
-                                  const int eflag, const int vflag, 
-                                  const int inum, const int nbor_pitch, 
+                                  const __global numtyp* sp_lj_in,
+                                  const __global int *dev_nbor,
+                                  const __global int *dev_packed,
+                                  __global acctyp4 *ans,
+                                  __global acctyp *engv,
+                                  const int eflag, const int vflag,
+                                  const int inum, const int nbor_pitch,
                                   const int t_per_atom, int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -787,18 +787,18 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -809,10 +809,10 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -820,19 +820,19 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp fraction=(numtyp)0;
         numtyp value = (numtyp)0;
         numtyp force = (numtyp)0;
-        union_int_float rsq_lookup; 
+        union_int_float rsq_lookup;
         rsq_lookup.f = rsq;
         itable = rsq_lookup.i & nmask[mtype];
         itable >>= nshiftbits[mtype];
@@ -842,14 +842,14 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
           value = coeff3[idx].z + fraction*coeff4[idx].z;
           force = factor_lj * value;
         } else force = (numtyp)0.0;
-          
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
-          if (itable <= tlm1) 
+          if (itable <= tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
diff --git a/lib/gpu/lal_table.h b/lib/gpu/lal_table.h
index 0e04737d27..f667336679 100644
--- a/lib/gpu/lal_table.h
+++ b/lib/gpu/lal_table.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Table : public BaseAtomic<numtyp, acctyp> {
  public:
   Table();
-  ~Table(); 
+  ~Table();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -38,10 +38,10 @@ class Table : public BaseAtomic<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double** cutsq, double ***host_table_coeffs,
-           double **host_table_data, 
+           double **host_table_data,
            double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen,
            int tabstyle, int ntables, int tablength);
 
@@ -54,42 +54,42 @@ class Table : public BaseAtomic<numtyp, acctyp> {
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
-  
+
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Kernel k_pair_linear, k_pair_linear_fast;
   UCL_Kernel k_pair_spline, k_pair_spline_fast;
   UCL_Kernel k_pair_bitmap, k_pair_bitmap_fast;
-  
+
   // --------------------------- TYPE DATA --------------------------
 
   UCL_D_Vec<int> tabindex, nshiftbits, nmask;
-  
-  /// coeff2.x = innersq, coeff2.y = invdelta, coeff2.z = deltasq6, 
+
+  /// coeff2.x = innersq, coeff2.y = invdelta, coeff2.z = deltasq6,
   UCL_D_Vec<numtyp4> coeff2;
-  
+
   /// coeff3.x = rsq, coeff3.y = e, coeff3.z = f
   UCL_D_Vec<numtyp4> coeff3;
-  
+
   /// coeff4.x = de, coeff4.y = df
   UCL_D_Vec<numtyp4> coeff4;
-  
+
   UCL_D_Vec<numtyp> cutsq;
-  
+
   /// Special LJ values
   UCL_D_Vec<numtyp> sp_lj;
 
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
-  
+
   /// Table style, length and number of tables
   int _tabstyle,_tablength,_ntables;
-  
+
  private:
   bool _allocated, _compiled_styles;
-  
+
   void loop(const bool _eflag, const bool _vflag);
 };
 
diff --git a/lib/gpu/lal_table_ext.cpp b/lib/gpu/lal_table_ext.cpp
index 172acb7d39..4eb7e0ce1b 100644
--- a/lib/gpu/lal_table_ext.cpp
+++ b/lib/gpu/lal_table_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ static Table<PRECISION,ACC_PRECISION> TBMF;
 int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
                  double **table_data, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen, 
+                 const double cell_size, int &gpu_mode, FILE *screen,
                  int tabstyle, int ntables, int tablength) {
   TBMF.clear();
   gpu_mode=TBMF.device->gpu_mode();
@@ -55,7 +55,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
   int init_ok=0;
   if (world_me==0)
     init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data,
-                      special_lj, inum, nall, 300, maxspecial, cell_size, 
+                      special_lj, inum, nall, 300, maxspecial, cell_size,
                       gpu_split, screen, tabstyle, ntables, tablength);
 
   TBMF.device->world_barrier();
@@ -73,11 +73,11 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data,
-                      special_lj, inum, nall, 300, maxspecial, cell_size, 
+                      special_lj, inum, nall, 300, maxspecial, cell_size,
                       gpu_split, screen, tabstyle, ntables, tablength);
 
     TBMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,7 +102,7 @@ int ** table_gpu_compute_n(const int ago, const int inum_full,
   return TBMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void table_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_yukawa.cpp b/lib/gpu/lal_yukawa.cpp
index 585dc069a0..88cb8cdb3c 100644
--- a/lib/gpu/lal_yukawa.cpp
+++ b/lib/gpu/lal_yukawa.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,19 +33,19 @@ YukawaT::Yukawa() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-YukawaT::~Yukawa() { 
+YukawaT::~Yukawa() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int YukawaT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int YukawaT::init(const int ntypes, 
+int YukawaT::init(const int ntypes,
                   double **host_cutsq, double kappa,
-                  double **host_a, double **host_offset, 
+                  double **host_a, double **host_offset,
                   double *host_special_lj, const int nlocal,
                   const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
@@ -83,7 +83,7 @@ int YukawaT::init(const int ntypes,
   ucl_copy(sp_lj,dview,false);
 
   _kappa = kappa;
-  
+
   _allocated=true;
   this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
   return 0;
@@ -122,7 +122,7 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -134,7 +134,7 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &coeff, &_kappa, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, 
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu
index b0c3b9978d..a8d637ec97 100644
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,14 +24,14 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_yukawa(const __global numtyp4 *restrict x_, 
+__kernel void k_yukawa(const __global numtyp4 *restrict x_,
                        const __global numtyp4 *restrict coeff,
                        const numtyp kappa, const int lj_types,
-                       const __global numtyp *restrict sp_lj_in, 
-                       const __global int *dev_nbor, 
-                       const __global int *dev_packed, 
+                       const __global numtyp *restrict sp_lj_in,
+                       const __global int *dev_nbor,
+                       const __global int *dev_packed,
                        __global acctyp4 *restrict ans,
-                       __global acctyp *restrict engv, 
+                       __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -49,20 +49,20 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -75,7 +75,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<coeff[mtype].z) {
         numtyp r2inv = ucl_recip(rsq);
@@ -84,14 +84,14 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
         numtyp screening = exp(-kappa*r);
         numtyp force = coeff[mtype].x*screening*(kappa + rinv)*r2inv;
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x*screening*rinv;
-          energy+=factor_lj*(e-coeff[mtype].y); 
+          energy+=factor_lj*(e-coeff[mtype].y);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -109,19 +109,19 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict coeff_in,
-                            const numtyp kappa, 
-                            const __global numtyp *restrict sp_lj_in, 
-                            const __global int *dev_nbor, 
-                            const __global int *dev_packed, 
-                            __global acctyp4 *restrict ans, 
-                            __global acctyp *restrict engv, 
-                            const int eflag, const int vflag, const int inum, 
+                            const numtyp kappa,
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -129,7 +129,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -138,7 +138,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -152,7 +152,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -165,7 +165,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<coeff[mtype].z) {
         numtyp r2inv = ucl_recip(rsq);
         numtyp r = ucl_sqrt(rsq);
@@ -180,7 +180,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x*screening*rinv;
-          energy+=factor_lj*(e-coeff[mtype].y); 
+          energy+=factor_lj*(e-coeff[mtype].y);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_yukawa.h b/lib/gpu/lal_yukawa.h
index 720dc903d0..4cc23c03e9 100644
--- a/lib/gpu/lal_yukawa.h
+++ b/lib/gpu/lal_yukawa.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Yukawa : public BaseAtomic<numtyp, acctyp> {
  public:
   Yukawa();
-  ~Yukawa(); 
+  ~Yukawa();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -39,8 +39,8 @@ class Yukawa : public BaseAtomic<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq, double kappa,
            double **host_a, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -57,16 +57,16 @@ class Yukawa : public BaseAtomic<numtyp, acctyp> {
 
   /// coeff.x = a, coeff.y = offset, coeff.z = cutsq
   UCL_D_Vec<numtyp4> coeff;
-  
+
   /// Special LJ values
   UCL_D_Vec<numtyp> sp_lj;
 
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
-  
+
   /// kappa
   numtyp _kappa;
 
diff --git a/lib/gpu/lal_yukawa_colloid.cpp b/lib/gpu/lal_yukawa_colloid.cpp
index 70282a7117..bfe398c62e 100644
--- a/lib/gpu/lal_yukawa_colloid.cpp
+++ b/lib/gpu/lal_yukawa_colloid.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -29,23 +29,23 @@ using namespace LAMMPS_AL;
 extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
-YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(), 
+YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(),
 _max_rad_size(0), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-YukawaColloidT::~YukawaColloid() { 
+YukawaColloidT::~YukawaColloid() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int YukawaColloidT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int YukawaColloidT::init(const int ntypes, 
-                   double **host_cutsq, double **host_a, 
+int YukawaColloidT::init(const int ntypes,
+                   double **host_cutsq, double **host_a,
                    double **host_offset, double *host_special_lj, const int nlocal,
                    const int nall, const int max_nbors,
                    const int maxspecial, const double cell_size,
@@ -62,16 +62,16 @@ int YukawaColloidT::init(const int ntypes,
     _shared_view=false;
 
   // allocate rad
-  
+
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
-  
+
   _max_rad_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-    
+
   if (_shared_view==false)
     c_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
-  
+
   rad_tex.get_texture(*(this->pair_program),"rad_tex");
   rad_tex.bind_float(c_rad,1);
 
@@ -102,7 +102,7 @@ int YukawaColloidT::init(const int ntypes,
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   dview.view(host_special_lj,4,*(this->ucl_device));
   ucl_copy(sp_lj,dview,false);
-  
+
   _allocated=true;
   this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
   return 0;
@@ -131,15 +131,15 @@ double YukawaColloidT::host_memory_usage() const {
 // Copy nbor list from host if necessary and then compute atom energies/forces
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void YukawaColloidT::compute(const int f_ago, const int inum_full, 
-               const int nall, double **host_x, int *host_type, int *ilist, 
+void YukawaColloidT::compute(const int f_ago, const int inum_full,
+               const int nall, double **host_x, int *host_type, int *ilist,
                int *numj, int **firstneigh, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success, double *rad) {
   this->acc_timers();
-  
+
   // ------------------- Resize rad array --------------------------
-  
+
   if (nall>_max_rad_size) {
     _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
     if (_shared_view==false) {
@@ -157,7 +157,7 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full,
     this->zero_timers();
     return;
   }
-  
+
   int ago=this->hd_balancer.ago_first(f_ago);
   int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
   this->ans->inum(inum);
@@ -170,7 +170,7 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full,
     if (!success)
       return;
   }
-  
+
   this->atom->cast_x_data(host_x,host_type);
   this->cast_rad_data(rad);
   this->hd_balancer.start_timer();
@@ -182,7 +182,7 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full,
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 }
-               
+
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU and then compute per-atom densities
 // ---------------------------------------------------------------------------
@@ -190,24 +190,24 @@ template <class numtyp, class acctyp>
 int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **jnum, const double cpu_time, bool &success,
                 double *rad) {
   this->acc_timers();
-  
+
   // ------------------- Resize rad array ----------------------------
-  
+
   if (nall>_max_rad_size) {
     _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
     if (_shared_view==false) {
       c_rad.resize(_max_rad_size);
       rad_tex.bind_float(c_rad,1);
     }
-  }      
+  }
 
   // -----------------------------------------------------------------
-  
+
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -215,21 +215,21 @@ int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall
     this->zero_timers();
     return NULL;
   }
-  
+
   // load balance, returning the atom count on the device (inum)
   this->hd_balancer.balance(cpu_time);
   int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
   this->ans->inum(inum);
   host_start=inum;
- 
-  // Build neighbor list on GPU if necessary 
+
+  // Build neighbor list on GPU if necessary
   if (ago==0) {
     this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                           sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return NULL;
     this->cast_rad_data(rad);
-    this->hd_balancer.start_timer();  
+    this->hd_balancer.start_timer();
   } else {
     this->atom->cast_x_data(host_x,host_type);
     this->cast_rad_data(rad);
@@ -265,7 +265,7 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -280,8 +280,8 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
                           &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &c_rad, &coeff, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &c_rad, &coeff, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
   }
diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu
index f9f4767123..ad02f202a3 100644
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -29,15 +29,15 @@ texture<int2> rad_tex;
 #define rad_tex rad_
 #endif
 
-__kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, 
+__kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict rad_,
-                               const __global numtyp4 *restrict coeff, 
-                               const int lj_types, 
-                               const __global numtyp *restrict sp_lj_in, 
-                               const __global int *dev_nbor, 
-                               const __global int *dev_packed, 
+                               const __global numtyp4 *restrict coeff,
+                               const int lj_types,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
                                __global acctyp4 *restrict ans,
-                               __global acctyp *restrict engv, 
+                               __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch, const int t_per_atom,
                                const numtyp kappa) {
@@ -56,21 +56,21 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp radi; fetch(radi,i,rad_tex);
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -78,29 +78,29 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       numtyp radj; fetch(radj,j,rad_tex);
       int jtype=jx.w;
-  
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
-      if (rsq<coeff[mtype].z) {   
+      if (rsq<coeff[mtype].z) {
         numtyp r = ucl_sqrt(rsq);
         numtyp rinv = ucl_recip(r);
 	      numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
 	      numtyp force = coeff[mtype].x * screening;
 
 	      force = factor_lj*force * rinv;
-  
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x/kappa * screening;
-          energy+=factor_lj*(e-coeff[mtype].y); 
+          energy+=factor_lj*(e-coeff[mtype].y);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -118,20 +118,20 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
                                     const __global numtyp *restrict rad_,
-                                    const __global numtyp4 *restrict coeff_in, 
+                                    const __global numtyp4 *restrict coeff_in,
                                     const __global numtyp *restrict sp_lj_in,
-                                    const __global int *dev_nbor, 
-                                    const __global int *dev_packed, 
-                                    __global acctyp4 *restrict ans, 
-                                    __global acctyp *restrict engv, 
-                                    const int eflag, const int vflag, 
-                                    const int inum, const int nbor_pitch, 
+                                    const __global int *dev_nbor,
+                                    const __global int *dev_packed,
+                                    __global acctyp4 *restrict ans,
+                                    __global acctyp *restrict engv,
+                                    const int eflag, const int vflag,
+                                    const int inum, const int nbor_pitch,
                                     const int t_per_atom, const numtyp kappa) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -139,7 +139,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -148,7 +148,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -163,7 +163,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -177,7 +177,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<coeff[mtype].z) {
         numtyp r = ucl_sqrt(rsq);
         numtyp rinv = ucl_recip(r);
@@ -185,14 +185,14 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
 	      numtyp force = coeff[mtype].x * screening;
 
 	      force = factor_lj*force * rinv;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x/kappa * screening;
-          energy+=factor_lj*(e-coeff[mtype].y); 
+          energy+=factor_lj*(e-coeff[mtype].y);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_yukawa_colloid.h b/lib/gpu/lal_yukawa_colloid.h
index 5a9ee7ae6e..ba69bc4bae 100644
--- a/lib/gpu/lal_yukawa_colloid.h
+++ b/lib/gpu/lal_yukawa_colloid.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
  public:
   YukawaColloid();
-  ~YukawaColloid(); 
+  ~YukawaColloid();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -39,8 +39,8 @@ class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
            double **host_a, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, const double kappa);
 
   inline void cast_rad_data(double* rad) {
@@ -70,22 +70,22 @@ class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
-  
+
   /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, 
-               const int nall, double **host_x, int *host_type, 
-               int *ilist, int *numj, int **firstneigh, 
+  void compute(const int f_ago, const int inum_full,
+               const int nall, double **host_x, int *host_type,
+               int *ilist, int *numj, int **firstneigh,
                const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success, double *rad);
-               
+
   /// Pair loop with device neighboring
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
-                int **ilist, int **jnum, const double cpu_time, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **jnum, const double cpu_time,
                 bool &success, double *rad);
 
   // --------------------------- TEXTURES -----------------------------
@@ -101,7 +101,7 @@ class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   int _max_rad_size;
diff --git a/lib/gpu/lal_yukawa_colloid_ext.cpp b/lib/gpu/lal_yukawa_colloid_ext.cpp
index 0e3c653e06..b9ce51e522 100644
--- a/lib/gpu/lal_yukawa_colloid_ext.cpp
+++ b/lib/gpu/lal_yukawa_colloid_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -27,10 +27,10 @@ static YukawaColloid<PRECISION,ACC_PRECISION> YKCOLLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, 
+int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
                        double **host_offset, double *special_lj, const int inum,
                        const int nall, const int max_nbors,  const int maxspecial,
-                       const double cell_size, int &gpu_mode, FILE *screen, 
+                       const double cell_size, int &gpu_mode, FILE *screen,
                        const double kappa) {
   YKCOLLMF.clear();
   gpu_mode=YKCOLLMF.device->gpu_mode();
@@ -54,8 +54,8 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, 
-                          inum, nall, 300, maxspecial, cell_size, gpu_split, 
+    init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj,
+                          inum, nall, 300, maxspecial, cell_size, gpu_split,
                           screen, kappa);
 
   YKCOLLMF.device->world_barrier();
@@ -72,12 +72,12 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, 
-                            inum, nall, 300, maxspecial, cell_size, gpu_split, 
+      init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj,
+                            inum, nall, 300, maxspecial, cell_size, gpu_split,
                             screen, kappa);
 
     YKCOLLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -103,11 +103,11 @@ int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full,
                           subhi, tag, nspecial, special, eflag, vflag, eatom,
                           vatom, host_start, ilist, jnum, cpu_time, success,
                           host_rad);
-}  
+}
 			
-void ykcolloid_gpu_compute(const int ago, const int inum_full, 
-                           const int nall, double **host_x, int *host_type, 
-                           int *ilist, int *numj, int **firstneigh, 
+void ykcolloid_gpu_compute(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *ilist, int *numj, int **firstneigh,
                            const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            const double cpu_time, bool &success, double *host_rad) {
diff --git a/lib/gpu/lal_yukawa_ext.cpp b/lib/gpu/lal_yukawa_ext.cpp
index 1cc89885aa..5136e3ea53 100644
--- a/lib/gpu/lal_yukawa_ext.cpp
+++ b/lib/gpu/lal_yukawa_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -28,9 +28,9 @@ static Yukawa<PRECISION,ACC_PRECISION> YKMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
-                 double **host_a, double **offset, double *special_lj, 
-                 const int inum, const int nall, const int max_nbors,  
-                 const int maxspecial, const double cell_size, 
+                 double **host_a, double **offset, double *special_lj,
+                 const int inum, const int nall, const int max_nbors,
+                 const int maxspecial, const double cell_size,
                  int &gpu_mode, FILE *screen) {
   YKMF.clear();
   gpu_mode=YKMF.device->gpu_mode();
@@ -54,8 +54,8 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, 
-                      inum, nall, 300, maxspecial, cell_size, 
+    init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj,
+                      inum, nall, 300, maxspecial, cell_size,
                       gpu_split, screen);
 
   YKMF.device->world_barrier();
@@ -72,12 +72,12 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, 
-                      inum, nall, 300, maxspecial, cell_size, 
+      init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj,
+                      inum, nall, 300, maxspecial, cell_size,
                       gpu_split, screen);
 
     YKMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,7 +102,7 @@ int ** yukawa_gpu_compute_n(const int ago, const int inum_full,
   return YKMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void yukawa_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
diff --git a/lib/gpu/lal_zbl.cpp b/lib/gpu/lal_zbl.cpp
index e172d48b33..a45faf01c3 100644
--- a/lib/gpu/lal_zbl.cpp
+++ b/lib/gpu/lal_zbl.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
@@ -33,10 +33,10 @@ ZBLT::ZBL() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-ZBLT::~ZBL() { 
+ZBLT::~ZBL() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int ZBLT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -44,15 +44,15 @@ int ZBLT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int ZBLT::init(const int ntypes, double **host_cutsq,
-               double **host_sw1, double **host_sw2, 
-               double **host_sw3, double **host_sw4, 
+               double **host_sw1, double **host_sw2,
+               double **host_sw3, double **host_sw4,
                double **host_sw5,
-               double **host_d1a, double **host_d2a, 
-               double **host_d3a, double **host_d4a, 
-               double **host_zze, double cut_globalsq, 
+               double **host_d1a, double **host_d2a,
+               double **host_d3a, double **host_d4a,
+               double **host_zze, double cut_globalsq,
                double cut_innersq, double cut_inner,
-               const int nlocal, const int nall, const int max_nbors, 
-               const int maxspecial, const double cell_size, 
+               const int nlocal, const int nall, const int max_nbors,
+               const int maxspecial, const double cell_size,
                const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -88,7 +88,7 @@ int ZBLT::init(const int ntypes, double **host_cutsq,
   coeff3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff3,host_write,host_sw3,host_sw4,host_sw5);
 
-  _cut_globalsq = cut_globalsq; 
+  _cut_globalsq = cut_globalsq;
   _cut_innersq = cut_innersq;
   _cut_inner = cut_inner;
 
@@ -131,7 +131,7 @@ void ZBLT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
diff --git a/lib/gpu/lal_zbl.cu b/lib/gpu/lal_zbl.cu
index b14753b5fa..30bbc8aa2e 100644
--- a/lib/gpu/lal_zbl.cu
+++ b/lib/gpu/lal_zbl.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/
 
@@ -35,9 +35,9 @@ texture<int4,1> pos_tex;
    compute ZBL pair energy
 ------------------------------------------------------------------------- */
 
-ucl_inline numtyp e_zbl(numtyp r, numtyp d1aij, numtyp d2aij, 
+ucl_inline numtyp e_zbl(numtyp r, numtyp d1aij, numtyp d2aij,
                       numtyp d3aij, numtyp d4aij, numtyp zzeij) {
-  
+
   numtyp rinv = ucl_recip(r);
 
   numtyp sum = c1*ucl_exp(-d1aij*r);
@@ -54,7 +54,7 @@ ucl_inline numtyp e_zbl(numtyp r, numtyp d1aij, numtyp d2aij,
    compute ZBL first derivative
 ------------------------------------------------------------------------- */
 
-ucl_inline numtyp dzbldr(numtyp r, numtyp d1aij, numtyp d2aij, 
+ucl_inline numtyp dzbldr(numtyp r, numtyp d1aij, numtyp d2aij,
                          numtyp d3aij, numtyp d4aij, numtyp zzeij) {
   numtyp rinv = ucl_recip(r);
 
@@ -72,24 +72,24 @@ ucl_inline numtyp dzbldr(numtyp r, numtyp d1aij, numtyp d2aij,
   sum_p -= c2*d2aij*e2;
   sum_p -= c3*d3aij*e3;
   sum_p -= c4*d4aij*e4;
-  
+
   numtyp result = zzeij*(sum_p - sum*rinv)*rinv;
-  
+
   return result;
 };
 
-__kernel void k_zbl(const __global numtyp4 *restrict x_, 
+__kernel void k_zbl(const __global numtyp4 *restrict x_,
                     const __global numtyp4 *restrict coeff1,
                     const __global numtyp4 *restrict coeff2,
                     const __global numtyp4 *restrict coeff3,
-                    const double cut_globalsq, 
-                    const double cut_innersq, 
-                    const double cut_inner, 
-                    const int lj_types, 
-                    const __global int *dev_nbor, 
-                    const __global int *dev_packed, 
+                    const double cut_globalsq,
+                    const double cut_innersq,
+                    const double cut_inner,
+                    const int lj_types,
+                    const __global int *dev_nbor,
+                    const __global int *dev_packed,
                     __global acctyp4 *restrict ans,
-                    __global acctyp *restrict engv, 
+                    __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -101,19 +101,19 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
@@ -125,15 +125,15 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<cut_globalsq) {
         numtyp r, t, force;
 
         r = ucl_sqrt(rsq);
-        force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y, 
+        force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
                        coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
-      
+
       	if (rsq>cut_innersq) {
 	        t = r - cut_inner;
 	        force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
@@ -146,14 +146,14 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
         f.z+=delz*force;
 
         if (eflag>0) {
-          numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, 
+          numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
                          coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
        	  e += coeff3[mtype].z;
       	  if (rsq > cut_innersq) {
       	    e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
       	  }
 
-          energy+=e; 
+          energy+=e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -171,22 +171,22 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_zbl_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict coeff1_in,
                          const __global numtyp4 *restrict coeff2_in,
                          const __global numtyp4 *restrict coeff3_in,
-                         const double cut_globalsq, 
-                         const double cut_innersq, 
-                         const double cut_inner, 
+                         const double cut_globalsq,
+                         const double cut_innersq,
+                         const double cut_inner,
                          const __global int *dev_nbor,
-                         const __global int *dev_packed, 
+                         const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
-                         __global acctyp *restrict engv, 
-                         const int eflag, const int vflag, const int inum, 
+                         __global acctyp *restrict engv,
+                         const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -195,7 +195,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
     coeff2[tid]=coeff2_in[tid];
     coeff3[tid]=coeff3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -204,7 +204,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -217,7 +217,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
@@ -229,14 +229,14 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<cut_globalsq) {
         numtyp r, t, force;
 
         r = ucl_sqrt(rsq);
-        force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y, 
+        force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
                        coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
-      
+
       	if (rsq>cut_innersq) {
 	        t = r - cut_inner;
 	        force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
@@ -249,14 +249,14 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
         f.z+=delz*force;
 
         if (eflag>0) {
-          numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, 
+          numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
                          coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
        	  e += coeff3[mtype].z;
       	  if (rsq > cut_innersq) {
       	    e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
       	  }
 
-          energy+=e; 
+          energy+=e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_zbl.h b/lib/gpu/lal_zbl.h
index 2996d90a5c..9885fcedf2 100644
--- a/lib/gpu/lal_zbl.h
+++ b/lib/gpu/lal_zbl.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
@@ -24,27 +24,27 @@ template <class numtyp, class acctyp>
 class ZBL : public BaseAtomic<numtyp, acctyp> {
  public:
   ZBL();
-  ~ZBL(); 
+  ~ZBL();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_sw1, 
+  int init(const int ntypes, double **host_cutsq, double **host_sw1,
            double **host_sw2, double **host_sw3, double **host_sw4, double **host_sw5,
-           double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a, 
+           double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a,
            double **host_zze, double cut_globalsq, double cut_innersq, double cut_inner,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -70,8 +70,8 @@ class ZBL : public BaseAtomic<numtyp, acctyp> {
   double _cut_globalsq;
   double _cut_innersq;
   double _cut_inner;
- 
-  /// Number of atom types 
+
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_zbl_ext.cpp b/lib/gpu/lal_zbl_ext.cpp
index ddce858076..5fd003b8ca 100644
--- a/lib/gpu/lal_zbl_ext.cpp
+++ b/lib/gpu/lal_zbl_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
@@ -27,11 +27,11 @@ static ZBL<PRECISION,ACC_PRECISION> ZBLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, 
+int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
                  double **host_sw2, double **host_sw3, double **host_sw4, double **host_sw5,
-                 double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a, 
+                 double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a,
                  double **host_zze, double cut_globalsq, double cut_innersq, double cut_inner,
-                 const int inum, const int nall, const int max_nbors,  
+                 const int inum, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) {
   ZBLMF.clear();
   gpu_mode=ZBLMF.device->gpu_mode();
@@ -55,7 +55,7 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, 
+    init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4,
                        host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze,
                        cut_globalsq, cut_innersq, cut_inner,
                        inum, nall, 300, maxspecial, cell_size, gpu_split, screen);
@@ -74,13 +74,13 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, 
+      init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4,
                          host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze,
-                         cut_globalsq, cut_innersq, cut_inner, 
+                         cut_globalsq, cut_innersq, cut_inner,
                          inum, nall, 300, maxspecial, cell_size, gpu_split, screen);
 
     ZBLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -105,7 +105,7 @@ int ** zbl_gpu_compute_n(const int ago, const int inum_full,
   return ZBLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
+}
 			
 void zbl_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,