diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index c44812cd87..6651a31ad8 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -57,8 +57,6 @@ OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
        $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
        $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
        $(OBJ_DIR)/cmmc_msm_gpu_memory.o $(OBJ_DIR)/cmmc_msm_gpu.o \
-       $(OBJ_DIR)/charge_gpu_memory2.o \
-       $(OBJ_DIR)/crml_gpu_memory2.o $(OBJ_DIR)/crml_gpu2.o \
        $(CUDPP)
 PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
        $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
@@ -77,8 +75,7 @@ PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
        $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \
        $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
        $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h \
-       $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h \
-       $(OBJ_DIR)/crml_gpu_kernel2.ptx $(OBJ_DIR)/crml_gpu_ptx2.h
+       $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h
 
 all: $(GPU_LIB) $(EXECS)
 
@@ -301,28 +298,6 @@ $(OBJ_DIR)/cmmc_msm_gpu_memory.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu_me
 $(OBJ_DIR)/cmmc_msm_gpu.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c cmmc_msm_gpu.cpp -I$(OBJ_DIR)
 
-
-
-$(OBJ_DIR)/charge_gpu_memory2.o: $(ALL_H) charge_gpu_memory2.h charge_gpu_memory2.cpp
-	$(CUDR) -o $@ -c charge_gpu_memory2.cpp
-
-$(OBJ_DIR)/crml_gpu_kernel2.ptx: crml_gpu_kernel2.cu pair_gpu_precision.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel2.cu
-
-$(OBJ_DIR)/crml_gpu_ptx2.h: $(OBJ_DIR)/crml_gpu_kernel2.ptx $(OBJ_DIR)/crml_gpu_kernel2.ptx
-	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/crml_gpu_kernel2.ptx $(OBJ_DIR)/crml_gpu_ptx2.h
-
-$(OBJ_DIR)/crml_gpu_memory2.o: $(ALL_H) crml_gpu_memory2.h crml_gpu_memory2.cpp $(OBJ_DIR)/crml_gpu_ptx2.h $(OBJ_DIR)/charge_gpu_memory2.o
-	$(CUDR) -o $@ -c crml_gpu_memory2.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/crml_gpu2.o: $(ALL_H) crml_gpu_memory2.h crml_gpu2.cpp charge_gpu_memory2.h
-	$(CUDR) -o $@ -c crml_gpu2.cpp -I$(OBJ_DIR)
-
-
-
-
-
-
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDART $(CUDA_LINK) 
 
diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp
index 6d3eca0996..531ea4000d 100644
--- a/lib/gpu/atomic_gpu_memory.cpp
+++ b/lib/gpu/atomic_gpu_memory.cpp
@@ -102,7 +102,7 @@ void AtomicGPUMemoryT::clear_atomic() {
   _gpu_overhead*=hd_balancer.timestep();
   _driver_overhead*=hd_balancer.timestep();
   device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
-                       _gpu_overhead,_driver_overhead,screen);
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
 
   if (_compiled) {
     k_pair_fast.clear();
diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp
index 8c8f231bf8..412596f5f2 100644
--- a/lib/gpu/charge_gpu_memory.cpp
+++ b/lib/gpu/charge_gpu_memory.cpp
@@ -57,7 +57,7 @@ int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
   if (host_nlocal>0)
     _gpu_host=1;
 
-  _threads_per_atom=device->threads_per_atom();
+  _threads_per_atom=device->threads_per_charge();
   if (_threads_per_atom>1 && gpu_nbor==false) {
     nbor->packing(true);
     _nbor_data=&(nbor->dev_packed);
@@ -104,7 +104,7 @@ void ChargeGPUMemoryT::clear_atomic() {
   _gpu_overhead*=hd_balancer.timestep();
   _driver_overhead*=hd_balancer.timestep();
   device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
-                       _gpu_overhead,_driver_overhead,screen);
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
 
   if (_compiled) {
     k_pair_fast.clear();
diff --git a/lib/gpu/charge_gpu_memory2.cpp b/lib/gpu/charge_gpu_memory2.cpp
deleted file mode 100644
index 76ef74288b..0000000000
--- a/lib/gpu/charge_gpu_memory2.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
-------------------------------------------------------------------------- */
-
-#include "charge_gpu_memory2.h"
-#define ChargeGPUMemory2T ChargeGPUMemory2<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-ChargeGPUMemory2T::ChargeGPUMemory2() : _compiled(false), _max_bytes(0) {
-  device=&pair_gpu_device;
-  ans=new PairGPUAns<numtyp,acctyp>();
-  nbor=new PairGPUNbor();
-}
-
-template <class numtyp, class acctyp>
-ChargeGPUMemory2T::~ChargeGPUMemory2() {
-  delete ans;
-  delete nbor;
-}
-
-template <class numtyp, class acctyp>
-int ChargeGPUMemory2T::bytes_per_atom_atomic(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
-         nbor->bytes_per_atom(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int ChargeGPUMemory2T::init_atomic(const int nlocal, const int nall,
-                                  const int max_nbors, const int maxspecial,
-                                  const double cell_size,
-                                  const double gpu_split, FILE *_screen,
-                                  const char *pair_program) {
-  nbor_time_avail=false;
-  screen=_screen;
-
-  bool gpu_nbor=false;
-  if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
-    gpu_nbor=true;
-
-  int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
-  if (host_nlocal>0)
-    _gpu_host=1;
-
-  _threads_per_atom=device->threads_per_atom();
-  if (_threads_per_atom>1 && gpu_nbor==false) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);
-    
-  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false);
-  if (success!=0)
-    return success;
-
-  ucl_device=device->gpu;
-  atom=&device->atom;
-
-  _block_size=device->pair_block_size();
-  _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program);
-
-  // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_nbor,gpu_split);
-
-  // Initialize timers for the selected GPU
-  time_pair.init(*ucl_device);
-  time_pair.zero();
-
-  pos_tex.bind_float(atom->dev_x,4);
-  q_tex.bind_float(atom->dev_q,1);
-
-  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-
-  return success;
-}
-
-template <class numtyp, class acctyp>
-void ChargeGPUMemory2T::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
-}
-
-template <class numtyp, class acctyp>
-void ChargeGPUMemory2T::clear_atomic() {
-  // Output any timing information
-  acc_timers();
-  double avg_split=hd_balancer.all_avg_split();
-  _gpu_overhead*=hd_balancer.timestep();
-  _driver_overhead*=hd_balancer.timestep();
-  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
-                       _gpu_overhead,_driver_overhead,screen);
-
-  if (_compiled) {
-    k_pair_fast.clear();
-    k_pair.clear();
-    delete pair_program;
-    _compiled=false;
-  }
-
-  time_pair.clear();
-  hd_balancer.clear();
-
-  device->clear();
-}
-
-// ---------------------------------------------------------------------------
-// Copy neighbor list from host
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int * ChargeGPUMemory2T::reset_nbors(const int nall, const int inum, int *ilist,
-                                   int *numj, int **firstneigh, bool &success) {
-  success=true;
-
-  nbor_time_avail=true;
-
-  int mn=nbor->max_nbor_loop(inum,numj,ilist);
-  resize_atom(inum,nall,success);
-  resize_local(inum,mn,success);
-  if (!success)
-    return false;
-
-  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
-
-  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-  if (bytes>_max_an_bytes)
-    _max_an_bytes=bytes;
-
-  return ilist;
-}
-
-// ---------------------------------------------------------------------------
-// Build neighbor list on device
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-inline void ChargeGPUMemory2T::build_nbor_list(const int inum,
-                                              const int host_inum,
-                                              const int nall, double **host_x,
-                                              int *host_type, double *sublo,
-                                              double *subhi, int *tag, 
-                                              int **nspecial, int **special,
-                                              bool &success) {
-  nbor_time_avail=true;
-
-  success=true;
-  resize_atom(inum,nall,success);
-  resize_local(inum,host_inum,nbor->max_nbors(),success);
-  if (!success)
-    return;
-  atom->cast_copy_x(host_x,host_type);
-
-  int mn;
-  nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
-
-  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-  if (bytes>_max_an_bytes)
-    _max_an_bytes=bytes;
-}
-
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void ChargeGPUMemory2T::compute(const int f_ago, const int inum_full,
-                               const int nall, double **host_x, int *host_type,
-                               int *ilist, int *numj, int **firstneigh,
-                               const bool eflag, const bool vflag,
-                               const bool eatom, const bool vatom,
-                               int &host_start, const double cpu_time,
-                               bool &success, double *host_q,
-                               const int nlocal, double *boxlo, double *prd) {
-  acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    resize_atom(0,nall,success);
-    zero_timers();
-    return;
-  }
-  
-  int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
-  ans->inum(inum);
-  host_start=inum;
-
-  if (ago==0) {
-    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-  }
-
-  atom->cast_x_data(host_x,host_type);
-  atom->cast_q_data(host_q);
-  hd_balancer.start_timer();
-  atom->add_x_data(host_x,host_type);
-  atom->add_q_data();
-
-  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
-                     boxlo, prd);
-
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  device->add_ans_object(ans);
-  hd_balancer.stop_timer();
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int** ChargeGPUMemory2T::compute(const int ago, const int inum_full,
-                                const int nall, double **host_x, int *host_type,
-                                double *sublo, double *subhi, int *tag,
-                                int **nspecial, int **special, const bool eflag, 
-                                const bool vflag, const bool eatom,
-                                const bool vatom, int &host_start,
-                                int **ilist, int **jnum,
-                                const double cpu_time, bool &success,
-                                double *host_q, double *boxlo, double *prd) {
-  acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    resize_atom(0,nall,success);
-    zero_timers();
-    return NULL;
-  }
-  
-  hd_balancer.balance(cpu_time);
-  int inum=hd_balancer.get_gpu_count(ago,inum_full);
-  ans->inum(inum);
-  host_start=inum;
- 
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    atom->cast_q_data(host_q);
-    hd_balancer.start_timer();
-  } else {
-    atom->cast_x_data(host_x,host_type);
-    atom->cast_q_data(host_q);
-    hd_balancer.start_timer();
-    atom->add_x_data(host_x,host_type);
-  }
-  atom->add_q_data();
-  *ilist=nbor->host_ilist.begin();
-  *jnum=nbor->host_acc.begin();
-
-  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
-                     boxlo, prd);
-
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
-  device->add_ans_object(ans);
-  hd_balancer.stop_timer();
-  
-  return nbor->host_jlist.begin()-host_start;
-}
-
-template <class numtyp, class acctyp>
-double ChargeGPUMemory2T::host_memory_usage_atomic() const {
-  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
-         4*sizeof(numtyp)+sizeof(ChargeGPUMemory2<numtyp,acctyp>);
-}
-
-template <class numtyp, class acctyp>
-void ChargeGPUMemory2T::compile_kernels(UCL_Device &dev, const char *pair_str) {
-  if (_compiled)
-    return;
-
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE);
-
-  pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,flags.c_str());
-  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
-  k_pair.set_function(*pair_program,"kernel_pair");
-  pos_tex.get_texture(*pair_program,"pos_tex");
-  q_tex.get_texture(*pair_program,"q_tex");
-
-  _compiled=true;
-}
-
-template class ChargeGPUMemory2<PRECISION,ACC_PRECISION>;
-
diff --git a/lib/gpu/charge_gpu_memory2.h b/lib/gpu/charge_gpu_memory2.h
deleted file mode 100644
index 0edfc7e497..0000000000
--- a/lib/gpu/charge_gpu_memory2.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
-------------------------------------------------------------------------- */
-
-#ifndef CHARGE_GPU_MEMORY2_H
-#define CHARGE_GPU_MEMORY2_H
-
-#include "pair_gpu_device.h"
-#include "pair_gpu_balance.h"
-#include "mpi.h"
-
-#ifdef USE_OPENCL
-#include "geryon/ocl_texture.h"
-#else
-#include "geryon/nvd_texture.h"
-#endif
-
-template <class numtyp, class acctyp>
-class ChargeGPUMemory2 {
- public:
-  ChargeGPUMemory2();
-  virtual ~ChargeGPUMemory2();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init_atomic(const int nlocal, const int nall, const int max_nbors,
-                  const int maxspecial, const double cell_size,
-                  const double gpu_split, FILE *screen,
-                  const char *pair_program);
-
-  /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
-
-  /// Check if there is enough storage for atom arrays and realloc if not
-  /** \param success set to false if insufficient memory **/
-  inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(nall, success)) {
-      pos_tex.bind_float(atom->dev_x,4);
-      q_tex.bind_float(atom->dev_q,1);
-    }
-    ans->resize(inum,success);
-  }
-
-  /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
-    * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
-    * \note olist_size=total number of local particles **/
-  inline void resize_local(const int inum, const int max_nbors, bool &success) {
-    nbor->resize(inum,max_nbors,success);
-  }
-
-  /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
-    * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
-    * \note host_inum is 0 if the host is performing neighboring
-    * \note nlocal+host_inum=total number local particles
-    * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
-                           const int max_nbors, bool &success) {
-    nbor->resize(inum,host_inum,max_nbors,success);
-  }
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear_atomic();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom_atomic(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage_atomic() const;
-
-  /// Accumulate timers
-  inline void acc_timers() {
-    if (nbor_time_avail) {
-      nbor->time_nbor.add_to_total();
-      nbor->time_kernel.add_to_total();
-      nbor_time_avail=false;
-    }
-    time_pair.add_to_total();
-    atom->acc_timers();
-    ans->acc_timers();
-  }
-
-  /// Zero timers
-  inline void zero_timers() {
-    nbor_time_avail=false;
-    time_pair.zero();
-    atom->zero_timers();
-    ans->zero_timers();
-  }
-
-  /// Copy neighbor list from host
-  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
-                    int **firstneigh, bool &success);
-
-  /// Build neighbor list on device
-  void build_nbor_list(const int inum, const int host_inum,
-                       const int nall, double **host_x, int *host_type,
-                       double *sublo, double *subhi, int *tag, int **nspecial,
-                       int **special, bool &success);
-
-  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall,
-               double **host_x, int *host_type, int *ilist, int *numj,
-               int **firstneigh, const bool eflag, const bool vflag,
-               const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, double *charge,
-               const int nlocal, double *boxlo, double *prd);
-
-  /// Pair loop with device neighboring
-  int** compute(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, double *sublo,
-                double *subhi, int *tag, int **nspecial,
-                int **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                double *charge, double *boxlo, double *prd);
-
-  // -------------------------- DEVICE DATA ------------------------- 
-
-  /// Device Properties and Atom and Neighbor storage
-  PairGPUDevice<numtyp,acctyp> *device;
-
-  /// Geryon device
-  UCL_Device *ucl_device;
-
-  /// Device Timers
-  UCL_Timer time_pair;
-
-  /// Host device load balancer
-  PairGPUBalance<numtyp,acctyp> hd_balancer;
-
-  /// LAMMPS pointer for screen output
-  FILE *screen;
-
-  // --------------------------- ATOM DATA --------------------------
-
-  /// Atom Data
-  PairGPUAtom<numtyp,acctyp> *atom;
-
-
-  // ------------------------ FORCE/ENERGY DATA -----------------------
-
-  PairGPUAns<numtyp,acctyp> *ans;
-
-  // --------------------------- NBOR DATA ----------------------------
-
-  /// Neighbor data
-  PairGPUNbor *nbor;
-
-  /// True if we need to accumulate time for neighboring
-  bool nbor_time_avail;
-
-  // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
-  inline int block_size() { return _block_size; }
-
-  // --------------------------- TEXTURES -----------------------------
-  UCL_Texture pos_tex;
-  UCL_Texture q_tex;
-
- protected:
-  bool _compiled;
-  int _block_size, _block_bio_size, _threads_per_atom;
-  double  _max_bytes, _max_an_bytes;
-  double _gpu_overhead, _driver_overhead;
-  UCL_D_Vec<int> *_nbor_data;
-
-  void compile_kernels(UCL_Device &dev, const char *pair_string);
-
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
-};
-
-#endif
-
diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu
index b2b7796025..08cc31ed7f 100644
--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@@ -184,7 +184,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
@@ -347,7 +347,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu
index c99a2a68c2..5153cb5016 100644
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@@ -233,7 +233,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
@@ -428,7 +428,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
 
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
diff --git a/lib/gpu/cmmc_msm_gpu_kernel.cu b/lib/gpu/cmmc_msm_gpu_kernel.cu
index 5ad765f8c4..d740b57feb 100644
--- a/lib/gpu/cmmc_msm_gpu_kernel.cu
+++ b/lib/gpu/cmmc_msm_gpu_kernel.cu
@@ -252,7 +252,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
@@ -472,7 +472,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
 
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
diff --git a/lib/gpu/crml_gpu2.cpp b/lib/gpu/crml_gpu2.cpp
deleted file mode 100644
index a3402fe26b..0000000000
--- a/lib/gpu/crml_gpu2.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
-------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include <math.h>
-
-#include "crml_gpu_memory2.h"
-
-using namespace std;
-
-static CRML_GPU_Memory2<PRECISION,ACC_PRECISION> CRMLMF2;
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int crml_gpu_init2(const int ntypes, double cut_bothsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4,
-                  double **offset, double *special_lj, const int inum,
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen,
-                  double host_cut_ljsq, double host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e,
-                  const double g_ewald, const double cut_lj_innersq,
-                  const double denom_lj, double **epsilon,
-                  double **sigma, const bool mix_arithmetic) {
-  CRMLMF2.clear();
-  gpu_mode=CRMLMF2.device->gpu_mode();
-  double gpu_split=CRMLMF2.device->particle_split();
-  int first_gpu=CRMLMF2.device->first_device();
-  int last_gpu=CRMLMF2.device->last_device();
-  int world_me=CRMLMF2.device->world_me();
-  int gpu_rank=CRMLMF2.device->gpu_rank();
-  int procs_per_gpu=CRMLMF2.device->procs_per_gpu();
-
-  CRMLMF2.device->init_message(screen,"lj/charmm/coul/long",first_gpu,last_gpu);
-
-  bool message=false;
-  if (CRMLMF2.device->replica_me()==0 && screen)
-    message=true;
-
-  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
-    fflush(screen);
-  }
-
-  int init_ok=0;
-  if (world_me==0)
-    CRMLMF2.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                offset, special_lj, inum, nall, 300, maxspecial, cell_size,
-                gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
-                host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
-                epsilon,sigma,mix_arithmetic);
-
-  CRMLMF2.device->world_barrier();
-  if (message)
-    fprintf(screen,"Done.\n");
-
-  for (int i=0; i<procs_per_gpu; i++) {
-    if (message) {
-      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
-      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
-                last_gpu,i);
-      fflush(screen);
-    }
-    if (gpu_rank==i && world_me!=0)
-      init_ok=CRMLMF2.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
-                          host_lj4, offset, special_lj, inum, nall, 300,
-                          maxspecial, cell_size, gpu_split, screen,
-                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                          qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
-                          sigma, mix_arithmetic);
-
-    CRMLMF2.device->gpu_barrier();
-    if (message) 
-      fprintf(screen,"Done.\n");
-  }
-  if (message)
-    fprintf(screen,"\n");
-
-  if (init_ok==0)
-    CRMLMF2.estimate_gpu_overhead();
-  return init_ok;
-}
-
-void crml_gpu_clear2() {
-  CRMLMF2.clear();
-}
-
-int** crml_gpu_compute_n2(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, int *tag, int **nspecial, 
-                         int **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum, const double cpu_time,
-                         bool &success, double *host_q, double *boxlo,
-                         double *prd) {
-  return CRMLMF2.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                        subhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, ilist, jnum, cpu_time, success,
-                        host_q, boxlo, prd);
-}  
-			
-void crml_gpu_compute2(const int ago, const int inum_full,
-	 	                  const int nall, double **host_x, int *host_type,
-                      int *ilist, int *numj, int **firstneigh,
-		                  const bool eflag, const bool vflag, const bool eatom,
-                      const bool vatom, int &host_start, const double cpu_time,
-                      bool &success, double *host_q, const int nlocal, 
-                      double *boxlo, double *prd) {
-  CRMLMF2.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
-                 eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
-                 nlocal,boxlo,prd);
-}
-
-double crml_gpu_bytes2() {
-  return CRMLMF2.host_memory_usage();
-}
-
-
diff --git a/lib/gpu/crml_gpu_kernel2.cu b/lib/gpu/crml_gpu_kernel2.cu
deleted file mode 100644
index 293b565d44..0000000000
--- a/lib/gpu/crml_gpu_kernel2.cu
+++ /dev/null
@@ -1,508 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
-------------------------------------------------------------------------- */
-
-#ifndef CRML_GPU_KERNEL
-#define CRML_GPU_KERNEL
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define EWALD_F (numtyp)1.12837917
-#define EWALD_P (numtyp)0.3275911
-#define A1 (numtyp)0.254829592
-#define A2 (numtyp)-0.284496736
-#define A3 (numtyp)1.421413741
-#define A4 (numtyp)-1.453152027
-#define A5 (numtyp)1.061405429
-
-#ifdef NV_KERNEL
-
-#include "nv_kernel_def.h"
-texture<float4> pos_tex;
-texture<float> q_tex;
-
-#ifdef _DOUBLE_DOUBLE
-__inline double4 fetch_pos(const int& i, const double4 *pos)
-{
-  return pos[i];
-}
-__inline double fetch_q(const int& i, const double *q)
-{
-  return q[i];
-}
-#else
-__inline float4 fetch_pos(const int& i, const float4 *pos)
-{
-  return tex1Dfetch(pos_tex, i);
-}
-__inline float fetch_q(const int& i, const float *q)
-{
-  return tex1Dfetch(q_tex, i);
-}
-#endif
-
-#else
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define __inline inline
-
-#define fetch_pos(i,y) x_[i]
-#define fetch_q(i,y) q_[i]
-#define BLOCK_BIO_PAIR 64
-
-#endif
-
-#define MAX_BIO_SHARED_TYPES 128
-
-#define SBBITS 30
-#define NEIGHMASK 0x3FFFFFFF
-__inline int sbmask(int j) { return j >> SBBITS & 3; }
-
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          const int lj_types, __global numtyp *sp_lj_in,
-                          __global int *dev_nbor, __global int *dev_packed,
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch,
-                          __global numtyp *q_, const numtyp cut_coulsq,
-                          const numtyp qqrd2e, const numtyp g_ewald,
-                          const numtyp denom_lj, const numtyp cut_bothsq, 
-                          const numtyp cut_ljsq, const numtyp cut_lj_innersq,
-                          const int t_per_atom) {
-  __local numtyp sp_lj[8];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-  sp_lj[4]=sp_lj_in[4];
-  sp_lj[5]=sp_lj_in[5];
-  sp_lj[6]=sp_lj_in[6];
-  sp_lj[7]=sp_lj_in[7];
-
-  int tid=THREAD_ID_X;
-
-  acctyp energy;
-  acctyp e_coul;
-  acctyp4 f;
-  acctyp virial[6];
-
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-  
-  energy=(acctyp)0;
-  e_coul=(acctyp)0;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  for (int o=0; o<6; o++)
-    virial[o]=(acctyp)0;
-
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int itype=ix.w;
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      int mtype=itype*lj_types+jtype;
-      if (rsq<cut_bothsq) {
-        numtyp r2inv=(numtyp)1.0/rsq;
-        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc, switch1;
-
-        if (rsq < cut_ljsq) {
-          r6inv = r2inv*r2inv*r2inv;
-          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-          if (rsq > cut_lj_innersq) {
-            switch1 = (cut_ljsq-rsq);
-            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
-                             denom_lj;
-            switch1 *= switch1;
-            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
-                       denom_lj;
-            switch2 *= r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
-            force_lj = force_lj*switch1+switch2;
-          }
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = sqrt(rsq);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = exp(-grij*grij);
-          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
-          forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < cut_ljsq) {
-            numtyp e=r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
-            if (rsq > cut_lj_innersq)
-              e *= switch1;
-            energy+=factor_lj*e;
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-  
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  __global acctyp *ap1=engv+ii;
-  if (ii<inum && offset==0) {
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  }
-}
-
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
-                               __global int *dev_packed, __global acctyp4 *ans,
-                               __global acctyp *engv, const int eflag,
-                               const int vflag, const int inum, const int nall,
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const numtyp denom_lj,
-                               const numtyp cut_bothsq, const numtyp cut_ljsq,
-                               const numtyp cut_lj_innersq,
-                               const int t_per_atom) {
-  int tid=THREAD_ID_X;
-  __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-
-  acctyp energy;
-  acctyp e_coul;
-  acctyp4 f;
-  acctyp virial[6];
-
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  ljd[tid]=ljd_in[tid];
-  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
-    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
-
-  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
-  ii+=tid/t_per_atom;
-  int offset=tid%t_per_atom;
-  
-  energy=(acctyp)0;
-  e_coul=(acctyp)0;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  for (int o=0; o<6; o++)
-    virial[o]=(acctyp)0;
-  __syncthreads();
-  
-  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-
-    int n_stride;
-    __global int *list_end;
-    if (dev_nbor==dev_packed) {
-      list_end=nbor+mul24(numj,nbor_pitch);
-      nbor+=mul24(offset,nbor_pitch);
-      n_stride=mul24(t_per_atom,nbor_pitch);
-    } else {
-      nbor=dev_packed+*nbor;
-      list_end=nbor+numj;
-      n_stride=t_per_atom;
-      nbor+=offset;
-    }
-  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
-    int itype=ix.w;
-
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<cut_bothsq) {
-        numtyp r2inv=(numtyp)1.0/rsq;
-        numtyp forcecoul, force_lj, force, prefactor, _erfc, switch1;
-        numtyp lj3, lj4;
-
-        if (rsq < cut_ljsq) {
-          numtyp eps = sqrt(ljd[itype].x*ljd[jtype].x);
-          numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
-
-          numtyp sig_r_6 = sig6*sig6*r2inv;
-          sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
-          lj4 = (numtyp)4.0*eps*sig_r_6;
-          lj3 = lj4*sig_r_6;
-          force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
-          if (rsq > cut_lj_innersq) {
-            switch1 = (cut_ljsq-rsq);
-            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
-                             denom_lj;
-            switch1 *= switch1;
-            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
-                       denom_lj;
-            switch2 *= lj3-lj4;
-            force_lj = force_lj*switch1+switch2;
-          }
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = sqrt(rsq);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = exp(-grij*grij);
-          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
-          forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < cut_ljsq) {
-            numtyp e=lj3-lj4;
-            if (rsq > cut_lj_innersq)
-              e *= switch1;
-            energy+=factor_lj*e;
-          }
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-  } // if ii
-
-  // Reduce answers
-  if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
-    
-    red_acc[0][tid]=f.x;
-    red_acc[1][tid]=f.y;
-    red_acc[2][tid]=f.z;
-    red_acc[3][tid]=energy;
-    red_acc[4][tid]=e_coul;
-
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-      if (offset < s) {
-        for (int r=0; r<5; r++)
-          red_acc[r][tid] += red_acc[r][tid+s];
-      }
-    }
-    
-    f.x=red_acc[0][tid];
-    f.y=red_acc[1][tid];
-    f.z=red_acc[2][tid];
-    energy=red_acc[3][tid];
-    e_coul=red_acc[4][tid];
-
-    if (vflag>0) {
-      for (int r=0; r<6; r++)
-        red_acc[r][tid]=virial[r];
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-    
-      for (int r=0; r<6; r++)
-        virial[r]=red_acc[r][tid];
-    }
-  }
-
-  // Store answers
-  __global acctyp *ap1=engv+ii;
-  if (ii<inum && offset==0) {
-    if (eflag>0) {
-      *ap1=energy;
-      ap1+=inum;
-      *ap1=e_coul;
-      ap1+=inum;
-    }
-    if (vflag>0) {
-      for (int v=0; v<6; v++) {
-        *ap1=virial[v];
-        ap1+=inum;
-      }
-    }
-    ans[ii]=f;
-  }
-}
-
-#endif
diff --git a/lib/gpu/crml_gpu_memory2.cpp b/lib/gpu/crml_gpu_memory2.cpp
deleted file mode 100644
index 894f5bfdc0..0000000000
--- a/lib/gpu/crml_gpu_memory2.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
-------------------------------------------------------------------------- */
-
-#ifdef USE_OPENCL
-#include "crml_gpu_cl2.h"
-#else
-#include "crml_gpu_ptx2.h"
-#endif
-
-#include "crml_gpu_memory2.h"
-#include <cassert>
-#define CRML_GPU_Memory2T CRML_GPU_Memory2<numtyp, acctyp>
-
-extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-
-template <class numtyp, class acctyp>
-CRML_GPU_Memory2T::CRML_GPU_Memory2() : ChargeGPUMemory2<numtyp,acctyp>(),
-                                    _allocated(false) {
-}
-
-template <class numtyp, class acctyp>
-CRML_GPU_Memory2T::~CRML_GPU_Memory2() {
-  clear();
-}
- 
-template <class numtyp, class acctyp>
-int CRML_GPU_Memory2T::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
-}
-
-template <class numtyp, class acctyp>
-int CRML_GPU_Memory2T::init(const int ntypes,
-                           double host_cut_bothsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen,
-                           double host_cut_ljsq, const double host_cut_coulsq,
-                           double *host_special_coul, const double qqrd2e,
-                           const double g_ewald, const double cut_lj_innersq,
-                           const double denom_lj, double **epsilon,
-                           double **sigma, const bool mix_arithmetic) {
-  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,crml_gpu_kernel2);
-  if (success!=0)
-    return success;
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  if (this->_block_bio_size>=64 && mix_arithmetic)
-    shared_types=true;
-  _lj_types=lj_types;
-
-  // Allocate a host write buffer for data initialization
-  int h_size=lj_types*lj_types;
-  int max_bio_shared_types=this->device->max_bio_shared_types();
-  if (h_size<max_bio_shared_types)
-    h_size=max_bio_shared_types;
-  UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
-  for (int i=0; i<h_size*32; i++)
-    host_write[i]=0.0;
-
-  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-                         host_lj3,host_lj4);
-
-  ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
-
-  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
-  for (int i=0; i<4; i++) {
-    host_write[i]=host_special_lj[i];
-    host_write[i+4]=host_special_coul[i];
-  }
-  ucl_copy(sp_lj,host_write,8,false);
-
-  _cut_bothsq = host_cut_bothsq;
-  _cut_coulsq = host_cut_coulsq;
-  _cut_ljsq = host_cut_ljsq;
-  _cut_lj_innersq = cut_lj_innersq;
-  _qqrd2e=qqrd2e;
-  _g_ewald=g_ewald;
-  _denom_lj=denom_lj;
-
-  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
-  return 0;
-}
-
-template <class numtyp, class acctyp>
-void CRML_GPU_Memory2T::clear() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  lj1.clear();
-  ljd.clear();
-  sp_lj.clear();
-  this->clear_atomic();
-}
-
-template <class numtyp, class acctyp>
-double CRML_GPU_Memory2T::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(CRML_GPU_Memory2<numtyp,acctyp>);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void CRML_GPU_Memory2T::loop(const bool _eflag, const bool _vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->_block_bio_size;
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int ainum=this->ans->inum();
-  int anall=this->atom->nall();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
-                          &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
-                          &_cut_ljsq, &_cut_lj_innersq,
-                          &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
-                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
-                     &this->_threads_per_atom);
-  }
-  this->time_pair.stop();
-}
-
-template class CRML_GPU_Memory2<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/crml_gpu_memory2.h b/lib/gpu/crml_gpu_memory2.h
deleted file mode 100644
index 9ae11ba6ac..0000000000
--- a/lib/gpu/crml_gpu_memory2.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
-------------------------------------------------------------------------- */
-
-#ifndef CRML_GPU_MEMORY2_H
-#define CRML_GPU_MEMORY2_H
-
-#include "charge_gpu_memory2.h"
-
-template <class numtyp, class acctyp>
-class CRML_GPU_Memory2 : public ChargeGPUMemory2<numtyp, acctyp> {
- public:
-  CRML_GPU_Memory2();
-  ~CRML_GPU_Memory2();
-
-  /// Clear any previous data and set up for a new LAMMPS run
-  /** \param max_nbors initial number of rows in the neighbor matrix
-    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device
-    * 
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double host_cut_bothsq,
-           double **host_lj1, double **host_lj2, double **host_lj3,
-           double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
-           const double gpu_split, FILE *screen, double host_cut_ljsq,
-           const double host_cut_coulsq, double *host_special_coul,
-           const double qqrd2e, const double g_ewald,
-           const double cut_lj_innersq, const double denom_lj, 
-           double **epsilon, double **sigma, const bool mix_arithmetic);
-
-  /// Clear all host and device data
-  /** \note This is called at the beginning of the init() routine **/
-  void clear();
-
-  /// Returns memory usage on device per atom
-  int bytes_per_atom(const int max_nbors) const;
-
-  /// Total host memory used by library for pair style
-  double host_memory_usage() const;
-
-  // --------------------------- TYPE DATA --------------------------
-
-  /// x = lj1, y = lj2, z = lj3, w = lj4
-  UCL_D_Vec<numtyp4> lj1;
-  /// x = epsilon, y = sigma
-  UCL_D_Vec<numtyp2> ljd;
-  /// Special LJ values [0-3] and Special Coul values [4-7]
-  UCL_D_Vec<numtyp> sp_lj;
-
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
-  /// Number of atom types 
-  int _lj_types;
-
-  numtyp _qqrd2e, _g_ewald, _denom_lj;
-
-  numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq;
-
- private:
-  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
-};
-
-#endif
-
diff --git a/lib/gpu/gb_gpu_kernel.cu b/lib/gpu/gb_gpu_kernel.cu
index d1abd4338b..7bb320f5d0 100644
--- a/lib/gpu/gb_gpu_kernel.cu
+++ b/lib/gpu/gb_gpu_kernel.cu
@@ -368,7 +368,7 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[7][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[7][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
diff --git a/lib/gpu/gb_gpu_kernel_lj.cu b/lib/gpu/gb_gpu_kernel_lj.cu
index 91c1355c64..657fc20cd5 100644
--- a/lib/gpu/gb_gpu_kernel_lj.cu
+++ b/lib/gpu/gb_gpu_kernel_lj.cu
@@ -248,7 +248,7 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
@@ -387,7 +387,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
@@ -535,7 +535,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu
index 39a17d89ed..3fc6a2f308 100644
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@@ -176,7 +176,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
@@ -332,7 +332,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu
index 1eadc7055d..75f36446f7 100644
--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@@ -175,7 +175,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
@@ -330,7 +330,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
diff --git a/lib/gpu/lj_expand_gpu_kernel.cu b/lib/gpu/lj_expand_gpu_kernel.cu
index df3f0ff9ae..2d09b4d941 100644
--- a/lib/gpu/lj_expand_gpu_kernel.cu
+++ b/lib/gpu/lj_expand_gpu_kernel.cu
@@ -178,7 +178,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
@@ -337,7 +337,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu
index 2b2cccb284..44a607588a 100644
--- a/lib/gpu/ljc_cut_gpu_kernel.cu
+++ b/lib/gpu/ljc_cut_gpu_kernel.cu
@@ -208,7 +208,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
@@ -388,7 +388,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
 
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu
index 000ecac616..7be7a86114 100644
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@@ -224,7 +224,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
@@ -411,7 +411,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
 
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
diff --git a/lib/gpu/morse_gpu_kernel.cu b/lib/gpu/morse_gpu_kernel.cu
index 3c483efac2..0a89aae070 100644
--- a/lib/gpu/morse_gpu_kernel.cu
+++ b/lib/gpu/morse_gpu_kernel.cu
@@ -176,7 +176,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
@@ -333,7 +333,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
   
   // Reduce answers
   if (t_per_atom>1) {
-    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    __local acctyp red_acc[6][BLOCK_PAIR];
     
     red_acc[0][tid]=f.x;
     red_acc[1][tid]=f.y;
diff --git a/lib/gpu/nv_kernel_def.h b/lib/gpu/nv_kernel_def.h
index f3f6ea4c36..8ac66a2db7 100644
--- a/lib/gpu/nv_kernel_def.h
+++ b/lib/gpu/nv_kernel_def.h
@@ -32,7 +32,8 @@
 
 #if (ARCH < 200)
 
-#define THREADS_PER_ATOM 8
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 8
 #define BLOCK_NBOR_BUILD 64
 #define BLOCK_PAIR 64
 #define BLOCK_BIO_PAIR 64
@@ -40,7 +41,8 @@
 
 #else
 
-#define THREADS_PER_ATOM 8
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 8
 #define BLOCK_NBOR_BUILD 128
 #define BLOCK_PAIR 128
 #define BLOCK_BIO_PAIR 128
diff --git a/lib/gpu/pair_gpu_dev_kernel.cu b/lib/gpu/pair_gpu_dev_kernel.cu
index 99f224b099..73411c99ff 100644
--- a/lib/gpu/pair_gpu_dev_kernel.cu
+++ b/lib/gpu/pair_gpu_dev_kernel.cu
@@ -30,6 +30,11 @@
      Definition:   Number of threads guaranteed to be on the same instruction
   THREADS_PER_ATOM
      Definition:   Default number of threads assigned per atom for pair styles
+     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
+  THREADS_PER_CHARGE
+     Definition:   Default number of threads assigned per atom for pair styles
+                   with charge
+     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
   PPPM_MAX_SPLINE
      Definition:   Maximum order for splines in PPPM
   PPPM_BLOCK_1D    
@@ -73,6 +78,7 @@
 #define MEM_THREADS 16
 #define WARP_SIZE 1
 #define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 1
 #define BLOCK_PAIR 64
 #define MAX_SHARED_TYPES 8
 #define BLOCK_NBOR_BUILD 64
@@ -107,6 +113,8 @@ __kernel void kernel_info(__global int *info) {
   info[10]=BLOCK_NBOR_BUILD;
   info[11]=BLOCK_BIO_PAIR;
   info[12]=MAX_BIO_SHARED_TYPES;
+  info[13]=THREADS_PER_CHARGE;
 }
 
 #endif
+
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index 056db9fb39..cadc8c59b0 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -52,6 +52,7 @@ int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
   omp_set_num_threads(nthreads);
   #endif
   _threads_per_atom=t_per_atom;
+  _threads_per_charge=t_per_atom;
 
   if (_device_init)
     return 0;
@@ -381,7 +382,8 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair,
                                   PairGPUNbor &nbor, const double avg_split, 
                                   const double max_bytes, 
                                   const double gpu_overhead,
-                                  const double driver_overhead, FILE *screen) {
+                                  const double driver_overhead, 
+                                  const int threads_per_atom, FILE *screen) {
   double single[8], times[8];
 
   single[0]=atom.transfer_time()+ans.transfer_time();
@@ -420,7 +422,7 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair,
       }
       fprintf(screen,"GPU Overhead:    %.4f s.\n",times[5]/_replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);
+      fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
       fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
       fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[7]/_replica_size);
@@ -534,8 +536,8 @@ int PairGPUDeviceT::compile_kernels() {
   k_info.set_function(*dev_program,"kernel_info");
   _compiled=true;
 
-  UCL_H_Vec<int> h_gpu_lib_data(13,*gpu,UCL_NOT_PINNED);
-  UCL_D_Vec<int> d_gpu_lib_data(13,*gpu);
+  UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
+  UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
   k_info.set_size(1,1);
   k_info.run(&d_gpu_lib_data.begin());
   ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
@@ -549,6 +551,8 @@ int PairGPUDeviceT::compile_kernels() {
   _warp_size=h_gpu_lib_data[2];
   if (_threads_per_atom<1)
     _threads_per_atom=h_gpu_lib_data[3];
+  if (_threads_per_charge<1)
+    _threads_per_charge=h_gpu_lib_data[13];
   _pppm_max_spline=h_gpu_lib_data[4];
   _pppm_block=h_gpu_lib_data[5];
   _block_pair=h_gpu_lib_data[6];
@@ -567,6 +571,10 @@ int PairGPUDeviceT::compile_kernels() {
     _threads_per_atom=_warp_size;
   if (_warp_size%_threads_per_atom!=0)
     _threads_per_atom=1;
+  if (_threads_per_charge>_warp_size)
+    _threads_per_charge=_warp_size;
+  if (_warp_size%_threads_per_charge!=0)
+    _threads_per_charge=1;
 
   return flag;    
 }
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index dc2e2e5ea8..8d1cefd823 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -112,7 +112,8 @@ class PairGPUDevice {
   void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans, 
                     PairGPUNbor &nbor, const double avg_split, 
                     const double max_bytes, const double gpu_overhead,
-                    const double driver_overhead, FILE *screen);
+                    const double driver_overhead, 
+                    const int threads_per_atom, FILE *screen);
 
   /// Output a message with timing information
   void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
@@ -203,6 +204,8 @@ class PairGPUDevice {
   inline int num_mem_threads() const { return _num_mem_threads; }
   /// Return the number of threads per atom for pair styles
   inline int threads_per_atom() const { return _threads_per_atom; }
+  /// Return the number of threads per atom for pair styles using charge
+  inline int threads_per_charge() const { return _threads_per_charge; }
   /// Return the min of the pair block size or the device max block size
   inline int pair_block_size() const { return _block_pair; }
   /// Return the maximum number of atom types that can be used with shared mem
@@ -277,7 +280,7 @@ class PairGPUDevice {
   double _particle_split;
   double _cpu_full;
 
-  int _num_mem_threads, _warp_size, _threads_per_atom;
+  int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
   int _pppm_max_spline, _pppm_block;
   int _block_pair, _max_shared_types;
   int _block_cell_2d, _block_cell_id, _block_nbor_build;
diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index 0f2bbfc095..83c1f61aa1 100644
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -36,8 +36,6 @@ if (test $1 = 1) then
   if (test -e ../pair_lj_charmm_coul_long.cpp) then
     cp pair_lj_charmm_coul_long_gpu.cpp ..
     cp pair_lj_charmm_coul_long_gpu.h ..
-    cp pair_lj_charmm_coul_long_gpu2.cpp ..
-    cp pair_lj_charmm_coul_long_gpu2.h ..
   fi
 
   if (test -e ../pair_cg_cmm.cpp) then
@@ -92,7 +90,6 @@ elif (test $1 = 0) then
   rm ../pair_lj_cut_coul_cut_gpu.cpp
   rm ../pair_lj_cut_coul_long_gpu.cpp
   rm ../pair_lj_charmm_coul_long_gpu.cpp
-  rm ../pair_lj_charmm_coul_long_gpu2.cpp
   rm ../pair_lj_cut_tgpu.cpp
   rm ../pair_cg_cmm_gpu.cpp
   rm ../pair_cg_cmm_coul_long_gpu.cpp
@@ -112,7 +109,6 @@ elif (test $1 = 0) then
   rm ../pair_lj_cut_coul_cut_gpu.h
   rm ../pair_lj_cut_coul_long_gpu.h
   rm ../pair_lj_charmm_coul_long_gpu.h
-  rm ../pair_lj_charmm_coul_long_gpu2.h
   rm ../pair_lj_cut_tgpu.cpp
   rm ../pair_cg_cmm_gpu.h
   rm ../pair_cg_cmm_coul_long_gpu.h
diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp
index 6220346977..cbfb299db0 100644
--- a/src/GPU/pair_gayberne_gpu.cpp
+++ b/src/GPU/pair_gayberne_gpu.cpp
@@ -170,7 +170,7 @@ void PairGayBerneGPU::init_style()
   double cell_size = sqrt(maxcut) + neighbor->skin;
 
   int success = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, 
-			    shape1, well, cutsq, sigma, epsilon, lshape, form,
+			    shape2, well, cutsq, sigma, epsilon, lshape, form,
 			    lj1, lj2, lj3, lj4, offset, force->special_lj, 
 			    atom->nlocal, atom->nlocal+atom->nghost, 300, 
 			    cell_size, gpu_mode, screen);
diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu2.cpp b/src/GPU/pair_lj_charmm_coul_long_gpu2.cpp
deleted file mode 100644
index fd5c55e7dd..0000000000
--- a/src/GPU/pair_lj_charmm_coul_long_gpu2.cpp
+++ /dev/null
@@ -1,344 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-   
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Mike Brown (SNL)
-------------------------------------------------------------------------- */
-
-#include "lmptype.h"
-#include "math.h"
-#include "stdio.h"
-#include "stdlib.h"
-#include "pair_lj_charmm_coul_long_gpu2.h"
-#include "atom.h"
-#include "atom_vec.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "integrate.h"
-#include "memory.h"
-#include "error.h"
-#include "neigh_request.h"
-#include "universe.h"
-#include "update.h"
-#include "domain.h"
-#include "string.h"
-#include "kspace.h"
-#include "gpu_extra.h"
-
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-
-#define EWALD_F   1.12837917
-#define EWALD_P   0.3275911
-#define A1        0.254829592
-#define A2       -0.284496736
-#define A3        1.421413741
-#define A4       -1.453152027
-#define A5        1.061405429
-
-// External functions from cuda library for atom decomposition
-
-int crml_gpu_init2(const int ntypes, double cut_bothsq, double **host_lj1,
-		  double **host_lj2, double **host_lj3, double **host_lj4, 
-		  double **offset, double *special_lj, const int nlocal, 
-		  const int nall, const int max_nbors, const int maxspecial,
-		  const double cell_size, int &gpu_mode, FILE *screen,
-		  double host_cut_ljsq, double host_cut_coulsq,
-		  double *host_special_coul, const double qqrd2e,
-		  const double g_ewald, const double cut_lj_innersq,
-		  const double denom_lj, double **epsilon, double **sigma,
-		  const bool mix_arithmetic);
-void crml_gpu_clear2();
-int ** crml_gpu_compute_n2(const int ago, const int inum,
-			  const int nall, double **host_x, int *host_type, 
-			  double *sublo, double *subhi, int *tag,
-			  int **nspecial, int **special, const bool eflag,
-			  const bool vflag, const bool eatom, const bool vatom,
-			  int &host_start, int **ilist, int **jnum,
-			  const double cpu_time, bool &success, double *host_q,
-			  double *boxlo, double *prd);
-void crml_gpu_compute2(const int ago, const int inum, const int nall,
-		      double **host_x, int *host_type, int *ilist, int *numj,
-		      int **firstneigh, const bool eflag, const bool vflag,
-		      const bool eatom, const bool vatom, int &host_start,
-		      const double cpu_time, bool &success, double *host_q,
-		      const int nlocal, double *boxlo, double *prd);
-double crml_gpu_bytes2();
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJCharmmCoulLongGPU2::PairLJCharmmCoulLongGPU2(LAMMPS *lmp) : 
-  PairLJCharmmCoulLong(lmp), gpu_mode(GPU_PAIR)
-{
-  respa_enable = 0;
-  cpu_time = 0.0;
-}
-
-/* ----------------------------------------------------------------------
-   free all arrays
-------------------------------------------------------------------------- */
-
-PairLJCharmmCoulLongGPU2::~PairLJCharmmCoulLongGPU2()
-{
-  crml_gpu_clear2();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCharmmCoulLongGPU2::compute(int eflag, int vflag)
-{
-  if (eflag || vflag) ev_setup(eflag,vflag);
-  else evflag = vflag_fdotr = 0;
-  
-  int nall = atom->nlocal + atom->nghost;
-  int inum, host_start;
-  
-  bool success = true;
-  int *ilist, *numneigh, **firstneigh;    
-  if (gpu_mode == GPU_NEIGH) {
-    inum = atom->nlocal;
-    firstneigh = crml_gpu_compute_n2(neighbor->ago, inum, nall, atom->x,
-				    atom->type, domain->sublo, domain->subhi,
-				    atom->tag, atom->nspecial, atom->special,
-				    eflag, vflag, eflag_atom, vflag_atom,
-				    host_start, &ilist, &numneigh, cpu_time,
-				    success, atom->q, domain->boxlo,
-				    domain->prd);
-  } else {
-    inum = list->inum;
-    ilist = list->ilist;
-    numneigh = list->numneigh;
-    firstneigh = list->firstneigh;
-    crml_gpu_compute2(neighbor->ago, inum, nall, atom->x, atom->type,
-		     ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
-		     vflag_atom, host_start, cpu_time, success, atom->q,
-		     atom->nlocal, domain->boxlo, domain->prd);
-  }
-  if (!success)
-    error->one("Out of memory on GPGPU");
-
-  if (host_start<inum) {
-    cpu_time = MPI_Wtime();
-    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
-    cpu_time = MPI_Wtime() - cpu_time;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   init specific to this pair style
-------------------------------------------------------------------------- */
-
-void PairLJCharmmCoulLongGPU2::init_style()
-{
-  cut_respa = NULL;
-
-  if (!atom->q_flag)
-    error->all("Pair style lj/charmm/coul/long requires atom attribute q");
-  if (force->newton_pair) 
-    error->all("Cannot use newton pair with GPU CHARMM pair style");
-
-  // Repeat cutsq calculation because done after call to init_style
-  double cut;
-  for (int i = 1; i <= atom->ntypes; i++) {
-    for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
-        cut = init_one(i,j);
-    }
-  }
-
-  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
-  cut_ljsq = cut_lj * cut_lj;
-  cut_coulsq = cut_coul * cut_coul;
-  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
-
-  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * 
-    (cut_ljsq-cut_lj_innersq);
-
-  double cell_size = sqrt(cut_bothsq) + neighbor->skin;
-
-  // insure use of KSpace long-range solver, set g_ewald
-
-  if (force->kspace == NULL)
-    error->all("Pair style is incompatible with KSpace style");
-  g_ewald = force->kspace->g_ewald;
-
-  // setup force tables
-
-  if (ncoultablebits) init_tables();
-
-  int maxspecial=0;
-  if (atom->molecular)
-    maxspecial=atom->maxspecial;
-
-  bool arithmetic = true;
-  for (int i = 1; i < atom->ntypes + 1; i++)
-    for (int j = i + 1; j < atom->ntypes + 1; j++) {
-      if (epsilon[i][j] != sqrt(epsilon[i][i] * epsilon[j][j]))
-	arithmetic = false;
-      if (sigma[i][j] != 0.5 * (sigma[i][i] + sigma[j][j]))
-	arithmetic = false;
-    }
-
-  int success = crml_gpu_init2(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4,
-			      offset, force->special_lj, atom->nlocal,
-			      atom->nlocal+atom->nghost, 300, maxspecial,
-			      cell_size, gpu_mode, screen, cut_ljsq, 
-			      cut_coulsq, force->special_coul, force->qqrd2e,
-			      g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma,
-			      arithmetic);
-  GPU_EXTRA::check_flag(success,error,world);
-
-  if (gpu_mode != GPU_NEIGH) {
-    int irequest = neighbor->request(this);
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->full = 1;
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-double PairLJCharmmCoulLongGPU2::memory_usage()
-{
-  double bytes = Pair::memory_usage();
-  return bytes + crml_gpu_bytes2();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCharmmCoulLongGPU2::cpu_compute(int start, int inum, int eflag,
-					  int vflag, int *ilist,
-					  int *numneigh, int **firstneigh)
-{
-  int i,j,ii,jj,jnum,itype,jtype,itable;
-  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
-  double fraction,table;
-  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
-  double grij,expm2,prefactor,t,erfc;
-  double philj,switch1,switch2;
-  int *jlist;
-  double rsq;
-
-  evdwl = ecoul = 0.0;
-
-  double **x = atom->x;
-  double **f = atom->f;
-  double *q = atom->q;
-  int *type = atom->type;
-  int nlocal = atom->nlocal;
-  double *special_coul = force->special_coul;
-  double *special_lj = force->special_lj;
-  double qqrd2e = force->qqrd2e;
-
-  // loop over neighbors of my atoms
-
-  for (ii = start; ii < inum; ii++) {
-    i = ilist[ii];
-    qtmp = q[i];
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-    jlist = firstneigh[i];
-    jnum = numneigh[i];
-
-    for (jj = 0; jj < jnum; jj++) {
-      j = jlist[jj];
-      factor_lj = special_lj[sbmask(j)];
-      factor_coul = special_coul[sbmask(j)];
-      j &= NEIGHMASK;
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-
-      if (rsq < cut_bothsq) {
-	r2inv = 1.0/rsq;
-
-	if (rsq < cut_coulsq) {
-	  if (!ncoultablebits || rsq <= tabinnersq) {
-	    r = sqrt(rsq);
-	    grij = g_ewald * r;
-	    expm2 = exp(-grij*grij);
-	    t = 1.0 / (1.0 + EWALD_P*grij);
-	    erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	    prefactor = qqrd2e * qtmp*q[j]/r;
-	    forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
-	    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
-	  } else {
-	    union_int_float_t rsq_lookup;
-	    rsq_lookup.f = rsq;
-	    itable = rsq_lookup.i & ncoulmask;
-	    itable >>= ncoulshiftbits;
-	    fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
-	    table = ftable[itable] + fraction*dftable[itable];
-	    forcecoul = qtmp*q[j] * table;
-	    if (factor_coul < 1.0) {
-	      table = ctable[itable] + fraction*dctable[itable];
-	      prefactor = qtmp*q[j] * table;
-	      forcecoul -= (1.0-factor_coul)*prefactor;
-	    }
-	  }
-	} else forcecoul = 0.0;
-
-	if (rsq < cut_ljsq) {
-	  r6inv = r2inv*r2inv*r2inv;
-	  jtype = type[j];
-	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
-	  if (rsq > cut_lj_innersq) {
-	    switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
-	      (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj;
-	    switch2 = 12.0*rsq * (cut_ljsq-rsq) * 
-	      (rsq-cut_lj_innersq) / denom_lj;
-	    philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]);
-	    forcelj = forcelj*switch1 + philj*switch2;
-	  }
-	} else forcelj = 0.0;
-
-	fpair = (forcecoul + factor_lj*forcelj) * r2inv;
-
-	f[i][0] += delx*fpair;
-	f[i][1] += dely*fpair;
-	f[i][2] += delz*fpair;
-
-	if (eflag) {
-	  if (rsq < cut_coulsq) {
-	    if (!ncoultablebits || rsq <= tabinnersq)
-	      ecoul = prefactor*erfc;
-	    else {
-	      table = etable[itable] + fraction*detable[itable];
-	      ecoul = qtmp*q[j] * table;
-	    }
-	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
-	  } else ecoul = 0.0;
-
-	  if (rsq < cut_ljsq) {
-	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
-	    if (rsq > cut_lj_innersq) {
-	      switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
-		(cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj;
-	      evdwl *= switch1;
-	    }
-	    evdwl *= factor_lj;
-	  } else evdwl = 0.0;
-	}
-
-	if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
-      }
-    }
-  }
-}
diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu2.h b/src/GPU/pair_lj_charmm_coul_long_gpu2.h
deleted file mode 100644
index b835e08da7..0000000000
--- a/src/GPU/pair_lj_charmm_coul_long_gpu2.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/charmm/coul/long/gpu2,PairLJCharmmCoulLongGPU2)
-
-#else
-
-#ifndef LMP_PAIR_LJ_CHARMM_COUL_LONG_GPU2_H
-#define LMP_PAIR_LJ_CHARMM_COUL_LONG_GPU2_H
-
-#include "pair_lj_charmm_coul_long.h"
-
-namespace LAMMPS_NS {
-
-class PairLJCharmmCoulLongGPU2 : public PairLJCharmmCoulLong {
- public:
-  PairLJCharmmCoulLongGPU2(LAMMPS *lmp);
-  ~PairLJCharmmCoulLongGPU2();
-  void cpu_compute(int, int, int, int, int *, int *, int **);
-  void compute(int, int);
-  void init_style();
-  double memory_usage();
-
- enum { GPU_PAIR, GPU_NEIGH };
-
- private:
-  int gpu_mode;
-  double cpu_time;
-  int *gpulist;
-};
-
-}
-#endif
-#endif
-