diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 6427b4cf15..e1505522b7 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -57,6 +57,8 @@ OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
        $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
        $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
        $(OBJ_DIR)/cmmc_msm_gpu_memory.o $(OBJ_DIR)/cmmc_msm_gpu.o \
+       $(OBJ_DIR)/charge_gpu_memory2.o \
+       $(OBJ_DIR)/crml_gpu_memory2.o $(OBJ_DIR)/crml_gpu2.o \
        $(CUDPP)
 PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
        $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
@@ -75,7 +77,8 @@ PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
        $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \
        $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
        $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h \
-       $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h
+       $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h \
+       $(OBJ_DIR)/crml_gpu_kernel2.ptx $(OBJ_DIR)/crml_gpu_ptx2.h
 
 all: $(GPU_LIB) $(EXECS)
 
@@ -298,6 +301,28 @@ $(OBJ_DIR)/cmmc_msm_gpu_memory.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu_me
 $(OBJ_DIR)/cmmc_msm_gpu.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c cmmc_msm_gpu.cpp -I$(OBJ_DIR)
 
+
+
+$(OBJ_DIR)/charge_gpu_memory2.o: $(ALL_H) charge_gpu_memory2.h charge_gpu_memory2.cpp
+	$(CUDR) -o $@ -c charge_gpu_memory2.cpp
+
+$(OBJ_DIR)/crml_gpu_kernel2.ptx: crml_gpu_kernel2.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel2.cu
+
+$(OBJ_DIR)/crml_gpu_ptx2.h: $(OBJ_DIR)/crml_gpu_kernel2.ptx $(OBJ_DIR)/crml_gpu_kernel2.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/crml_gpu_kernel2.ptx $(OBJ_DIR)/crml_gpu_ptx2.h
+
+$(OBJ_DIR)/crml_gpu_memory2.o: $(ALL_H) crml_gpu_memory2.h crml_gpu_memory2.cpp $(OBJ_DIR)/crml_gpu_ptx2.h $(OBJ_DIR)/charge_gpu_memory2.o
+	$(CUDR) -o $@ -c crml_gpu_memory2.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/crml_gpu2.o: $(ALL_H) crml_gpu_memory2.h crml_gpu2.cpp charge_gpu_memory2.h
+	$(CUDR) -o $@ -c crml_gpu2.cpp -I$(OBJ_DIR)
+
+
+
+
+
+
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDART $(CUDA_LINK) 
 
diff --git a/lib/gpu/charge_gpu_memory2.cpp b/lib/gpu/charge_gpu_memory2.cpp
new file mode 100644
index 0000000000..077213cec5
--- /dev/null
+++ b/lib/gpu/charge_gpu_memory2.cpp
@@ -0,0 +1,296 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include "charge_gpu_memory2.h"
+#define ChargeGPUMemory2T ChargeGPUMemory2<numtyp, acctyp>
+
+extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+ChargeGPUMemory2T::ChargeGPUMemory2() : _compiled(false), _max_bytes(0) {
+  device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
+  nbor=new PairGPUNbor();
+}
+
+template <class numtyp, class acctyp>
+ChargeGPUMemory2T::~ChargeGPUMemory2() {
+  delete ans;
+  delete nbor;
+}
+
+template <class numtyp, class acctyp>
+int ChargeGPUMemory2T::bytes_per_atom_atomic(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int ChargeGPUMemory2T::init_atomic(const int nlocal, const int nall,
+                                  const int max_nbors, const int maxspecial,
+                                  const double cell_size,
+                                  const double gpu_split, FILE *_screen,
+                                  const char *pair_program) {
+  nbor_time_avail=false;
+  screen=_screen;
+
+  bool gpu_nbor=false;
+  if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=true;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
+                           maxspecial,_gpu_host,max_nbors,cell_size,false);
+  if (success!=0)
+    return success;
+
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pair_block_size();
+  compile_kernels(*ucl_device,pair_program);
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_nbor,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->dev_x,4);
+  q_tex.bind_float(atom->dev_q,1);
+
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+
+  return success;
+}
+
+template <class numtyp, class acctyp>
+void ChargeGPUMemory2T::estimate_gpu_overhead() {
+  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+}
+
+template <class numtyp, class acctyp>
+void ChargeGPUMemory2T::clear_atomic() {
+  // Output any timing information
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,screen);
+
+  if (_compiled) {
+    k_pair_fast.clear();
+    k_pair.clear();
+    delete pair_program;
+    _compiled=false;
+  }
+
+  time_pair.clear();
+  hd_balancer.clear();
+
+  device->clear();
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * ChargeGPUMemory2T::reset_nbors(const int nall, const int inum, int *ilist,
+                                   int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  nbor_time_avail=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return false;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+
+  return ilist;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline void ChargeGPUMemory2T::build_nbor_list(const int inum,
+                                              const int host_inum,
+                                              const int nall, double **host_x,
+                                              int *host_type, double *sublo,
+                                              double *subhi, int *tag, 
+                                              int **nspecial, int **special,
+                                              bool &success) {
+  nbor_time_avail=true;
+
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
+                        nspecial, special, success, mn);
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, virials,..
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void ChargeGPUMemory2T::compute(const int f_ago, const int inum_full,
+                               const int nall, double **host_x, int *host_type,
+                               int *ilist, int *numj, int **firstneigh,
+                               const bool eflag, const bool vflag,
+                               const bool eatom, const bool vatom,
+                               int &host_start, const double cpu_time,
+                               bool &success, double *host_q,
+                               const int nlocal, double *boxlo, double *prd) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return;
+  }
+  
+  int ago=hd_balancer.ago_first(f_ago);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
+  host_start=inum;
+
+  if (ago==0) {
+    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
+    if (!success)
+      return;
+  }
+
+  atom->cast_x_data(host_x,host_type);
+  atom->cast_q_data(host_q);
+  hd_balancer.start_timer();
+  atom->add_x_data(host_x,host_type);
+  atom->add_q_data();
+
+  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary and then compute forces, virials, energies
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** ChargeGPUMemory2T::compute(const int ago, const int inum_full,
+                                const int nall, double **host_x, int *host_type,
+                                double *sublo, double *subhi, int *tag,
+                                int **nspecial, int **special, const bool eflag, 
+                                const bool vflag, const bool eatom,
+                                const bool vatom, int &host_start,
+                                int **ilist, int **jnum,
+                                const double cpu_time, bool &success,
+                                double *host_q, double *boxlo, double *prd) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return NULL;
+  }
+  
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+ 
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, success);
+    if (!success)
+      return NULL;
+    atom->cast_q_data(host_q);
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    atom->cast_q_data(host_q);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  atom->add_q_data();
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+  
+  return nbor->host_jlist.begin()-host_start;
+}
+
+template <class numtyp, class acctyp>
+double ChargeGPUMemory2T::host_memory_usage_atomic() const {
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(ChargeGPUMemory2<numtyp,acctyp>);
+}
+
+template <class numtyp, class acctyp>
+void ChargeGPUMemory2T::compile_kernels(UCL_Device &dev, const char *pair_str) {
+  if (_compiled)
+    return;
+
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
+                    std::string(OCL_PRECISION_COMPILE);
+
+  pair_program=new UCL_Program(dev);
+  pair_program->load_string(pair_str,flags.c_str());
+  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
+  k_pair.set_function(*pair_program,"kernel_pair");
+  pos_tex.get_texture(*pair_program,"pos_tex");
+  q_tex.get_texture(*pair_program,"q_tex");
+
+  _compiled=true;
+}
+
+template class ChargeGPUMemory2<PRECISION,ACC_PRECISION>;
+
diff --git a/lib/gpu/charge_gpu_memory2.h b/lib/gpu/charge_gpu_memory2.h
new file mode 100644
index 0000000000..e2accf4a2d
--- /dev/null
+++ b/lib/gpu/charge_gpu_memory2.h
@@ -0,0 +1,200 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef CHARGE_GPU_MEMORY2_H
+#define CHARGE_GPU_MEMORY2_H
+
+#include "pair_gpu_device.h"
+#include "pair_gpu_balance.h"
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+#include "geryon/ocl_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+template <class numtyp, class acctyp>
+class ChargeGPUMemory2 {
+ public:
+  ChargeGPUMemory2();
+  virtual ~ChargeGPUMemory2();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const double cell_size,
+                  const double gpu_split, FILE *screen,
+                  const char *pair_program);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead();
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success)) {
+      pos_tex.bind_float(atom->dev_x,4);
+      q_tex.bind_float(atom->dev_q,1);
+    }
+    ans->resize(inum,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note olist_size=total number of local particles **/
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note nlocal+host_inum=total number local particles
+    * \note olist_size=0 **/
+  inline void resize_local(const int inum, const int host_inum, 
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear_atomic();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom_atomic(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_atomic() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (nbor_time_avail) {
+      nbor->time_nbor.add_to_total();
+      nbor->time_kernel.add_to_total();
+      nbor_time_avail=false;
+    }
+    time_pair.add_to_total();
+    atom->acc_timers();
+    ans->acc_timers();
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    nbor_time_avail=false;
+    time_pair.zero();
+    atom->zero_timers();
+    ans->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, int *tag, int **nspecial,
+                       int **special, bool &success);
+
+  /// Pair loop with host neighboring
+  void compute(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *charge,
+               const int nlocal, double *boxlo, double *prd);
+
+  /// Pair loop with device neighboring
+  int** compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd);
+
+  // -------------------------- DEVICE DATA ------------------------- 
+
+  /// Device Properties and Atom and Neighbor storage
+  PairGPUDevice<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_pair;
+
+  /// Host device load balancer
+  PairGPUBalance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  PairGPUAtom<numtyp,acctyp> *atom;
+
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  PairGPUNbor *nbor;
+
+  /// True if we need to accumulate time for neighboring
+  bool nbor_time_avail;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program;
+  UCL_Kernel k_pair_fast, k_pair;
+  inline int block_size() { return _block_size; }
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+
+ protected:
+  bool _compiled;
+  int _block_size;
+  double  _max_bytes, _max_an_bytes;
+  double _gpu_overhead, _driver_overhead;
+
+  void compile_kernels(UCL_Device &dev, const char *pair_string);
+
+  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+};
+
+#endif
+
diff --git a/lib/gpu/crml_gpu2.cpp b/lib/gpu/crml_gpu2.cpp
new file mode 100644
index 0000000000..a3402fe26b
--- /dev/null
+++ b/lib/gpu/crml_gpu2.cpp
@@ -0,0 +1,136 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "crml_gpu_memory2.h"
+
+using namespace std;
+
+static CRML_GPU_Memory2<PRECISION,ACC_PRECISION> CRMLMF2;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int crml_gpu_init2(const int ntypes, double cut_bothsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald, const double cut_lj_innersq,
+                  const double denom_lj, double **epsilon,
+                  double **sigma, const bool mix_arithmetic) {
+  CRMLMF2.clear();
+  gpu_mode=CRMLMF2.device->gpu_mode();
+  double gpu_split=CRMLMF2.device->particle_split();
+  int first_gpu=CRMLMF2.device->first_device();
+  int last_gpu=CRMLMF2.device->last_device();
+  int world_me=CRMLMF2.device->world_me();
+  int gpu_rank=CRMLMF2.device->gpu_rank();
+  int procs_per_gpu=CRMLMF2.device->procs_per_gpu();
+
+  CRMLMF2.device->init_message(screen,"lj/charmm/coul/long",first_gpu,last_gpu);
+
+  bool message=false;
+  if (CRMLMF2.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    CRMLMF2.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                offset, special_lj, inum, nall, 300, maxspecial, cell_size,
+                gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
+                host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
+                epsilon,sigma,mix_arithmetic);
+
+  CRMLMF2.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CRMLMF2.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
+                          host_lj4, offset, special_lj, inum, nall, 300,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
+                          sigma, mix_arithmetic);
+
+    CRMLMF2.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    CRMLMF2.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void crml_gpu_clear2() {
+  CRMLMF2.clear();
+}
+
+int** crml_gpu_compute_n2(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return CRMLMF2.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
+}  
+			
+void crml_gpu_compute2(const int ago, const int inum_full,
+	 	                  const int nall, double **host_x, int *host_type,
+                      int *ilist, int *numj, int **firstneigh,
+		                  const bool eflag, const bool vflag, const bool eatom,
+                      const bool vatom, int &host_start, const double cpu_time,
+                      bool &success, double *host_q, const int nlocal, 
+                      double *boxlo, double *prd) {
+  CRMLMF2.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
+                 eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+                 nlocal,boxlo,prd);
+}
+
+double crml_gpu_bytes2() {
+  return CRMLMF2.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/crml_gpu_kernel2.cu b/lib/gpu/crml_gpu_kernel2.cu
new file mode 100644
index 0000000000..6ba6eaedca
--- /dev/null
+++ b/lib/gpu/crml_gpu_kernel2.cu
@@ -0,0 +1,388 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef CRML_GPU_KERNEL
+#define CRML_GPU_KERNEL
+
+#define MAX_BIO_SHARED_TYPES 128
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#define EWALD_F (numtyp)1.12837917
+#define EWALD_P (numtyp)0.3275911
+#define A1 (numtyp)0.254829592
+#define A2 (numtyp)-0.284496736
+#define A3 (numtyp)1.421413741
+#define A4 (numtyp)-1.453152027
+#define A5 (numtyp)1.061405429
+
+#ifdef NV_KERNEL
+
+#include "geryon/ucl_nv_kernel.h"
+texture<float4> pos_tex;
+texture<float> q_tex;
+
+#ifdef _DOUBLE_DOUBLE
+__inline double4 fetch_pos(const int& i, const double4 *pos)
+{
+  return pos[i];
+}
+__inline double fetch_q(const int& i, const double *q)
+{
+  return q[i];
+}
+#else
+__inline float4 fetch_pos(const int& i, const float4 *pos)
+{
+  return tex1Dfetch(pos_tex, i);
+}
+__inline float fetch_q(const int& i, const float *q)
+{
+  return tex1Dfetch(q_tex, i);
+}
+#endif
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define __inline inline
+
+#define fetch_pos(i,y) x_[i]
+#define fetch_q(i,y) q_[i]
+
+#endif
+
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global acctyp4 *ans, __global acctyp *engv, 
+                          const int eflag, const int vflag, const int inum, 
+                          const int nall, const int nbor_pitch,
+                          __global numtyp *q_, const numtyp cut_coulsq,
+                          const numtyp qqrd2e, const numtyp g_ewald,
+                          const numtyp denom_lj, const numtyp cut_bothsq, 
+                          const numtyp cut_ljsq, const numtyp cut_lj_innersq) {
+
+  // ii indexes the two interacting particles in gi
+  int ii=GLOBAL_ID_X;
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  if (ii<inum) {
+    acctyp energy=(acctyp)0;
+    acctyp e_coul=(acctyp)0;
+    acctyp4 f;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(acctyp)0;
+
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<cut_bothsq) {
+        numtyp r2inv=(numtyp)1.0/rsq;
+        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc, switch1;
+
+        if (rsq < cut_ljsq) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq);
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
+                             denom_lj;
+            switch1 *= switch1;
+            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
+                       denom_lj;
+            switch2 *= r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
+            force_lj = force_lj*switch1+switch2;
+          }
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = sqrt(rsq);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = exp(-grij*grij);
+          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else {
+          forcecoul = (numtyp)0.0;
+          prefactor = (numtyp)0.0;
+        }
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_ljsq) {
+            numtyp e=r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
+            if (rsq > cut_lj_innersq)
+              e *= switch1;
+            energy+=factor_lj*e;
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+      *ap1=e_coul;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
+                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nall, const int nbor_pitch,
+                               __global numtyp *q_, const numtyp cut_coulsq, 
+                               const numtyp qqrd2e, const numtyp g_ewald,
+                               const numtyp denom_lj, const numtyp cut_bothsq, 
+                               const numtyp cut_ljsq,
+                               const numtyp cut_lj_innersq) {
+  // ii indexes the two interacting particles in gi
+  int ii=THREAD_ID_X;
+  __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (ii<8)
+    sp_lj[ii]=sp_lj_in[ii];
+  ljd[ii]=ljd_in[ii];
+  ljd[ii+64]=ljd_in[ii+64];
+
+  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  __syncthreads();
+  
+  if (ii<inum) {
+  
+    acctyp energy=(acctyp)0;
+    acctyp e_coul=(acctyp)0;
+    acctyp4 f;
+    f.x=(acctyp)0;
+    f.y=(acctyp)0;
+    f.z=(acctyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(acctyp)0;
+  
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cut_bothsq) {
+        numtyp r2inv=(numtyp)1.0/rsq;
+        numtyp forcecoul, force_lj, force, prefactor, _erfc, switch1;
+        numtyp lj3, lj4;
+
+        if (rsq < cut_ljsq) {
+          numtyp eps = sqrt(ljd[itype].x*ljd[jtype].x);
+          numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
+
+          numtyp sig_r_6 = sig6*sig6*r2inv;
+          sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
+          lj4 = (numtyp)4.0*eps*sig_r_6;
+          lj3 = lj4*sig_r_6;
+          force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq);
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
+                             denom_lj;
+            switch1 *= switch1;
+            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
+                       denom_lj;
+            switch2 *= lj3-lj4;
+            force_lj = force_lj*switch1+switch2;
+          }
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = sqrt(rsq);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = exp(-grij*grij);
+          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else {
+          forcecoul = (numtyp)0.0;
+          prefactor = (numtyp)0.0;
+        }
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_ljsq) {
+            numtyp e=lj3-lj4;
+            if (rsq > cut_lj_innersq)
+              e *= switch1;
+            energy+=factor_lj*e;
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+      *ap1=e_coul;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+#endif
diff --git a/lib/gpu/crml_gpu_memory2.cpp b/lib/gpu/crml_gpu_memory2.cpp
new file mode 100644
index 0000000000..f2f9281ebc
--- /dev/null
+++ b/lib/gpu/crml_gpu_memory2.cpp
@@ -0,0 +1,172 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifdef USE_OPENCL
+#include "crml_gpu_cl2.h"
+#else
+#include "crml_gpu_ptx2.h"
+#endif
+
+#include "crml_gpu_memory2.h"
+#include <cassert>
+#define CRML_GPU_Memory2T CRML_GPU_Memory2<numtyp, acctyp>
+
+extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+CRML_GPU_Memory2T::CRML_GPU_Memory2() : ChargeGPUMemory2<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+CRML_GPU_Memory2T::~CRML_GPU_Memory2() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int CRML_GPU_Memory2T::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int CRML_GPU_Memory2T::init(const int ntypes,
+                           double host_cut_bothsq, double **host_lj1, 
+                           double **host_lj2, double **host_lj3, 
+                           double **host_lj4, double **host_offset, 
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *_screen,
+                           double host_cut_ljsq, const double host_cut_coulsq,
+                           double *host_special_coul, const double qqrd2e,
+                           const double g_ewald, const double cut_lj_innersq,
+                           const double denom_lj, double **epsilon,
+                           double **sigma, const bool mix_arithmetic) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,crml_gpu_kernel2);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  if (this->_block_size>=64 && mix_arithmetic)
+    shared_types=true;
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  int h_size=lj_types*lj_types;
+  if (h_size<MAX_BIO_SHARED_TYPES)
+    h_size=MAX_BIO_SHARED_TYPES;
+  UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+  for (int i=0; i<h_size*32; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_lj3,host_lj4);
+
+  ljd.alloc(MAX_BIO_SHARED_TYPES,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_bothsq = host_cut_bothsq;
+  _cut_coulsq = host_cut_coulsq;
+  _cut_ljsq = host_cut_ljsq;
+  _cut_lj_innersq = cut_lj_innersq;
+  _qqrd2e=qqrd2e;
+  _g_ewald=g_ewald;
+  _denom_lj=denom_lj;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void CRML_GPU_Memory2T::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  ljd.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double CRML_GPU_Memory2T::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(CRML_GPU_Memory2<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void CRML_GPU_Memory2T::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+
+  int ainum=this->ans->inum();
+  int anall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
+                          &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch,
+                          &this->atom->dev_q.begin(), &_cut_coulsq,
+                          &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
+                          &_cut_ljsq, &_cut_lj_innersq);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
+                     &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
+                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq);
+  }
+  this->time_pair.stop();
+}
+
+template class CRML_GPU_Memory2<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/crml_gpu_memory2.h b/lib/gpu/crml_gpu_memory2.h
new file mode 100644
index 0000000000..9ae11ba6ac
--- /dev/null
+++ b/lib/gpu/crml_gpu_memory2.h
@@ -0,0 +1,86 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef CRML_GPU_MEMORY2_H
+#define CRML_GPU_MEMORY2_H
+
+#include "charge_gpu_memory2.h"
+
+template <class numtyp, class acctyp>
+class CRML_GPU_Memory2 : public ChargeGPUMemory2<numtyp, acctyp> {
+ public:
+  CRML_GPU_Memory2();
+  ~CRML_GPU_Memory2();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double host_cut_bothsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald,
+           const double cut_lj_innersq, const double denom_lj, 
+           double **epsilon, double **sigma, const bool mix_arithmetic);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// x = lj1, y = lj2, z = lj3, w = lj4
+  UCL_D_Vec<numtyp4> lj1;
+  /// x = epsilon, y = sigma
+  UCL_D_Vec<numtyp2> ljd;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _qqrd2e, _g_ewald, _denom_lj;
+
+  numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+#endif
+
diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index 83c1f61aa1..0f2bbfc095 100644
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -36,6 +36,8 @@ if (test $1 = 1) then
   if (test -e ../pair_lj_charmm_coul_long.cpp) then
     cp pair_lj_charmm_coul_long_gpu.cpp ..
     cp pair_lj_charmm_coul_long_gpu.h ..
+    cp pair_lj_charmm_coul_long_gpu2.cpp ..
+    cp pair_lj_charmm_coul_long_gpu2.h ..
   fi
 
   if (test -e ../pair_cg_cmm.cpp) then
@@ -90,6 +92,7 @@ elif (test $1 = 0) then
   rm ../pair_lj_cut_coul_cut_gpu.cpp
   rm ../pair_lj_cut_coul_long_gpu.cpp
   rm ../pair_lj_charmm_coul_long_gpu.cpp
+  rm ../pair_lj_charmm_coul_long_gpu2.cpp
   rm ../pair_lj_cut_tgpu.cpp
   rm ../pair_cg_cmm_gpu.cpp
   rm ../pair_cg_cmm_coul_long_gpu.cpp
@@ -109,6 +112,7 @@ elif (test $1 = 0) then
   rm ../pair_lj_cut_coul_cut_gpu.h
   rm ../pair_lj_cut_coul_long_gpu.h
   rm ../pair_lj_charmm_coul_long_gpu.h
+  rm ../pair_lj_charmm_coul_long_gpu2.h
   rm ../pair_lj_cut_tgpu.cpp
   rm ../pair_cg_cmm_gpu.h
   rm ../pair_cg_cmm_coul_long_gpu.h
diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu2.cpp b/src/GPU/pair_lj_charmm_coul_long_gpu2.cpp
new file mode 100644
index 0000000000..fd5c55e7dd
--- /dev/null
+++ b/src/GPU/pair_lj_charmm_coul_long_gpu2.cpp
@@ -0,0 +1,344 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+   
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "lmptype.h"
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "pair_lj_charmm_coul_long_gpu2.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "update.h"
+#include "domain.h"
+#include "string.h"
+#include "kspace.h"
+#include "gpu_extra.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+// External functions from cuda library for atom decomposition
+
+int crml_gpu_init2(const int ntypes, double cut_bothsq, double **host_lj1,
+		  double **host_lj2, double **host_lj3, double **host_lj4, 
+		  double **offset, double *special_lj, const int nlocal, 
+		  const int nall, const int max_nbors, const int maxspecial,
+		  const double cell_size, int &gpu_mode, FILE *screen,
+		  double host_cut_ljsq, double host_cut_coulsq,
+		  double *host_special_coul, const double qqrd2e,
+		  const double g_ewald, const double cut_lj_innersq,
+		  const double denom_lj, double **epsilon, double **sigma,
+		  const bool mix_arithmetic);
+void crml_gpu_clear2();
+int ** crml_gpu_compute_n2(const int ago, const int inum,
+			  const int nall, double **host_x, int *host_type, 
+			  double *sublo, double *subhi, int *tag,
+			  int **nspecial, int **special, const bool eflag,
+			  const bool vflag, const bool eatom, const bool vatom,
+			  int &host_start, int **ilist, int **jnum,
+			  const double cpu_time, bool &success, double *host_q,
+			  double *boxlo, double *prd);
+void crml_gpu_compute2(const int ago, const int inum, const int nall,
+		      double **host_x, int *host_type, int *ilist, int *numj,
+		      int **firstneigh, const bool eflag, const bool vflag,
+		      const bool eatom, const bool vatom, int &host_start,
+		      const double cpu_time, bool &success, double *host_q,
+		      const int nlocal, double *boxlo, double *prd);
+double crml_gpu_bytes2();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulLongGPU2::PairLJCharmmCoulLongGPU2(LAMMPS *lmp) : 
+  PairLJCharmmCoulLong(lmp), gpu_mode(GPU_PAIR)
+{
+  respa_enable = 0;
+  cpu_time = 0.0;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairLJCharmmCoulLongGPU2::~PairLJCharmmCoulLongGPU2()
+{
+  crml_gpu_clear2();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongGPU2::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+  
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;    
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    firstneigh = crml_gpu_compute_n2(neighbor->ago, inum, nall, atom->x,
+				    atom->type, domain->sublo, domain->subhi,
+				    atom->tag, atom->nspecial, atom->special,
+				    eflag, vflag, eflag_atom, vflag_atom,
+				    host_start, &ilist, &numneigh, cpu_time,
+				    success, atom->q, domain->boxlo,
+				    domain->prd);
+  } else {
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    crml_gpu_compute2(neighbor->ago, inum, nall, atom->x, atom->type,
+		     ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		     vflag_atom, host_start, cpu_time, success, atom->q,
+		     atom->nlocal, domain->boxlo, domain->prd);
+  }
+  if (!success)
+    error->one("Out of memory on GPGPU");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongGPU2::init_style()
+{
+  cut_respa = NULL;
+
+  if (!atom->q_flag)
+    error->all("Pair style lj/charmm/coul/long requires atom attribute q");
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU CHARMM pair style");
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
+        cut = init_one(i,j);
+    }
+  }
+
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
+
+  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * 
+    (cut_ljsq-cut_lj_innersq);
+
+  double cell_size = sqrt(cut_bothsq) + neighbor->skin;
+
+  // insure use of KSpace long-range solver, set g_ewald
+
+  if (force->kspace == NULL)
+    error->all("Pair style is incompatible with KSpace style");
+  g_ewald = force->kspace->g_ewald;
+
+  // setup force tables
+
+  if (ncoultablebits) init_tables();
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+
+  bool arithmetic = true;
+  for (int i = 1; i < atom->ntypes + 1; i++)
+    for (int j = i + 1; j < atom->ntypes + 1; j++) {
+      if (epsilon[i][j] != sqrt(epsilon[i][i] * epsilon[j][j]))
+	arithmetic = false;
+      if (sigma[i][j] != 0.5 * (sigma[i][i] + sigma[j][j]))
+	arithmetic = false;
+    }
+
+  int success = crml_gpu_init2(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4,
+			      offset, force->special_lj, atom->nlocal,
+			      atom->nlocal+atom->nghost, 300, maxspecial,
+			      cell_size, gpu_mode, screen, cut_ljsq, 
+			      cut_coulsq, force->special_coul, force->qqrd2e,
+			      g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma,
+			      arithmetic);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCharmmCoulLongGPU2::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + crml_gpu_bytes2();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongGPU2::cpu_compute(int start, int inum, int eflag,
+					  int vflag, int *ilist,
+					  int *numneigh, int **firstneigh)
+{
+  int i,j,ii,jj,jnum,itype,jtype,itable;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double fraction,table;
+  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  double philj,switch1,switch2;
+  int *jlist;
+  double rsq;
+
+  evdwl = ecoul = 0.0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *q = atom->q;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cut_bothsq) {
+	r2inv = 1.0/rsq;
+
+	if (rsq < cut_coulsq) {
+	  if (!ncoultablebits || rsq <= tabinnersq) {
+	    r = sqrt(rsq);
+	    grij = g_ewald * r;
+	    expm2 = exp(-grij*grij);
+	    t = 1.0 / (1.0 + EWALD_P*grij);
+	    erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+	    prefactor = qqrd2e * qtmp*q[j]/r;
+	    forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+	    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+	  } else {
+	    union_int_float_t rsq_lookup;
+	    rsq_lookup.f = rsq;
+	    itable = rsq_lookup.i & ncoulmask;
+	    itable >>= ncoulshiftbits;
+	    fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+	    table = ftable[itable] + fraction*dftable[itable];
+	    forcecoul = qtmp*q[j] * table;
+	    if (factor_coul < 1.0) {
+	      table = ctable[itable] + fraction*dctable[itable];
+	      prefactor = qtmp*q[j] * table;
+	      forcecoul -= (1.0-factor_coul)*prefactor;
+	    }
+	  }
+	} else forcecoul = 0.0;
+
+	if (rsq < cut_ljsq) {
+	  r6inv = r2inv*r2inv*r2inv;
+	  jtype = type[j];
+	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+	  if (rsq > cut_lj_innersq) {
+	    switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+	      (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj;
+	    switch2 = 12.0*rsq * (cut_ljsq-rsq) * 
+	      (rsq-cut_lj_innersq) / denom_lj;
+	    philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]);
+	    forcelj = forcelj*switch1 + philj*switch2;
+	  }
+	} else forcelj = 0.0;
+
+	fpair = (forcecoul + factor_lj*forcelj) * r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  if (rsq < cut_coulsq) {
+	    if (!ncoultablebits || rsq <= tabinnersq)
+	      ecoul = prefactor*erfc;
+	    else {
+	      table = etable[itable] + fraction*detable[itable];
+	      ecoul = qtmp*q[j] * table;
+	    }
+	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+	  } else ecoul = 0.0;
+
+	  if (rsq < cut_ljsq) {
+	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
+	    if (rsq > cut_lj_innersq) {
+	      switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+		(cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj;
+	      evdwl *= switch1;
+	    }
+	    evdwl *= factor_lj;
+	  } else evdwl = 0.0;
+	}
+
+	if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu2.h b/src/GPU/pair_lj_charmm_coul_long_gpu2.h
new file mode 100644
index 0000000000..b835e08da7
--- /dev/null
+++ b/src/GPU/pair_lj_charmm_coul_long_gpu2.h
@@ -0,0 +1,47 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/long/gpu2,PairLJCharmmCoulLongGPU2)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_LONG_GPU2_H
+#define LMP_PAIR_LJ_CHARMM_COUL_LONG_GPU2_H
+
+#include "pair_lj_charmm_coul_long.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulLongGPU2 : public PairLJCharmmCoulLong {
+ public:
+  PairLJCharmmCoulLongGPU2(LAMMPS *lmp);
+  ~PairLJCharmmCoulLongGPU2();
+  void cpu_compute(int, int, int, int, int *, int *, int **);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+ enum { GPU_PAIR, GPU_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
+};
+
+}
+#endif
+#endif
+