diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index cc74d2ebd6..3c859ffdd4 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -30,8 +30,9 @@ UCL_H  = $(wildcard ./geryon/ucl*.h)
 NVC_H  = $(wildcard ./geryon/nvc*.h) $(UCL_H)
 NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) 
 # Headers for Pair Stuff
-PAIR_H  = pair_gpu_atom.h pair_gpu_nbor_shared.h pair_gpu_nbor.h \
-          pair_gpu_precision.h pair_gpu_device.h pair_gpu_balance.h
+PAIR_H  = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
+          pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
+          pair_gpu_balance.h
 
 ALL_H = $(NVD_H) $(PAIR_H)
 
@@ -39,8 +40,9 @@ EXECS = $(BIN_DIR)/nvc_get_devices
 CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
         $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
         $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
-OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
-       $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_device.o \
+OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
+       $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \
+       $(OBJ_DIR)/pair_gpu_device.o \
        $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
        $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
        $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
@@ -95,6 +97,9 @@ $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx
 $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
 	$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H)
+	$(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
 	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu
 
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 829add7350..e488a56bc0 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -23,14 +23,16 @@ OCL_LIB = $(LIB_DIR)/libgpu.a
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H)
 # Headers for Pair Stuff
-PAIR_H  = pair_gpu_atom.h pair_gpu_nbor_shared.h pair_gpu_nbor.h \
-          pair_gpu_precision.h pair_gpu_device.h pair_gpu_balance.h
+PAIR_H  = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
+          pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
+          pair_gpu_balance.h
 
 ALL_H = $(OCL_H) $(PAIR_H)
 
 EXECS = $(BIN_DIR)/ocl_get_devices
-OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \
-       $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_device.o \
+OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
+       $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \
+       $(OBJ_DIR)/pair_gpu_device.o \
        $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
        $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
        $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
@@ -46,7 +48,7 @@ KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
        $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
        $(OBJ_DIR)/crml_gpu_cl.h \
        $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h 
-       
+
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
 
 all: $(OCL_LIB) $(EXECS)
diff --git a/lib/gpu/pair_gpu_ans.cpp b/lib/gpu/pair_gpu_ans.cpp
new file mode 100644
index 0000000000..e6982e6eba
--- /dev/null
+++ b/lib/gpu/pair_gpu_ans.cpp
@@ -0,0 +1,409 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include "pair_gpu_ans.h"
+
+#define PairGPUAnsT PairGPUAns<numtyp,acctyp>
+
+template <class numtyp, class acctyp>
+PairGPUAnsT::PairGPUAns() : _allocated(false),_eflag(false),_vflag(false),
+                            _inum(0),_ilist(NULL),_newton(false) {
+}
+
+template <class numtyp, class acctyp>
+int PairGPUAnsT::bytes_per_atom() const { 
+  int bytes=11*sizeof(acctyp);
+  if (_rot)
+    bytes+=4*sizeof(acctyp);
+  if (_charge)
+    bytes+=sizeof(acctyp);
+  return bytes;
+}
+
+template <class numtyp, class acctyp>
+bool PairGPUAnsT::alloc(const int inum) {
+  _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
+
+  bool success=true;
+  
+  int ans_elements=4;
+  if (_rot)
+    ans_elements+=4;
+  
+  // Ignore host/device transfers?
+  bool cpuview=false;
+  if (dev->device_type()==UCL_CPU)
+    cpuview=true;
+    
+  // --------------------------   Host allocations
+  success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
+  success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
+    
+  // ---------------------------  Device allocations
+  if (cpuview) {
+    dev_engv.view(host_engv);
+    dev_ans.view(host_ans);
+  } else {
+    success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
+                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
+    success=success && (dev_ans.alloc(ans_elements*_max_local,
+                                      *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
+  }
+  _gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes();
+  
+  _allocated=true;  
+  return success;
+}
+
+template <class numtyp, class acctyp>
+bool PairGPUAnsT::init(const int inum, const bool charge, const bool rot,
+                       UCL_Device &devi) {
+  clear();
+
+  bool success=true;
+  _charge=charge;
+  _rot=rot;
+  _other=_charge || _rot;
+  dev=&devi;
+
+  _e_fields=1;
+  if (_charge)
+    _e_fields++;
+  _ev_fields=6+_e_fields;
+    
+  // Initialize atom and nbor data
+  int ef_inum=inum;
+  if (ef_inum==0)
+    ef_inum=1000;
+  
+  // Initialize timers for the selected device
+  time_answer.init(*dev);
+  time_answer.zero();
+  _time_cast=0.0;
+  
+  return success && alloc(ef_inum);
+}
+  
+template <class numtyp, class acctyp>
+bool PairGPUAnsT::add_fields(const bool charge, const bool rot) {
+  bool realloc=false;
+  if (charge && _charge==false) {
+    _charge=true;
+    _e_fields++;
+    _ev_fields++;
+    realloc=true;
+  }
+  if (rot && _rot==false) {
+    _rot=true;
+    realloc=true;
+  }
+  if (realloc) {
+    _other=_charge || _rot;
+    int inum=_max_local;
+    clear_resize();
+    return alloc(inum);
+  }
+  return true;
+}
+
+template <class numtyp, class acctyp>
+void PairGPUAnsT::clear_resize() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  dev_ans.clear();
+  dev_engv.clear();
+  host_ans.clear();
+  host_engv.clear();
+}
+
+template <class numtyp, class acctyp>
+void PairGPUAnsT::clear() {
+  _gpu_bytes=0;
+  if (!_allocated)
+    return;
+
+  time_pos.clear();
+  time_other.clear();
+  time_answer.clear();
+  clear_resize();
+  _inum=0;
+  _eflag=false;
+  _vflag=false;
+}
+
+template <class numtyp, class acctyp>
+double PairGPUAnsT::host_memory_usage() const {
+  int atom_bytes=4;
+  if (_charge) 
+    atom_bytes+=1;
+  if (_rot) 
+    atom_bytes+=4;
+  int ans_bytes=atom_bytes+_ev_fields;
+  return ans_bytes*(_max_local)*sizeof(acctyp)+
+         sizeof(PairGPUAns<numtyp,acctyp>);
+}
+  
+template <class numtyp, class acctyp>
+void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
+                               const bool ef_atom, const bool vf_atom) {
+  time_answer.start();
+  _eflag=eflag;
+  _vflag=vflag;
+  _ef_atom=ef_atom;
+  _vf_atom=vf_atom;
+    
+  int csize=_ev_fields;    
+  if (!eflag)
+    csize-=_e_fields;
+  if (!vflag)
+    csize-=6;
+      
+  if (csize>0)
+    ucl_copy(host_engv,dev_engv,_inum*csize,true);
+  if (_rot)
+    ucl_copy(host_ans,dev_ans,_inum*4*2,true);
+  else
+    ucl_copy(host_ans,dev_ans,_inum*4,true);
+  time_answer.stop();
+  dev->add_answer_object(this);
+}
+
+template <class numtyp, class acctyp>
+void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
+                               const bool ef_atom, const bool vf_atom,
+                               int *ilist) {
+  _ilist=ilist;
+  copy_answers(eflag,vflag,ef_atom,vf_atom);
+}
+
+template <class numtyp, class acctyp>
+double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
+                                  double *virial) {
+  if (_eflag==false && _vflag==false)
+    return 0.0;
+
+  double evdwl=0.0;
+  if (_gpu_nbor) {
+    for (int i=0; i<_inum; i++) {
+      acctyp *ap=host_engv.begin()+i;
+      if (_eflag) {
+        if (_ef_atom) {
+          evdwl+=*ap;
+          eatom[i]+=*ap*0.5;
+          ap+=_inum;
+        } else {
+          evdwl+=*ap;
+          ap+=_inum;
+        }
+      }
+      if (_vflag) {
+        if (_vf_atom) {
+          for (int j=0; j<6; j++) {
+            vatom[i][j]+=*ap*0.5;
+            virial[j]+=*ap;
+            ap+=_inum;
+          }
+        } else {
+          for (int j=0; j<6; j++) {
+            virial[j]+=*ap;
+            ap+=_inum;
+          }
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]*=0.5;
+  } else {
+    for (int i=0; i<_inum; i++) {
+      acctyp *ap=host_engv.begin()+i;
+      int ii=_ilist[i];
+      if (_eflag) {
+        if (_ef_atom) {
+          evdwl+=*ap;
+          eatom[ii]+=*ap*0.5;
+          ap+=_inum;
+        } else {
+          evdwl+=*ap;
+          ap+=_inum;
+        }
+      }
+      if (_vflag) {
+        if (_vf_atom) {
+          for (int j=0; j<6; j++) {
+            vatom[ii][j]+=*ap*0.5;
+            virial[j]+=*ap;
+            ap+=_inum;
+          }
+        } else {
+          for (int j=0; j<6; j++) {
+            virial[j]+=*ap;
+            ap+=_inum;
+          }
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]*=0.5;
+  }
+  
+  evdwl*=0.5;
+  return evdwl;
+}
+
+template <class numtyp, class acctyp>
+double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
+                                   double *virial, double &ecoul) {
+  if (_eflag==false && _vflag==false) {
+    ecoul=0.0;
+    return 0.0;
+  }
+
+  if (_charge==false)
+    return energy_virial(eatom,vatom,virial);
+
+  double evdwl=0.0;
+  double _ecoul=0.0;
+  if (_gpu_nbor) {
+    for (int i=0; i<_inum; i++) {
+      acctyp *ap=host_engv.begin()+i;
+      if (_eflag) {
+        if (_ef_atom) {
+          evdwl+=*ap;
+          eatom[i]+=*ap*0.5;
+          ap+=_inum;
+          _ecoul+=*ap;
+          eatom[i]+=*ap*0.5;
+          ap+=_inum;
+        } else {
+          evdwl+=*ap;
+          ap+=_inum;
+          _ecoul+=*ap;
+          ap+=_inum;
+        }
+      }
+      if (_vflag) {
+        if (_vf_atom) {
+          for (int j=0; j<6; j++) {
+            vatom[i][j]+=*ap*0.5;
+            virial[j]+=*ap;
+            ap+=_inum;
+          }
+        } else {
+          for (int j=0; j<6; j++) {
+            virial[j]+=*ap;
+            ap+=_inum;
+          }
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]*=0.5;
+  } else {
+    for (int i=0; i<_inum; i++) {
+      acctyp *ap=host_engv.begin()+i;
+      int ii=_ilist[i];
+      if (_eflag) {
+        if (_ef_atom) {
+          evdwl+=*ap;
+          eatom[ii]+=*ap*0.5;
+          ap+=_inum;
+          _ecoul+=*ap;
+          eatom[ii]+=*ap*0.5;
+          ap+=_inum;
+        } else {
+          evdwl+=*ap;
+          ap+=_inum;
+          _ecoul+=*ap;
+          ap+=_inum;
+        }
+      }
+      if (_vflag) {
+        if (_vf_atom) {
+          for (int j=0; j<6; j++) {
+            vatom[ii][j]+=*ap*0.5;
+            virial[j]+=*ap;
+            ap+=_inum;
+          }
+        } else {
+          for (int j=0; j<6; j++) {
+            virial[j]+=*ap;
+            ap+=_inum;
+          }
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]*=0.5;
+  }
+  
+  evdwl*=0.5;
+  ecoul+=_ecoul*0.5;
+  return evdwl;
+}
+
+template <class numtyp, class acctyp>
+void PairGPUAnsT::get_answers(double **f, double **tor) {
+  _x_avail=false;
+  _q_avail=false;
+  _quat_avail=false;
+  acctyp *ap=host_ans.begin();
+  if (_gpu_nbor) {
+    for (int i=0; i<_inum; i++) {
+      f[i][0]+=*ap;
+      ap++;
+      f[i][1]+=*ap;
+      ap++;
+      f[i][2]+=*ap;
+      ap+=2;
+    }
+    if (_rot) {
+      for (int i=0; i<_inum; i++) {
+        tor[i][0]+=*ap;
+        ap++;
+        tor[i][1]+=*ap;
+        ap++;
+        tor[i][2]+=*ap;
+        ap+=2;
+      }
+    }
+  } else {
+    for (int i=0; i<_inum; i++) {
+      int ii=_ilist[i];
+      f[ii][0]+=*ap;
+      ap++;
+      f[ii][1]+=*ap;
+      ap++;
+      f[ii][2]+=*ap;
+      ap+=2;
+    }
+    if (_rot) {
+      for (int i=0; i<_inum; i++) {
+        int ii=_ilist[i];
+        tor[ii][0]+=*ap;
+        ap++;
+        tor[ii][1]+=*ap;
+        ap++;
+        tor[ii][2]+=*ap;
+        ap+=2;
+      }
+    }
+  }
+}
+
+template class PairGPUAns<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/pair_gpu_ans.h b/lib/gpu/pair_gpu_ans.h
new file mode 100644
index 0000000000..a93ed6fcd5
--- /dev/null
+++ b/lib/gpu/pair_gpu_ans.h
@@ -0,0 +1,158 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef PAIR_GPU_ANS_H
+#define PAIR_GPU_ANS_H
+
+#include <math.h>
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+
+#include "geryon/ocl_timer.h"
+#include "geryon/ocl_mat.h"
+using namespace ucl_opencl;
+
+#else
+
+#include "cudpp.h"
+#include "geryon/nvd_timer.h"
+#include "geryon/nvd_mat.h"
+using namespace ucl_cudadr;
+
+#endif
+
+#include "pair_gpu_precision.h"
+
+template <class numtyp, class acctyp>
+class PairGPUAns {
+ public:
+  PairGPUAns();
+  ~PairGPUAns() { clear(); }
+
+  /// Current number of local atoms stored
+  inline int inum() const { return _inum; }
+  /// Set number of local atoms for future copy operations
+  inline void inum(const int n) { _inum=n; }
+  
+  /// Memory usage per atom in this class
+  int bytes_per_atom() const; 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param rot True if atom storage needs quaternions
+    * \param gpu_nbor True if neighboring will be performed on device **/
+  bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev);
+  
+  /// Check if we have enough device storage and realloc if not
+  inline bool resize(const int inum, bool &success) {
+    _inum=inum;
+    if (inum>_max_local) {
+      clear_resize();
+      success = success && alloc(inum);
+      return true;
+    }
+    return false;
+  }
+  
+  /// If already initialized by another LAMMPS style, add fields as necessary
+  /** \param rot True if atom storage needs quaternions
+    * \param gpu_nbor True if neighboring will be performed on device **/
+  bool add_fields(const bool charge, const bool rot);
+  
+  /// Free all memory on host and device needed to realloc for more atoms
+  void clear_resize();
+
+  /// Free all memory on host and device
+  void clear();
+ 
+  /// Return the total amount of host memory used by class in bytes
+  double host_memory_usage() const;
+
+  /// Add copy times to timers
+  inline void acc_timers() {
+    time_answer.add_to_total();
+  }
+
+  /// Add copy times to timers
+  inline void zero_timers() {
+    time_answer.zero();
+  }
+
+  /// Return the total time for host/device data transfer
+  inline double transfer_time() {
+    return time_answer.total_seconds();
+  }
+  
+  /// Return the total time for data cast/pack
+  inline double cast_time() { return _time_cast; }
+  
+  /// Return number of bytes used on device
+  inline double gpu_bytes() { return _gpu_bytes; } 
+
+  // -------------------------COPY FROM GPU -------------------------------
+
+  /// Copy answers from device into read buffer asynchronously
+  void copy_answers(const bool eflag, const bool vflag,
+                    const bool ef_atom, const bool vf_atom);
+
+  /// Copy answers from device into read buffer asynchronously
+  void copy_answers(const bool eflag, const bool vflag,
+                    const bool ef_atom, const bool vf_atom, int *ilist);
+  
+  /// Copy energy and virial data into LAMMPS memory
+  double energy_virial(double *eatom, double **vatom, double *virial);
+
+  /// Copy energy and virial data into LAMMPS memory
+  double energy_virial(double *eatom, double **vatom, double *virial,
+                       double &ecoul);
+
+  /// Add forces and torques from the GPU into a LAMMPS pointer
+  void get_answers(double **f, double **tor);
+
+  // ------------------------------ DATA ----------------------------------
+
+  /// Force and possibly torque
+  UCL_D_Vec<acctyp> dev_ans;
+  /// Energy and virial per-atom storage
+  UCL_D_Vec<acctyp> dev_engv;
+  
+  /// Force and possibly torque data on host
+  UCL_H_Vec<acctyp> host_ans;
+  /// Energy/virial data on host
+  UCL_H_Vec<acctyp> host_engv;
+  
+  /// Device timers
+  UCL_Timer time_answer;
+  
+  /// Geryon device
+  UCL_Device *dev;
+
+ private:
+  bool alloc(const int inum);
+  
+  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
+  int _max_local, _inum, _e_fields, _ev_fields;
+  int *_ilist;
+  double _time_cast;
+  
+  double _gpu_bytes;
+  
+  bool _newton;
+};
+
+#endif
+
diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp
index 46c9066e56..812a7c82d6 100644
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_atom.cpp
@@ -29,9 +29,7 @@ __win_sort _win_sort;
 #endif
 
 template <class numtyp, class acctyp>
-PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
-                              _vflag(false),_inum(0),_ilist(NULL), 
-                              _newton(false) {
+PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false) {
   #ifndef USE_OPENCL
   sort_config.op = CUDPP_ADD;
   sort_config.datatype = CUDPP_UINT;
@@ -56,28 +54,20 @@ int PairGPUAtomT::bytes_per_atom() const {
   int id_space=0;
   if (_gpu_nbor)
     id_space=2;
-  int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space;
+  int bytes=4*sizeof(numtyp)+id_space;
   if (_rot)
-    bytes+=4*sizeof(numtyp)+4*sizeof(acctyp);
+    bytes+=4*sizeof(numtyp);
   if (_charge)
     bytes+=sizeof(numtyp);
   return bytes;
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUAtomT::alloc(const int inum, const int nall) {
+bool PairGPUAtomT::alloc(const int nall) {
   _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
-  if (_newton)
-    _max_local=_max_atoms;
-  else
-    _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
 
   bool success=true;
   
-  int ans_elements=4;
-  if (_rot)
-    ans_elements+=4;
-  
   // Ignore host/device transfers?
   bool cpuview=false;
   if (dev->device_type()==UCL_CPU)
@@ -107,8 +97,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
   success=success && (host_x.alloc(_max_atoms*4,*dev,
                       UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   #endif                      
-  success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
-  success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
   // Buffer for casting only if different precisions
   if (_charge)
     success=success && (host_q.alloc(_max_atoms,*dev,
@@ -127,8 +115,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
     #else
     dev_x.view(host_x);
     #endif
-    dev_engv.view(host_engv);
-    dev_ans.view(host_ans);
     if (_rot)
       dev_quat.view(host_quat);
     if (_charge)
@@ -145,10 +131,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
     success=success && (UCL_SUCCESS==
                         dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
     #endif
-    success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
-                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
-    success=success && (dev_ans.alloc(ans_elements*_max_local,
-                                      *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
     if (_charge) {
       success=success && (dev_q.alloc(_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
@@ -170,15 +152,15 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
     }
   }
 
-  _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
+  _gpu_bytes+=dev_x.row_bytes();
   
   _allocated=true;  
   return success;
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
-                        const bool rot, UCL_Device &devi, const bool gpu_nbor,
+bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
+                        UCL_Device &devi, const bool gpu_nbor,
                         const bool bonds) {
   clear();
 
@@ -193,33 +175,23 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
   _other=_charge || _rot;
   dev=&devi;
 
-  _e_fields=1;
-  if (_charge)
-    _e_fields++;
-  _ev_fields=6+_e_fields;
-    
   // Initialize atom and nbor data
-  int ef_inum=inum;
-  if (ef_inum==0)
-    ef_inum=1000;
   int ef_nall=nall;
-  if (ef_nall<=ef_inum)
-    ef_nall=ef_inum*2;
+  if (ef_nall==0)
+    ef_nall=2000;
   
   // Initialize timers for the selected device
   time_pos.init(*dev);
   time_other.init(*dev);
-  time_answer.init(*dev);
   time_pos.zero();
   time_other.zero();
-  time_answer.zero();
   _time_cast=0.0;
   
   #ifdef GPU_CAST
   compile_kernels(*dev);
   #endif
   
-  return success && alloc(ef_inum,ef_nall);
+  return success && alloc(ef_nall);
 }
   
 template <class numtyp, class acctyp>
@@ -227,7 +199,6 @@ bool PairGPUAtomT::add_fields(const bool charge, const bool rot) {
   bool realloc=false;
   if (charge && _charge==false) {
     _charge=true;
-    _e_fields++;
     realloc=true;
   }
   if (rot && _rot==false) {
@@ -236,10 +207,9 @@ bool PairGPUAtomT::add_fields(const bool charge, const bool rot) {
   }
   if (realloc) {
     _other=_charge || _rot;
-    int inum=_max_local;
-    int nall=_max_atoms;
+    int max_atoms=_max_atoms;
     clear_resize();
-    return alloc(inum,nall);
+    return alloc(max_atoms);
   }
   return true;
 }
@@ -259,16 +229,12 @@ void PairGPUAtomT::clear_resize() {
     dev_quat.clear();
     host_quat.clear();
   }
-  dev_ans.clear();
-  dev_engv.clear();
   #ifndef GPU_CAST
   host_x.clear();
   #else
   host_x_cast.clear();
   host_type_cast.clear();
   #endif
-  host_ans.clear();
-  host_engv.clear();
   dev_cell_id.clear();
   dev_particle_id.clear();
   dev_tag.clear();
@@ -292,11 +258,7 @@ void PairGPUAtomT::clear() {
 
   time_pos.clear();
   time_other.clear();
-  time_answer.clear();
   clear_resize();
-  _inum=0;
-  _eflag=false;
-  _vflag=false;
 
   #ifdef GPU_CAST
   if (_compiled) {
@@ -314,258 +276,10 @@ double PairGPUAtomT::host_memory_usage() const {
     atom_bytes+=1;
   if (_rot) 
     atom_bytes+=4;
-  int ans_bytes=atom_bytes+_ev_fields;
   return _max_atoms*atom_bytes*sizeof(numtyp)+
-         ans_bytes*(_max_local)*sizeof(acctyp)+
          sizeof(PairGPUAtom<numtyp,acctyp>);
 }
   
-template <class numtyp, class acctyp>
-void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
-                                const bool ef_atom, const bool vf_atom) {
-  time_answer.start();
-  _eflag=eflag;
-  _vflag=vflag;
-  _ef_atom=ef_atom;
-  _vf_atom=vf_atom;
-    
-  int csize=_ev_fields;    
-  if (!eflag)
-    csize-=_e_fields;
-  if (!vflag)
-    csize-=6;
-      
-  if (csize>0)
-    ucl_copy(host_engv,dev_engv,_inum*csize,true);
-  if (_rot)
-    ucl_copy(host_ans,dev_ans,_inum*4*2,true);
-  else
-    ucl_copy(host_ans,dev_ans,_inum*4,true);
-  time_answer.stop();
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
-                                const bool ef_atom, const bool vf_atom,
-                                int *ilist) {
-  _ilist=ilist;
-  copy_answers(eflag,vflag,ef_atom,vf_atom);
-}
-
-template <class numtyp, class acctyp>
-double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
-                                   double *virial) {
-  if (_eflag==false && _vflag==false)
-    return 0.0;
-
-  double evdwl=0.0;
-  if (_gpu_nbor) {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  }
-  
-  evdwl*=0.5;
-  return evdwl;
-}
-
-template <class numtyp, class acctyp>
-double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
-                                   double *virial, double &ecoul) {
-  if (_eflag==false && _vflag==false) {
-    ecoul=0.0;
-    return 0.0;
-  }
-
-  if (_charge==false)
-    return energy_virial(eatom,vatom,virial);
-
-  double evdwl=0.0;
-  double _ecoul=0.0;
-  if (_gpu_nbor) {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  }
-  
-  evdwl*=0.5;
-  ecoul+=_ecoul*0.5;
-  return evdwl;
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::get_answers(double **f, double **tor) {
-  _x_avail=false;
-  _q_avail=false;
-  _quat_avail=false;
-  acctyp *ap=host_ans.begin();
-  if (_gpu_nbor) {
-    for (int i=0; i<_inum; i++) {
-      f[i][0]+=*ap;
-      ap++;
-      f[i][1]+=*ap;
-      ap++;
-      f[i][2]+=*ap;
-      ap+=2;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        tor[i][0]+=*ap;
-        ap++;
-        tor[i][1]+=*ap;
-        ap++;
-        tor[i][2]+=*ap;
-        ap+=2;
-      }
-    }
-  } else {
-    for (int i=0; i<_inum; i++) {
-      int ii=_ilist[i];
-      f[ii][0]+=*ap;
-      ap++;
-      f[ii][1]+=*ap;
-      ap++;
-      f[ii][2]+=*ap;
-      ap+=2;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        int ii=_ilist[i];
-        tor[ii][0]+=*ap;
-        ap++;
-        tor[ii][1]+=*ap;
-        ap++;
-        tor[ii][2]+=*ap;
-        ap+=2;
-      }
-    }
-  }
-}
-
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
 void PairGPUAtomT::sort_neighbor(const int num_atoms) {
diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h
index 562ca0846d..c4c6b1586f 100644
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@@ -23,7 +23,6 @@
 
 #ifdef USE_OPENCL
 
-#include "geryon/ocl_device.h"
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
@@ -32,7 +31,6 @@ using namespace ucl_opencl;
 #else
 
 #include "cudpp.h"
-#include "geryon/nvd_device.h"
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 #include "geryon/nvd_kernel.h"
@@ -40,10 +38,6 @@ using namespace ucl_cudadr;
 
 #endif
 
-#ifndef int2
-struct int2 { int x; int y; };
-#endif
-
 #include "pair_gpu_precision.h"
 
 template <class numtyp, class acctyp>
@@ -56,13 +50,9 @@ class PairGPUAtom {
   inline int max_atoms() const { return _max_atoms; }
   /// Current number of local+ghost atoms stored
   inline int nall() const { return _nall; }
-  /// Current number of local atoms stored
-  inline int inum() const { return _inum; }
 
   /// Set number of local+ghost atoms for future copy operations
   inline void nall(const int n) { _nall=n; }
-  /// Set number of local atoms for future copy operations
-  inline void inum(const int n) { _inum=n; }
   
   /// Memory usage per atom in this class
   int bytes_per_atom() const; 
@@ -70,16 +60,15 @@ class PairGPUAtom {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param rot True if atom storage needs quaternions
     * \param gpu_nbor True if neighboring will be performed on device **/
-  bool init(const int inum, const int nall, const bool charge, const bool rot, 
+  bool init(const int nall, const bool charge, const bool rot, 
             UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
   
   /// Check if we have enough device storage and realloc if not
-  inline bool resize(const int inum, const int nall, bool &success) {
-    _inum=inum;
+  inline bool resize(const int nall, bool &success) {
     _nall=nall;
-    if (inum>_max_local || nall>_max_atoms) {
+    if (nall>_max_atoms) {
       clear_resize();
-      success = success && alloc(inum,nall);
+      success = success && alloc(nall);
       return true;
     }
     return false;
@@ -90,9 +79,6 @@ class PairGPUAtom {
     * \param gpu_nbor True if neighboring will be performed on device **/
   bool add_fields(const bool charge, const bool rot);
   
-  /// True if charge data is available for kernels
-  bool charge_avail() const { return _charge; }
-
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
   
@@ -108,7 +94,6 @@ class PairGPUAtom {
   /// Add copy times to timers
   inline void acc_timers() {
     time_pos.add_to_total();
-    time_answer.add_to_total();
     if (_other)
       time_other.add_to_total();
   }
@@ -116,14 +101,13 @@ class PairGPUAtom {
   /// Add copy times to timers
   inline void zero_timers() {
     time_pos.zero();
-    time_answer.zero();
     if (_other)
       time_other.zero();
   }
 
   /// Return the total time for host/device data transfer
   inline double transfer_time() {
-    double total=time_pos.total_seconds()+time_answer.total_seconds();
+    double total=time_pos.total_seconds();
     if (_other) total+=time_other.total_seconds();
     return total;
   }
@@ -224,6 +208,10 @@ class PairGPUAtom {
 
   // -------------------------COPY TO GPU ----------------------------------
 
+  /// Signal that we need to transfer atom data for next timestep
+  inline void data_unavail()
+    { _x_avail=false; _q_avail=false; _quat_avail=false; }
+
   /// Cast positions and types to write buffer
   inline void cast_x_data(double **host_ptr, const int *host_type) {
     if (_x_avail==false) {
@@ -349,26 +337,6 @@ class PairGPUAtom {
   /// Return number of bytes used on device
   inline double gpu_bytes() { return _gpu_bytes; } 
 
-  // -------------------------COPY FROM GPU -------------------------------
-
-  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom);
-
-  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom, int *ilist);
-  
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(double *eatom, double **vatom, double *virial);
-
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(double *eatom, double **vatom, double *virial,
-                       double &ecoul);
-
-  /// Add forces and torques from the GPU into a LAMMPS pointer
-  void get_answers(double **f, double **tor);
-
   // ------------------------------ DATA ----------------------------------
 
   /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
@@ -377,10 +345,6 @@ class PairGPUAtom {
   UCL_D_Vec<numtyp> dev_q;
   /// Quaterions
   UCL_D_Vec<numtyp> dev_quat;
-  /// Force and possibly torque
-  UCL_D_Vec<acctyp> dev_ans;
-  /// Energy and virial per-atom storage
-  UCL_D_Vec<acctyp> dev_engv;
   
   #ifdef GPU_CAST
   UCL_D_Vec<double> dev_x_cast;
@@ -395,10 +359,6 @@ class PairGPUAtom {
   UCL_H_Vec<numtyp> host_q;
   /// Buffer for moving quat data to GPU
   UCL_H_Vec<numtyp> host_quat;
-  /// Force and possibly torque data on host
-  UCL_H_Vec<acctyp> host_ans;
-  /// Energy/virial data on host
-  UCL_H_Vec<acctyp> host_engv;
   
   /// Cell list identifiers for device nbor builds
   UCL_D_Vec<unsigned> dev_cell_id;
@@ -408,7 +368,7 @@ class PairGPUAtom {
   UCL_D_Vec<int> dev_tag;
 
   /// Device timers
-  UCL_Timer time_pos, time_other, time_answer;
+  UCL_Timer time_pos, time_other;
   
   /// Geryon device
   UCL_Device *dev;
@@ -423,20 +383,17 @@ class PairGPUAtom {
   bool _compiled;
   
   // True if data has been copied to device already
-  int _x_avail, _q_avail, _quat_avail;
+  bool _x_avail, _q_avail, _quat_avail;
 
-  bool alloc(const int inum, const int nall);
+  bool alloc(const int nall);
   
-  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
-  int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
+  bool _allocated, _rot, _charge, _other;
+  int _max_atoms, _nall;
   bool _gpu_nbor, _bonds;
-  int *_ilist;
   double _time_cast;
   
   double _gpu_bytes;
   
-  bool _newton;
-
   #ifndef USE_OPENCL
   CUDPPConfiguration sort_config;
   CUDPPHandle sort_plan;
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index 30206d72e6..718e9d9ddb 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -281,6 +281,7 @@ double lmp_gpu_forces(double **f, double **tor, double *eatom,
     pair_gpu_device.gpu->sync();
     double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
     pair_gpu_device.atom.get_answers(f,tor);
+    pair_gpu_device.atom.data_unavail();
 
     return evdw;
   }
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index 8f24e0231b..1ef85c78ad 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -19,11 +19,13 @@
 #define PAIR_GPU_DEVICE_H
 
 #include "pair_gpu_atom.h"
+#include "pair_gpu_ans.h"
 #include "pair_gpu_nbor.h"
 #include "mpi.h"
 #include <sstream>
 #include "stdio.h"
 #include <string>
+#include <queue>
 
 template <class numtyp, class acctyp>
 class PairGPUDevice {
@@ -71,6 +73,9 @@ class PairGPUDevice {
   /// Clear all memory on host and device
   void clear_device();
 
+  /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
+  inline void add_ans_object(PairGPUAns *ans) { ans_queue.push(ans); }
+
   /// Start timer on host
   inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
   
@@ -134,6 +139,7 @@ class PairGPUDevice {
   PairGPUNborShared _nbor_shared;
 
  private:
+  std::queue<PairGPUAns *> ans_queue;
   int _init_count;
   bool _device_init;
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;