From c1c9fd47dee5a226fc22ea0edc35feddcb38835e Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <brownw@ornl.gov>
Date: Thu, 27 Jan 2011 17:52:08 -0500
Subject: [PATCH] Hybrid working, but not fully tested.

---
 lib/gpu/Nvidia.makefile          |  2 +-
 lib/gpu/Opencl.makefile          |  2 +-
 lib/gpu/atomic_gpu_memory.cpp    | 22 ++++++++-----
 lib/gpu/atomic_gpu_memory.h      |  8 ++++-
 lib/gpu/charge_gpu_memory.cpp    | 28 +++++++++-------
 lib/gpu/charge_gpu_memory.h      |  9 ++++-
 lib/gpu/cmm_cut_gpu_memory.cpp   | 12 +++----
 lib/gpu/cmmc_long_gpu_memory.cpp | 12 +++----
 lib/gpu/cmmc_msm_gpu_memory.cpp  | 12 +++----
 lib/gpu/crml_gpu_memory.cpp      | 12 +++----
 lib/gpu/gb_gpu.cpp               | 50 ++++++++++++++--------------
 lib/gpu/gb_gpu_memory.cpp        | 14 ++++----
 lib/gpu/gb_gpu_memory.h          | 15 ++++++---
 lib/gpu/geryon/nvd_timer.h       | 12 +++++++
 lib/gpu/geryon/ocl_timer.h       | 12 +++++++
 lib/gpu/lj96_cut_gpu_memory.cpp  | 12 +++----
 lib/gpu/lj_cut_gpu_memory.cpp    | 12 +++----
 lib/gpu/ljc_cut_gpu_memory.cpp   | 12 +++----
 lib/gpu/ljcl_cut_gpu_memory.cpp  | 12 +++----
 lib/gpu/pair_gpu_ans.cpp         | 13 +++-----
 lib/gpu/pair_gpu_ans.h           | 14 ++++++--
 lib/gpu/pair_gpu_atom.cpp        | 31 +++++++++++-------
 lib/gpu/pair_gpu_atom.h          | 56 ++++++++++++++++++--------------
 lib/gpu/pair_gpu_device.cpp      | 36 +++++++++-----------
 lib/gpu/pair_gpu_device.h        | 34 ++++++++++++++-----
 25 files changed, 269 insertions(+), 185 deletions(-)
diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 3c859ffdd4..967fbba76f 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -118,7 +118,7 @@ $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared
 $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
+$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H)
 	$(CUDR) -o $@ -c pair_gpu_device.cpp
 
 $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index e488a56bc0..b0b4857770 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -68,7 +68,7 @@ $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared
 $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h
 	$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
+$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H)
 	$(OCL) -o $@ -c pair_gpu_device.cpp
 
 $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp
index bea44c4ced..af7ed795a5 100644
--- a/lib/gpu/atomic_gpu_memory.cpp
+++ b/lib/gpu/atomic_gpu_memory.cpp
@@ -23,11 +23,13 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 template <class numtyp, class acctyp>
 AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0)  {
   device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
   nbor=new PairGPUNbor();
 }
 
 template <class numtyp, class acctyp>
 AtomicGPUMemoryT::~AtomicGPUMemory() {
+  delete ans;
   delete nbor;
 }
 
@@ -54,7 +56,7 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
   if (host_nlocal>0)
     _gpu_host=1;
 
-  if (!device->init(false,false,nlocal,host_nlocal,nall,nbor,maxspecial,
+  if (!device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,maxspecial,
                     gpu_nbor,_gpu_host,max_nbors,cell_size,false))
     return false;
   ucl_device=device->gpu;
@@ -74,7 +76,7 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
 
   pos_tex.bind_float(atom->dev_x,4);
 
-  _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
   return true;
 }
@@ -84,7 +86,7 @@ void AtomicGPUMemoryT::clear_atomic() {
   // Output any timing information
   acc_timers();
   double avg_split=hd_balancer.all_avg_split();
-  device->output_times(time_pair,*nbor,avg_split,_max_bytes+_max_an_bytes,
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
                        screen);
 
   if (_compiled) {
@@ -118,7 +120,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
 
   nbor->get_host(inum,ilist,numj,firstneigh,block_size());
 
-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
   
@@ -149,7 +151,7 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
   nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
                         nspecial, special, success, mn);
 
-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
 }
@@ -173,7 +175,7 @@ void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
   
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time,nbor->gpu_nbor());
-  atom->inum(inum);
+  ans->inum(inum);
   host_start=inum;
 
   if (ago==0) {
@@ -187,7 +189,8 @@ void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
   atom->add_x_data(host_x,host_type);
 
   loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
 
@@ -210,7 +213,7 @@ int * AtomicGPUMemoryT::compute(const int ago, const int inum_full,
   
   hd_balancer.balance(cpu_time,nbor->gpu_nbor());
   int inum=hd_balancer.get_gpu_count(ago,inum_full);
-  atom->inum(inum);
+  ans->inum(inum);
   host_start=inum;
  
   // Build neighbor list on GPU if necessary
@@ -227,7 +230,8 @@ int * AtomicGPUMemoryT::compute(const int ago, const int inum_full,
   }
 
   loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
   hd_balancer.stop_timer();
   
   return nbor->host_nbor.begin();
diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h
index 642ff56afd..11cda990af 100644
--- a/lib/gpu/atomic_gpu_memory.h
+++ b/lib/gpu/atomic_gpu_memory.h
@@ -48,8 +48,9 @@ class AtomicGPUMemory {
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
   inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(inum, nall, success))
+    if (atom->resize(nall, success))
       pos_tex.bind_float(atom->dev_x,4);
+    ans->resize(inum,success);
   }
 
   /// Check if there is enough storage for neighbors and realloc if not
@@ -92,6 +93,7 @@ class AtomicGPUMemory {
     }
     time_pair.add_to_total();
     atom->acc_timers();
+    ans->acc_timers();
   }
 
   /// Zero timers
@@ -99,6 +101,7 @@ class AtomicGPUMemory {
     nbor_time_avail=false;
     time_pair.zero();
     atom->zero_timers();
+    ans->zero_timers();
   }
 
   /// Copy neighbor list from host
@@ -148,6 +151,9 @@ class AtomicGPUMemory {
   /// Atom Data
   PairGPUAtom<numtyp,acctyp> *atom;
 
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;
 
   // --------------------------- NBOR DATA ----------------------------
 
diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp
index 71a2f1622b..4460077ff4 100644
--- a/lib/gpu/charge_gpu_memory.cpp
+++ b/lib/gpu/charge_gpu_memory.cpp
@@ -23,11 +23,13 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 template <class numtyp, class acctyp>
 ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
   device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
   nbor=new PairGPUNbor();
 }
 
 template <class numtyp, class acctyp>
 ChargeGPUMemoryT::~ChargeGPUMemory() {
+  delete ans;
   delete nbor;
 }
 
@@ -54,8 +56,8 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
   if (host_nlocal>0)
     _gpu_host=1;
 
-  if (!device->init(true,false,nlocal,host_nlocal,nall,nbor,maxspecial,gpu_nbor,
-                    _gpu_host,max_nbors,cell_size,false))
+  if (!device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,maxspecial,
+                    gpu_nbor,_gpu_host,max_nbors,cell_size,false))
     return false;
   ucl_device=device->gpu;
   atom=&device->atom;
@@ -75,7 +77,7 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
   pos_tex.bind_float(atom->dev_x,4);
   q_tex.bind_float(atom->dev_q,1);
 
-  _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
   return true;
 }
@@ -85,7 +87,7 @@ void ChargeGPUMemoryT::clear_atomic() {
   // Output any timing information
   acc_timers();
   double avg_split=hd_balancer.all_avg_split();
-  device->output_times(time_pair,*nbor,avg_split,_max_bytes+_max_an_bytes,
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
                        screen);
 
   if (_compiled) {
@@ -119,7 +121,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
 
   nbor->get_host(inum,ilist,numj,firstneigh,block_size());
 
-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
 
@@ -150,7 +152,7 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
   nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
                         nspecial, special, success, mn);
 
-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
 }
@@ -175,7 +177,7 @@ void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time,
 		               nbor->gpu_nbor());
-  atom->inum(inum);
+  ans->inum(inum);
   host_start=inum;
 
   if (ago==0) {
@@ -188,10 +190,11 @@ void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
   atom->cast_q_data(host_q);
   hd_balancer.start_timer();
   atom->add_x_data(host_x,host_type);
-  atom->add_other_data();
+  atom->add_q_data();
 
   loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
 
@@ -215,7 +218,7 @@ int * ChargeGPUMemoryT::compute(const int ago, const int inum_full,
   
   hd_balancer.balance(cpu_time,nbor->gpu_nbor());
   int inum=hd_balancer.get_gpu_count(ago,inum_full);
-  atom->inum(inum);
+  ans->inum(inum);
   host_start=inum;
  
   // Build neighbor list on GPU if necessary
@@ -232,10 +235,11 @@ int * ChargeGPUMemoryT::compute(const int ago, const int inum_full,
     hd_balancer.start_timer();
     atom->add_x_data(host_x,host_type);
   }
-  atom->add_other_data();
+  atom->add_q_data();
 
   loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
   hd_balancer.stop_timer();
   
   return nbor->host_nbor.begin();
diff --git a/lib/gpu/charge_gpu_memory.h b/lib/gpu/charge_gpu_memory.h
index 42e7307f47..f35b14a8c2 100644
--- a/lib/gpu/charge_gpu_memory.h
+++ b/lib/gpu/charge_gpu_memory.h
@@ -48,10 +48,11 @@ class ChargeGPUMemory {
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
   inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(inum, nall, success)) {
+    if (atom->resize(nall, success)) {
       pos_tex.bind_float(atom->dev_x,4);
       q_tex.bind_float(atom->dev_q,1);
     }
+    ans->resize(inum,success);
   }
 
   /// Check if there is enough storage for neighbors and realloc if not
@@ -94,6 +95,7 @@ class ChargeGPUMemory {
     }
     time_pair.add_to_total();
     atom->acc_timers();
+    ans->acc_timers();
   }
 
   /// Zero timers
@@ -101,6 +103,7 @@ class ChargeGPUMemory {
     nbor_time_avail=false;
     time_pair.zero();
     atom->zero_timers();
+    ans->zero_timers();
   }
 
   /// Copy neighbor list from host
@@ -151,6 +154,10 @@ class ChargeGPUMemory {
   PairGPUAtom<numtyp,acctyp> *atom;
 
 
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;
+
   // --------------------------- NBOR DATA ----------------------------
 
   /// Neighbor data
diff --git a/lib/gpu/cmm_cut_gpu_memory.cpp b/lib/gpu/cmm_cut_gpu_memory.cpp
index e5a83e5872..53f219274c 100644
--- a/lib/gpu/cmm_cut_gpu_memory.cpp
+++ b/lib/gpu/cmm_cut_gpu_memory.cpp
@@ -122,9 +122,9 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -133,15 +133,15 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/cmmc_long_gpu_memory.cpp b/lib/gpu/cmmc_long_gpu_memory.cpp
index 9a63bc5628..2091122832 100644
--- a/lib/gpu/cmmc_long_gpu_memory.cpp
+++ b/lib/gpu/cmmc_long_gpu_memory.cpp
@@ -133,9 +133,9 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -144,8 +144,8 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch,
                           &this->atom->dev_q.begin(), &_cut_coulsq,
                           &_qqrd2e, &_g_ewald);
@@ -153,8 +153,8 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                      &_cut_coulsq, &_qqrd2e, &_g_ewald);
   }
diff --git a/lib/gpu/cmmc_msm_gpu_memory.cpp b/lib/gpu/cmmc_msm_gpu_memory.cpp
index 6fced81d86..7675005348 100644
--- a/lib/gpu/cmmc_msm_gpu_memory.cpp
+++ b/lib/gpu/cmmc_msm_gpu_memory.cpp
@@ -133,9 +133,9 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -144,8 +144,8 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch,
                           &this->atom->dev_q.begin(), &_cut_coulsq,
                           &_qqrd2e, &_smooth);
@@ -153,8 +153,8 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                      &_cut_coulsq, &_qqrd2e, &_smooth);
   }
diff --git a/lib/gpu/crml_gpu_memory.cpp b/lib/gpu/crml_gpu_memory.cpp
index e877503e87..f051a349f9 100644
--- a/lib/gpu/crml_gpu_memory.cpp
+++ b/lib/gpu/crml_gpu_memory.cpp
@@ -137,9 +137,9 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -147,8 +147,8 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
                           &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch,
                           &this->atom->dev_q.begin(), &_cut_coulsq,
                           &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
@@ -157,8 +157,8 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                      &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
                      &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq);
diff --git a/lib/gpu/gb_gpu.cpp b/lib/gpu/gb_gpu.cpp
index c1b403fe24..9212e43cb6 100644
--- a/lib/gpu/gb_gpu.cpp
+++ b/lib/gpu/gb_gpu.cpp
@@ -216,9 +216,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum())/BX));
   int stride=gbm.nbor->nbor_pitch();
-  int ainum=gbm.atom->inum();
+  int ainum=gbm.ans->inum();
   int anall=gbm.atom->nall();
 
   if (gbm.multiple_forms) {
@@ -237,11 +237,11 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
            &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
            &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
            &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
-           &stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
+           &stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(),
            &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
       gbm.time_gayberne.stop();
 
-      if (gbm.last_ellipse==gbm.atom->inum()) {
+      if (gbm.last_ellipse==gbm.ans->inum()) {
         gbm.time_kernel2.start();
         gbm.time_kernel2.stop();
         gbm.time_gayberne2.start();
@@ -254,9 +254,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
       // ------------ SPHERE_ELLIPSE ---------------
 
       gbm.time_kernel2.start();
-      GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
+      GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum()-
                                gbm.last_ellipse)/BX));
-      gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
+      gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(),
 			SPHERE_ELLIPSE,SPHERE_ELLIPSE);
       gbm.time_kernel2.stop();
 
@@ -266,13 +266,13 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
               &gbm.shape.begin(), &gbm.well.begin(), 
               &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
               &gbm._lj_types, &gbm.lshape.begin(), 
-              &gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
-              &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
+              &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),
+              &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
               &vflag, &gbm.last_ellipse, &ainum, &anall);
       gbm.time_gayberne2.stop();
    } else {
-      gbm.atom->dev_ans.zero();
-      gbm.atom->dev_engv.zero();
+      gbm.ans->dev_ans.zero();
+      gbm.ans->dev_engv.zero();
       gbm.time_kernel.stop();
       gbm.time_gayberne.start();                                 
       gbm.time_gayberne.stop();
@@ -284,29 +284,29 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
     
     // ------------         LJ      ---------------
     gbm.time_pair.start();
-    if (gbm.last_ellipse<gbm.atom->inum()) {
+    if (gbm.last_ellipse<gbm.ans->inum()) {
       if (gbm.shared_types) {
         GBMF.k_lj_fast.set_size(GX,BX);
         GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
                            &gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
                            &stride, &gbm.nbor->dev_packed.begin(),
-                           &gbm.atom->dev_ans.begin(),
-                           &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
+                           &gbm.ans->dev_ans.begin(),
+                           &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
                            &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
       } else {
         GBMF.k_lj.set_size(GX,BX);
         GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
                       &gbm.lj3.begin(), &gbm._lj_types, 
                       &gbm.gamma_upsilon_mu.begin(), &stride, 
-                      &gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
-                      &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
+                      &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(),
+                      &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
                       &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
       }
     }
     gbm.time_pair.stop();
   } else {
     gbm.time_kernel.start();
-    gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
+    gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE,
 		      ELLIPSE_ELLIPSE);
     gbm.time_kernel.stop();
     gbm.time_gayberne.start(); 
@@ -315,8 +315,8 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
             &gbm.shape.begin(), &gbm.well.begin(), 
             &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
             &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
-            &stride, &gbm.atom->dev_ans.begin(), &ainum,
-            &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
+            &stride, &gbm.ans->dev_ans.begin(), &ainum,
+            &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
             &eflag, &vflag, &ainum, &anall);
     gbm.time_gayberne.stop();
   }
@@ -342,7 +342,7 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
 
   gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
   int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full);
-  gbm.atom->inum(inum);
+  gbm.ans->inum(inum);
   gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
   host_start=inum;
   
@@ -361,10 +361,11 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
     gbm.atom->add_x_data(host_x,host_type);
   }
 
-  gbm.atom->add_other_data();
+  gbm.atom->add_quat_data();
 
   _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
-  gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
+  gbm.ans->copy_answers(eflag,vflag,eatom,vatom);
+  gbm.device->add_ans_object(gbm.ans);
   gbm.hd_balancer.stop_timer();
   return gbm.nbor->host_nbor.begin();
 }
@@ -399,7 +400,7 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
   
   int ago=gbm.hd_balancer.ago_first(f_ago);
   int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time,gbm.nbor->gpu_nbor());
-  gbm.atom->inum(inum);
+  gbm.ans->inum(inum);
   gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
   host_start=inum;
 
@@ -419,10 +420,11 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
   gbm.atom->cast_quat_data(host_quat[0]);
   gbm.hd_balancer.start_timer();
   gbm.atom->add_x_data(host_x,host_type);
-  gbm.atom->add_other_data();
+  gbm.atom->add_quat_data();
 
   _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
-  gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
+  gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list);
+  gbm.device->add_ans_object(gbm.ans);
   gbm.hd_balancer.stop_timer();
   return list;
 }
diff --git a/lib/gpu/gb_gpu_memory.cpp b/lib/gpu/gb_gpu_memory.cpp
index 622bdd73fc..ac2923aa9c 100644
--- a/lib/gpu/gb_gpu_memory.cpp
+++ b/lib/gpu/gb_gpu_memory.cpp
@@ -32,12 +32,14 @@ template <class numtyp, class acctyp>
 GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
                                   _max_bytes(0.0) {
   device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
   nbor=new PairGPUNbor;
 }
 
 template <class numtyp, class acctyp>
 GB_GPU_MemoryT::~GB_GPU_Memory() { 
   clear();
+  delete ans;
   delete nbor;
 }
  
@@ -70,7 +72,7 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
   if (host_nlocal>0)
     _gpu_host=1;
   
-  if (!device->init(false,true,nlocal,host_nlocal,nall,nbor,0,gpu_nbor,
+  if (!device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,gpu_nbor,
                     _gpu_host,max_nbors,cell_size,true))
     return false;
   ucl_device=device->gpu;
@@ -187,9 +189,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
   }
   
   if (multiple_forms)
-    atom->dev_ans.zero();
+    ans->dev_ans.zero();
 
-  _max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
   // Memory for ilist ordered by particle type
   return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
@@ -212,7 +214,7 @@ void GB_GPU_MemoryT::clear() {
   acc_timers();
   double single[6], times[6];
 
-  single[0]=atom->transfer_time();
+  single[0]=atom->transfer_time()+ans->transfer_time();
   single[1]=nbor->time_nbor.total_seconds();
   single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
             nbor->time_kernel.total_seconds();
@@ -221,7 +223,7 @@ void GB_GPU_MemoryT::clear() {
     single[4]=time_pair.total_seconds();
   else
     single[4]=0;
-  single[5]=atom->cast_time();
+  single[5]=atom->cast_time()+ans->cast_time();
 
   MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica());
   double avg_split=hd_balancer.all_avg_split();
@@ -229,7 +231,7 @@ void GB_GPU_MemoryT::clear() {
   _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
               sigma_epsilon.row_bytes()+cut_form.row_bytes()+
               shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
-              gamma_upsilon_mu.row_bytes();
+              gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
   double mpi_max_bytes;
   MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
              device->replica());
diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h
index 2cfc805cd8..b9c0ce8c53 100644
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@@ -49,9 +49,10 @@ class GB_GPU_Memory {
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
   inline void resize_atom(const int inum, const int nall, bool &success) {
-    atom->resize(inum, nall, success);
-    if (multiple_forms) atom->dev_ans.zero();
-    double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+    atom->resize(nall, success);
+    ans->resize(inum, success);
+    if (multiple_forms) ans->dev_ans.zero();
+    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
     if (bytes>_max_bytes)
       _max_bytes=bytes;
   }
@@ -74,7 +75,7 @@ class GB_GPU_Memory {
       success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
     }
     nbor->resize(nlocal,host_inum,max_nbors,success);
-    double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
     if (bytes>_max_bytes)
       _max_bytes=bytes;
   }
@@ -104,6 +105,7 @@ class GB_GPU_Memory {
       time_pair.add_to_total();
     }
     atom->acc_timers();
+    ans->acc_timers();
   }
   
   /// Accumulate timers
@@ -117,6 +119,7 @@ class GB_GPU_Memory {
       time_pair.zero();
     }
     atom->zero_timers();
+    ans->zero_timers();
   }
 
   // -------------------------- DEVICE DATA ------------------------- 
@@ -168,6 +171,10 @@ class GB_GPU_Memory {
 
   int last_ellipse, max_last_ellipse;
 
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;
+
   // --------------------------- NBOR DATA ----------------------------
 
   /// Neighbor data
diff --git a/lib/gpu/geryon/nvd_timer.h b/lib/gpu/geryon/nvd_timer.h
index 59001c03fd..1f39abb971 100644
--- a/lib/gpu/geryon/nvd_timer.h
+++ b/lib/gpu/geryon/nvd_timer.h
@@ -25,6 +25,7 @@
 #define NVD_TIMER_H
 
 #include "nvd_macros.h"
+#include "nvd_device.h"
 
 namespace ucl_cudadr {
 
@@ -66,12 +67,23 @@ class UCL_Timer {
   /// Stop timing on command queue
   inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
   
+  /// Block until the start event has been reached on device
+  inline void sync_start() 
+    { CU_SAFE_CALL(cuEventSynchronize(start_event)); }
+
+  /// Block until the stop event has been reached on device
+  inline void sync_stop() 
+    { CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
+
   /// Set the time elapsed to zero (not the total_time)
   inline void zero() {
     CU_SAFE_CALL(cuEventRecord(start_event,_cq));
     CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
   }
   
+  /// Set the total time to zero
+  inline void zero_total() { _total_time=0.0; }
+  
   /// Add time from previous start and stop to total
   /** Forces synchronization **/
   inline double add_to_total() 
diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h
index aafb0aac4b..a268dca256 100644
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@@ -25,6 +25,7 @@
 #define OCL_TIMER_H
 
 #include "ocl_macros.h"
+#include "ocl_device.h"
 
 namespace ucl_opencl {
 
@@ -67,10 +68,21 @@ class UCL_Timer {
   /// Stop timing on default command queue
   inline void stop() { clEnqueueMarker(_cq,&stop_event); }
   
+  /// Block until the start event has been reached on device
+  inline void sync_start() 
+    { CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
+
+  /// Block until the stop event has been reached on device
+  inline void sync_stop() 
+    { CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
+
   /// Set the time elapsed to zero (not the total_time)
   inline void zero() 
     { clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); } 
   
+  /// Set the total time to zero
+  inline void zero_total() { _total_time=0.0; }
+  
   /// Add time from previous start and stop to total
   /** Forces synchronization **/
   inline double add_to_total() 
diff --git a/lib/gpu/lj96_cut_gpu_memory.cpp b/lib/gpu/lj96_cut_gpu_memory.cpp
index d365d71044..c30c5d8caf 100644
--- a/lib/gpu/lj96_cut_gpu_memory.cpp
+++ b/lib/gpu/lj96_cut_gpu_memory.cpp
@@ -122,9 +122,9 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -133,15 +133,15 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lj_cut_gpu_memory.cpp b/lib/gpu/lj_cut_gpu_memory.cpp
index 23b2fcf6d0..27a759fec1 100644
--- a/lib/gpu/lj_cut_gpu_memory.cpp
+++ b/lib/gpu/lj_cut_gpu_memory.cpp
@@ -122,9 +122,9 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -133,15 +133,15 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/ljc_cut_gpu_memory.cpp b/lib/gpu/ljc_cut_gpu_memory.cpp
index d63ed6e5d9..b662b653fe 100644
--- a/lib/gpu/ljc_cut_gpu_memory.cpp
+++ b/lib/gpu/ljc_cut_gpu_memory.cpp
@@ -134,9 +134,9 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -145,8 +145,8 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch,
                           &this->atom->dev_q.begin(), &cutsq.begin(),
                           &_qqrd2e);
@@ -154,8 +154,8 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                      &cutsq.begin(), &_qqrd2e);
   }
diff --git a/lib/gpu/ljcl_cut_gpu_memory.cpp b/lib/gpu/ljcl_cut_gpu_memory.cpp
index a126309a92..1f638c030c 100644
--- a/lib/gpu/ljcl_cut_gpu_memory.cpp
+++ b/lib/gpu/ljcl_cut_gpu_memory.cpp
@@ -132,9 +132,9 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -143,8 +143,8 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch,
                           &this->atom->dev_q.begin(), &_cut_coulsq,
                           &_qqrd2e, &_g_ewald);
@@ -152,8 +152,8 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                      &_cut_coulsq, &_qqrd2e, &_g_ewald);
   }
diff --git a/lib/gpu/pair_gpu_ans.cpp b/lib/gpu/pair_gpu_ans.cpp
index e6982e6eba..a06ba9e394 100644
--- a/lib/gpu/pair_gpu_ans.cpp
+++ b/lib/gpu/pair_gpu_ans.cpp
@@ -138,11 +138,10 @@ void PairGPUAnsT::clear() {
   if (!_allocated)
     return;
 
-  time_pos.clear();
-  time_other.clear();
   time_answer.clear();
   clear_resize();
   _inum=0;
+  _ilist=NULL;
   _eflag=false;
   _vflag=false;
 }
@@ -181,7 +180,6 @@ void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
   else
     ucl_copy(host_ans,dev_ans,_inum*4,true);
   time_answer.stop();
-  dev->add_answer_object(this);
 }
 
 template <class numtyp, class acctyp>
@@ -199,7 +197,7 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
     return 0.0;
 
   double evdwl=0.0;
-  if (_gpu_nbor) {
+  if (_ilist==NULL) {
     for (int i=0; i<_inum; i++) {
       acctyp *ap=host_engv.begin()+i;
       if (_eflag) {
@@ -279,7 +277,7 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
 
   double evdwl=0.0;
   double _ecoul=0.0;
-  if (_gpu_nbor) {
+  if (_ilist==NULL) {
     for (int i=0; i<_inum; i++) {
       acctyp *ap=host_engv.begin()+i;
       if (_eflag) {
@@ -359,11 +357,8 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
 
 template <class numtyp, class acctyp>
 void PairGPUAnsT::get_answers(double **f, double **tor) {
-  _x_avail=false;
-  _q_avail=false;
-  _quat_avail=false;
   acctyp *ap=host_ans.begin();
-  if (_gpu_nbor) {
+  if (_ilist==NULL) {
     for (int i=0; i<_inum; i++) {
       f[i][0]+=*ap;
       ap++;
diff --git a/lib/gpu/pair_gpu_ans.h b/lib/gpu/pair_gpu_ans.h
index a93ed6fcd5..7c876ee246 100644
--- a/lib/gpu/pair_gpu_ans.h
+++ b/lib/gpu/pair_gpu_ans.h
@@ -58,14 +58,12 @@ class PairGPUAns {
   bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev);
   
   /// Check if we have enough device storage and realloc if not
-  inline bool resize(const int inum, bool &success) {
+  inline void resize(const int inum, bool &success) {
     _inum=inum;
     if (inum>_max_local) {
       clear_resize();
       success = success && alloc(inum);
-      return true;
     }
-    return false;
   }
   
   /// If already initialized by another LAMMPS style, add fields as necessary
@@ -123,6 +121,16 @@ class PairGPUAns {
   /// Add forces and torques from the GPU into a LAMMPS pointer
   void get_answers(double **f, double **tor);
 
+  inline double get_answers(double **f, double **tor, double *eatom, 
+                            double **vatom, double *virial, double &ecoul) {
+    time_answer.sync_stop();
+    double ts=MPI_Wtime();
+    double evdw=energy_virial(eatom,vatom,virial,ecoul);
+    get_answers(f,tor);
+    _time_cast+=MPI_Wtime()-ts;
+    return evdw;
+  }
+
   // ------------------------------ DATA ----------------------------------
 
   /// Force and possibly torque
diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp
index 812a7c82d6..6342d2c3fc 100644
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_atom.cpp
@@ -29,7 +29,8 @@ __win_sort _win_sort;
 #endif
 
 template <class numtyp, class acctyp>
-PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false) {
+PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
+                              _max_gpu_bytes(0) {
   #ifndef USE_OPENCL
   sort_config.op = CUDPP_ADD;
   sort_config.datatype = CUDPP_UINT;
@@ -108,7 +109,7 @@ bool PairGPUAtomT::alloc(const int nall) {
 
     
   // ---------------------------  Device allocations
-  _gpu_bytes=0;
+  int gpu_bytes=0;
   if (cpuview) {
     #ifdef GPU_CAST
     assert(0==1);
@@ -126,7 +127,7 @@ bool PairGPUAtomT::alloc(const int nall) {
                         dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
     success=success && (UCL_SUCCESS==
                         dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
-    _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
+    gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
     #else
     success=success && (UCL_SUCCESS==
                         dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
@@ -134,25 +135,27 @@ bool PairGPUAtomT::alloc(const int nall) {
     if (_charge) {
       success=success && (dev_q.alloc(_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_q.row_bytes();
+      gpu_bytes+=dev_q.row_bytes();
     }
     if (_rot) {
       success=success && (dev_quat.alloc(_max_atoms*4,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_quat.row_bytes();
+      gpu_bytes+=dev_quat.row_bytes();
     }
   }
   if (_gpu_nbor) {
     success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
     success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-    _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
+    gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
     if (_bonds) {
       success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-      _gpu_bytes+=dev_tag.row_bytes();
+      gpu_bytes+=dev_tag.row_bytes();
     }
   }
 
-  _gpu_bytes+=dev_x.row_bytes();
+  gpu_bytes+=dev_x.row_bytes();
+  if (gpu_bytes>_max_gpu_bytes)
+    _max_gpu_bytes=gpu_bytes;
   
   _allocated=true;  
   return success;
@@ -168,6 +171,7 @@ bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
   _x_avail=false;
   _q_avail=false;
   _quat_avail=false;
+  _resized=false;
   _gpu_nbor=gpu_nbor;
   _bonds=bonds;
   _charge=charge;
@@ -182,9 +186,11 @@ bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
   
   // Initialize timers for the selected device
   time_pos.init(*dev);
-  time_other.init(*dev);
+  time_q.init(*dev);
+  time_quat.init(*dev);
   time_pos.zero();
-  time_other.zero();
+  time_q.zero();
+  time_quat.zero();
   _time_cast=0.0;
   
   #ifdef GPU_CAST
@@ -252,12 +258,13 @@ void PairGPUAtomT::clear_resize() {
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::clear() {
-  _gpu_bytes=0;
+  _max_gpu_bytes=0;
   if (!_allocated)
     return;
 
   time_pos.clear();
-  time_other.clear();
+  time_q.clear();
+  time_quat.clear();
   clear_resize();
 
   #ifdef GPU_CAST
diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h
index c4c6b1586f..45e29753b8 100644
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@@ -64,14 +64,15 @@ class PairGPUAtom {
             UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
   
   /// Check if we have enough device storage and realloc if not
+  /** Returns true if resized with any call during this timestep **/
   inline bool resize(const int nall, bool &success) {
     _nall=nall;
     if (nall>_max_atoms) {
       clear_resize();
       success = success && alloc(nall);
-      return true;
+      _resized=true;
     }
-    return false;
+    return _resized;
   }
   
   /// If already initialized by another LAMMPS style, add fields as necessary
@@ -94,26 +95,42 @@ class PairGPUAtom {
   /// Add copy times to timers
   inline void acc_timers() {
     time_pos.add_to_total();
-    if (_other)
-      time_other.add_to_total();
+    if (_charge)
+      time_q.add_to_total();
+    if (_rot)
+      time_quat.add_to_total();
   }
 
   /// Add copy times to timers
   inline void zero_timers() {
     time_pos.zero();
-    if (_other)
-      time_other.zero();
+    if (_charge)
+      time_q.zero();
+    if (_rot)
+      time_quat.zero();
   }
 
   /// Return the total time for host/device data transfer
+  /** Zeros the total so that the atom times are only included once **/
   inline double transfer_time() {
     double total=time_pos.total_seconds();
-    if (_other) total+=time_other.total_seconds();
+    time_pos.zero_total();
+    if (_charge) {
+      total+=time_q.total_seconds();
+      time_q.zero_total();
+    }
+    if (_rot) {
+      total+=time_q.total_seconds();
+      time_quat.zero_total();
+    }
+    
     return total;
   }
   
   /// Return the total time for data cast/pack
-  inline double cast_time() { return _time_cast; }
+  /** Zeros the time so that atom times are only included once **/
+  inline double cast_time() 
+    { double t=_time_cast; _time_cast=0.0; return t; }
 
   /// Pack LAMMPS atom type constants into matrix and copy to device
   template <class dev_typ, class t1>
@@ -210,7 +227,7 @@ class PairGPUAtom {
 
   /// Signal that we need to transfer atom data for next timestep
   inline void data_unavail()
-    { _x_avail=false; _q_avail=false; _quat_avail=false; }
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
 
   /// Cast positions and types to write buffer
   inline void cast_x_data(double **host_ptr, const int *host_type) {
@@ -322,20 +339,9 @@ class PairGPUAtom {
     }
   }
 
-  /// Copy data other than pos and type to device
-  inline void add_other_data() {
-    if (_other) {
-      time_other.start();
-      if (_charge)
-        add_q_data();
-      if (_rot)
-        add_quat_data();
-      time_other.stop();
-    }
-  }
-  
   /// Return number of bytes used on device
-  inline double gpu_bytes() { return _gpu_bytes; } 
+  inline double max_gpu_bytes() 
+    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 
 
   // ------------------------------ DATA ----------------------------------
 
@@ -368,7 +374,7 @@ class PairGPUAtom {
   UCL_D_Vec<int> dev_tag;
 
   /// Device timers
-  UCL_Timer time_pos, time_other;
+  UCL_Timer time_pos, time_q, time_quat;
   
   /// Geryon device
   UCL_Device *dev;
@@ -383,7 +389,7 @@ class PairGPUAtom {
   bool _compiled;
   
   // True if data has been copied to device already
-  bool _x_avail, _q_avail, _quat_avail;
+  bool _x_avail, _q_avail, _quat_avail, _resized;
 
   bool alloc(const int nall);
   
@@ -392,7 +398,7 @@ class PairGPUAtom {
   bool _gpu_nbor, _bonds;
   double _time_cast;
   
-  double _gpu_bytes;
+  double _max_gpu_bytes;
   
   #ifndef USE_OPENCL
   CUDPPConfiguration sort_config;
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index 718e9d9ddb..c98c72e04f 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -117,10 +117,11 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, 
+bool PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
+                          const bool rot, const int nlocal, 
                           const int host_nlocal, const int nall,
-                          PairGPUNbor *nbor, const int maxspecial, 
-                          const bool gpu_nbor, const int gpu_host, 
+                          PairGPUNbor *nbor, const int maxspecial,
+                          const bool gpu_nbor, const int gpu_host,
                           const int max_nbors, const double cell_size,
                           const bool pre_cut) {
   if (!_device_init)
@@ -133,12 +134,14 @@ bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal,
 
   if (_init_count==0) {
     // Initialize atom and nbor data
-    if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
-                   gpu_nbor && maxspecial>0))
+    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
       return false;
   } else
     atom.add_fields(charge,rot);
 
+  if (!ans.init(ef_nlocal,charge,rot,*gpu))
+    return false;
+
   if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                   *gpu,gpu_nbor,gpu_host,pre_cut))
     return false;
@@ -189,20 +192,21 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
 }
 
 template <class numtyp, class acctyp>
-void PairGPUDeviceT::output_times(UCL_Timer &time_pair, PairGPUNbor &nbor,
-                                  const double avg_split, 
+void PairGPUDeviceT::output_times(UCL_Timer &time_pair, 
+                                  PairGPUAns<numtyp,acctyp> &ans, 
+                                  PairGPUNbor &nbor, const double avg_split, 
                                   const double max_bytes, FILE *screen) {
   double single[5], times[5];
 
-  single[0]=atom.transfer_time();
+  single[0]=atom.transfer_time()+ans.transfer_time();
   single[1]=nbor.time_nbor.total_seconds();
   single[2]=nbor.time_kernel.total_seconds();
   single[3]=time_pair.total_seconds();
-  single[4]=atom.cast_time();
+  single[4]=atom.cast_time()+ans.cast_time();
 
   MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
-  double my_max_bytes=max_bytes;
+  double my_max_bytes=max_bytes+atom.max_gpu_bytes();
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
@@ -276,15 +280,5 @@ void lmp_clear_device() {
 
 double lmp_gpu_forces(double **f, double **tor, double *eatom,
                       double **vatom, double *virial, double &ecoul) {
-  if (pair_gpu_device.init_count()) {
-    pair_gpu_device.stop_host_timer();
-    pair_gpu_device.gpu->sync();
-    double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
-    pair_gpu_device.atom.get_answers(f,tor);
-    pair_gpu_device.atom.data_unavail();
-
-    return evdw;
-  }
-  return 0.0;
+  return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
 }
-
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index 1ef85c78ad..fde23ce2c3 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -53,18 +53,19 @@ class PairGPUDevice {
     * \param cell_size cutoff+skin 
     * \param pre_cut True if cutoff test will be performed in separate kernel
     *                than the force kernel **/
-  bool init(const bool charge, const bool rot, const int nlocal,
-            const int host_nlocal, const int nall, PairGPUNbor *nbor, 
-            const int maxspecial, const bool gpu_nbor, const int gpu_host, 
-            const int max_nbors, const double cell_size, const bool pre_cut);
+  bool init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
+            const int nlocal, const int host_nlocal, const int nall,
+            PairGPUNbor *nbor, const int maxspecial, const bool gpu_nbor,
+            const int gpu_host, const int max_nbors, const double cell_size,
+            const bool pre_cut);
 
   /// Output a message for pair_style acceleration with device stats
   void init_message(FILE *screen, const char *name,
                     const int first_gpu, const int last_gpu);
 
   /// Output a message with timing information
-  void output_times(UCL_Timer &time_pair, PairGPUNbor &nbor, 
-                    const double avg_split, 
+  void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans, 
+                    PairGPUNbor &nbor, const double avg_split, 
                     const double max_bytes, FILE *screen);
 
   /// Clear all memory on host and device associated with atom and nbor data
@@ -74,7 +75,24 @@ class PairGPUDevice {
   void clear_device();
 
   /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
-  inline void add_ans_object(PairGPUAns *ans) { ans_queue.push(ans); }
+  inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
+    { ans_queue.push(ans); }
+
+  /// Add "answers" (force,energies,etc.) into LAMMPS structures
+  inline double fix_gpu(double **f, double **tor, double *eatom,
+                      double **vatom, double *virial, double &ecoul) {
+    atom.data_unavail();
+    if (ans_queue.empty()==false) {
+      stop_host_timer();
+      double evdw=0.0;
+      while (ans_queue.empty()==false) {
+        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
+        ans_queue.pop();
+      }                                                 
+      return evdw;
+    }
+    return 0.0;
+  }
 
   /// Start timer on host
   inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
@@ -139,7 +157,7 @@ class PairGPUDevice {
   PairGPUNborShared _nbor_shared;
 
  private:
-  std::queue<PairGPUAns *> ans_queue;
+  std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
   int _init_count;
   bool _device_init;
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;