Hybrid working, but not fully tested.

2011-01-27 17:52:08 -05:00
parent b9c7e2bb96
commit c1c9fd47de
25 changed files with 269 additions and 185 deletions
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -118,7 +118,7 @@ $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared
 $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
+$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H)
 	$(CUDR) -o $@ -c pair_gpu_device.cpp

 $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -68,7 +68,7 @@ $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared
 $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h
 	$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
+$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H)
 	$(OCL) -o $@ -c pair_gpu_device.cpp

 $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
--- a/lib/gpu/atomic_gpu_memory.cpp
+++ b/lib/gpu/atomic_gpu_memory.cpp
@ -23,11 +23,13 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 template <class numtyp, class acctyp>
 AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0)  {
  device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
  nbor=new PairGPUNbor();
 }

 template <class numtyp, class acctyp>
 AtomicGPUMemoryT::~AtomicGPUMemory() {
+  delete ans;
  delete nbor;
 }

@ -54,7 +56,7 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
  if (host_nlocal>0)
    _gpu_host=1;

-  if (!device->init(false,false,nlocal,host_nlocal,nall,nbor,maxspecial,
+  if (!device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,maxspecial,
                    gpu_nbor,_gpu_host,max_nbors,cell_size,false))
    return false;
  ucl_device=device->gpu;
@ -74,7 +76,7 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,

  pos_tex.bind_float(atom->dev_x,4);

-  _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

  return true;
 }
@ -84,7 +86,7 @@ void AtomicGPUMemoryT::clear_atomic() {
  // Output any timing information
  acc_timers();
  double avg_split=hd_balancer.all_avg_split();
-  device->output_times(time_pair,*nbor,avg_split,_max_bytes+_max_an_bytes,
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
                       screen);

  if (_compiled) {
@ -118,7 +120,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,

  nbor->get_host(inum,ilist,numj,firstneigh,block_size());

-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
  
@ -149,7 +151,7 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
  nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
                        nspecial, special, success, mn);

-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
 }
@ -173,7 +175,7 @@ void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
  
  int ago=hd_balancer.ago_first(f_ago);
  int inum=hd_balancer.balance(ago,inum_full,cpu_time,nbor->gpu_nbor());
-  atom->inum(inum);
+  ans->inum(inum);
  host_start=inum;

  if (ago==0) {
@ -187,7 +189,8 @@ void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
  atom->add_x_data(host_x,host_type);

  loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
  hd_balancer.stop_timer();
 }

@ -210,7 +213,7 @@ int * AtomicGPUMemoryT::compute(const int ago, const int inum_full,
  
  hd_balancer.balance(cpu_time,nbor->gpu_nbor());
  int inum=hd_balancer.get_gpu_count(ago,inum_full);
-  atom->inum(inum);
+  ans->inum(inum);
  host_start=inum;
 
  // Build neighbor list on GPU if necessary
@ -227,7 +230,8 @@ int * AtomicGPUMemoryT::compute(const int ago, const int inum_full,
  }

  loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
  hd_balancer.stop_timer();
  
  return nbor->host_nbor.begin();
--- a/lib/gpu/atomic_gpu_memory.h
+++ b/lib/gpu/atomic_gpu_memory.h
@ -48,8 +48,9 @@ class AtomicGPUMemory {
  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(inum, nall, success))
+    if (atom->resize(nall, success))
      pos_tex.bind_float(atom->dev_x,4);
+    ans->resize(inum,success);
  }

  /// Check if there is enough storage for neighbors and realloc if not
@ -92,6 +93,7 @@ class AtomicGPUMemory {
    }
    time_pair.add_to_total();
    atom->acc_timers();
+    ans->acc_timers();
  }

  /// Zero timers
@ -99,6 +101,7 @@ class AtomicGPUMemory {
    nbor_time_avail=false;
    time_pair.zero();
    atom->zero_timers();
+    ans->zero_timers();
  }

  /// Copy neighbor list from host
@ -148,6 +151,9 @@ class AtomicGPUMemory {
  /// Atom Data
  PairGPUAtom<numtyp,acctyp> *atom;

+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;

  // --------------------------- NBOR DATA ----------------------------

--- a/lib/gpu/charge_gpu_memory.cpp
+++ b/lib/gpu/charge_gpu_memory.cpp
@ -23,11 +23,13 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 template <class numtyp, class acctyp>
 ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
  device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
  nbor=new PairGPUNbor();
 }

 template <class numtyp, class acctyp>
 ChargeGPUMemoryT::~ChargeGPUMemory() {
+  delete ans;
  delete nbor;
 }

@ -54,8 +56,8 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
  if (host_nlocal>0)
    _gpu_host=1;

-  if (!device->init(true,false,nlocal,host_nlocal,nall,nbor,maxspecial,gpu_nbor,
-                    _gpu_host,max_nbors,cell_size,false))
+  if (!device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,maxspecial,
+                    gpu_nbor,_gpu_host,max_nbors,cell_size,false))
    return false;
  ucl_device=device->gpu;
  atom=&device->atom;
@ -75,7 +77,7 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
  pos_tex.bind_float(atom->dev_x,4);
  q_tex.bind_float(atom->dev_q,1);

-  _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

  return true;
 }
@ -85,7 +87,7 @@ void ChargeGPUMemoryT::clear_atomic() {
  // Output any timing information
  acc_timers();
  double avg_split=hd_balancer.all_avg_split();
-  device->output_times(time_pair,*nbor,avg_split,_max_bytes+_max_an_bytes,
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
                       screen);

  if (_compiled) {
@ -119,7 +121,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,

  nbor->get_host(inum,ilist,numj,firstneigh,block_size());

-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;

@ -150,7 +152,7 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
  nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
                        nspecial, special, success, mn);

-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
 }
@ -175,7 +177,7 @@ void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
  int ago=hd_balancer.ago_first(f_ago);
  int inum=hd_balancer.balance(ago,inum_full,cpu_time,
 		               nbor->gpu_nbor());
-  atom->inum(inum);
+  ans->inum(inum);
  host_start=inum;

  if (ago==0) {
@ -188,10 +190,11 @@ void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
  atom->cast_q_data(host_q);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);
-  atom->add_other_data();
+  atom->add_q_data();

  loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
  hd_balancer.stop_timer();
 }

@ -215,7 +218,7 @@ int * ChargeGPUMemoryT::compute(const int ago, const int inum_full,
  
  hd_balancer.balance(cpu_time,nbor->gpu_nbor());
  int inum=hd_balancer.get_gpu_count(ago,inum_full);
-  atom->inum(inum);
+  ans->inum(inum);
  host_start=inum;
 
  // Build neighbor list on GPU if necessary
@ -232,10 +235,11 @@ int * ChargeGPUMemoryT::compute(const int ago, const int inum_full,
    hd_balancer.start_timer();
    atom->add_x_data(host_x,host_type);
  }
-  atom->add_other_data();
+  atom->add_q_data();

  loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
  hd_balancer.stop_timer();
  
  return nbor->host_nbor.begin();
--- a/lib/gpu/charge_gpu_memory.h
+++ b/lib/gpu/charge_gpu_memory.h
@ -48,10 +48,11 @@ class ChargeGPUMemory {
  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(inum, nall, success)) {
+    if (atom->resize(nall, success)) {
      pos_tex.bind_float(atom->dev_x,4);
      q_tex.bind_float(atom->dev_q,1);
    }
+    ans->resize(inum,success);
  }

  /// Check if there is enough storage for neighbors and realloc if not
@ -94,6 +95,7 @@ class ChargeGPUMemory {
    }
    time_pair.add_to_total();
    atom->acc_timers();
+    ans->acc_timers();
  }

  /// Zero timers
@ -101,6 +103,7 @@ class ChargeGPUMemory {
    nbor_time_avail=false;
    time_pair.zero();
    atom->zero_timers();
+    ans->zero_timers();
  }

  /// Copy neighbor list from host
@ -151,6 +154,10 @@ class ChargeGPUMemory {
  PairGPUAtom<numtyp,acctyp> *atom;


+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;
+
  // --------------------------- NBOR DATA ----------------------------

  /// Neighbor data
--- a/lib/gpu/cmm_cut_gpu_memory.cpp
+++ b/lib/gpu/cmm_cut_gpu_memory.cpp
@ -122,9 +122,9 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -133,15 +133,15 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch);
  }
  this->time_pair.stop();
--- a/lib/gpu/cmmc_long_gpu_memory.cpp
+++ b/lib/gpu/cmmc_long_gpu_memory.cpp
@ -133,9 +133,9 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -144,8 +144,8 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &_cut_coulsq,
                          &_qqrd2e, &_g_ewald);
@ -153,8 +153,8 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                     &_cut_coulsq, &_qqrd2e, &_g_ewald);
  }
--- a/lib/gpu/cmmc_msm_gpu_memory.cpp
+++ b/lib/gpu/cmmc_msm_gpu_memory.cpp
@ -133,9 +133,9 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -144,8 +144,8 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &_cut_coulsq,
                          &_qqrd2e, &_smooth);
@ -153,8 +153,8 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                     &_cut_coulsq, &_qqrd2e, &_smooth);
  }
--- a/lib/gpu/crml_gpu_memory.cpp
+++ b/lib/gpu/crml_gpu_memory.cpp
@ -137,9 +137,9 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -147,8 +147,8 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
                          &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &_cut_coulsq,
                          &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
@ -157,8 +157,8 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                     &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq);
--- a/lib/gpu/gb_gpu.cpp
+++ b/lib/gpu/gb_gpu.cpp
@ -216,9 +216,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum())/BX));
  int stride=gbm.nbor->nbor_pitch();
-  int ainum=gbm.atom->inum();
+  int ainum=gbm.ans->inum();
  int anall=gbm.atom->nall();

  if (gbm.multiple_forms) {
@ -237,11 +237,11 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
           &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
           &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
           &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
-           &stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
+           &stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(),
           &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
      gbm.time_gayberne.stop();

-      if (gbm.last_ellipse==gbm.atom->inum()) {
+      if (gbm.last_ellipse==gbm.ans->inum()) {
        gbm.time_kernel2.start();
        gbm.time_kernel2.stop();
        gbm.time_gayberne2.start();
@ -254,9 +254,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
      // ------------ SPHERE_ELLIPSE ---------------

      gbm.time_kernel2.start();
-      GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
+      GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum()-
                               gbm.last_ellipse)/BX));
-      gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
+      gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(),
 			SPHERE_ELLIPSE,SPHERE_ELLIPSE);
      gbm.time_kernel2.stop();

@ -266,13 +266,13 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
              &gbm.shape.begin(), &gbm.well.begin(), 
              &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
              &gbm._lj_types, &gbm.lshape.begin(), 
-              &gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
-              &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
+              &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),
+              &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
              &vflag, &gbm.last_ellipse, &ainum, &anall);
      gbm.time_gayberne2.stop();
   } else {
-      gbm.atom->dev_ans.zero();
-      gbm.atom->dev_engv.zero();
+      gbm.ans->dev_ans.zero();
+      gbm.ans->dev_engv.zero();
      gbm.time_kernel.stop();
      gbm.time_gayberne.start();                                 
      gbm.time_gayberne.stop();
@ -284,29 +284,29 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
    
    // ------------         LJ      ---------------
    gbm.time_pair.start();
-    if (gbm.last_ellipse<gbm.atom->inum()) {
+    if (gbm.last_ellipse<gbm.ans->inum()) {
      if (gbm.shared_types) {
        GBMF.k_lj_fast.set_size(GX,BX);
        GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
                           &gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
                           &stride, &gbm.nbor->dev_packed.begin(),
-                           &gbm.atom->dev_ans.begin(),
-                           &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
+                           &gbm.ans->dev_ans.begin(),
+                           &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
                           &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
      } else {
        GBMF.k_lj.set_size(GX,BX);
        GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
                      &gbm.lj3.begin(), &gbm._lj_types, 
                      &gbm.gamma_upsilon_mu.begin(), &stride, 
-                      &gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
-                      &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
+                      &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(),
+                      &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
                      &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
      }
    }
    gbm.time_pair.stop();
  } else {
    gbm.time_kernel.start();
-    gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
+    gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE,
 		      ELLIPSE_ELLIPSE);
    gbm.time_kernel.stop();
    gbm.time_gayberne.start(); 
@ -315,8 +315,8 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
            &gbm.shape.begin(), &gbm.well.begin(), 
            &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
            &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
-            &stride, &gbm.atom->dev_ans.begin(), &ainum,
-            &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
+            &stride, &gbm.ans->dev_ans.begin(), &ainum,
+            &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
            &eflag, &vflag, &ainum, &anall);
    gbm.time_gayberne.stop();
  }
@ -342,7 +342,7 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int ago,

  gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
  int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full);
-  gbm.atom->inum(inum);
+  gbm.ans->inum(inum);
  gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
  host_start=inum;
  
@ -361,10 +361,11 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
    gbm.atom->add_x_data(host_x,host_type);
  }

-  gbm.atom->add_other_data();
+  gbm.atom->add_quat_data();

  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
-  gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
+  gbm.ans->copy_answers(eflag,vflag,eatom,vatom);
+  gbm.device->add_ans_object(gbm.ans);
  gbm.hd_balancer.stop_timer();
  return gbm.nbor->host_nbor.begin();
 }
@ -399,7 +400,7 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
  
  int ago=gbm.hd_balancer.ago_first(f_ago);
  int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time,gbm.nbor->gpu_nbor());
-  gbm.atom->inum(inum);
+  gbm.ans->inum(inum);
  gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
  host_start=inum;

@ -419,10 +420,11 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
  gbm.atom->cast_quat_data(host_quat[0]);
  gbm.hd_balancer.start_timer();
  gbm.atom->add_x_data(host_x,host_type);
-  gbm.atom->add_other_data();
+  gbm.atom->add_quat_data();

  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
-  gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
+  gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list);
+  gbm.device->add_ans_object(gbm.ans);
  gbm.hd_balancer.stop_timer();
  return list;
 }
--- a/lib/gpu/gb_gpu_memory.cpp
+++ b/lib/gpu/gb_gpu_memory.cpp
@ -32,12 +32,14 @@ template <class numtyp, class acctyp>
 GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
                                  _max_bytes(0.0) {
  device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
  nbor=new PairGPUNbor;
 }

 template <class numtyp, class acctyp>
 GB_GPU_MemoryT::~GB_GPU_Memory() { 
  clear();
+  delete ans;
  delete nbor;
 }
 
@ -70,7 +72,7 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
  if (host_nlocal>0)
    _gpu_host=1;
  
-  if (!device->init(false,true,nlocal,host_nlocal,nall,nbor,0,gpu_nbor,
+  if (!device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,gpu_nbor,
                    _gpu_host,max_nbors,cell_size,true))
    return false;
  ucl_device=device->gpu;
@ -187,9 +189,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
  }
  
  if (multiple_forms)
-    atom->dev_ans.zero();
+    ans->dev_ans.zero();

-  _max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

  // Memory for ilist ordered by particle type
  return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
@ -212,7 +214,7 @@ void GB_GPU_MemoryT::clear() {
  acc_timers();
  double single[6], times[6];

-  single[0]=atom->transfer_time();
+  single[0]=atom->transfer_time()+ans->transfer_time();
  single[1]=nbor->time_nbor.total_seconds();
  single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
            nbor->time_kernel.total_seconds();
@ -221,7 +223,7 @@ void GB_GPU_MemoryT::clear() {
    single[4]=time_pair.total_seconds();
  else
    single[4]=0;
-  single[5]=atom->cast_time();
+  single[5]=atom->cast_time()+ans->cast_time();

  MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica());
  double avg_split=hd_balancer.all_avg_split();
@ -229,7 +231,7 @@ void GB_GPU_MemoryT::clear() {
  _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
              sigma_epsilon.row_bytes()+cut_form.row_bytes()+
              shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
-              gamma_upsilon_mu.row_bytes();
+              gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
  double mpi_max_bytes;
  MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
             device->replica());
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@ -49,9 +49,10 @@ class GB_GPU_Memory {
  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
-    atom->resize(inum, nall, success);
-    if (multiple_forms) atom->dev_ans.zero();
-    double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+    atom->resize(nall, success);
+    ans->resize(inum, success);
+    if (multiple_forms) ans->dev_ans.zero();
+    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
    if (bytes>_max_bytes)
      _max_bytes=bytes;
  }
@ -74,7 +75,7 @@ class GB_GPU_Memory {
      success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
    }
    nbor->resize(nlocal,host_inum,max_nbors,success);
-    double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
    if (bytes>_max_bytes)
      _max_bytes=bytes;
  }
@ -104,6 +105,7 @@ class GB_GPU_Memory {
      time_pair.add_to_total();
    }
    atom->acc_timers();
+    ans->acc_timers();
  }
  
  /// Accumulate timers
@ -117,6 +119,7 @@ class GB_GPU_Memory {
      time_pair.zero();
    }
    atom->zero_timers();
+    ans->zero_timers();
  }

  // -------------------------- DEVICE DATA ------------------------- 
@ -168,6 +171,10 @@ class GB_GPU_Memory {

  int last_ellipse, max_last_ellipse;

+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;
+
  // --------------------------- NBOR DATA ----------------------------

  /// Neighbor data
--- a/lib/gpu/geryon/nvd_timer.h
+++ b/lib/gpu/geryon/nvd_timer.h
@ -25,6 +25,7 @@
 #define NVD_TIMER_H

 #include "nvd_macros.h"
+#include "nvd_device.h"

 namespace ucl_cudadr {

@ -66,12 +67,23 @@ class UCL_Timer {
  /// Stop timing on command queue
  inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
  
+  /// Block until the start event has been reached on device
+  inline void sync_start() 
+    { CU_SAFE_CALL(cuEventSynchronize(start_event)); }
+
+  /// Block until the stop event has been reached on device
+  inline void sync_stop() 
+    { CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
+
  /// Set the time elapsed to zero (not the total_time)
  inline void zero() {
    CU_SAFE_CALL(cuEventRecord(start_event,_cq));
    CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
  }
  
+  /// Set the total time to zero
+  inline void zero_total() { _total_time=0.0; }
+  
  /// Add time from previous start and stop to total
  /** Forces synchronization **/
  inline double add_to_total() 
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -25,6 +25,7 @@
 #define OCL_TIMER_H

 #include "ocl_macros.h"
+#include "ocl_device.h"

 namespace ucl_opencl {

@ -67,10 +68,21 @@ class UCL_Timer {
  /// Stop timing on default command queue
  inline void stop() { clEnqueueMarker(_cq,&stop_event); }
  
+  /// Block until the start event has been reached on device
+  inline void sync_start() 
+    { CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
+
+  /// Block until the stop event has been reached on device
+  inline void sync_stop() 
+    { CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
+
  /// Set the time elapsed to zero (not the total_time)
  inline void zero() 
    { clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); } 
  
+  /// Set the total time to zero
+  inline void zero_total() { _total_time=0.0; }
+  
  /// Add time from previous start and stop to total
  /** Forces synchronization **/
  inline double add_to_total() 
--- a/lib/gpu/lj96_cut_gpu_memory.cpp
+++ b/lib/gpu/lj96_cut_gpu_memory.cpp
@ -122,9 +122,9 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -133,15 +133,15 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch);
  }
  this->time_pair.stop();
--- a/lib/gpu/lj_cut_gpu_memory.cpp
+++ b/lib/gpu/lj_cut_gpu_memory.cpp
@ -122,9 +122,9 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -133,15 +133,15 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch);
  }
  this->time_pair.stop();
--- a/lib/gpu/ljc_cut_gpu_memory.cpp
+++ b/lib/gpu/ljc_cut_gpu_memory.cpp
@ -134,9 +134,9 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -145,8 +145,8 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &cutsq.begin(),
                          &_qqrd2e);
@ -154,8 +154,8 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                     &cutsq.begin(), &_qqrd2e);
  }
--- a/lib/gpu/ljcl_cut_gpu_memory.cpp
+++ b/lib/gpu/ljcl_cut_gpu_memory.cpp
@ -132,9 +132,9 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -143,8 +143,8 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &_cut_coulsq,
                          &_qqrd2e, &_g_ewald);
@ -152,8 +152,8 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                     &_cut_coulsq, &_qqrd2e, &_g_ewald);
  }
--- a/lib/gpu/pair_gpu_ans.cpp
+++ b/lib/gpu/pair_gpu_ans.cpp
@ -138,11 +138,10 @@ void PairGPUAnsT::clear() {
  if (!_allocated)
    return;

-  time_pos.clear();
-  time_other.clear();
  time_answer.clear();
  clear_resize();
  _inum=0;
+  _ilist=NULL;
  _eflag=false;
  _vflag=false;
 }
@ -181,7 +180,6 @@ void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
  else
    ucl_copy(host_ans,dev_ans,_inum*4,true);
  time_answer.stop();
-  dev->add_answer_object(this);
 }

 template <class numtyp, class acctyp>
@ -199,7 +197,7 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
    return 0.0;

  double evdwl=0.0;
-  if (_gpu_nbor) {
+  if (_ilist==NULL) {
    for (int i=0; i<_inum; i++) {
      acctyp *ap=host_engv.begin()+i;
      if (_eflag) {
@ -279,7 +277,7 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,

  double evdwl=0.0;
  double _ecoul=0.0;
-  if (_gpu_nbor) {
+  if (_ilist==NULL) {
    for (int i=0; i<_inum; i++) {
      acctyp *ap=host_engv.begin()+i;
      if (_eflag) {
@ -359,11 +357,8 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom,

 template <class numtyp, class acctyp>
 void PairGPUAnsT::get_answers(double **f, double **tor) {
-  _x_avail=false;
-  _q_avail=false;
-  _quat_avail=false;
  acctyp *ap=host_ans.begin();
-  if (_gpu_nbor) {
+  if (_ilist==NULL) {
    for (int i=0; i<_inum; i++) {
      f[i][0]+=*ap;
      ap++;
--- a/lib/gpu/pair_gpu_ans.h
+++ b/lib/gpu/pair_gpu_ans.h
@ -58,14 +58,12 @@ class PairGPUAns {
  bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev);
  
  /// Check if we have enough device storage and realloc if not
-  inline bool resize(const int inum, bool &success) {
+  inline void resize(const int inum, bool &success) {
    _inum=inum;
    if (inum>_max_local) {
      clear_resize();
      success = success && alloc(inum);
-      return true;
    }
-    return false;
  }
  
  /// If already initialized by another LAMMPS style, add fields as necessary
@ -123,6 +121,16 @@ class PairGPUAns {
  /// Add forces and torques from the GPU into a LAMMPS pointer
  void get_answers(double **f, double **tor);

+  inline double get_answers(double **f, double **tor, double *eatom, 
+                            double **vatom, double *virial, double &ecoul) {
+    time_answer.sync_stop();
+    double ts=MPI_Wtime();
+    double evdw=energy_virial(eatom,vatom,virial,ecoul);
+    get_answers(f,tor);
+    _time_cast+=MPI_Wtime()-ts;
+    return evdw;
+  }
+
  // ------------------------------ DATA ----------------------------------

  /// Force and possibly torque
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_atom.cpp
@ -29,7 +29,8 @@ __win_sort _win_sort;
 #endif

 template <class numtyp, class acctyp>
-PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false) {
+PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
+                              _max_gpu_bytes(0) {
  #ifndef USE_OPENCL
  sort_config.op = CUDPP_ADD;
  sort_config.datatype = CUDPP_UINT;
@ -108,7 +109,7 @@ bool PairGPUAtomT::alloc(const int nall) {

    
  // ---------------------------  Device allocations
-  _gpu_bytes=0;
+  int gpu_bytes=0;
  if (cpuview) {
    #ifdef GPU_CAST
    assert(0==1);
@ -126,7 +127,7 @@ bool PairGPUAtomT::alloc(const int nall) {
                        dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
    success=success && (UCL_SUCCESS==
                        dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
-    _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
+    gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
    #else
    success=success && (UCL_SUCCESS==
                        dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
@ -134,25 +135,27 @@ bool PairGPUAtomT::alloc(const int nall) {
    if (_charge) {
      success=success && (dev_q.alloc(_max_atoms,*dev,
                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_q.row_bytes();
+      gpu_bytes+=dev_q.row_bytes();
    }
    if (_rot) {
      success=success && (dev_quat.alloc(_max_atoms*4,*dev,
                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_quat.row_bytes();
+      gpu_bytes+=dev_quat.row_bytes();
    }
  }
  if (_gpu_nbor) {
    success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
    success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-    _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
+    gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
    if (_bonds) {
      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-      _gpu_bytes+=dev_tag.row_bytes();
+      gpu_bytes+=dev_tag.row_bytes();
    }
  }

-  _gpu_bytes+=dev_x.row_bytes();
+  gpu_bytes+=dev_x.row_bytes();
+  if (gpu_bytes>_max_gpu_bytes)
+    _max_gpu_bytes=gpu_bytes;
  
  _allocated=true;  
  return success;
@ -168,6 +171,7 @@ bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
  _x_avail=false;
  _q_avail=false;
  _quat_avail=false;
+  _resized=false;
  _gpu_nbor=gpu_nbor;
  _bonds=bonds;
  _charge=charge;
@ -182,9 +186,11 @@ bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
  
  // Initialize timers for the selected device
  time_pos.init(*dev);
-  time_other.init(*dev);
+  time_q.init(*dev);
+  time_quat.init(*dev);
  time_pos.zero();
-  time_other.zero();
+  time_q.zero();
+  time_quat.zero();
  _time_cast=0.0;
  
  #ifdef GPU_CAST
@ -252,12 +258,13 @@ void PairGPUAtomT::clear_resize() {

 template <class numtyp, class acctyp>
 void PairGPUAtomT::clear() {
-  _gpu_bytes=0;
+  _max_gpu_bytes=0;
  if (!_allocated)
    return;

  time_pos.clear();
-  time_other.clear();
+  time_q.clear();
+  time_quat.clear();
  clear_resize();

  #ifdef GPU_CAST
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@ -64,14 +64,15 @@ class PairGPUAtom {
            UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
  
  /// Check if we have enough device storage and realloc if not
+  /** Returns true if resized with any call during this timestep **/
  inline bool resize(const int nall, bool &success) {
    _nall=nall;
    if (nall>_max_atoms) {
      clear_resize();
      success = success && alloc(nall);
-      return true;
+      _resized=true;
    }
-    return false;
+    return _resized;
  }
  
  /// If already initialized by another LAMMPS style, add fields as necessary
@ -94,26 +95,42 @@ class PairGPUAtom {
  /// Add copy times to timers
  inline void acc_timers() {
    time_pos.add_to_total();
-    if (_other)
-      time_other.add_to_total();
+    if (_charge)
+      time_q.add_to_total();
+    if (_rot)
+      time_quat.add_to_total();
  }

  /// Add copy times to timers
  inline void zero_timers() {
    time_pos.zero();
-    if (_other)
-      time_other.zero();
+    if (_charge)
+      time_q.zero();
+    if (_rot)
+      time_quat.zero();
  }

  /// Return the total time for host/device data transfer
+  /** Zeros the total so that the atom times are only included once **/
  inline double transfer_time() {
    double total=time_pos.total_seconds();
-    if (_other) total+=time_other.total_seconds();
+    time_pos.zero_total();
+    if (_charge) {
+      total+=time_q.total_seconds();
+      time_q.zero_total();
+    }
+    if (_rot) {
+      total+=time_q.total_seconds();
+      time_quat.zero_total();
+    }
+    
    return total;
  }
  
  /// Return the total time for data cast/pack
-  inline double cast_time() { return _time_cast; }
+  /** Zeros the time so that atom times are only included once **/
+  inline double cast_time() 
+    { double t=_time_cast; _time_cast=0.0; return t; }

  /// Pack LAMMPS atom type constants into matrix and copy to device
  template <class dev_typ, class t1>
@ -210,7 +227,7 @@ class PairGPUAtom {

  /// Signal that we need to transfer atom data for next timestep
  inline void data_unavail()
-    { _x_avail=false; _q_avail=false; _quat_avail=false; }
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }

  /// Cast positions and types to write buffer
  inline void cast_x_data(double **host_ptr, const int *host_type) {
@ -322,20 +339,9 @@ class PairGPUAtom {
    }
  }

-  /// Copy data other than pos and type to device
-  inline void add_other_data() {
-    if (_other) {
-      time_other.start();
-      if (_charge)
-        add_q_data();
-      if (_rot)
-        add_quat_data();
-      time_other.stop();
-    }
-  }
-  
  /// Return number of bytes used on device
-  inline double gpu_bytes() { return _gpu_bytes; } 
+  inline double max_gpu_bytes() 
+    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 

  // ------------------------------ DATA ----------------------------------

@ -368,7 +374,7 @@ class PairGPUAtom {
  UCL_D_Vec<int> dev_tag;

  /// Device timers
-  UCL_Timer time_pos, time_other;
+  UCL_Timer time_pos, time_q, time_quat;
  
  /// Geryon device
  UCL_Device *dev;
@ -383,7 +389,7 @@ class PairGPUAtom {
  bool _compiled;
  
  // True if data has been copied to device already
-  bool _x_avail, _q_avail, _quat_avail;
+  bool _x_avail, _q_avail, _quat_avail, _resized;

  bool alloc(const int nall);
  
@ -392,7 +398,7 @@ class PairGPUAtom {
  bool _gpu_nbor, _bonds;
  double _time_cast;
  
-  double _gpu_bytes;
+  double _max_gpu_bytes;
  
  #ifndef USE_OPENCL
  CUDPPConfiguration sort_config;
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@ -117,10 +117,11 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
 }

 template <class numtyp, class acctyp>
-bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, 
+bool PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
+                          const bool rot, const int nlocal, 
                          const int host_nlocal, const int nall,
-                          PairGPUNbor *nbor, const int maxspecial, 
-                          const bool gpu_nbor, const int gpu_host, 
+                          PairGPUNbor *nbor, const int maxspecial,
+                          const bool gpu_nbor, const int gpu_host,
                          const int max_nbors, const double cell_size,
                          const bool pre_cut) {
  if (!_device_init)
@ -133,12 +134,14 @@ bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal,

  if (_init_count==0) {
    // Initialize atom and nbor data
-    if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
-                   gpu_nbor && maxspecial>0))
+    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
      return false;
  } else
    atom.add_fields(charge,rot);

+  if (!ans.init(ef_nlocal,charge,rot,*gpu))
+    return false;
+
  if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                  *gpu,gpu_nbor,gpu_host,pre_cut))
    return false;
@ -189,20 +192,21 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
 }

 template <class numtyp, class acctyp>
-void PairGPUDeviceT::output_times(UCL_Timer &time_pair, PairGPUNbor &nbor,
-                                  const double avg_split, 
+void PairGPUDeviceT::output_times(UCL_Timer &time_pair, 
+                                  PairGPUAns<numtyp,acctyp> &ans, 
+                                  PairGPUNbor &nbor, const double avg_split, 
                                  const double max_bytes, FILE *screen) {
  double single[5], times[5];

-  single[0]=atom.transfer_time();
+  single[0]=atom.transfer_time()+ans.transfer_time();
  single[1]=nbor.time_nbor.total_seconds();
  single[2]=nbor.time_kernel.total_seconds();
  single[3]=time_pair.total_seconds();
-  single[4]=atom.cast_time();
+  single[4]=atom.cast_time()+ans.cast_time();

  MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);

-  double my_max_bytes=max_bytes;
+  double my_max_bytes=max_bytes+atom.max_gpu_bytes();
  double mpi_max_bytes;
  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
  double max_mb=mpi_max_bytes/(1024.0*1024.0);
@ -276,15 +280,5 @@ void lmp_clear_device() {

 double lmp_gpu_forces(double **f, double **tor, double *eatom,
                      double **vatom, double *virial, double &ecoul) {
-  if (pair_gpu_device.init_count()) {
-    pair_gpu_device.stop_host_timer();
-    pair_gpu_device.gpu->sync();
-    double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
-    pair_gpu_device.atom.get_answers(f,tor);
-    pair_gpu_device.atom.data_unavail();
-
-    return evdw;
-  }
-  return 0.0;
+  return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
 }
-
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@ -53,18 +53,19 @@ class PairGPUDevice {
    * \param cell_size cutoff+skin 
    * \param pre_cut True if cutoff test will be performed in separate kernel
    *                than the force kernel **/
-  bool init(const bool charge, const bool rot, const int nlocal,
-            const int host_nlocal, const int nall, PairGPUNbor *nbor, 
-            const int maxspecial, const bool gpu_nbor, const int gpu_host, 
-            const int max_nbors, const double cell_size, const bool pre_cut);
+  bool init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
+            const int nlocal, const int host_nlocal, const int nall,
+            PairGPUNbor *nbor, const int maxspecial, const bool gpu_nbor,
+            const int gpu_host, const int max_nbors, const double cell_size,
+            const bool pre_cut);

  /// Output a message for pair_style acceleration with device stats
  void init_message(FILE *screen, const char *name,
                    const int first_gpu, const int last_gpu);

  /// Output a message with timing information
-  void output_times(UCL_Timer &time_pair, PairGPUNbor &nbor, 
-                    const double avg_split, 
+  void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans, 
+                    PairGPUNbor &nbor, const double avg_split, 
                    const double max_bytes, FILE *screen);

  /// Clear all memory on host and device associated with atom and nbor data
@ -74,7 +75,24 @@ class PairGPUDevice {
  void clear_device();

  /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
-  inline void add_ans_object(PairGPUAns *ans) { ans_queue.push(ans); }
+  inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
+    { ans_queue.push(ans); }
+
+  /// Add "answers" (force,energies,etc.) into LAMMPS structures
+  inline double fix_gpu(double **f, double **tor, double *eatom,
+                      double **vatom, double *virial, double &ecoul) {
+    atom.data_unavail();
+    if (ans_queue.empty()==false) {
+      stop_host_timer();
+      double evdw=0.0;
+      while (ans_queue.empty()==false) {
+        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
+        ans_queue.pop();
+      }                                                 
+      return evdw;
+    }
+    return 0.0;
+  }

  /// Start timer on host
  inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
@ -139,7 +157,7 @@ class PairGPUDevice {
  PairGPUNborShared _nbor_shared;

 private:
-  std::queue<PairGPUAns *> ans_queue;
+  std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
  int _init_count;
  bool _device_init;
  MPI_Comm _comm_world, _comm_replica, _comm_gpu;