From 45971bf7329de2ac9bbae2c5dc9084cb56cae8cf Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <brownw@ornl.gov>
Date: Mon, 24 Jan 2011 17:33:38 -0500
Subject: [PATCH] Atom data will not be copied multiple times if multiple
 styles need it. Adding back missing extern from nbor.

---
 lib/gpu/pair_gpu_atom.cpp |   6 ++
 lib/gpu/pair_gpu_atom.h   | 161 ++++++++++++++++++++++----------------
 lib/gpu/pair_gpu_nbor.cpp |   6 ++
 3 files changed, 104 insertions(+), 69 deletions(-)
diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp
index 5eb655016c..46c9066e56 100644
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_atom.cpp
@@ -183,6 +183,9 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
   clear();
 
   bool success=true;
+  _x_avail=false;
+  _q_avail=false;
+  _quat_avail=false;
   _gpu_nbor=gpu_nbor;
   _bonds=bonds;
   _charge=charge;
@@ -516,6 +519,9 @@ double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::get_answers(double **f, double **tor) {
+  _x_avail=false;
+  _q_avail=false;
+  _quat_avail=false;
   acctyp *ap=host_ans.begin();
   if (_gpu_nbor) {
     for (int i=0; i<_inum; i++) {
diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h
index 2f816be037..562ca0846d 100644
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@@ -88,7 +88,10 @@ class PairGPUAtom {
   /// If already initialized by another LAMMPS style, add fields as necessary
   /** \param rot True if atom storage needs quaternions
     * \param gpu_nbor True if neighboring will be performed on device **/
-  bool add_fields(const bool charge, const bool rot);  
+  bool add_fields(const bool charge, const bool rot);
+  
+  /// True if charge data is available for kernels
+  bool charge_avail() const { return _charge; }
 
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
@@ -223,41 +226,46 @@ class PairGPUAtom {
 
   /// Cast positions and types to write buffer
   inline void cast_x_data(double **host_ptr, const int *host_type) {
-    double t=MPI_Wtime();
-    #ifdef GPU_CAST
-    memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
-    memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
-    #else
-    numtyp *_write_loc=host_x.begin();
-    for (int i=0; i<_nall; i++) {
-      *_write_loc=host_ptr[i][0];
-      _write_loc++;
-      *_write_loc=host_ptr[i][1];
-      _write_loc++;
-      *_write_loc=host_ptr[i][2];
-      _write_loc++;
-      *_write_loc=host_type[i];
-      _write_loc++;
+    if (_x_avail==false) {
+      double t=MPI_Wtime();
+      #ifdef GPU_CAST
+      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
+      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
+      #else
+      numtyp *_write_loc=host_x.begin();
+      for (int i=0; i<_nall; i++) {
+        *_write_loc=host_ptr[i][0];
+        _write_loc++;
+        *_write_loc=host_ptr[i][1];
+        _write_loc++;
+        *_write_loc=host_ptr[i][2];
+        _write_loc++;
+        *_write_loc=host_type[i];
+        _write_loc++;
+      }
+      #endif
+      _time_cast+=MPI_Wtime()-t;
     }
-    #endif
-    _time_cast+=MPI_Wtime()-t;
-  }      
+  }
 
   /// Copy positions and types to device asynchronously
   /** Copies nall() elements **/
   inline void add_x_data(double **host_ptr, int *host_type) { 
     time_pos.start();
-    #ifdef GPU_CAST
-    ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
-    ucl_copy(dev_type_cast,host_type_cast,_nall,true);
-    int block_size=64;
-    int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
-    k_cast_x.set_size(GX,block_size);
-    k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
-                 &_nall);
-    #else
-    ucl_copy(dev_x,host_x,_nall*4,true);
-    #endif
+    if (_x_avail==false) {
+      #ifdef GPU_CAST
+      ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
+      ucl_copy(dev_type_cast,host_type_cast,_nall,true);
+      int block_size=64;
+      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
+      k_cast_x.set_size(GX,block_size);
+      k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
+                   &_nall);
+      #else
+      ucl_copy(dev_x,host_x,_nall*4,true);
+      #endif
+      _x_avail=true;
+    }
     time_pos.stop();
   }
 
@@ -267,63 +275,75 @@ class PairGPUAtom {
     add_x_data(host_ptr,host_type);
   }
 
-  /// Cast charges to write buffer
+  // Cast charges to write buffer
   template<class cpytyp>
   inline void cast_q_data(cpytyp *host_ptr) {
-    double t=MPI_Wtime();
-    if (dev->device_type()==UCL_CPU) {
-      if (sizeof(numtyp)==sizeof(double)) {
-        host_q.view((numtyp*)host_ptr,_nall,*dev);
-        dev_q.view(host_q);
-      } else
-        for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
-    } else {
-      if (sizeof(numtyp)==sizeof(double))
-        memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
-      else
-        for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+    if (_q_avail==false) {
+      double t=MPI_Wtime();
+      if (dev->device_type()==UCL_CPU) {
+        if (sizeof(numtyp)==sizeof(double)) {
+          host_q.view((numtyp*)host_ptr,_nall,*dev);
+          dev_q.view(host_q);
+        } else
+          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+      } else {
+        if (sizeof(numtyp)==sizeof(double))
+          memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
+        else
+          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+      }
+      _time_cast+=MPI_Wtime()-t;
     }
-    _time_cast+=MPI_Wtime()-t;
   }
 
-  /// Copy charges to device asynchronously
+  // Copy charges to device asynchronously
   inline void add_q_data() {
-    ucl_copy(dev_q,host_q,_nall,true);
+    if (_q_avail==false) {
+      ucl_copy(dev_q,host_q,_nall,true);
+      _q_avail=true;
+    }
   }
 
-  /// Cast quaternions to write buffer
+  // Cast quaternions to write buffer
   template<class cpytyp>
   inline void cast_quat_data(cpytyp *host_ptr) {
-    double t=MPI_Wtime();
-    if (dev->device_type()==UCL_CPU) {
-      if (sizeof(numtyp)==sizeof(double)) {
-        host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
-        dev_quat.view(host_quat);
-      } else
-        for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
-    } else {
-      if (sizeof(numtyp)==sizeof(double))
-        memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
-      else
-        for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+    if (_quat_avail==false) {
+      double t=MPI_Wtime();
+      if (dev->device_type()==UCL_CPU) {
+        if (sizeof(numtyp)==sizeof(double)) {
+          host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
+          dev_quat.view(host_quat);
+        } else
+          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+      } else {
+        if (sizeof(numtyp)==sizeof(double))
+          memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
+        else
+          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+      }
+      _time_cast+=MPI_Wtime()-t;
     }
-    _time_cast+=MPI_Wtime()-t;
   }
 
-  /// Copy quaternions to device
+  // Copy quaternions to device
   /** Copies nall()*4 elements **/
   inline void add_quat_data() {
-    ucl_copy(dev_quat,host_quat,_nall*4,true);
+    if (_quat_avail==false) {
+      ucl_copy(dev_quat,host_quat,_nall*4,true);
+      _quat_avail=true;
+    }
   }
 
-  /// Copy data other than pos and data to device
+  /// Copy data other than pos and type to device
   inline void add_other_data() {
-    time_other.start();
-    if (_charge)
-      add_q_data();
-    if (_rot)
-      add_quat_data();
-    time_other.stop();
+    if (_other) {
+      time_other.start();
+      if (_charge)
+        add_q_data();
+      if (_rot)
+        add_quat_data();
+      time_other.stop();
+    }
   }
   
   /// Return number of bytes used on device
@@ -401,6 +421,9 @@ class PairGPUAtom {
   #endif
 
   bool _compiled;
+  
+  // True if data has been copied to device already
+  int _x_avail, _q_avail, _quat_avail;
 
   bool alloc(const int inum, const int nall);
   
diff --git a/lib/gpu/pair_gpu_nbor.cpp b/lib/gpu/pair_gpu_nbor.cpp
index 0a71761958..648a6c2e2e 100644
--- a/lib/gpu/pair_gpu_nbor.cpp
+++ b/lib/gpu/pair_gpu_nbor.cpp
@@ -359,3 +359,9 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
     ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false);
   time_nbor.stop();
 }
+
+template void PairGPUNbor::build_nbor_list<PRECISION,ACC_PRECISION>
+     (const int inum, const int host_inum, const int nall,
+      PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *boxlo, double *boxhi,
+      int *, int **, int **, bool &success, int &mn);
+