From 3803804d3806aaf643a1bc06c19c6e8aa887e167 Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <brownw@ornl.gov>
Date: Sat, 3 Dec 2011 21:52:19 -0500
Subject: [PATCH] Removing the need for 2 allocations for fp on the host.

---
 lib/gpu/lal_eam.cpp      | 29 +++---------
 lib/gpu/lal_eam.h        | 25 ++---------
 lib/gpu/lal_eam_ext.cpp  | 25 ++++++-----
 src/GPU/pair_eam_gpu.cpp | 97 +++++++++++++++++++++++++---------------
 src/GPU/pair_eam_gpu.h   |  6 ++-
 src/MANYBODY/pair_eam.h  |  4 +-
 6 files changed, 93 insertions(+), 93 deletions(-)
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index 7b76e6893e..d95ddc1a98 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -236,7 +236,7 @@ void EAMT::compute(const int f_ago, const int inum_full,
                    const bool eflag, const bool vflag,
                    const bool eatom, const bool vatom,
                    int &host_start, const double cpu_time,
-                   bool &success, double *fp) {
+                   bool &success, void **fp_ptr) {
   this->acc_timers();
   
   if (this->device->time_device()) {
@@ -277,7 +277,8 @@ void EAMT::compute(const int f_ago, const int inum_full,
       dev_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_WRITE_ONLY);
     
     fp_tex.bind_float(dev_fp,1);
-  }  
+  }
+  *fp_ptr=host_fp.begin();  
 
   // -----------------------------------------------------------------
 
@@ -296,14 +297,6 @@ void EAMT::compute(const int f_ago, const int inum_full,
   time_fp1.start();
   ucl_copy(host_fp,dev_fp,false);
   time_fp1.stop();
-  
-  double t = MPI_Wtime();
-  numtyp *ap=host_fp.begin();
-  for (int i=0; i<inum; i++) {
-    fp[i]=*ap;
-    ap++;
-  }
-  this->atom->add_cast_time(MPI_Wtime() - t);
 }
 
 // ---------------------------------------------------------------------------
@@ -318,7 +311,7 @@ int** EAMT::compute(const int ago, const int inum_full,
                     const bool vatom, int &host_start,
                     int **ilist, int **jnum,
                     const double cpu_time, bool &success,
-                    double *fp, int &inum) {
+                    int &inum, void **fp_ptr) {
   this->acc_timers();
   
   if (this->device->time_device()) {
@@ -361,6 +354,7 @@ int** EAMT::compute(const int ago, const int inum_full,
     
     fp_tex.bind_float(dev_fp,1);
   }      
+  *fp_ptr=host_fp.begin();  
 
   // -----------------------------------------------------------------
 
@@ -384,14 +378,6 @@ int** EAMT::compute(const int ago, const int inum_full,
   ucl_copy(host_fp,dev_fp,false);
   time_fp1.stop();
   
-  double t = MPI_Wtime();
-  numtyp *ap=host_fp.begin();
-  for (int i=0; i<inum; i++) {
-    fp[i]=*ap;
-    ap++;
-  }
-  this->atom->add_cast_time(MPI_Wtime() - t);
-  
   return this->nbor->host_jlist.begin()-host_start;
 }
 
@@ -400,10 +386,9 @@ int** EAMT::compute(const int ago, const int inum_full,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void EAMT::compute2(int *ilist, const bool eflag, const bool vflag,
-                    const bool eatom, const bool vatom, double *host_fp) {
-  time_fp2.start();
-  this->cast_fp_data(host_fp);
+                    const bool eatom, const bool vatom) {
   this->hd_balancer.start_timer();
+  time_fp2.start();
   this->add_fp_data();
   time_fp2.stop();
   
diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h
index 100d850cd7..c07297da7c 100644
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@@ -47,24 +47,6 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *_screen);
   
-  // Cast fp to write buffer
-//  template<class cpytyp>
-  inline void cast_fp_data(double *host_ptr) {
-    int nall = this->atom->nall();
-    if (this->ucl_device->device_type()==UCL_CPU) {
-      if (sizeof(numtyp)==sizeof(double)) {
-        host_fp.view((numtyp*)host_ptr,nall,*(this->ucl_device));
-        dev_fp.view(host_fp);
-      } else
-        for (int i=0; i<nall; i++) host_fp[i]=host_ptr[i];
-    } else {
-      if (sizeof(numtyp)==sizeof(double))
-        memcpy(host_fp.begin(),host_ptr,nall*sizeof(numtyp));
-      else
-        for (int i=0; i<nall; i++) host_fp[i]=host_ptr[i];
-    }
-  }
-
   // Copy charges to device asynchronously
   inline void add_fp_data() {
     ucl_copy(dev_fp,host_fp,this->atom->nall(),true);
@@ -85,7 +67,8 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
                double **host_x, int *host_type, int *ilist, int *numj,
                int **firstneigh, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, double *fp);
+               const double cpu_time, bool &success,
+               void **fp_ptr);
                
   /// Pair loop with device neighboring
   int** compute(const int ago, const int inum_full, const int nall,
@@ -94,11 +77,11 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
                 int **special, const bool eflag, const bool vflag, 
                 const bool eatom, const bool vatom, int &host_start, 
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                double *fp, int &inum);
+                int &inum, void **fp_ptr);
 
   /// Pair loop with host neighboring
   void compute2(int *ilist, const bool eflag, const bool vflag,
-                    const bool eatom, const bool vatom, double *host_fp);
+                    const bool eatom, const bool vatom);
   
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Kernel k_energy;
diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp
index 5add042b83..0b4b155964 100644
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@@ -35,7 +35,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
                  int nrho, int nz2r, int nfrho, int nr, 
                  const int nlocal, const int nall, const int max_nbors, 
                  const int maxspecial, const double cell_size, 
-                 int &gpu_mode, FILE *screen) {
+                 int &gpu_mode, FILE *screen, int &fp_size) {
   EAMMF.clear();
   gpu_mode=EAMMF.device->gpu_mode();
   double gpu_split=EAMMF.device->particle_split();
@@ -49,6 +49,8 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
   if (gpu_split != 1.0) 
     return -8;
     
+  fp_size=sizeof(PRECISION);
+    
   EAMMF.device->init_message(screen,"eam",first_gpu,last_gpu);
 
   bool message=false;
@@ -114,23 +116,24 @@ int ** eam_gpu_compute_energy_n(const int ago, const int inum_full,
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
-                         bool &success, double *host_fp, double *boxlo,
-                         double *prd, int &inum) {
+                         bool &success, double *boxlo,
+                         double *prd, int &inum, void **fp_ptr) {
   return EAMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
-                        host_fp, inum);
+                        inum, fp_ptr);
 }  
 
 void eam_gpu_compute_energy(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
-                      const double cpu_time, bool &success, double *host_fp,
-                      const int nlocal, double *boxlo, double *prd) {
+                      const double cpu_time, bool &success,
+                      const int nlocal, double *boxlo, double *prd, 
+                      void **fp_ptr) {
   EAMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_fp);
+                fp_ptr);
 }
 
 void eam_gpu_compute_n(const int ago, const int inum_full,
@@ -139,18 +142,18 @@ void eam_gpu_compute_n(const int ago, const int inum_full,
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
-                         bool &success, double *host_fp, double *boxlo,
+                         bool &success, double *boxlo,
                          double *prd, int inum) {
-  EAMMF.compute2(NULL, eflag, vflag, eatom, vatom, host_fp);
+  EAMMF.compute2(NULL, eflag, vflag, eatom, vatom);
 }  
 			
 void eam_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
-                      const double cpu_time, bool &success, double *host_fp,
+                      const double cpu_time, bool &success,
                       const int nlocal, double *boxlo, double *prd) {
-  EAMMF.compute2(ilist, eflag, vflag, eatom, vatom, host_fp);
+  EAMMF.compute2(ilist, eflag, vflag, eatom, vatom);
 }
 
 double eam_gpu_bytes() {
diff --git a/src/GPU/pair_eam_gpu.cpp b/src/GPU/pair_eam_gpu.cpp
index 3fefffbb2b..c68fe9e43c 100644
--- a/src/GPU/pair_eam_gpu.cpp
+++ b/src/GPU/pair_eam_gpu.cpp
@@ -33,9 +33,6 @@
 
 using namespace LAMMPS_NS;
 
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-
 #define MAXLINE 1024
 
 // External functions from cuda library for atom decomposition
@@ -49,7 +46,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
                  int nz2r, int nfrho, int nr,
                  const int nlocal, const int nall, const int max_nbors, 
                  const int maxspecial, const double cell_size, 
-                 int &gpu_mode, FILE *screen);
+                 int &gpu_mode, FILE *screen, int &fp_size);
 void eam_gpu_clear();
 int** eam_gpu_compute_energy_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
@@ -57,27 +54,28 @@ int** eam_gpu_compute_energy_n(const int ago, const int inum_full,
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
-                         bool &success, double *host_fp, double *boxlo,
-                         double *prd, int &inum);
+                         bool &success, double *boxlo,
+			       double *prd, int &inum, void **fp_ptr);
 void eam_gpu_compute_energy(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
-                      const double cpu_time, bool &success, double *host_fp,
-                      const int nlocal, double *boxlo, double *prd);
+                      const double cpu_time, bool &success,
+			    const int nlocal, double *boxlo, double *prd,
+			    void **fp_ptr);
 void eam_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
-                         bool &success, double *host_fp, double *boxlo,
+                         bool &success, double *boxlo,
                          double *prd, int inum);
 void eam_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
-                      const double cpu_time, bool &success, double *host_fp,
+                      const double cpu_time, bool &success,
                       const int nlocal, double *boxlo, double *prd);
 double eam_gpu_bytes();
 
@@ -117,28 +115,9 @@ void PairEAMGPU::compute(int eflag, int vflag)
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = eflag_global = eflag_atom = 0;
  
-  // grow energy and fp arrays if necessary
-  // need to be atom->nmax in length
-    
-  if (atom->nmax > nmax) {
-    memory->destroy(rho);
-    memory->destroy(fp);
-    nmax = atom->nmax;
-    memory->create(rho,nmax,"pair:rho");
-    memory->create(fp,nmax,"pair:fp");
-  }
-
   int nlocal = atom->nlocal;
   int newton_pair = force->newton_pair;
 
-  // zero out density
-
-  if (newton_pair) {
-    m = nlocal + atom->nghost;
-    for (i = 0; i < m; i++) rho[i] = 0.0; 
-  } else for (i = 0; i < nlocal; i++) rho[i] = 0.0; 
-
-  
   // compute density on each atom on GPU
 
   int nall = atom->nlocal + atom->nghost;  
@@ -154,8 +133,8 @@ void PairEAMGPU::compute(int eflag, int vflag)
              atom->tag, atom->nspecial, atom->special,
              eflag, vflag, eflag_atom, vflag_atom,
              host_start, &ilist, &numneigh, cpu_time,
-             success, fp, domain->boxlo, 
-             domain->prd, inum_dev);
+             success, domain->boxlo, 
+					  domain->prd, inum_dev, &fp_pinned);
   } else { // gpu_mode == GPU_FORCE
     inum = list->inum;
     ilist = list->ilist;
@@ -163,8 +142,9 @@ void PairEAMGPU::compute(int eflag, int vflag)
     firstneigh = list->firstneigh;
     eam_gpu_compute_energy(neighbor->ago, inum, nall, atom->x, atom->type,
 		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
-		    vflag_atom, host_start, cpu_time, success, fp,
-		    atom->nlocal, domain->boxlo, domain->prd);
+		    vflag_atom, host_start, cpu_time, success,
+			   atom->nlocal, domain->boxlo, domain->prd, 
+			   &fp_pinned);
   }
     
   if (!success)
@@ -189,12 +169,12 @@ void PairEAMGPU::compute(int eflag, int vflag)
 				   atom->tag, atom->nspecial, atom->special,
 				   eflag, vflag, eflag_atom, vflag_atom,
 				   host_start, &ilist, &numneigh, cpu_time,
-				   success, fp, domain->boxlo, 
+				   success, domain->boxlo, 
 				   domain->prd, inum_dev);
   } else { // gpu_mode == GPU_FORCE
     eam_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
 		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
-		    vflag_atom, host_start, cpu_time, success, fp,
+		    vflag_atom, host_start, cpu_time, success,
 		    atom->nlocal, domain->boxlo, domain->prd);
   }
   
@@ -404,12 +384,13 @@ void PairEAMGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int fp_size;
   int success = eam_gpu_init(atom->ntypes+1, cutforcesq,
           type2rhor, type2z2r, type2frho,
           rhor_spline, z2r_spline, frho_spline,
           rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, atom->nlocal, 
           atom->nlocal+atom->nghost, 300, maxspecial,
-          cell_size, gpu_mode, screen);
+			     cell_size, gpu_mode, screen, fp_size);
   GPU_EXTRA::check_flag(success,error,world);
   
   if (gpu_mode == GPU_FORCE) {
@@ -417,8 +398,52 @@ void PairEAMGPU::init_style()
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
+
+  if (fp_size == sizeof(double))
+    fp_single = false;
+  else
+    fp_single = true;
 }
 
+/* ---------------------------------------------------------------------- */
 
+int PairEAMGPU::pack_comm(int n, int *list, double *buf, int pbc_flag, 
+			  int *pbc)
+{
+  int i,j,m;
 
+  m = 0;
 
+  if (fp_single) {
+    float *fp_ptr = (float *)fp_pinned;
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = static_cast<double>(fp_ptr[j]);
+    }
+  } else {
+    double *fp_ptr = (double *)fp_pinned;
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = fp_ptr[j];
+    }
+  }
+
+  return 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairEAMGPU::unpack_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  if (fp_single) {
+    float *fp_ptr = (float *)fp_pinned;
+    for (i = first; i < last; i++) fp_ptr[i] = buf[m++];
+  } else {
+    double *fp_ptr = (double *)fp_pinned;
+    for (i = first; i < last; i++) fp_ptr[i] = buf[m++];
+  }
+}
diff --git a/src/GPU/pair_eam_gpu.h b/src/GPU/pair_eam_gpu.h
index b7eaffcfa1..7cdecb0105 100644
--- a/src/GPU/pair_eam_gpu.h
+++ b/src/GPU/pair_eam_gpu.h
@@ -36,13 +36,17 @@ class PairEAMGPU : public PairEAM {
   void init_style();
   double memory_usage();
 
+  int pack_comm(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+
  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
 
  private:
   int gpu_mode;
   double cpu_time;
   int *gpulist;
-  
+  void *fp_pinned;
+  bool fp_single;  
 };
 
 }
diff --git a/src/MANYBODY/pair_eam.h b/src/MANYBODY/pair_eam.h
index 1b7b1f1f00..677d83f4e7 100644
--- a/src/MANYBODY/pair_eam.h
+++ b/src/MANYBODY/pair_eam.h
@@ -53,8 +53,8 @@ class PairEAM : public Pair {
   double init_one(int, int);
   double single(int, int, int, int, double, double, double, double &);
 
-  int pack_comm(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
+  virtual int pack_comm(int, int *, double *, int, int *);
+  virtual void unpack_comm(int, int, double *);
   int pack_reverse_comm(int, int, double *);
   void unpack_reverse_comm(int, int *, double *);
   double memory_usage();