From a3d6cff836334746f233d9fa93b550c020fcd2a3 Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <brownw@ornl.gov>
Date: Mon, 21 Feb 2011 11:35:26 -0500
Subject: [PATCH] CODE USED TO BENCHMARK CHARGE SPREADING FOR PAPER.

---
 lib/gpu/pair_gpu_device.cpp |   9 +-
 lib/gpu/pppm_gpu_memory.cpp | 224 ++++++++++++++++++------------------
 src/GPU/pppm_gpu.cpp        |  13 ++-
 src/lmptype.h               |   4 +-
 4 files changed, 130 insertions(+), 120 deletions(-)
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index 550c1207b5..8e1651abba 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -266,9 +266,12 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in,
                                          const double max_bytes, FILE *screen) {
   double single[4], times[4];
 
-  single[0]=atom.transfer_time()+ans.transfer_time()+time_in.total_seconds()+
-            time_out.total_seconds();
-  single[1]=atom.cast_time()+ans.cast_time();
+//  single[0]=atom.transfer_time()+ans.transfer_time()+time_in.total_seconds()+
+//            time_out.total_seconds();
+//  single[1]=atom.cast_time()+ans.cast_time();
+
+  single[0]=time_out.total_seconds();
+  single[1]=0.0;
   single[2]=time_map.total_seconds();
   single[3]=time_rho.total_seconds();
 
diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp
index 222bea40b6..395389ffa8 100644
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@@ -147,12 +147,6 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
   d_error_flag.zero();
   _max_bytes+=1;
   
-std::cout << "LO: " << _nxlo_out << " " << _nylo_out << " " << _nzlo_out << " " << _nlower << std::endl;
-std::cout << "HI: " << _nxhi_out << " " << _nyhi_out << " " << _nzhi_out << " " << _nupper << std::endl;
-std::cout << "pts: " << _npts_x << " " << _npts_y << " " << _npts_z << std::endl;
-std::cout << "local: " << _nlocal_x << " " << _nlocal_y << " " << _nlocal_z << std::endl;
-
-
   return h_brick.begin();
 }
 
@@ -188,7 +182,7 @@ void PPPMGPUMemoryT::clear() {
   device->clear();
 }
 
-/*
+
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
@@ -296,6 +290,113 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
   
   return h_error_flag[0];
 }
+
+
+/*
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, virials,..
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
+                            double **host_x, int *host_type, bool &success,
+                            double *host_q, double *boxlo, 
+                            const double delxinv, const double delyinv,
+                            const double delzinv) {
+  acc_timers();
+  if (nlocal==0) {
+    zero_timers();
+    return 0;
+  }
+  
+  ans->inum(nlocal);
+
+  if (ago==0) {
+    resize_atom(nlocal,nall,success);
+    resize_local(nlocal,success);
+    if (!success)
+      return 0;
+
+    double bytes=ans->gpu_bytes();
+    if (bytes>_max_an_bytes)
+      _max_an_bytes=bytes;
+  }
+
+  atom->cast_x_data(host_x,host_type);
+  atom->cast_q_data(host_q);
+  atom->add_x_data(host_x,host_type);
+  atom->add_q_data();
+
+  time_map.start();
+
+  // Compute the block size and grid size to keep all cores busy
+  int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+
+  int _max_atoms=10;
+  int ainum=this->ans->inum();
+  
+  // Boxlo adjusted to be upper left brick and shift for even stencil order
+  double shift=0.0;
+  if (_order % 2)
+    shift=0.5;
+  numtyp f_brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
+  numtyp f_brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
+  numtyp f_brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
+  
+  numtyp f_delxinv=delxinv;
+  numtyp f_delyinv=delyinv;
+  numtyp f_delzinv=delzinv;
+  double delvolinv = delxinv*delyinv*delzinv;
+  numtyp f_delvolinv = delvolinv;
+
+  d_brick_counts.zero();
+  k_particle_map.set_size(GX,BX);
+  k_particle_map.run(&atom->dev_x.begin(), &ainum, &d_brick_counts.begin(),
+                     &d_brick_atoms.begin(), &f_brick_x, &f_brick_y, 
+                     &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, &_nlocal_x,
+                     &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms,
+                     &d_error_flag.begin());
+  time_map.stop();
+
+  time_rho.start();
+  if (_order % 2)
+    shift=0.0;
+  else
+    shift=0.5;
+  f_brick_x=boxlo[0]+(_nxlo_out-_nlower+shift)/delxinv;
+  f_brick_y=boxlo[1]+(_nylo_out-_nlower+shift)/delyinv;
+  f_brick_z=boxlo[2]+(_nzlo_out-_nlower+shift)/delzinv;
+
+  BX=block_size();
+  GX=_npts_x;
+  int GY=_npts_y;
+  k_make_rho.set_size(GX,GY,BX,1);
+  k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(),
+                 &d_brick_counts.begin(), &d_brick_atoms.begin(),
+                 &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, &_npts_x,
+                 &_npts_yx, &_npts_z, &_nlocal_x, &_nlocal_y, &_nlocal_z,
+                 &_order_m_1, &f_brick_x, &f_brick_y, &f_brick_z,
+                 &f_delxinv, &f_delyinv, &f_delzinv, &_order, &_order2,
+                 &f_delvolinv);
+  time_rho.stop();
+
+  time_out.start();
+  ucl_copy(h_brick,d_brick,true);
+  ucl_copy(h_error_flag,d_error_flag,false);
+  time_out.stop();
+  
+  if (h_error_flag[0]==2) {
+    // Not enough storage for atoms on the brick
+    _max_brick_atoms*=2;
+    d_brick_atoms.clear();
+    d_brick_atoms.alloc(_atom_stride*_max_atoms,*ucl_device);
+    _max_bytes+=d_brick_atoms.row_bytes();
+    return compute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, 
+                   delxinv,delyinv,delzinv);
+  }
+  
+  return h_error_flag[0];
+}
 */
 
 /*
@@ -332,115 +433,14 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
   atom->add_x_data(host_x,host_type);
   atom->add_q_data();
 
-  // Compute the block size and grid size to keep all cores busy
-  int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
-
-  int _max_atoms=10;
-  int ainum=this->ans->inum();
-  
-  // Boxlo adjusted to be upper left brick and shift for even stencil order
-  double shift=0.0;
-  if (_order % 2)
-    shift=0.5;
-  numtyp f_brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
-  numtyp f_brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
-  numtyp f_brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
-  
-  numtyp f_delxinv=delxinv;
-  numtyp f_delyinv=delyinv;
-  numtyp f_delzinv=delzinv;
-  double delvolinv = delxinv*delyinv*delzinv;
-  numtyp f_delvolinv = delvolinv;
-
   time_map.start();
-  d_brick_counts.zero();
-  k_particle_map.set_size(GX,BX);
-  k_particle_map.run(&atom->dev_x.begin(), &ainum, &d_brick_counts.begin(),
-                     &d_brick_atoms.begin(), &f_brick_x, &f_brick_y, 
-                     &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, &_nlocal_x,
-                     &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms,
-                     &d_error_flag.begin());
   time_map.stop();
 
-  if (_order % 2)
-    shift=0.0;
-  else
-    shift=0.5;
-  f_brick_x=boxlo[0]+(_nxlo_out-_nlower+shift)/delxinv;
-  f_brick_y=boxlo[1]+(_nylo_out-_nlower+shift)/delyinv;
-  f_brick_z=boxlo[2]+(_nzlo_out-_nlower+shift)/delzinv;
-
   time_rho.start();
-  BX=block_size();
-  GX=_npts_x;
-  int GY=_npts_y;
-  k_make_rho.set_size(GX,GY,BX,1);
-  k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(),
-                 &d_brick_counts.begin(), &d_brick_atoms.begin(),
-                 &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, &_npts_x,
-                 &_npts_yx, &_npts_z, &_nlocal_x, &_nlocal_y, &_nlocal_z,
-                 &_order_m_1, &f_brick_x, &f_brick_y, &f_brick_z,
-                 &f_delxinv, &f_delyinv, &f_delzinv, &_order, &_order2,
-                 &f_delvolinv);
-  time_rho.stop();
-
-  time_out.start();
-  ucl_copy(h_brick,d_brick,true);
-  ucl_copy(h_error_flag,d_error_flag,false);
-  time_out.stop();
-  
-  if (h_error_flag[0]==2) {
-    // Not enough storage for atoms on the brick
-    _max_brick_atoms*=2;
-    d_brick_atoms.clear();
-    d_brick_atoms.alloc(_atom_stride*_max_atoms,*ucl_device);
-    _max_bytes+=d_brick_atoms.row_bytes();
-    return compute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, 
-                   delxinv,delyinv,delzinv);
-  }
-  
-  return h_error_flag[0];
-}
-*/
-
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
-                            double **host_x, int *host_type, bool &success,
-                            double *host_q, double *boxlo, 
-                            const double delxinv, const double delyinv,
-                            const double delzinv) {
-  acc_timers();
-  if (nlocal==0) {
-    zero_timers();
-    return 0;
-  }
-  
-  ans->inum(nlocal);
-
-  if (ago==0) {
-    resize_atom(nlocal,nall,success);
-    resize_local(nlocal,success);
-    if (!success)
-      return 0;
-
-    double bytes=ans->gpu_bytes();
-    if (bytes>_max_an_bytes)
-      _max_an_bytes=bytes;
-  }
-
-  atom->cast_x_data(host_x,host_type);
-  atom->cast_q_data(host_q);
-  atom->add_x_data(host_x,host_type);
-  atom->add_q_data();
 
   // Compute the block size and grid size to keep all cores busy
   int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
-
   int ainum=this->ans->inum();
   
   // Boxlo adjusted to be upper left brick and shift for even stencil order
@@ -457,10 +457,6 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
   double delvolinv = delxinv*delyinv*delzinv;
   numtyp f_delvolinv = delvolinv;
   
-  time_map.start();
-  time_map.stop();
-
-  time_rho.start();
   d_brick.zero();
   k_make_rho.set_size(GX,BX);
   k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum,
@@ -477,7 +473,7 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
   
   return h_error_flag[0];
 }
-
+*/
 
 template <class numtyp, class acctyp>
 double PPPMGPUMemoryT::host_memory_usage() const {
@@ -495,7 +491,7 @@ void PPPMGPUMemoryT::compile_kernels(UCL_Device &dev) {
   pppm_program=new UCL_Program(dev);
   pppm_program->load_string(pppm_gpu_kernel,flags.c_str());
   k_particle_map.set_function(*pppm_program,"particle_map");
-  k_make_rho.set_function(*pppm_program,"make_rho3");
+  k_make_rho.set_function(*pppm_program,"make_rho");
   pos_tex.get_texture(*pppm_program,"pos_tex");
   q_tex.get_texture(*pppm_program,"q_tex");
 
diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index 33c92e9ed3..5f3dc5e688 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -105,7 +105,14 @@ PPPMGPU::~PPPMGPU()
   deallocate();
   memory->destroy_2d_int_array(part2grid);
   pppm_gpu_clear();
-std::cout << "DEBUG_TIMES: " << time1 << " " << time2 << " " << time3 
+double total1, total2, total3;
+int rank;
+MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+MPI_Allreduce(&time1,&total1,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+MPI_Allreduce(&time2,&total2,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+MPI_Allreduce(&time3,&total3,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+if (rank==0)
+std::cout << "DEBUG_TIMES: " << total1 << " " << total2 << " " << total3 
           << std::endl;
 }
 
@@ -702,6 +709,8 @@ double t1=MPI_Wtime();
   make_rho();
 time1+=MPI_Wtime()-t1;
 
+
+/*
 double max_error=0;
 int _npts_x=nxhi_out-nxlo_out+1;
 int _npts_y=nyhi_out-nylo_out+1;
@@ -732,6 +741,8 @@ for (int i=0; i<iend; i++) {
   }
 }
 std::cout << "Maximum relative error: " << max_error*100.0 << "%\n";
+*/
+
 
   // all procs communicate density values from their ghost cells
   //   to fully sum contribution in their 3d bricks
diff --git a/src/lmptype.h b/src/lmptype.h
index af2bf465d3..880f8c45d9 100644
--- a/src/lmptype.h
+++ b/src/lmptype.h
@@ -47,6 +47,7 @@ namespace LAMMPS_NS {
 // default settings
 // 32-bit smallint and tagint, 64-bit bigint
 
+/*
 typedef int smallint;
 typedef int tagint;
 typedef int64_t bigint;
@@ -63,6 +64,7 @@ typedef int64_t bigint;
 
 #define ATOTAGINT atoi
 #define ATOBIGINT atoll
+*/
 
 // for molecular problems that exceed 2 billion (2^31) atoms
 // 32-bit smallint, 64-bit tagint and bigint
@@ -90,7 +92,6 @@ typedef int64_t bigint;
 // for machines that do not support 64-bit ints
 // 32-bit smallint and tagint and bigint
 
-/*
 typedef int smallint;
 typedef int tagint;
 typedef int bigint;
@@ -107,7 +108,6 @@ typedef int bigint;
 
 #define ATOTAGINT atoi
 #define ATOBIGINT atoi
-*/
 
 }