git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8693 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2012-08-21 13:57:32 +00:00
parent 9a99e27552
commit 31551d81fd
85 changed files with 2630 additions and 2172 deletions
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -51,10 +51,14 @@ bool AtomT::alloc(const int nall) {
  bool success=true;
  
  // Ignore host/device transfers?
-  bool cpuview=false;
-  if (dev->device_type()==UCL_CPU)
-    cpuview=true;
-    
+  _host_view=false;
+  if (dev->shared_memory()) {
+    _host_view=true;
+    #ifdef GPU_CAST
+    assert(0==1);
+    #endif
+  }
+      
  // Allocate storage for CUDPP sort
  #ifdef USE_CUDPP
  if (_gpu_nbor==1) {
@ -64,63 +68,101 @@ bool AtomT::alloc(const int nall) {
  }
  #endif

-  // --------------------------   Host allocations
-  // Get a host write only buffer
-  #ifdef GPU_CAST
-  success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
-                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  success=success && (host_type_cast.alloc(_max_atoms,*dev,
-                                           UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  #else
-  success=success && (host_x.alloc(_max_atoms*4,*dev,
-                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  #endif                      
-  // Buffer for casting only if different precisions
-  if (_charge)
-    success=success && (host_q.alloc(_max_atoms,*dev,
-                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  // Buffer for casting only if different precisions
-  if (_rot)
-    success=success && (host_quat.alloc(_max_atoms*4,*dev,
-                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-
-    
  // ---------------------------  Device allocations
  int gpu_bytes=0;
-  if (cpuview) {
-    #ifdef GPU_CAST
-    assert(0==1);
-    #else
-    dev_x.view(host_x);
-    #endif
-    if (_rot)
-      dev_quat.view(host_quat);
-    if (_charge)
-      dev_q.view(host_q);
-  } else {
-    #ifdef GPU_CAST
-    success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
-    success=success && (UCL_SUCCESS==
-                        dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
-    success=success && (UCL_SUCCESS==
-                        dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
-    gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
-    #else
-    success=success && (UCL_SUCCESS==
-                        dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
-    #endif
-    if (_charge) {
-      success=success && (dev_q.alloc(_max_atoms,*dev,
-                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      gpu_bytes+=dev_q.row_bytes();
+  success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+                              UCL_READ_ONLY)==UCL_SUCCESS);
+  #ifdef GPU_CAST
+  success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)==
+                      UCL_SUCCESS);
+  success=success && (type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)==
+                      UCL_SUCCESS);
+  gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes();
+  #endif
+
+  if (_charge && _host_view==false) {
+    success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
+                                UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=q.device.row_bytes();
+  }
+  if (_rot && _host_view==false) {
+    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=quat.device.row_bytes();
+  }
+
+  if (_gpu_nbor>0) {
+    if (_bonds) {
+      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      gpu_bytes+=dev_tag.row_bytes();
    }
-    if (_rot) {
-      success=success && (dev_quat.alloc(_max_atoms*4,*dev,
-                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      gpu_bytes+=dev_quat.row_bytes();
+    if (_gpu_nbor==1) {
+      success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      gpu_bytes+=dev_cell_id.row_bytes();
+    } else {
+      success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && 
+             (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
+    }
+    if (_gpu_nbor==2 && _host_view)
+      dev_particle_id.view(host_particle_id);
+    else
+      success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+    gpu_bytes+=dev_particle_id.row_bytes();
+  }
+
+  gpu_bytes+=x.device.row_bytes();
+  if (gpu_bytes>_max_gpu_bytes)
+    _max_gpu_bytes=gpu_bytes;
+  
+  _allocated=true;  
+  return success;
+}
+
+template <class numtyp, class acctyp>
+bool AtomT::add_fields(const bool charge, const bool rot,
+                       const int gpu_nbor, const bool bonds) {
+  bool success=true;
+  // Ignore host/device transfers?
+  int gpu_bytes=0;
+  
+  if (charge && _charge==false) {
+    _charge=true;
+    _other=true;
+    if (_host_view==false) {
+      success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
+                                  UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=q.device.row_bytes();
    }
  }
-  if (_gpu_nbor>0) {
+
+  if (rot && _rot==false) {
+    _rot=true;
+    _other=true;
+    if (_host_view==false) {
+      success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+                                     UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=quat.device.row_bytes();
+    }
+  }
+
+  if (bonds && _bonds==false) {
+    _bonds=true;
+    if (_bonds && _gpu_nbor>0) {
+      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      gpu_bytes+=dev_tag.row_bytes();
+    }
+  }
+
+  if (gpu_nbor>0 && _gpu_nbor==0) {
+    _gpu_nbor=gpu_nbor;
+    #ifdef USE_CUDPP
+    if (_gpu_nbor==1) {
+      CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
+      if (CUDPP_SUCCESS != result)
+        return false;
+    }
+    #endif
    success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
    gpu_bytes+=dev_particle_id.row_bytes();
    if (_bonds) {
@ -137,43 +179,9 @@ bool AtomT::alloc(const int nall) {
    }             
  }

-  gpu_bytes+=dev_x.row_bytes();
-  if (gpu_bytes>_max_gpu_bytes)
-    _max_gpu_bytes=gpu_bytes;
-  
-  _allocated=true;  
  return success;
 }

-template <class numtyp, class acctyp>
-bool AtomT::add_fields(const bool charge, const bool rot,
-                       const int gpu_nbor, const bool bonds) {
-  bool realloc=false;
-  if (charge && _charge==false) {
-    _charge=true;
-    realloc=true;
-  }
-  if (rot && _rot==false) {
-    _rot=true;
-    realloc=true;
-  }
-  if (gpu_nbor>0 && _gpu_nbor==0) {
-    _gpu_nbor=gpu_nbor;
-    realloc=true;
-  }
-  if (bonds && _bonds==false) {
-    _bonds=true;
-    realloc=true;
-  }
-  if (realloc) {
-    _other=_charge || _rot;
-    int max_atoms=_max_atoms;
-    clear_resize();
-    return alloc(max_atoms);
-  }
-  return true;
-}
-
 template <class numtyp, class acctyp>
 bool AtomT::init(const int nall, const bool charge, const bool rot,
                 UCL_Device &devi, const int gpu_nbor, const bool bonds) {
@ -219,27 +227,18 @@ void AtomT::clear_resize() {
    return;
  _allocated=false;

-  dev_x.clear();
-  if (_charge) { 
-    dev_q.clear();
-    host_q.clear();
-  }
-  if (_rot) {
-    dev_quat.clear();
-    host_quat.clear();
-  }
-  #ifndef GPU_CAST
-  host_x.clear();
-  #else
-  host_x_cast.clear();
-  host_type_cast.clear();
-  #endif
+  x.clear();
+  if (_charge)
+    q.clear();
+  if (_rot)
+    quat.clear();
+
  dev_cell_id.clear();
  dev_particle_id.clear();
  dev_tag.clear();
  #ifdef GPU_CAST
-  dev_x_cast.clear();
-  dev_type_cast.clear();
+  x_cast.clear();
+  type_cast.clear();
  #endif

  #ifdef USE_CUDPP
@ -279,8 +278,7 @@ double AtomT::host_memory_usage() const {
    atom_bytes+=1;
  if (_rot) 
    atom_bytes+=4;
-  return _max_atoms*atom_bytes*sizeof(numtyp)+
-         sizeof(Atom<numtyp,acctyp>);
+  return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }
  
 // Sort arrays for neighbor list calculation
@ -292,16 +290,18 @@ void AtomT::sort_neighbor(const int num_atoms) {
                                 8*sizeof(unsigned), num_atoms);
  if (CUDPP_SUCCESS != result) {
    printf("Error in cudppSort\n");
-    NVD_GERYON_EXIT;
+    UCL_GERYON_EXIT;
  }
  #endif
 }

 #ifdef GPU_CAST
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "atom_cl.h"
+#elif defined(USE_CUDART)
+const char *atom=0;
 #else
-#include "atom_ptx.h"
+#include "atom_cubin.h"
 #endif

 template <class numtyp, class acctyp>
@ -316,3 +316,4 @@ void AtomT::compile_kernels(UCL_Device &dev) {
 #endif

 template class Atom<PRECISION,ACC_PRECISION>;
+