git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8699 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2012-08-23 13:03:38 +00:00
parent 2512429ccf
commit c81a8f1bc9
3 changed files with 0 additions and 493 deletions
--- a/lib/gpu/geryon/nvc_device.h
+++ b/lib/gpu/geryon/nvc_device.h
@ -1,346 +0,0 @@
-/***************************************************************************
-                                nvc_device.h
-                             -------------------
-                               W. Michael Brown
-
-  Utilities for dealing with cuda devices
-
- __________________________________________________________________________
-    This file is part of the Geryon Unified Coprocessor Library (UCL)
- __________________________________________________________________________
-
-    begin                : Wed Jan 28 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : brownw@ornl.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the Simplified BSD License.
-   ----------------------------------------------------------------------- */
-
-#ifndef NVC_DEVICE
-#define NVC_DEVICE
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <cstdlib>
-#include "nvc_macros.h"
-#include "ucl_types.h"
-
-namespace ucl_cudart {
-
-// --------------------------------------------------------------------------
-// - COMMAND QUEUE STUFF
-// --------------------------------------------------------------------------
-typedef cudaStream_t command_queue; 
-
-inline void ucl_sync(cudaStream_t &stream) {
-  CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
-}
-
-/// Class for looking at device properties
-/** \note Calls to change the device outside of the class results in incorrect
-  *       behavior 
-  * \note There is no error checking for indexing past the number of devices **/
-class UCL_Device {
- public:
-  /// Collect properties for every GPU on the node
-  /** \note You must set the active GPU with set() before using the device **/
-  UCL_Device();
-  
-  ~UCL_Device();
-
-  /// Returns 1 (For compatibility with OpenCL)
-  inline int num_platforms() { return 1; }
-
-  /// Return a string with name and info of the current platform
-  std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA"; }
-
-  /// Return the number of devices that support CUDA
-  inline int num_devices() { return _properties.size(); }
-
-  /// Set the CUDA device to the specified device number
-  /** Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
-    * be allocated for use **/
-  int set(int num);
-
-  /// Get the current device number
-  inline int device_num() { return _device; }
-
-  /// Returns the default stream for the current device
-  inline command_queue & cq() { return cq(0); }
-  
-  /// Returns the stream indexed by i
-  inline command_queue & cq(const int i) { return _cq[i]; }
-
-  /// Set the default command queue (by default this is the null stream)
-  /** \param i index of the command queue (as added by push_command_queue()) 
-      If i is 0, the default command queue is set to the null stream **/
-  inline void set_command_queue(const int i) {
-    if (i==0) _cq[0]=0;
-    else _cq[0]=_cq[i];
-  }
-  
-  /// Block until all commands in the default stream have completed
-  inline void sync() { sync(0); }
-  
-  /// Block until all commands in the specified stream have completed
-  inline void sync(const int i) { ucl_sync(cq(i)); }
-  
-  /// Get the number of command queues currently available on device
-  inline int num_queues() 
-    { if (_device==-1) return 0; else return _cq.size(); }
-  
-  /// Add a stream for device computations
-  inline void push_command_queue() {
-    _cq.push_back(cudaStream_t()); 
-    CUDA_SAFE_CALL_NS(cudaStreamCreate(&_cq.back())); 
-  }
-
-  /// Remove a stream for device computations
-  /** \note You cannot delete the default stream **/
-  inline void pop_command_queue() {
-    if (_cq.size()<2) return;
-    CUDA_DESTRUCT_CALL_NS(cudaStreamDestroy(_cq.back()));
-    _cq.pop_back();
-  }
-  
-  /// Get the current CUDA device name
-  inline std::string name() { return name(_device); }
-  /// Get the CUDA device name
-  inline std::string name(const int i) 
-    { return std::string(_properties[i].name); }
-
-  /// Get a string telling the type of the current device
-  inline std::string device_type_name() { return device_type_name(_device); }
-  /// Get a string telling the type of the device
-  inline std::string device_type_name(const int i) { return "GPU"; }
-
-  /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type() { return device_type(_device); }
-  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type(const int i) { return UCL_GPU; }
-  
-  /// Returns true if double precision is support for the current device
-  bool double_precision() { return double_precision(_device); }
-  /// Returns true if double precision is support for the device
-  bool double_precision(const int i) {return arch(i)>=1.3;}
-  
-  /// Get the number of cores in the current device
-  inline unsigned cores() { return cores(_device); }
-  /// Get the number of cores
-  inline unsigned cores(const int i) 
-    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; 
-      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
-      else return _properties[i].multiProcessorCount*192; }
-  
-  /// Get the gigabytes of global memory in the current device
-  inline double gigabytes() { return gigabytes(_device); }
-  /// Get the gigabytes of global memory
-  inline double gigabytes(const int i) 
-    { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
-  
-  /// Get the bytes of global memory in the current device
-  inline size_t bytes() { return bytes(_device); }
-  /// Get the bytes of global memory
-  inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
-
-  /// Return the GPGPU compute capability for current device
-  inline double arch() { return arch(_device); }
-  /// Return the GPGPU compute capability
-  inline double arch(const int i) 
-    { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
-  
-  /// Clock rate in GHz for current device
-  inline double clock_rate() { return clock_rate(_device); }
-  /// Clock rate in GHz
-  inline double clock_rate(const int i) { return _properties[i].clockRate*1e-6;}
-               
-  /// Get the maximum number of threads per block
-  inline size_t group_size() { return group_size(_device); }
-  /// Get the maximum number of threads per block
-  inline size_t group_size(const int i) 
-    { return _properties[i].maxThreadsPerBlock; }
-  
-  /// Return the maximum memory pitch in bytes for current device
-  inline size_t max_pitch() { return max_pitch(_device); }
-  /// Return the maximum memory pitch in bytes
-  inline size_t max_pitch(const int i) { return _properties[i].memPitch; }
-
-  /// Returns false if accelerator cannot be shared by multiple processes
-  /** If it cannot be determined, true is returned **/
-  inline bool sharing_supported() { return sharing_supported(_device); }
-  /// Returns false if accelerator cannot be shared by multiple processes
-  /** If it cannot be determined, true is returned **/
-  inline bool sharing_supported(const int i)
-    { return (_properties[i].computeMode == cudaComputeModeDefault); }
-
-  /// List all devices along with all properties
-  void print_all(std::ostream &out);
-  
- private:
-  int _device, _num_devices;
-  std::vector<cudaDeviceProp> _properties;
-  std::vector<cudaStream_t> _cq;
-  std::vector<int> _device_ids;
-};
-
-// Grabs the properties for all devices
-inline UCL_Device::UCL_Device() {
-  CUDA_SAFE_CALL_NS(cudaGetDeviceCount(&_num_devices));
-  for (int dev=0; dev<_num_devices; ++dev) {
-    cudaDeviceProp deviceProp;
-    CUDA_SAFE_CALL_NS(cudaGetDeviceProperties(&deviceProp, dev));
-    if (deviceProp.major == 9999 && deviceProp.minor == 9999)
-      break;
-    _properties.push_back(deviceProp);
-    _device_ids.push_back(dev);
-  }
-  _device=-1;
-  _cq.push_back(cudaStream_t());
-  _cq.back()=0;
-}
-
-inline UCL_Device::~UCL_Device() {
-  for (int i=1; i<num_queues(); i++) pop_command_queue();
-}
-
-// Set the CUDA device to the specified device number
-inline int UCL_Device::set(int num) {
-  if (_device==num)
-    return UCL_SUCCESS;
-  for (int i=1; i<num_queues(); i++) pop_command_queue();
-  _cq[0]=0;
-  cudaThreadExit();
-  cudaError err=cudaSetDevice(_device_ids[num]);
-  if (err!=cudaSuccess) {
-    #ifndef UCL_NO_EXIT
-    std::cerr << "UCL Error: Could not access accelerator number " << num
-              << " for use.\n";
-    UCL_GERYON_EXIT;
-    #endif
-    return UCL_ERROR;
-  }
-  _device=num;
-  return UCL_SUCCESS;
-}
-
-// List all devices along with all properties
-inline void UCL_Device::print_all(std::ostream &out) {
-  #if CUDART_VERSION >= 2020
-  int driver_version, runtime_version;
-  cudaDriverGetVersion(&driver_version);
-  out << "CUDA Driver Version:                           "
-      << driver_version/1000 << "." << driver_version%100
-		  << std::endl;
-  cudaRuntimeGetVersion(&runtime_version);
-	out << "CUDA Runtime Version:                          "
-	    << runtime_version/1000 << "." << runtime_version%100
-	    << std::endl;
-  #endif
-
-  if (num_devices() == 0)
-    out << "There is no device supporting CUDA\n";
-  for (int i=0; i<num_devices(); ++i) {
-    out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
-    out << "  Type of device:                                "
-        << device_type_name(i).c_str() << std::endl;
-    out << "  Compute capability:                            "
-        << arch(i) << std::endl;
-    out << "  Double precision support:                      ";
-    if (double_precision(i))
-      out << "Yes\n";
-    else
-      out << "No\n";
-    out << "  Total amount of global memory:                 "
-        << gigabytes(i) << " GB\n";
-    #if CUDART_VERSION >= 2000
-    out << "  Number of compute units/multiprocessors:       "
-        << _properties[i].multiProcessorCount << std::endl;
-    out << "  Number of cores:                               "
-        << cores(i) << std::endl;
-    #endif
-    out << "  Total amount of constant memory:               "
-        << _properties[i].totalConstMem << " bytes\n";
-    out << "  Total amount of local/shared memory per block: "
-        << _properties[i].sharedMemPerBlock << " bytes\n";
-    out << "  Total number of registers available per block: "
-        << _properties[i].regsPerBlock << std::endl;
-    out << "  Warp size:                                     "
-        << _properties[i].warpSize << std::endl;
-    out << "  Maximum number of threads per block:           "
-        << _properties[i].maxThreadsPerBlock << std::endl;
-    out << "  Maximum group size (# of threads per block)    "
-        << _properties[i].maxThreadsDim[0] << " x "
-        << _properties[i].maxThreadsDim[1] << " x "
-        << _properties[i].maxThreadsDim[2] << std::endl;
-    out << "  Maximum item sizes (# threads for each dim)    "
-        << _properties[i].maxGridSize[0] << " x "
-        << _properties[i].maxGridSize[1] << " x "
-        << _properties[i].maxGridSize[2] << std::endl;
-    out << "  Maximum memory pitch:                          "
-        << max_pitch(i) << " bytes\n";
-    out << "  Texture alignment:                             "
-        << _properties[i].textureAlignment << " bytes\n";
-    out << "  Clock rate:                                    "
-        << clock_rate(i) << " GHz\n";
-    #if CUDART_VERSION >= 2000
-    out << "  Concurrent copy and execution:                 ";
-    if (_properties[i].deviceOverlap)
-      out << "Yes\n";
-    else
-      out << "No\n";
-    #endif
-    #if CUDART_VERSION >= 2020
-    out << "  Run time limit on kernels:                     ";
-    if (_properties[i].kernelExecTimeoutEnabled)
-      out << "Yes\n";
-    else
-      out << "No\n";
-    out << "  Integrated:                                    ";
-    if (_properties[i].integrated)
-      out << "Yes\n";
-    else
-      out << "No\n";
-    out << "  Support host page-locked memory mapping:       ";
-    if (_properties[i].canMapHostMemory)
-      out << "Yes\n";
-    else
-      out << "No\n";
-    out << "  Compute mode:                                  ";
-    if (_properties[i].computeMode == cudaComputeModeDefault)
-      out << "Default\n"; // multiple threads can use device
-    else if (_properties[i].computeMode == cudaComputeModeExclusive)
-      out << "Exclusive\n"; // only thread can use device
-    else if (_properties[i].computeMode == cudaComputeModeProhibited)
-      out << "Prohibited\n"; // no thread can use device
-    #if CUDART_VERSION >= 4000
-    else if (_properties[i].computeMode == cudaComputeModeExclusiveProcess)
-      out << "Exclusive Process\n"; // multiple threads 1 process
-    #endif
-    else
-      out << "Unknown\n";
-    #endif
-    #if CUDART_VERSION >= 3010
-    out << "  Concurrent kernel execution:                   ";
-    if (_properties[i].concurrentKernels)
-      out << "Yes\n";
-    else
-      out << "No\n";
-    out << "  Device has ECC support enabled:                ";
-    if (_properties[i].ECCEnabled)
-      out << "Yes\n";
-    else
-      out << "No\n";
-    #endif
-  }
-}
-
-}
-
-#endif
-
--- a/lib/gpu/geryon/nvc_macros.h
+++ b/lib/gpu/geryon/nvc_macros.h
@ -1,78 +0,0 @@
-#ifndef NVC_MACROS_H
-#define NVC_MACROS_H
-
-#if defined(__APPLE__)
-#if _GLIBCXX_ATOMIC_BUILTINS == 1
-#undef _GLIBCXX_ATOMIC_BUILTINS
-#endif // _GLIBCXX_ATOMIC_BUILTINS
-#endif // __APPLE__
-
-#include <stdio.h>
-#include <cassert>
-#include <cuda_runtime.h>
-
-#ifdef MPI_GERYON
-#include "mpi.h"
-#define NVC_GERYON_EXIT MPI_Abort(MPI_COMM_WORLD,-1)
-#else
-#define NVC_GERYON_EXIT assert(0==1)
-#endif
-
-#ifndef UCL_GERYON_EXIT
-#define UCL_GERYON_EXIT NVC_GERYON_EXIT
-#endif
-
-#ifdef UCL_DEBUG
-#define UCL_SYNC_DEBUG
-#define UCL_DESTRUCT_CHECK
-#endif
-
-#ifndef UCL_NO_API_CHECK
-
-#define CUDA_SAFE_CALL_NS( call) do {                                        \
-    cudaError err = call;                                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error in call at file '%s' in line %i : %s.\n", \
-                __FILE__, __LINE__, cudaGetErrorString( err) );              \
-        NVC_GERYON_EXIT;                                                     \
-    } } while (0)
-
-#ifdef UCL_SYNC_DEBUG
-
-#define CUDA_SAFE_CALL( call) do {                                           \
-    CUDA_SAFE_CALL_NS( call);                                                \
-    cudaError err=cudaThreadSynchronize();                                   \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
-                __FILE__, __LINE__, cudaGetErrorString( err) );              \
-        NVC_GERYON_EXIT;                                                     \
-    } } while (0)
-
-#else
-
-#define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NS( call)
-
-#endif
-
-#else  // not DEBUG
-
-// void macros for performance reasons
-#define CUDA_SAFE_CALL( call) call
-#define CUDA_SAFE_CALL_NS( call) call
-
-#endif
-
-#ifdef UCL_DESTRUCT_CHECK
-
-#define CUDA_DESTRUCT_CALL( call) CUDA_SAFE_CALL( call)
-#define CUDA_DESTRUCT_CALL_NS( call) CUDA_SAFE_CALL_NS( call)
-
-#else
-
-#define CUDA_DESTRUCT_CALL( call) call
-#define CUDA_DESTRUCT_CALL_NS( call) call
-
-#endif
-
-#endif
-
--- a/lib/gpu/geryon/nvc_texture.h
+++ b/lib/gpu/geryon/nvc_texture.h
@ -1,69 +0,0 @@
-/***************************************************************************
-                                nvc_texture.h
-                             -------------------
-                               W. Michael Brown
-
-  Utilities for dealing with CUDA Runtime textures
-
- __________________________________________________________________________
-    This file is part of the Geryon Unified Coprocessor Library (UCL)
- __________________________________________________________________________
-
-    begin                : Fri Jul 2 2010
-    copyright            : (C) 2010 by W. Michael Brown
-    email                : brownw@ornl.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2010) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the Simplified BSD License.
-   ----------------------------------------------------------------------- */
-
-#ifndef NVC_TEXTURE
-#define NVC_TEXTURE
-
-#include "nvc_mat.h"
-
-namespace ucl_cudart {
-    
-/// Class storing a texture reference
-class UCL_Texture {
- public:
-  UCL_Texture() {}
-  ~UCL_Texture() {}
-  /// Construct with a specified texture reference
-  inline UCL_Texture(textureReference *t) { get_texture(t); }
-  /// Set the texture reference for this object
-  inline void get_texture(textureReference *t) { _tex_ptr=t; }
-
-  /// Bind a float array where each fetch grabs a vector of length numel
-  template<class mat_typ>
-  inline void bind_float(mat_typ &vec, const unsigned numel) {
-    #ifdef UCL_DEBUG
-    assert(numel!=0 && numel<5);
-    #endif
-    int bits[4]={0,0,0,0};
-    for (int i=0; i<numel; i++) bits[i]=32;
-    _channel = cudaCreateChannelDesc(bits[0], bits[1], bits[2], bits[3], 
-                                     cudaChannelFormatKindFloat);
-    (*_tex_ptr).addressMode[0] = cudaAddressModeClamp;
-    (*_tex_ptr).addressMode[1] = cudaAddressModeClamp;
-    (*_tex_ptr).filterMode = cudaFilterModePoint;
-    (*_tex_ptr).normalized = false;
-    CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,vec.cbegin(),&_channel));
-  }
-
-  /// Unbind the texture reference from the memory allocation
-  inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); }
-
- private:
-  textureReference *_tex_ptr;
-  cudaChannelFormatDesc _channel;
-};
-
-} // namespace
-
-#endif
-