git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2016-07-01 23:27:26 +00:00
parent 8366b35459
commit 9656958169
245 changed files with 4890 additions and 4832 deletions

View File

@ -1,6 +1,6 @@
# Settings that the LAMMPS build will import when this package library is used # Settings that the LAMMPS build will import when this package library is used
# settings for OpenCL builds # settings for OpenCL builds
gpu_SYSINC = gpu_SYSINC =
gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -lOpenCL gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -Wl,-Bdynamic,-lOpenCL,-Bstatic
gpu_SYSPATH = gpu_SYSPATH =

View File

@ -7,7 +7,7 @@
EXTRAMAKE = Makefile.lammps.standard EXTRAMAKE = Makefile.lammps.standard
ifeq($(CUDA_HOME),) ifeq ($(CUDA_HOME),)
CUDA_HOME = /usr/local/cuda CUDA_HOME = /usr/local/cuda
endif endif

View File

@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \ OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
-mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \ -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
-I$(CUDA_HOME)/include -I$(CUDA_HOME)/include
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL -L../../src/STUBS -lmpi_mingw32 OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic -L../../src/STUBS -lmpi_mingw32
OCL_PREC = -D_SINGLE_DOUBLE OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DFERMI_OCL OCL_TUNE = -DFERMI_OCL
EXTRAMAKE = Makefile.lammps.mingw-cross EXTRAMAKE = Makefile.lammps.mingw-cross

View File

@ -4,7 +4,7 @@ OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
-mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \ -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \
-I../../tools/mingw-cross/mpich2-win32/include/ \ -I../../tools/mingw-cross/mpich2-win32/include/ \
-DMPICH_IGNORE_CXX_SEEK -DMPICH_IGNORE_CXX_SEEK
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL \ OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
-L../../tools/mingw-cross/mpich2-win32/lib -lmpi -L../../tools/mingw-cross/mpich2-win32/lib -lmpi
OCL_PREC = -D_SINGLE_DOUBLE OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DFERMI_OCL OCL_TUNE = -DFERMI_OCL

View File

@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \ OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
-msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \ -msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
-I$(CUDA_HOME)/include -I$(CUDA_HOME)/include
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \ OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
-L../../src/STUBS -lmpi_mingw64 -L../../src/STUBS -lmpi_mingw64
OCL_PREC = -D_SINGLE_DOUBLE OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DFERMI_OCL OCL_TUNE = -DFERMI_OCL

View File

@ -5,7 +5,7 @@ OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
-I../../tools/mingw-cross/mpich2-win64/include/ \ -I../../tools/mingw-cross/mpich2-win64/include/ \
-DMPICH_IGNORE_CXX_SEEK -DMPICH_IGNORE_CXX_SEEK
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \ OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
-L../../tools/mingw-cross/mpich2-win64/lib -lmpi -L../../tools/mingw-cross/mpich2-win64/lib -lmpi
OCL_PREC = -D_SINGLE_DOUBLE OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DFERMI_OCL OCL_TUNE = -DFERMI_OCL

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -35,7 +35,7 @@ namespace ucl_cudadr {
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
// - COMMAND QUEUE STUFF // - COMMAND QUEUE STUFF
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
typedef CUstream command_queue; typedef CUstream command_queue;
inline void ucl_sync(CUstream &stream) { inline void ucl_sync(CUstream &stream) {
CU_SAFE_CALL(cuStreamSynchronize(stream)); CU_SAFE_CALL(cuStreamSynchronize(stream));
@ -59,21 +59,21 @@ struct NVDProperties {
/// Class for looking at device properties /// Class for looking at device properties
/** \note Calls to change the device outside of the class results in incorrect /** \note Calls to change the device outside of the class results in incorrect
* behavior * behavior
* \note There is no error checking for indexing past the number of devices **/ * \note There is no error checking for indexing past the number of devices **/
class UCL_Device { class UCL_Device {
public: public:
/// Collect properties for every GPU on the node /// Collect properties for every GPU on the node
/** \note You must set the active GPU with set() before using the device **/ /** \note You must set the active GPU with set() before using the device **/
inline UCL_Device(); inline UCL_Device();
inline ~UCL_Device(); inline ~UCL_Device();
/// Returns 1 (For compatibility with OpenCL) /// Returns 1 (For compatibility with OpenCL)
inline int num_platforms() { return 1; } inline int num_platforms() { return 1; }
/// Return a string with name and info of the current platform /// Return a string with name and info of the current platform
inline std::string platform_name() inline std::string platform_name()
{ return "NVIDIA Corporation NVIDIA CUDA Driver"; } { return "NVIDIA Corporation NVIDIA CUDA Driver"; }
/// Delete any contexts/data and set the platform number to be used /// Delete any contexts/data and set the platform number to be used
@ -97,24 +97,24 @@ class UCL_Device {
/// Returns the default stream for the current device /// Returns the default stream for the current device
inline command_queue & cq() { return cq(0); } inline command_queue & cq() { return cq(0); }
/// Returns the stream indexed by i /// Returns the stream indexed by i
inline command_queue & cq(const int i) { return _cq[i]; } inline command_queue & cq(const int i) { return _cq[i]; }
/// Block until all commands in the default stream have completed /// Block until all commands in the default stream have completed
inline void sync() { sync(0); } inline void sync() { sync(0); }
/// Block until all commands in the specified stream have completed /// Block until all commands in the specified stream have completed
inline void sync(const int i) { ucl_sync(cq(i)); } inline void sync(const int i) { ucl_sync(cq(i)); }
/// Get the number of command queues currently available on device /// Get the number of command queues currently available on device
inline int num_queues() inline int num_queues()
{ return _cq.size(); } { return _cq.size(); }
/// Add a stream for device computations /// Add a stream for device computations
inline void push_command_queue() { inline void push_command_queue() {
_cq.push_back(CUstream()); _cq.push_back(CUstream());
CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0)); CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0));
} }
/// Remove a stream for device computations /// Remove a stream for device computations
@ -124,19 +124,19 @@ class UCL_Device {
CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back())); CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back()));
_cq.pop_back(); _cq.pop_back();
} }
/// Set the default command queue (by default this is the null stream) /// Set the default command queue (by default this is the null stream)
/** \param i index of the command queue (as added by push_command_queue()) /** \param i index of the command queue (as added by push_command_queue())
If i is 0, the default command queue is set to the null stream **/ If i is 0, the default command queue is set to the null stream **/
inline void set_command_queue(const int i) { inline void set_command_queue(const int i) {
if (i==0) _cq[0]=0; if (i==0) _cq[0]=0;
else _cq[0]=_cq[i]; else _cq[0]=_cq[i];
} }
/// Get the current CUDA device name /// Get the current CUDA device name
inline std::string name() { return name(_device); } inline std::string name() { return name(_device); }
/// Get the CUDA device name /// Get the CUDA device name
inline std::string name(const int i) inline std::string name(const int i)
{ return std::string(_properties[i].name); } { return std::string(_properties[i].name); }
/// Get a string telling the type of the current device /// Get a string telling the type of the current device
@ -148,38 +148,38 @@ class UCL_Device {
inline int device_type() { return device_type(_device); } inline int device_type() { return device_type(_device); }
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i) { return UCL_GPU; } inline int device_type(const int i) { return UCL_GPU; }
/// Returns true if host memory is efficiently addressable from device /// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); } inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device /// Returns true if host memory is efficiently addressable from device
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
/// Returns true if double precision is support for the current device /// Returns true if double precision is support for the current device
inline bool double_precision() { return double_precision(_device); } inline bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device /// Returns true if double precision is support for the device
inline bool double_precision(const int i) {return arch(i)>=1.3;} inline bool double_precision(const int i) {return arch(i)>=1.3;}
/// Get the number of compute units on the current device /// Get the number of compute units on the current device
inline unsigned cus() { return cus(_device); } inline unsigned cus() { return cus(_device); }
/// Get the number of compute units /// Get the number of compute units
inline unsigned cus(const int i) inline unsigned cus(const int i)
{ return _properties[i].multiProcessorCount; } { return _properties[i].multiProcessorCount; }
/// Get the number of cores in the current device /// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); } inline unsigned cores() { return cores(_device); }
/// Get the number of cores /// Get the number of cores
inline unsigned cores(const int i) inline unsigned cores(const int i)
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32; else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48; else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
else return _properties[i].multiProcessorCount*192; } else return _properties[i].multiProcessorCount*192; }
/// Get the gigabytes of global memory in the current device /// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); } inline double gigabytes() { return gigabytes(_device); }
/// Get the gigabytes of global memory /// Get the gigabytes of global memory
inline double gigabytes(const int i) inline double gigabytes(const int i)
{ return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; } { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
/// Get the bytes of global memory in the current device /// Get the bytes of global memory in the current device
inline size_t bytes() { return bytes(_device); } inline size_t bytes() { return bytes(_device); }
/// Get the bytes of global memory /// Get the bytes of global memory
@ -188,13 +188,13 @@ class UCL_Device {
// Get the gigabytes of free memory in the current device // Get the gigabytes of free memory in the current device
inline double free_gigabytes() { return free_gigabytes(_device); } inline double free_gigabytes() { return free_gigabytes(_device); }
// Get the gigabytes of free memory // Get the gigabytes of free memory
inline double free_gigabytes(const int i) inline double free_gigabytes(const int i)
{ return static_cast<double>(free_bytes(i))/1073741824; } { return static_cast<double>(free_bytes(i))/1073741824; }
// Get the bytes of free memory in the current device // Get the bytes of free memory in the current device
inline size_t free_bytes() { return free_bytes(_device); } inline size_t free_bytes() { return free_bytes(_device); }
// Get the bytes of free memory // Get the bytes of free memory
inline size_t free_bytes(const int i) { inline size_t free_bytes(const int i) {
CUDA_INT_TYPE dfree, dtotal; CUDA_INT_TYPE dfree, dtotal;
CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal)); CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal));
return static_cast<size_t>(dfree); return static_cast<size_t>(dfree);
@ -203,21 +203,21 @@ class UCL_Device {
/// Return the GPGPU compute capability for current device /// Return the GPGPU compute capability for current device
inline double arch() { return arch(_device); } inline double arch() { return arch(_device); }
/// Return the GPGPU compute capability /// Return the GPGPU compute capability
inline double arch(const int i) inline double arch(const int i)
{ return static_cast<double>(_properties[i].minor)/10+_properties[i].major;} { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
/// Clock rate in GHz for current device /// Clock rate in GHz for current device
inline double clock_rate() { return clock_rate(_device); } inline double clock_rate() { return clock_rate(_device); }
/// Clock rate in GHz /// Clock rate in GHz
inline double clock_rate(const int i) inline double clock_rate(const int i)
{ return _properties[i].p.clockRate*1e-6;} { return _properties[i].p.clockRate*1e-6;}
/// Get the maximum number of threads per block /// Get the maximum number of threads per block
inline size_t group_size() { return group_size(_device); } inline size_t group_size() { return group_size(_device); }
/// Get the maximum number of threads per block /// Get the maximum number of threads per block
inline size_t group_size(const int i) inline size_t group_size(const int i)
{ return _properties[i].p.maxThreadsPerBlock; } { return _properties[i].p.maxThreadsPerBlock; }
/// Return the maximum memory pitch in bytes for current device /// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); } inline size_t max_pitch() { return max_pitch(_device); }
/// Return the maximum memory pitch in bytes /// Return the maximum memory pitch in bytes
@ -242,7 +242,7 @@ class UCL_Device {
{ return fission_by_counts(_device); } { return fission_by_counts(_device); }
/// True if splitting device into subdevices by specified counts supported /// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts(const int i) inline bool fission_by_counts(const int i)
{ return false; } { return false; }
/// True if splitting device into subdevices by affinity domains supported /// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity() inline bool fission_by_affinity()
{ return fission_by_affinity(_device); } { return fission_by_affinity(_device); }
@ -259,7 +259,7 @@ class UCL_Device {
/// List all devices along with all properties /// List all devices along with all properties
inline void print_all(std::ostream &out); inline void print_all(std::ostream &out);
private: private:
int _device, _num_devices; int _device, _num_devices;
std::vector<NVDProperties> _properties; std::vector<NVDProperties> _properties;
@ -279,16 +279,16 @@ UCL_Device::UCL_Device() {
CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m)); CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
if (major==9999) if (major==9999)
continue; continue;
_properties.push_back(NVDProperties()); _properties.push_back(NVDProperties());
_properties.back().device_id=dev; _properties.back().device_id=dev;
_properties.back().major=major; _properties.back().major=major;
_properties.back().minor=minor; _properties.back().minor=minor;
char namecstr[1024]; char namecstr[1024];
CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m)); CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
_properties.back().name=namecstr; _properties.back().name=namecstr;
CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m)); CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount, CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
@ -296,23 +296,23 @@ UCL_Device::UCL_Device() {
CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m)); CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m));
#if CUDA_VERSION >= 2020 #if CUDA_VERSION >= 2020
CU_SAFE_CALL_NS(cuDeviceGetAttribute( CU_SAFE_CALL_NS(cuDeviceGetAttribute(
&_properties.back().kernelExecTimeoutEnabled, &_properties.back().kernelExecTimeoutEnabled,
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev)); CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute( CU_SAFE_CALL_NS(cuDeviceGetAttribute(
&_properties.back().integrated, &_properties.back().integrated,
CU_DEVICE_ATTRIBUTE_INTEGRATED, dev)); CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute( CU_SAFE_CALL_NS(cuDeviceGetAttribute(
&_properties.back().canMapHostMemory, &_properties.back().canMapHostMemory,
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev)); CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode, CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode,
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev)); CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
#endif #endif
#if CUDA_VERSION >= 3010 #if CUDA_VERSION >= 3010
CU_SAFE_CALL_NS(cuDeviceGetAttribute( CU_SAFE_CALL_NS(cuDeviceGetAttribute(
&_properties.back().concurrentKernels, &_properties.back().concurrentKernels,
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev)); CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute( CU_SAFE_CALL_NS(cuDeviceGetAttribute(
&_properties.back().ECCEnabled, &_properties.back().ECCEnabled,
CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev)); CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
#endif #endif
} }
@ -365,7 +365,7 @@ void UCL_Device::print_all(std::ostream &out) {
cuDriverGetVersion(&driver_version); cuDriverGetVersion(&driver_version);
out << "CUDA Driver Version: " out << "CUDA Driver Version: "
<< driver_version/1000 << "." << driver_version%100 << driver_version/1000 << "." << driver_version%100
<< std::endl; << std::endl;
#endif #endif
if (num_devices() == 0) if (num_devices() == 0)

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -35,15 +35,15 @@ template <class numtyp> class UCL_D_Mat;
template <class hosttype, class devtype> class UCL_Vector; template <class hosttype, class devtype> class UCL_Vector;
template <class hosttype, class devtype> class UCL_Matrix; template <class hosttype, class devtype> class UCL_Matrix;
#define UCL_MAX_KERNEL_ARGS 256 #define UCL_MAX_KERNEL_ARGS 256
/// Class storing 1 or more kernel functions from a single string or file /// Class storing 1 or more kernel functions from a single string or file
class UCL_Program { class UCL_Program {
public: public:
inline UCL_Program(UCL_Device &device) { _cq=device.cq(); } inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
inline UCL_Program(UCL_Device &device, const void *program, inline UCL_Program(UCL_Device &device, const void *program,
const char *flags="", std::string *log=NULL) { const char *flags="", std::string *log=NULL) {
_cq=device.cq(); _cq=device.cq();
init(device); init(device);
load_string(program,flags,log); load_string(program,flags,log);
} }
@ -61,20 +61,20 @@ class UCL_Program {
std::string *log=NULL) { std::string *log=NULL) {
std::ifstream in(filename); std::ifstream in(filename);
if (!in || in.is_open()==false) { if (!in || in.is_open()==false) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not open kernel file: " std::cerr << "UCL Error: Could not open kernel file: "
<< filename << std::endl; << filename << std::endl;
UCL_GERYON_EXIT; UCL_GERYON_EXIT;
#endif #endif
return UCL_FILE_NOT_FOUND; return UCL_FILE_NOT_FOUND;
} }
std::string program((std::istreambuf_iterator<char>(in)), std::string program((std::istreambuf_iterator<char>(in)),
std::istreambuf_iterator<char>()); std::istreambuf_iterator<char>());
in.close(); in.close();
return load_string(program.c_str(),flags,log); return load_string(program.c_str(),flags,log);
} }
/// Load a program from a string and compile with flags /// Load a program from a string and compile with flags
inline int load_string(const void *program, const char *flags="", inline int load_string(const void *program, const char *flags="",
std::string *log=NULL) { std::string *log=NULL) {
@ -94,12 +94,12 @@ class UCL_Program {
CUresult err=cuModuleLoadDataEx(&_module,program,num_opts, CUresult err=cuModuleLoadDataEx(&_module,program,num_opts,
options,(void **)values); options,(void **)values);
if (log!=NULL) if (log!=NULL)
*log=std::string(clog); *log=std::string(clog);
if (err != CUDA_SUCCESS) { if (err != CUDA_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << std::endl std::cerr << std::endl
<< "----------------------------------------------------------\n" << "----------------------------------------------------------\n"
<< " UCL Error: Error compiling PTX Program...\n" << " UCL Error: Error compiling PTX Program...\n"
@ -108,24 +108,24 @@ class UCL_Program {
#endif #endif
return UCL_COMPILE_ERROR; return UCL_COMPILE_ERROR;
} }
return UCL_SUCCESS; return UCL_SUCCESS;
} }
/// Load a precompiled program from a file /// Load a precompiled program from a file
inline int load_binary(const char *filename) { inline int load_binary(const char *filename) {
CUmodule _module; CUmodule _module;
CUresult err = cuModuleLoad(&_module,filename); CUresult err = cuModuleLoad(&_module,filename);
if (err==301) { if (err==301) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not open binary kernel file: " std::cerr << "UCL Error: Could not open binary kernel file: "
<< filename << std::endl; << filename << std::endl;
UCL_GERYON_EXIT; UCL_GERYON_EXIT;
#endif #endif
return UCL_FILE_NOT_FOUND; return UCL_FILE_NOT_FOUND;
} else if (err!=CUDA_SUCCESS) { } else if (err!=CUDA_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Error loading binary kernel file: " std::cerr << "UCL Error: Error loading binary kernel file: "
<< filename << std::endl; << filename << std::endl;
UCL_GERYON_EXIT; UCL_GERYON_EXIT;
#endif #endif
@ -138,7 +138,7 @@ class UCL_Program {
// return UCL_ERROR; // return UCL_ERROR;
return UCL_SUCCESS; return UCL_SUCCESS;
} }
friend class UCL_Kernel; friend class UCL_Kernel;
private: private:
CUmodule _module; CUmodule _module;
@ -149,23 +149,23 @@ class UCL_Program {
/// Class for dealing with CUDA Driver kernels /// Class for dealing with CUDA Driver kernels
class UCL_Kernel { class UCL_Kernel {
public: public:
UCL_Kernel() : _dimensions(1), _num_args(0) { UCL_Kernel() : _dimensions(1), _num_args(0) {
#if CUDA_VERSION < 4000 #if CUDA_VERSION < 4000
_param_size=0; _param_size=0;
#endif #endif
_num_blocks[0]=0; _num_blocks[0]=0;
} }
UCL_Kernel(UCL_Program &program, const char *function) : UCL_Kernel(UCL_Program &program, const char *function) :
_dimensions(1), _num_args(0) { _dimensions(1), _num_args(0) {
#if CUDA_VERSION < 4000 #if CUDA_VERSION < 4000
_param_size=0; _param_size=0;
#endif #endif
_num_blocks[0]=0; _num_blocks[0]=0;
set_function(program,function); set_function(program,function);
_cq=program._cq; _cq=program._cq;
} }
~UCL_Kernel() {} ~UCL_Kernel() {}
/// Clear any function associated with the kernel /// Clear any function associated with the kernel
@ -189,7 +189,7 @@ class UCL_Kernel {
/// Set the kernel argument. /// Set the kernel argument.
/** If not a device pointer, this must be repeated each time the argument /** If not a device pointer, this must be repeated each time the argument
* changes * changes
* \note To set kernel parameter i (i>0), parameter i-1 must be set **/ * \note To set kernel parameter i (i>0), parameter i-1 must be set **/
template <class dtype> template <class dtype>
inline void set_arg(const unsigned index, const dtype * const arg) { inline void set_arg(const unsigned index, const dtype * const arg) {
@ -202,27 +202,27 @@ class UCL_Kernel {
CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype))); CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
#endif #endif
else else
assert(0==1); // Must add kernel parameters in sequential order assert(0==1); // Must add kernel parameters in sequential order
} }
/// Set a geryon container as a kernel argument. /// Set a geryon container as a kernel argument.
template <class numtyp> template <class numtyp>
inline void set_arg(const UCL_D_Vec<numtyp> * const arg) inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
{ set_arg(&arg->begin()); } { set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument. /// Set a geryon container as a kernel argument.
template <class numtyp> template <class numtyp>
inline void set_arg(const UCL_D_Mat<numtyp> * const arg) inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
{ set_arg(&arg->begin()); } { set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument. /// Set a geryon container as a kernel argument.
template <class hosttype, class devtype> template <class hosttype, class devtype>
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg) inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); } { set_arg(&arg->device.begin()); }
/// Set a geryon container as a kernel argument. /// Set a geryon container as a kernel argument.
template <class hosttype, class devtype> template <class hosttype, class devtype>
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg) inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); } { set_arg(&arg->device.begin()); }
/// Add a kernel argument. /// Add a kernel argument.
@ -257,37 +257,37 @@ class UCL_Kernel {
/// Add a geryon container as a kernel argument. /// Add a geryon container as a kernel argument.
template <class numtyp> template <class numtyp>
inline void add_arg(const UCL_D_Vec<numtyp> * const arg) inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
{ add_arg(&arg->begin()); } { add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument. /// Add a geryon container as a kernel argument.
template <class numtyp> template <class numtyp>
inline void add_arg(const UCL_D_Mat<numtyp> * const arg) inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
{ add_arg(&arg->begin()); } { add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument. /// Add a geryon container as a kernel argument.
template <class hosttype, class devtype> template <class hosttype, class devtype>
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg) inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); } { add_arg(&arg->device.begin()); }
/// Add a geryon container as a kernel argument. /// Add a geryon container as a kernel argument.
template <class hosttype, class devtype> template <class hosttype, class devtype>
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg) inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); } { add_arg(&arg->device.begin()); }
/// Set the number of thread blocks and the number of threads in each block /// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added /** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/ \note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks, const size_t block_size) { inline void set_size(const size_t num_blocks, const size_t block_size) {
_dimensions=1; _dimensions=1;
_num_blocks[0]=num_blocks; _num_blocks[0]=num_blocks;
_num_blocks[1]=1; _num_blocks[1]=1;
_num_blocks[2]=1; _num_blocks[2]=1;
#if CUDA_VERSION >= 4000 #if CUDA_VERSION >= 4000
_block_size[0]=block_size; _block_size[0]=block_size;
_block_size[1]=1; _block_size[1]=1;
_block_size[2]=1; _block_size[2]=1;
#else #else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1)); CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
#endif #endif
} }
@ -303,43 +303,43 @@ class UCL_Kernel {
/** \note This should be called before any arguments have been added /** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/ \note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y) { const size_t block_size_x, const size_t block_size_y) {
_dimensions=2; _dimensions=2;
_num_blocks[0]=num_blocks_x; _num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y; _num_blocks[1]=num_blocks_y;
_num_blocks[2]=1; _num_blocks[2]=1;
#if CUDA_VERSION >= 4000 #if CUDA_VERSION >= 4000
_block_size[0]=block_size_x; _block_size[0]=block_size_x;
_block_size[1]=block_size_y; _block_size[1]=block_size_y;
_block_size[2]=1; _block_size[2]=1;
#else #else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1)); CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
#endif #endif
} }
/// Set the number of thread blocks and the number of threads in each block /// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added /** \note This should be called before any arguments have been added
\note The default command queue for the kernel is changed to cq **/ \note The default command queue for the kernel is changed to cq **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y, const size_t block_size_x, const size_t block_size_y,
command_queue &cq) command_queue &cq)
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);} {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
/// Set the number of thread blocks and the number of threads in each block /// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added /** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/ \note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_x,
const size_t block_size_y, const size_t block_size_z) { const size_t block_size_y, const size_t block_size_z) {
_dimensions=2; _dimensions=2;
_num_blocks[0]=num_blocks_x; _num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y; _num_blocks[1]=num_blocks_y;
_num_blocks[2]=1; _num_blocks[2]=1;
#if CUDA_VERSION >= 4000 #if CUDA_VERSION >= 4000
_block_size[0]=block_size_x; _block_size[0]=block_size_x;
_block_size[1]=block_size_y; _block_size[1]=block_size_y;
_block_size[2]=block_size_z; _block_size[2]=block_size_z;
#else #else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y, CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
block_size_z)); block_size_z));
#endif #endif
@ -352,10 +352,10 @@ class UCL_Kernel {
const size_t block_size_x, const size_t block_size_y, const size_t block_size_x, const size_t block_size_y,
const size_t block_size_z, command_queue &cq) { const size_t block_size_z, command_queue &cq) {
_cq=cq; _cq=cq;
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
block_size_z); block_size_z);
} }
/// Run the kernel in the default command queue /// Run the kernel in the default command queue
inline void run() { inline void run() {
#if CUDA_VERSION >= 4000 #if CUDA_VERSION >= 4000
@ -367,12 +367,12 @@ class UCL_Kernel {
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq)); CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
#endif #endif
} }
/// Clear any arguments associated with the kernel /// Clear any arguments associated with the kernel
inline void clear_args() { inline void clear_args() {
_num_args=0; _num_args=0;
#if CUDA_VERSION < 4000 #if CUDA_VERSION < 4000
_offsets.clear(); _offsets.clear();
_param_size=0; _param_size=0;
#endif #endif
} }
@ -390,7 +390,7 @@ class UCL_Kernel {
unsigned _num_blocks[3]; unsigned _num_blocks[3];
unsigned _num_args; unsigned _num_args;
friend class UCL_Texture; friend class UCL_Texture;
#if CUDA_VERSION >= 4000 #if CUDA_VERSION >= 4000
unsigned _block_size[3]; unsigned _block_size[3];
void * _kernel_args[UCL_MAX_KERNEL_ARGS]; void * _kernel_args[UCL_MAX_KERNEL_ARGS];

View File

@ -17,12 +17,12 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
/*! \file */ /*! \file */
#ifndef NVD_MAT_H #ifndef NVD_MAT_H
#define NVD_MAT_H #define NVD_MAT_H
@ -52,6 +52,6 @@ namespace ucl_cudadr {
#include "ucl_print.h" #include "ucl_print.h"
#undef UCL_PRINT_ALLOW #undef UCL_PRINT_ALLOW
} // namespace ucl_cudadr } // namespace ucl_cudadr
#endif #endif

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -46,7 +46,7 @@ typedef CUdeviceptr device_ptr;
// - HOST MEMORY ALLOCATION ROUTINES // - HOST MEMORY ALLOCATION ROUTINES
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
template <class mat_type, class copy_type> template <class mat_type, class copy_type>
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
CUresult err=CUDA_SUCCESS; CUresult err=CUDA_SUCCESS;
if (kind==UCL_NOT_PINNED) if (kind==UCL_NOT_PINNED)
@ -62,7 +62,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
} }
template <class mat_type> template <class mat_type>
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
CUresult err=CUDA_SUCCESS; CUresult err=CUDA_SUCCESS;
if (kind==UCL_NOT_PINNED) if (kind==UCL_NOT_PINNED)
@ -95,7 +95,7 @@ inline int _host_resize(mat_type &mat, const size_t n) {
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (mat.kind()==UCL_WRITE_ONLY) else if (mat.kind()==UCL_WRITE_ONLY)
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
else else
err=cuMemAllocHost((void **)mat.host_ptr(),n); err=cuMemAllocHost((void **)mat.host_ptr(),n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
@ -130,30 +130,30 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
const size_t cols, size_t &pitch, const size_t cols, size_t &pitch,
const enum UCL_MEMOPT kind) { const enum UCL_MEMOPT kind) {
CUresult err; CUresult err;
CUDA_INT_TYPE upitch; CUDA_INT_TYPE upitch;
err=cuMemAllocPitch(&mat.cbegin(),&upitch, err=cuMemAllocPitch(&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows,16); cols*sizeof(typename mat_type::data_type),rows,16);
pitch=static_cast<size_t>(upitch); pitch=static_cast<size_t>(upitch);
if (err!=CUDA_SUCCESS) if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
mat.cq()=cm.cq(); mat.cq()=cm.cq();
return UCL_SUCCESS; return UCL_SUCCESS;
} }
template <class mat_type, class copy_type> template <class mat_type, class copy_type>
inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows, inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
const size_t cols, size_t &pitch, const size_t cols, size_t &pitch,
const enum UCL_MEMOPT kind) { const enum UCL_MEMOPT kind) {
CUresult err; CUresult err;
unsigned upitch; unsigned upitch;
err=cuMemAllocPitch(&mat.cbegin(),&upitch, err=cuMemAllocPitch(&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows,16); cols*sizeof(typename mat_type::data_type),rows,16);
pitch=static_cast<size_t>(upitch); pitch=static_cast<size_t>(upitch);
if (err!=CUDA_SUCCESS) if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
mat.cq()=d.cq(); mat.cq()=d.cq();
return UCL_SUCCESS; return UCL_SUCCESS;
} }
template <class mat_type> template <class mat_type>
inline void _device_free(mat_type &mat) { inline void _device_free(mat_type &mat) {
@ -175,33 +175,33 @@ inline int _device_resize(mat_type &mat, const size_t rows,
const size_t cols, size_t &pitch) { const size_t cols, size_t &pitch) {
_device_free(mat); _device_free(mat);
CUresult err; CUresult err;
CUDA_INT_TYPE upitch; CUDA_INT_TYPE upitch;
err=cuMemAllocPitch(&mat.cbegin(),&upitch, err=cuMemAllocPitch(&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows,16); cols*sizeof(typename mat_type::data_type),rows,16);
pitch=static_cast<size_t>(upitch); pitch=static_cast<size_t>(upitch);
if (err!=CUDA_SUCCESS) if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
return UCL_SUCCESS; return UCL_SUCCESS;
} }
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
*ptr=in; *ptr=in;
} }
template <class numtyp> template <class numtyp>
inline void _device_view(CUdeviceptr *ptr, numtyp *in) { inline void _device_view(CUdeviceptr *ptr, numtyp *in) {
*ptr=0; *ptr=0;
} }
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in, inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in,
const size_t offset, const size_t numsize) { const size_t offset, const size_t numsize) {
*ptr=in+offset*numsize; *ptr=in+offset*numsize;
} }
template <class numtyp> template <class numtyp>
inline void _device_view(CUdeviceptr *ptr, numtyp *in, inline void _device_view(CUdeviceptr *ptr, numtyp *in,
const size_t offset, const size_t numsize) { const size_t offset, const size_t numsize) {
*ptr=0; *ptr=0;
} }
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
@ -211,13 +211,13 @@ template <class mat_type, class copy_type>
inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows, inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
const size_t cols) { const size_t cols) {
assert(0==1); assert(0==1);
} }
template <class mat_type, class copy_type> template <class mat_type, class copy_type>
inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows, inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
const size_t cols) { const size_t cols) {
assert(0==1); assert(0==1);
} }
template <class mat_type> template <class mat_type>
inline void _device_image_free(mat_type &mat) { inline void _device_image_free(mat_type &mat) {
@ -245,7 +245,7 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
// - HELPER FUNCTIONS FOR MEMCPY ROUTINES // - HELPER FUNCTIONS FOR MEMCPY ROUTINES
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch, inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows) { const size_t rows) {
ins.srcXInBytes=0; ins.srcXInBytes=0;
@ -257,13 +257,13 @@ inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
ins.WidthInBytes=cols; ins.WidthInBytes=cols;
ins.Height=rows; ins.Height=rows;
} }
template <int mem> struct _nvd_set_2D_mem; template <int mem> struct _nvd_set_2D_mem;
template <> struct _nvd_set_2D_mem<1> template <> struct _nvd_set_2D_mem<1>
{ static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } }; { static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } };
template <> struct _nvd_set_2D_mem<2> template <> struct _nvd_set_2D_mem<2>
{ static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } }; { static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } };
template <int mem> struct _nvd_set_2D_mem template <int mem> struct _nvd_set_2D_mem
{ static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } }; { static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } };
@ -285,7 +285,7 @@ template<> struct _ucl_memcpy<2,2> {
assert(0==1); assert(0==1);
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows) { const size_t rows) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -297,7 +297,7 @@ template<> struct _ucl_memcpy<2,2> {
CU_SAFE_CALL(cuMemcpy2D(&ins)); CU_SAFE_CALL(cuMemcpy2D(&ins));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) { const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -322,7 +322,7 @@ template<> struct _ucl_memcpy<2,0> {
assert(0==1); assert(0==1);
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows) { const size_t rows) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -334,7 +334,7 @@ template<> struct _ucl_memcpy<2,0> {
CU_SAFE_CALL(cuMemcpy2D(&ins)); CU_SAFE_CALL(cuMemcpy2D(&ins));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) { const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -359,7 +359,7 @@ template<> struct _ucl_memcpy<2,1> {
assert(0==1); assert(0==1);
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows) { const size_t rows) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -371,7 +371,7 @@ template<> struct _ucl_memcpy<2,1> {
CU_SAFE_CALL(cuMemcpy2D(&ins)); CU_SAFE_CALL(cuMemcpy2D(&ins));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) { const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -396,7 +396,7 @@ template<> struct _ucl_memcpy<0,2> {
assert(0==1); assert(0==1);
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows) { const size_t rows) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -408,7 +408,7 @@ template<> struct _ucl_memcpy<0,2> {
CU_SAFE_CALL(cuMemcpy2D(&ins)); CU_SAFE_CALL(cuMemcpy2D(&ins));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) { const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -433,7 +433,7 @@ template<> struct _ucl_memcpy<1,2> {
assert(0==1); assert(0==1);
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows) { const size_t rows) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -445,7 +445,7 @@ template<> struct _ucl_memcpy<1,2> {
CU_SAFE_CALL(cuMemcpy2D(&ins)); CU_SAFE_CALL(cuMemcpy2D(&ins));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) { const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -470,7 +470,7 @@ template <> struct _ucl_memcpy<1,0> {
CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq)); CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows) { const size_t rows) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -482,7 +482,7 @@ template <> struct _ucl_memcpy<1,0> {
CU_SAFE_CALL(cuMemcpy2D(&ins)); CU_SAFE_CALL(cuMemcpy2D(&ins));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) { const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -507,7 +507,7 @@ template <> struct _ucl_memcpy<0,1> {
CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq)); CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows) { const size_t rows) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -519,7 +519,7 @@ template <> struct _ucl_memcpy<0,1> {
CU_SAFE_CALL(cuMemcpy2D(&ins)); CU_SAFE_CALL(cuMemcpy2D(&ins));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) { const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -542,7 +542,7 @@ template <> struct _ucl_memcpy<1,1> {
CUstream &cq) CUstream &cq)
{ memcpy(dst.begin(),src.begin(),n); } { memcpy(dst.begin(),src.begin(),n); }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows) { const size_t rows) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -554,7 +554,7 @@ template <> struct _ucl_memcpy<1,1> {
CU_SAFE_CALL(cuMemcpy2D(&ins)); CU_SAFE_CALL(cuMemcpy2D(&ins));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) { const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
@ -579,18 +579,18 @@ template <int mem1, int mem2> struct _ucl_memcpy {
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq)); CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows) { const size_t rows) {
if (p1::PADDED==0 || p2::PADDED==0) { if (p1::PADDED==0 || p2::PADDED==0) {
size_t src_offset=0, dst_offset=0; size_t src_offset=0, dst_offset=0;
for (size_t i=0; i<rows; i++) { for (size_t i=0; i<rows; i++) {
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset, CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
src.cbegin()+src_offset,cols)); src.cbegin()+src_offset,cols));
src_offset+=spitch; src_offset+=spitch;
dst_offset+=dpitch; dst_offset+=dpitch;
} }
} else { } else {
CUDA_MEMCPY2D ins; CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a(); ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
@ -601,12 +601,12 @@ template <int mem1, int mem2> struct _ucl_memcpy {
} }
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) { const size_t rows, CUstream &cq) {
if (p1::PADDED==0 || p2::PADDED==0) { if (p1::PADDED==0 || p2::PADDED==0) {
size_t src_offset=0, dst_offset=0; size_t src_offset=0, dst_offset=0;
for (size_t i=0; i<rows; i++) { for (size_t i=0; i<rows; i++) {
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin()+dst_offset, CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin()+dst_offset,
src.cbegin()+src_offset,cols,cq)); src.cbegin()+src_offset,cols,cq));
src_offset+=spitch; src_offset+=spitch;
@ -636,22 +636,22 @@ inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
} }
template<class mat1, class mat2> template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows) { const size_t rows) {
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols, _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
rows); rows);
} }
template<class mat1, class mat2> template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows,CUstream &cq) { const size_t rows,CUstream &cq) {
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols, _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
rows,cq); rows,cq);
} }
} // namespace ucl_cudart } // namespace ucl_cudart
#endif #endif

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -28,7 +28,7 @@
#include "nvd_mat.h" #include "nvd_mat.h"
namespace ucl_cudadr { namespace ucl_cudadr {
/// Class storing a texture reference /// Class storing a texture reference
class UCL_Texture { class UCL_Texture {
public: public:
@ -38,39 +38,39 @@ class UCL_Texture {
inline UCL_Texture(UCL_Program &prog, const char *texture_name) inline UCL_Texture(UCL_Program &prog, const char *texture_name)
{ get_texture(prog,texture_name); } { get_texture(prog,texture_name); }
/// Set the texture reference for this object /// Set the texture reference for this object
inline void get_texture(UCL_Program &prog, const char *texture_name) inline void get_texture(UCL_Program &prog, const char *texture_name)
{ CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); } { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
/// Bind a float array where each fetch grabs a vector of length numel /// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp> template<class numtyp>
inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel) inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
{ _bind_float(vec,numel); } { _bind_float(vec,numel); }
/// Bind a float array where each fetch grabs a vector of length numel /// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp> template<class numtyp>
inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel) inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
{ _bind_float(vec,numel); } { _bind_float(vec,numel); }
/// Bind a float array where each fetch grabs a vector of length numel /// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp, class devtyp> template<class numtyp, class devtyp>
inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel) inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
{ _bind_float(vec.device,numel); } { _bind_float(vec.device,numel); }
/// Bind a float array where each fetch grabs a vector of length numel /// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp, class devtyp> template<class numtyp, class devtyp>
inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel) inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
{ _bind_float(vec.device,numel); } { _bind_float(vec.device,numel); }
/// Unbind the texture reference from the memory allocation /// Unbind the texture reference from the memory allocation
inline void unbind() { } inline void unbind() { }
/// Make a texture reference available to kernel /// Make a texture reference available to kernel
inline void allow(UCL_Kernel &kernel) { inline void allow(UCL_Kernel &kernel) {
#if CUDA_VERSION < 4000 #if CUDA_VERSION < 4000
CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex)); CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
#endif #endif
} }
private: private:
CUtexref _tex; CUtexref _tex;
friend class UCL_Kernel; friend class UCL_Kernel;
@ -80,7 +80,7 @@ class UCL_Texture {
#ifdef UCL_DEBUG #ifdef UCL_DEBUG
assert(numel!=0 && numel<5); assert(numel!=0 && numel<5);
#endif #endif
CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
vec.numel()*vec.element_size())); vec.numel()*vec.element_size()));
if (vec.element_size()==sizeof(float)) if (vec.element_size()==sizeof(float))
CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel)); CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -41,7 +41,7 @@ class UCL_Timer {
/// Clear any data associated with timer /// Clear any data associated with timer
/** \note init() must be called to reuse timer after a clear() **/ /** \note init() must be called to reuse timer after a clear() **/
inline void clear() { inline void clear() {
if (_initialized) { if (_initialized) {
CU_DESTRUCT_CALL(cuEventDestroy(start_event)); CU_DESTRUCT_CALL(cuEventDestroy(start_event));
CU_DESTRUCT_CALL(cuEventDestroy(stop_event)); CU_DESTRUCT_CALL(cuEventDestroy(stop_event));
_initialized=false; _initialized=false;
@ -63,16 +63,16 @@ class UCL_Timer {
/// Start timing on command queue /// Start timing on command queue
inline void start() { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); } inline void start() { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); }
/// Stop timing on command queue /// Stop timing on command queue
inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); } inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
/// Block until the start event has been reached on device /// Block until the start event has been reached on device
inline void sync_start() inline void sync_start()
{ CU_SAFE_CALL(cuEventSynchronize(start_event)); } { CU_SAFE_CALL(cuEventSynchronize(start_event)); }
/// Block until the stop event has been reached on device /// Block until the stop event has been reached on device
inline void sync_stop() inline void sync_stop()
{ CU_SAFE_CALL(cuEventSynchronize(stop_event)); } { CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
/// Set the time elapsed to zero (not the total_time) /// Set the time elapsed to zero (not the total_time)
@ -80,29 +80,29 @@ class UCL_Timer {
CU_SAFE_CALL(cuEventRecord(start_event,_cq)); CU_SAFE_CALL(cuEventRecord(start_event,_cq));
CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
} }
/// Set the total time to zero /// Set the total time to zero
inline void zero_total() { _total_time=0.0; } inline void zero_total() { _total_time=0.0; }
/// Add time from previous start and stop to total /// Add time from previous start and stop to total
/** Forces synchronization **/ /** Forces synchronization **/
inline double add_to_total() inline double add_to_total()
{ double t=time(); _total_time+=t; return t/1000.0; } { double t=time(); _total_time+=t; return t/1000.0; }
/// Add a user specified time to the total (ms) /// Add a user specified time to the total (ms)
inline void add_time_to_total(const double t) { _total_time+=t; } inline void add_time_to_total(const double t) { _total_time+=t; }
/// Return the time (ms) of last start to stop - Forces synchronization /// Return the time (ms) of last start to stop - Forces synchronization
inline double time() { inline double time() {
float timer; float timer;
CU_SAFE_CALL(cuEventSynchronize(stop_event)); CU_SAFE_CALL(cuEventSynchronize(stop_event));
CU_SAFE_CALL( cuEventElapsedTime(&timer,start_event,stop_event) ); CU_SAFE_CALL( cuEventElapsedTime(&timer,start_event,stop_event) );
return timer; return timer;
} }
/// Return the time (s) of last start to stop - Forces synchronization /// Return the time (s) of last start to stop - Forces synchronization
inline double seconds() { return time()/1000.0; } inline double seconds() { return time()/1000.0; }
/// Return the total time in ms /// Return the total time in ms
inline double total_time() { return _total_time; } inline double total_time() { return _total_time; }

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -40,13 +40,13 @@
#include "ucl_types.h" #include "ucl_types.h"
namespace ucl_opencl { namespace ucl_opencl {
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
// - COMMAND QUEUE STUFF // - COMMAND QUEUE STUFF
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
typedef cl_command_queue command_queue; typedef cl_command_queue command_queue;
typedef cl_context context_type; typedef cl_context context_type;
inline void ucl_sync(cl_command_queue &cq) { inline void ucl_sync(cl_command_queue &cq) {
CL_SAFE_CALL(clFinish(cq)); CL_SAFE_CALL(clFinish(cq));
} }
@ -76,19 +76,19 @@ struct OCLProperties {
/// Class for looking at data parallel device properties /// Class for looking at data parallel device properties
/** \note Calls to change the device outside of the class results in incorrect /** \note Calls to change the device outside of the class results in incorrect
* behavior * behavior
* \note There is no error checking for indexing past the number of devices **/ * \note There is no error checking for indexing past the number of devices **/
class UCL_Device { class UCL_Device {
public: public:
/// Collect properties for every device on the node /// Collect properties for every device on the node
/** \note You must set the active GPU with set() before using the device **/ /** \note You must set the active GPU with set() before using the device **/
inline UCL_Device(); inline UCL_Device();
inline ~UCL_Device(); inline ~UCL_Device();
/// Return the number of platforms (0 if error or no platforms) /// Return the number of platforms (0 if error or no platforms)
inline int num_platforms() { return _num_platforms; } inline int num_platforms() { return _num_platforms; }
/// Return a string with name and info of the current platform /// Return a string with name and info of the current platform
inline std::string platform_name(); inline std::string platform_name();
@ -104,38 +104,38 @@ class UCL_Device {
* be allocated for use. clear() is called to delete any contexts and * be allocated for use. clear() is called to delete any contexts and
* associated data from previous calls to set(). **/ * associated data from previous calls to set(). **/
inline int set(int num); inline int set(int num);
/// Delete any context and associated data stored from a call to set() /// Delete any context and associated data stored from a call to set()
inline void clear(); inline void clear();
/// Get the current device number /// Get the current device number
inline int device_num() { return _device; } inline int device_num() { return _device; }
/// Returns the context for the current device /// Returns the context for the current device
inline cl_context & context() { return _context; } inline cl_context & context() { return _context; }
/// Returns the default stream for the current device /// Returns the default stream for the current device
inline command_queue & cq() { return cq(_default_cq); } inline command_queue & cq() { return cq(_default_cq); }
/// Returns the stream indexed by i /// Returns the stream indexed by i
inline command_queue & cq(const int i) { return _cq[i]; } inline command_queue & cq(const int i) { return _cq[i]; }
/// Set the default command queue /// Set the default command queue
/** \param i index of the command queue (as added by push_command_queue()) /** \param i index of the command queue (as added by push_command_queue())
If i is 0, the command queue created with device initialization is If i is 0, the command queue created with device initialization is
used **/ used **/
inline void set_command_queue(const int i) { _default_cq=i; } inline void set_command_queue(const int i) { _default_cq=i; }
/// Block until all commands in the default stream have completed /// Block until all commands in the default stream have completed
inline void sync() { sync(_default_cq); } inline void sync() { sync(_default_cq); }
/// Block until all commands in the specified stream have completed /// Block until all commands in the specified stream have completed
inline void sync(const int i) { ucl_sync(cq(i)); } inline void sync(const int i) { ucl_sync(cq(i)); }
/// Get the number of command queues currently available on device /// Get the number of command queues currently available on device
inline int num_queues() inline int num_queues()
{ return _cq.size(); } { return _cq.size(); }
/// Add a command queue for device computations (with profiling enabled) /// Add a command queue for device computations (with profiling enabled)
inline void push_command_queue() { inline void push_command_queue() {
cl_int errorv; cl_int errorv;
@ -143,7 +143,7 @@ class UCL_Device {
_cq.back()=clCreateCommandQueue(_context,_cl_device, _cq.back()=clCreateCommandQueue(_context,_cl_device,
CL_QUEUE_PROFILING_ENABLE,&errorv); CL_QUEUE_PROFILING_ENABLE,&errorv);
if (errorv!=CL_SUCCESS) { if (errorv!=CL_SUCCESS) {
std::cerr << "Could not create command queue on device: " << name() std::cerr << "Could not create command queue on device: " << name()
<< std::endl; << std::endl;
UCL_GERYON_EXIT; UCL_GERYON_EXIT;
} }
@ -160,76 +160,76 @@ class UCL_Device {
/// Get the current OpenCL device name /// Get the current OpenCL device name
inline std::string name() { return name(_device); } inline std::string name() { return name(_device); }
/// Get the OpenCL device name /// Get the OpenCL device name
inline std::string name(const int i) inline std::string name(const int i)
{ return std::string(_properties[i].name); } { return std::string(_properties[i].name); }
/// Get a string telling the type of the current device /// Get a string telling the type of the current device
inline std::string device_type_name() { return device_type_name(_device); } inline std::string device_type_name() { return device_type_name(_device); }
/// Get a string telling the type of the device /// Get a string telling the type of the device
inline std::string device_type_name(const int i); inline std::string device_type_name(const int i);
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type() { return device_type(_device); } inline int device_type() { return device_type(_device); }
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i); inline int device_type(const int i);
/// Returns true if host memory is efficiently addressable from device /// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); } inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device /// Returns true if host memory is efficiently addressable from device
inline bool shared_memory(const int i) inline bool shared_memory(const int i)
{ return _shared_mem_device(_properties[i].device_type); } { return _shared_mem_device(_properties[i].device_type); }
/// Returns true if double precision is support for the current device /// Returns true if double precision is support for the current device
inline bool double_precision() { return double_precision(_device); } inline bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device /// Returns true if double precision is support for the device
inline bool double_precision(const int i) inline bool double_precision(const int i)
{return _properties[i].double_precision;} {return _properties[i].double_precision;}
/// Get the number of compute units on the current device /// Get the number of compute units on the current device
inline unsigned cus() { return cus(_device); } inline unsigned cus() { return cus(_device); }
/// Get the number of compute units /// Get the number of compute units
inline unsigned cus(const int i) inline unsigned cus(const int i)
{ return _properties[i].compute_units; } { return _properties[i].compute_units; }
/// Get the gigabytes of global memory in the current device /// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); } inline double gigabytes() { return gigabytes(_device); }
/// Get the gigabytes of global memory /// Get the gigabytes of global memory
inline double gigabytes(const int i) inline double gigabytes(const int i)
{ return static_cast<double>(_properties[i].global_mem)/1073741824; } { return static_cast<double>(_properties[i].global_mem)/1073741824; }
/// Get the bytes of global memory in the current device /// Get the bytes of global memory in the current device
inline size_t bytes() { return bytes(_device); } inline size_t bytes() { return bytes(_device); }
/// Get the bytes of global memory /// Get the bytes of global memory
inline size_t bytes(const int i) { return _properties[i].global_mem; } inline size_t bytes(const int i) { return _properties[i].global_mem; }
/// Return the GPGPU revision number for current device /// Return the GPGPU revision number for current device
//inline double revision() { return revision(_device); } //inline double revision() { return revision(_device); }
/// Return the GPGPU revision number /// Return the GPGPU revision number
//inline double revision(const int i) //inline double revision(const int i)
// { return //static_cast<double>(_properties[i].minor)/10+_properties[i].major;} // { return //static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
/// Clock rate in GHz for current device /// Clock rate in GHz for current device
inline double clock_rate() { return clock_rate(_device); } inline double clock_rate() { return clock_rate(_device); }
/// Clock rate in GHz /// Clock rate in GHz
inline double clock_rate(const int i) { return _properties[i].clock*1e-3;} inline double clock_rate(const int i) { return _properties[i].clock*1e-3;}
/// Return the address alignment in bytes /// Return the address alignment in bytes
inline int alignment() { return alignment(_device); } inline int alignment() { return alignment(_device); }
/// Return the address alignment in bytes /// Return the address alignment in bytes
inline int alignment(const int i) { return _properties[i].alignment; } inline int alignment(const int i) { return _properties[i].alignment; }
/// Return the timer resolution /// Return the timer resolution
inline size_t timer_resolution() { return timer_resolution(_device); } inline size_t timer_resolution() { return timer_resolution(_device); }
/// Return the timer resolution /// Return the timer resolution
inline size_t timer_resolution(const int i) inline size_t timer_resolution(const int i)
{ return _properties[i].timer_resolution; } { return _properties[i].timer_resolution; }
/// Get the maximum number of threads per block /// Get the maximum number of threads per block
inline size_t group_size() { return group_size(_device); } inline size_t group_size() { return group_size(_device); }
/// Get the maximum number of threads per block /// Get the maximum number of threads per block
inline size_t group_size(const int i) inline size_t group_size(const int i)
{ return _properties[i].work_group_size; } { return _properties[i].work_group_size; }
/// Return the maximum memory pitch in bytes for current device /// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); } inline size_t max_pitch() { return max_pitch(_device); }
/// Return the maximum memory pitch in bytes /// Return the maximum memory pitch in bytes
@ -254,7 +254,7 @@ class UCL_Device {
{ return fission_by_counts(_device); } { return fission_by_counts(_device); }
/// True if splitting device into subdevices by specified counts supported /// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts(const int i) inline bool fission_by_counts(const int i)
{ return _properties[i].partition_counts; } { return _properties[i].partition_counts; }
/// True if splitting device into subdevices by affinity domains supported /// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity() inline bool fission_by_affinity()
{ return fission_by_affinity(_device); } { return fission_by_affinity(_device); }
@ -271,10 +271,10 @@ class UCL_Device {
/// List all devices along with all properties /// List all devices along with all properties
inline void print_all(std::ostream &out); inline void print_all(std::ostream &out);
/// Return the OpenCL type for the device /// Return the OpenCL type for the device
inline cl_device_id & cl_device() { return _cl_device; } inline cl_device_id & cl_device() { return _cl_device; }
private: private:
int _num_platforms; // Number of platforms int _num_platforms; // Number of platforms
int _platform; // UCL_Device ID for current platform int _platform; // UCL_Device ID for current platform
@ -287,7 +287,7 @@ class UCL_Device {
std::vector<cl_device_id> _cl_devices; // OpenCL IDs for all devices std::vector<cl_device_id> _cl_devices; // OpenCL IDs for all devices
int _num_devices; // Number of devices int _num_devices; // Number of devices
std::vector<OCLProperties> _properties; // Properties for each device std::vector<OCLProperties> _properties; // Properties for each device
inline void add_properties(cl_device_id); inline void add_properties(cl_device_id);
inline int create_context(); inline int create_context();
int _default_cq; int _default_cq;
@ -300,7 +300,7 @@ UCL_Device::UCL_Device() {
// --- Get Number of Platforms // --- Get Number of Platforms
cl_uint nplatforms; cl_uint nplatforms;
cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms); cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms);
if (errorv!=CL_SUCCESS) { if (errorv!=CL_SUCCESS) {
_num_platforms=0; _num_platforms=0;
return; return;
@ -328,18 +328,18 @@ void UCL_Device::clear() {
int UCL_Device::set_platform(int pid) { int UCL_Device::set_platform(int pid) {
clear(); clear();
cl_int errorv; cl_int errorv;
_cl_device=0; _cl_device=0;
_device=-1; _device=-1;
_num_devices=0; _num_devices=0;
_default_cq=0; _default_cq=0;
#ifdef UCL_DEBUG #ifdef UCL_DEBUG
assert(pid<num_platforms()); assert(pid<num_platforms());
#endif #endif
_platform=pid; _platform=pid;
_cl_platform=_cl_platforms[_platform]; _cl_platform=_cl_platforms[_platform];
// --- Get Number of Devices // --- Get Number of Devices
cl_uint n; cl_uint n;
errorv=clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,0,NULL,&n); errorv=clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,0,NULL,&n);
@ -351,7 +351,7 @@ int UCL_Device::set_platform(int pid) {
cl_device_id device_list[_num_devices]; cl_device_id device_list[_num_devices];
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list, CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
&n)); &n));
// --- Store properties for each device // --- Store properties for each device
for (int i=0; i<_num_devices; i++) { for (int i=0; i<_num_devices; i++) {
_cl_devices.push_back(device_list[i]); _cl_devices.push_back(device_list[i]);
@ -385,7 +385,7 @@ void UCL_Device::add_properties(cl_device_id device_list) {
OCLProperties op; OCLProperties op;
char buffer[1024]; char buffer[1024];
cl_bool ans_bool; cl_bool ans_bool;
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL)); CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
op.name=buffer; op.name=buffer;
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_GLOBAL_MEM_SIZE, CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_GLOBAL_MEM_SIZE,
@ -409,8 +409,8 @@ void UCL_Device::add_properties(cl_device_id device_list) {
NULL)); NULL));
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MEM_BASE_ADDR_ALIGN, CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MEM_BASE_ADDR_ALIGN,
sizeof(cl_uint),&op.alignment,NULL)); sizeof(cl_uint),&op.alignment,NULL));
op.alignment/=8; op.alignment/=8;
// Determine if double precision is supported // Determine if double precision is supported
cl_uint double_width; cl_uint double_width;
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_SAFE_CALL(clGetDeviceInfo(device_list,
@ -420,11 +420,11 @@ void UCL_Device::add_properties(cl_device_id device_list) {
op.double_precision=false; op.double_precision=false;
else else
op.double_precision=true; op.double_precision=true;
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_PROFILING_TIMER_RESOLUTION, CL_DEVICE_PROFILING_TIMER_RESOLUTION,
sizeof(size_t),&op.timer_resolution,NULL)); sizeof(size_t),&op.timer_resolution,NULL));
op.ecc_support=false; op.ecc_support=false;
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_SAFE_CALL(clGetDeviceInfo(device_list,
@ -432,7 +432,7 @@ void UCL_Device::add_properties(cl_device_id device_list) {
sizeof(ans_bool),&ans_bool,NULL)); sizeof(ans_bool),&ans_bool,NULL));
if (ans_bool==CL_TRUE) if (ans_bool==CL_TRUE)
op.ecc_support=true; op.ecc_support=true;
op.c_version=""; op.c_version="";
op.partition_equal=false; op.partition_equal=false;
op.partition_counts=false; op.partition_counts=false;
@ -458,30 +458,30 @@ void UCL_Device::add_properties(cl_device_id device_list) {
else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN)
op.partition_affinity=true; op.partition_affinity=true;
} }
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_PARTITION_MAX_SUB_DEVICES, CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
sizeof(cl_uint),&op.max_sub_devices,NULL)); sizeof(cl_uint),&op.max_sub_devices,NULL));
#endif #endif
_properties.push_back(op); _properties.push_back(op);
} }
std::string UCL_Device::platform_name() { std::string UCL_Device::platform_name() {
char info[1024]; char info[1024];
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info, CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
NULL)); NULL));
std::string ans=std::string(info)+' '; std::string ans=std::string(info)+' ';
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_NAME,1024,info, CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_NAME,1024,info,
NULL)); NULL));
ans+=std::string(info)+' '; ans+=std::string(info)+' ';
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VERSION,1024,info, CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VERSION,1024,info,
NULL)); NULL));
ans+=std::string(info); ans+=std::string(info);
return ans; return ans;
} }
@ -512,7 +512,7 @@ int UCL_Device::device_type(const int i) {
// Set the CUDA device to the specified device number // Set the CUDA device to the specified device number
int UCL_Device::set(int num) { int UCL_Device::set(int num) {
clear(); clear();
cl_device_id device_list[_num_devices]; cl_device_id device_list[_num_devices];
cl_uint n; cl_uint n;
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices, CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
@ -557,7 +557,7 @@ void UCL_Device::print_all(std::ostream &out) {
<< _properties[i].work_item_size[1] << " x " << _properties[i].work_item_size[1] << " x "
<< _properties[i].work_item_size[2] << std::endl; << _properties[i].work_item_size[2] << std::endl;
//out << " Maximum sizes of each dimension of a grid: " //out << " Maximum sizes of each dimension of a grid: "
// << _properties[i].maxGridSize[0] << " x " // << _properties[i].maxGridSize[0] << " x "
// << _properties[i].maxGridSize[1] << " x " // << _properties[i].maxGridSize[1] << " x "
// << _properties[i].maxGridSize[2] << std::endl; // << _properties[i].maxGridSize[2] << std::endl;
//out << " Maximum memory pitch: " //out << " Maximum memory pitch: "

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -28,7 +28,7 @@
#include <fstream> #include <fstream>
namespace ucl_opencl { namespace ucl_opencl {
class UCL_Texture; class UCL_Texture;
template <class numtyp> class UCL_D_Vec; template <class numtyp> class UCL_D_Vec;
template <class numtyp> class UCL_D_Mat; template <class numtyp> class UCL_D_Mat;
@ -41,10 +41,10 @@ class UCL_Program {
public: public:
inline UCL_Program() : _init_done(false) {} inline UCL_Program() : _init_done(false) {}
inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); } inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
inline UCL_Program(UCL_Device &device, const void *program, inline UCL_Program(UCL_Device &device, const void *program,
const char *flags="", std::string *log=NULL) : const char *flags="", std::string *log=NULL) :
_init_done(false) { _init_done(false) {
init(device); init(device);
load_string(program,flags,log); load_string(program,flags,log);
} }
@ -56,7 +56,7 @@ class UCL_Program {
_device=device.cl_device(); _device=device.cl_device();
_context=device.context(); _context=device.context();
_cq=device.cq(); _cq=device.cq();
CL_SAFE_CALL(clRetainContext(_context)); CL_SAFE_CALL(clRetainContext(_context));
CL_SAFE_CALL(clRetainCommandQueue(_cq)); CL_SAFE_CALL(clRetainCommandQueue(_cq));
_init_done=true; _init_done=true;
} }
@ -65,7 +65,7 @@ class UCL_Program {
/** \note Must call init() after each clear **/ /** \note Must call init() after each clear **/
inline void clear() { inline void clear() {
if (_init_done) { if (_init_done) {
CL_DESTRUCT_CALL(clReleaseProgram(_program)); CL_DESTRUCT_CALL(clReleaseProgram(_program));
CL_DESTRUCT_CALL(clReleaseContext(_context)); CL_DESTRUCT_CALL(clReleaseContext(_context));
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq)); CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
_init_done=false; _init_done=false;
@ -77,20 +77,20 @@ class UCL_Program {
std::string *log=NULL) { std::string *log=NULL) {
std::ifstream in(filename); std::ifstream in(filename);
if (!in || in.is_open()==false) { if (!in || in.is_open()==false) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not open kernel file: " std::cerr << "UCL Error: Could not open kernel file: "
<< filename << std::endl; << filename << std::endl;
UCL_GERYON_EXIT; UCL_GERYON_EXIT;
#endif #endif
return UCL_FILE_NOT_FOUND; return UCL_FILE_NOT_FOUND;
} }
std::string program((std::istreambuf_iterator<char>(in)), std::string program((std::istreambuf_iterator<char>(in)),
std::istreambuf_iterator<char>()); std::istreambuf_iterator<char>());
in.close(); in.close();
return load_string(program.c_str(),flags,log); return load_string(program.c_str(),flags,log);
} }
/// Load a program from a string and compile with flags /// Load a program from a string and compile with flags
inline int load_string(const void *program, const char *flags="", inline int load_string(const void *program, const char *flags="",
std::string *log=NULL) { std::string *log=NULL) {
@ -103,23 +103,23 @@ class UCL_Program {
CL_CHECK_ERR(error_flag); CL_CHECK_ERR(error_flag);
cl_build_status build_status; cl_build_status build_status;
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device, CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,
CL_PROGRAM_BUILD_STATUS, CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status),&build_status, sizeof(cl_build_status),&build_status,
NULL)); NULL));
if (build_status != CL_SUCCESS || log!=NULL) { if (build_status != CL_SUCCESS || log!=NULL) {
size_t ms; size_t ms;
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0, CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
NULL, &ms)); NULL, &ms));
char build_log[ms]; char build_log[ms];
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms, CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
build_log, NULL)); build_log, NULL));
if (log!=NULL) if (log!=NULL)
*log=std::string(build_log); *log=std::string(build_log);
if (build_status != CL_SUCCESS) { if (build_status != CL_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << std::endl std::cerr << std::endl
<< "----------------------------------------------------------\n" << "----------------------------------------------------------\n"
<< " UCL Error: Error compiling OpenCL Program (" << " UCL Error: Error compiling OpenCL Program ("
@ -130,10 +130,10 @@ class UCL_Program {
return UCL_COMPILE_ERROR; return UCL_COMPILE_ERROR;
} }
} }
return UCL_SUCCESS; return UCL_SUCCESS;
} }
/// Return the default command queue/stream associated with this data /// Return the default command queue/stream associated with this data
inline command_queue & cq() { return _cq; } inline command_queue & cq() { return _cq; }
/// Change the default command queue associated with matrix /// Change the default command queue associated with matrix
@ -143,7 +143,7 @@ class UCL_Program {
private: private:
bool _init_done; bool _init_done;
cl_program _program; cl_program _program;
cl_device_id _device; cl_device_id _device;
cl_context _context; cl_context _context;
cl_command_queue _cq; cl_command_queue _cq;
}; };
@ -153,7 +153,7 @@ class UCL_Kernel {
public: public:
UCL_Kernel() : _dimensions(1), _function_set(false), _num_args(0) UCL_Kernel() : _dimensions(1), _function_set(false), _num_args(0)
{ _block_size[0]=0; _num_blocks[0]=0; } { _block_size[0]=0; _num_blocks[0]=0; }
inline UCL_Kernel(UCL_Program &program, const char *function) : inline UCL_Kernel(UCL_Program &program, const char *function) :
_dimensions(1), _function_set(false), _num_args(0) _dimensions(1), _function_set(false), _num_args(0)
{ _block_size[0]=0; _num_blocks[0]=0; set_function(program,function); } { _block_size[0]=0; _num_blocks[0]=0; set_function(program,function); }
@ -178,48 +178,48 @@ class UCL_Kernel {
/** If not a device pointer, this must be repeated each time the argument /** If not a device pointer, this must be repeated each time the argument
* changes **/ * changes **/
template <class dtype> template <class dtype>
inline void set_arg(const cl_uint index, const dtype * const arg) { inline void set_arg(const cl_uint index, const dtype * const arg) {
CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
if (index>_num_args) { if (index>_num_args) {
_num_args=index; _num_args=index;
#ifdef UCL_DEBUG #ifdef UCL_DEBUG
if (_num_args>_kernel_info_nargs) { if (_num_args>_kernel_info_nargs) {
std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
<< _kernel_info_name << std::endl; << _kernel_info_name << std::endl;
assert(0==1); assert(0==1);
} }
#endif #endif
} }
} }
/// Set a geryon container as a kernel argument. /// Set a geryon container as a kernel argument.
template <class numtyp> template <class numtyp>
inline void set_arg(const UCL_D_Vec<numtyp> * const arg) inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
{ set_arg(&arg->begin()); } { set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument. /// Set a geryon container as a kernel argument.
template <class numtyp> template <class numtyp>
inline void set_arg(const UCL_D_Mat<numtyp> * const arg) inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
{ set_arg(&arg->begin()); } { set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument. /// Set a geryon container as a kernel argument.
template <class hosttype, class devtype> template <class hosttype, class devtype>
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg) inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); } { set_arg(&arg->device.begin()); }
/// Set a geryon container as a kernel argument. /// Set a geryon container as a kernel argument.
template <class hosttype, class devtype> template <class hosttype, class devtype>
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg) inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); } { set_arg(&arg->device.begin()); }
/// Add a kernel argument. /// Add a kernel argument.
template <class dtype> template <class dtype>
inline void add_arg(const dtype * const arg) { inline void add_arg(const dtype * const arg) {
CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
_num_args++; _num_args++;
#ifdef UCL_DEBUG #ifdef UCL_DEBUG
if (_num_args>_kernel_info_nargs) { if (_num_args>_kernel_info_nargs) {
std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
<< _kernel_info_name << std::endl; << _kernel_info_name << std::endl;
assert(0==1); assert(0==1);
} }
@ -228,31 +228,31 @@ class UCL_Kernel {
/// Add a geryon container as a kernel argument. /// Add a geryon container as a kernel argument.
template <class numtyp> template <class numtyp>
inline void add_arg(const UCL_D_Vec<numtyp> * const arg) inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
{ add_arg(&arg->begin()); } { add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument. /// Add a geryon container as a kernel argument.
template <class numtyp> template <class numtyp>
inline void add_arg(const UCL_D_Mat<numtyp> * const arg) inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
{ add_arg(&arg->begin()); } { add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument. /// Add a geryon container as a kernel argument.
template <class hosttype, class devtype> template <class hosttype, class devtype>
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg) inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); } { add_arg(&arg->device.begin()); }
/// Add a geryon container as a kernel argument. /// Add a geryon container as a kernel argument.
template <class hosttype, class devtype> template <class hosttype, class devtype>
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg) inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); } { add_arg(&arg->device.begin()); }
/// Set the number of thread blocks and the number of threads in each block /// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added /** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/ \note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks, const size_t block_size) { inline void set_size(const size_t num_blocks, const size_t block_size) {
_dimensions=1; _dimensions=1;
_num_blocks[0]=num_blocks*block_size; _num_blocks[0]=num_blocks*block_size;
_block_size[0]=block_size; _block_size[0]=block_size;
} }
/// Set the number of thread blocks and the number of threads in each block /// Set the number of thread blocks and the number of threads in each block
@ -266,36 +266,36 @@ class UCL_Kernel {
/** \note This should be called before any arguments have been added /** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/ \note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y) { const size_t block_size_x, const size_t block_size_y) {
_dimensions=2; _dimensions=2;
_num_blocks[0]=num_blocks_x*block_size_x; _num_blocks[0]=num_blocks_x*block_size_x;
_block_size[0]=block_size_x; _block_size[0]=block_size_x;
_num_blocks[1]=num_blocks_y*block_size_y; _num_blocks[1]=num_blocks_y*block_size_y;
_block_size[1]=block_size_y; _block_size[1]=block_size_y;
} }
/// Set the number of thread blocks and the number of threads in each block /// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added /** \note This should be called before any arguments have been added
\note The default command queue for the kernel is changed to cq **/ \note The default command queue for the kernel is changed to cq **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y, const size_t block_size_x, const size_t block_size_y,
command_queue &cq) command_queue &cq)
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);} {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
/// Set the number of thread blocks and the number of threads in each block /// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added /** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/ \note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_x,
const size_t block_size_y, const size_t block_size_z) { const size_t block_size_y, const size_t block_size_z) {
_dimensions=3; _dimensions=3;
const size_t num_blocks_z=1; const size_t num_blocks_z=1;
_num_blocks[0]=num_blocks_x*block_size_x; _num_blocks[0]=num_blocks_x*block_size_x;
_block_size[0]=block_size_x; _block_size[0]=block_size_x;
_num_blocks[1]=num_blocks_y*block_size_y; _num_blocks[1]=num_blocks_y*block_size_y;
_block_size[1]=block_size_y; _block_size[1]=block_size_y;
_num_blocks[2]=num_blocks_z*block_size_z; _num_blocks[2]=num_blocks_z*block_size_z;
_block_size[2]=block_size_z; _block_size[2]=block_size_z;
} }
/// Set the number of thread blocks and the number of threads in each block /// Set the number of thread blocks and the number of threads in each block
@ -305,13 +305,13 @@ class UCL_Kernel {
const size_t block_size_x, const size_t block_size_y, const size_t block_size_x, const size_t block_size_y,
const size_t block_size_z, command_queue &cq) { const size_t block_size_z, command_queue &cq) {
_cq=cq; _cq=cq;
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
block_size_z); block_size_z);
} }
/// Run the kernel in the default command queue /// Run the kernel in the default command queue
inline void run(); inline void run();
/// Clear any arguments associated with the kernel /// Clear any arguments associated with the kernel
inline void clear_args() { _num_args=0; } inline void clear_args() { _num_args=0; }
@ -320,7 +320,7 @@ class UCL_Kernel {
/// Change the default command queue associated with matrix /// Change the default command queue associated with matrix
inline void cq(command_queue &cq_in) { _cq=cq_in; } inline void cq(command_queue &cq_in) { _cq=cq_in; }
#include "ucl_arg_kludge.h" #include "ucl_arg_kludge.h"
private: private:
cl_kernel _kernel; cl_kernel _kernel;
cl_program _program; cl_program _program;
@ -328,7 +328,7 @@ class UCL_Kernel {
size_t _block_size[3]; size_t _block_size[3];
size_t _num_blocks[3]; size_t _num_blocks[3];
bool _function_set; bool _function_set;
cl_command_queue _cq; // The default command queue for this kernel cl_command_queue _cq; // The default command queue for this kernel
unsigned _num_args; unsigned _num_args;
@ -348,7 +348,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
CL_SAFE_CALL(clRetainProgram(_program)); CL_SAFE_CALL(clRetainProgram(_program));
cl_int error_flag; cl_int error_flag;
_kernel=clCreateKernel(program._program,function,&error_flag); _kernel=clCreateKernel(program._program,function,&error_flag);
if (error_flag!=CL_SUCCESS) { if (error_flag!=CL_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not find function: " << function std::cerr << "UCL Error: Could not find function: " << function
@ -357,7 +357,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
#endif #endif
return UCL_FUNCTION_NOT_FOUND; return UCL_FUNCTION_NOT_FOUND;
} }
#ifdef UCL_DEBUG #ifdef UCL_DEBUG
_kernel_info_name=function; _kernel_info_name=function;
cl_uint nargs; cl_uint nargs;
@ -375,7 +375,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
#endif #endif
#endif #endif
return UCL_SUCCESS; return UCL_SUCCESS;
} }
void UCL_Kernel::run() { void UCL_Kernel::run() {

View File

@ -17,12 +17,12 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
/*! \file */ /*! \file */
#ifndef OCL_MAT_H #ifndef OCL_MAT_H
#define OCL_MAT_H #define OCL_MAT_H
@ -54,6 +54,6 @@ namespace ucl_opencl {
#include "ucl_print.h" #include "ucl_print.h"
#undef UCL_PRINT_ALLOW #undef UCL_PRINT_ALLOW
} // namespace ucl_cudart } // namespace ucl_cudart
#endif #endif

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -36,10 +36,10 @@ namespace ucl_opencl {
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
struct ocl_kernel_dim { struct ocl_kernel_dim {
size_t x,y,z; size_t x,y,z;
ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) : ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) :
x(_x), y(_y), z(_z) {} x(_x), y(_y), z(_z) {}
operator size_t * () { return (size_t *)this; } operator size_t * () { return (size_t *)this; }
operator const size_t * () const { return (const size_t *)this; } operator const size_t * () const { return (const size_t *)this; }
}; };
typedef ocl_kernel_dim ucl_kernel_dim; typedef ocl_kernel_dim ucl_kernel_dim;
@ -53,13 +53,13 @@ typedef cl_mem device_ptr;
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
template <class mat_type, class copy_type> template <class mat_type, class copy_type>
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
cl_int error_flag; cl_int error_flag;
cl_context context; cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context), CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL)); &context,NULL));
cl_mem_flags buffer_perm; cl_mem_flags buffer_perm;
cl_map_flags map_perm; cl_map_flags map_perm;
if (kind2==UCL_NOT_SPECIFIED) { if (kind2==UCL_NOT_SPECIFIED) {
@ -88,7 +88,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR; buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
else else
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
if (kind==UCL_READ_ONLY) { if (kind==UCL_READ_ONLY) {
#ifdef CL_VERSION_1_2 #ifdef CL_VERSION_1_2
buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY; buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
@ -102,9 +102,9 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
} else } else
map_perm=CL_MAP_READ | CL_MAP_WRITE; map_perm=CL_MAP_READ | CL_MAP_WRITE;
} }
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag); mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS) if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*) *mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE, clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
@ -125,7 +125,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags), CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
&orig_flags,NULL)); &orig_flags,NULL));
orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR; orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n, mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
*mat.host_ptr(), &error_flag); *mat.host_ptr(), &error_flag);
@ -135,7 +135,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
} }
template <class mat_type> template <class mat_type>
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
cl_mem_flags buffer_perm; cl_mem_flags buffer_perm;
cl_map_flags map_perm; cl_map_flags map_perm;
@ -160,7 +160,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
cl_int error_flag; cl_int error_flag;
mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag); mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS) if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*) *mat.host_ptr() = (typename mat_type::data_type*)
@ -210,7 +210,7 @@ inline int _host_resize(mat_type &mat, const size_t n) {
map_perm=CL_MAP_READ | CL_MAP_WRITE; map_perm=CL_MAP_READ | CL_MAP_WRITE;
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag); mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS) if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*) *mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE, clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
@ -248,7 +248,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
else else
assert(0==1); assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag); mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS) if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
mat.cq()=cm.cq(); mat.cq()=cm.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@ -278,7 +278,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
assert(0==1); assert(0==1);
mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL, mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
&error_flag); &error_flag);
if (error_flag != CL_SUCCESS) if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
mat.cq()=dev.cq(); mat.cq()=dev.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@ -304,7 +304,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
if (dev.device_type()!=UCL_CPU && cols%256!=0) if (dev.device_type()!=UCL_CPU && cols%256!=0)
padded_cols+=256-cols%256; padded_cols+=256-cols%256;
pitch=padded_cols*sizeof(typename mat_type::data_type); pitch=padded_cols*sizeof(typename mat_type::data_type);
return _device_alloc(mat,dev,pitch*rows,kind); return _device_alloc(mat,dev,pitch*rows,kind);
} }
template <class mat_type> template <class mat_type>
@ -342,7 +342,7 @@ inline int _device_resize(mat_type &mat, const size_t n) {
else else
assert(0==1); assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag); mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS) if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
return UCL_SUCCESS; return UCL_SUCCESS;
} }
@ -380,7 +380,7 @@ inline int _device_resize(mat_type &mat, const size_t rows,
else else
assert(0==1); assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag); mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
if (error_flag != CL_SUCCESS) if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
return UCL_SUCCESS; return UCL_SUCCESS;
} }
@ -396,21 +396,21 @@ inline void _host_zero(void *ptr, const size_t n) {
inline void _ocl_build(cl_program &program, cl_device_id &device, inline void _ocl_build(cl_program &program, cl_device_id &device,
const char* options = "") { const char* options = "") {
clBuildProgram(program,1,&device,options,NULL,NULL); clBuildProgram(program,1,&device,options,NULL,NULL);
cl_build_status build_status; cl_build_status build_status;
CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status),&build_status, sizeof(cl_build_status),&build_status,
NULL)); NULL));
if (build_status == CL_SUCCESS) if (build_status == CL_SUCCESS)
return; return;
size_t ms; size_t ms;
CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0, CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0,
NULL, &ms)); NULL, &ms));
char build_log[ms]; char build_log[ms];
CL_SAFE_CALL(clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,ms, CL_SAFE_CALL(clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,ms,
build_log, NULL)); build_log, NULL));
std::cerr << std::endl std::cerr << std::endl
<< "----------------------------------------------------------\n" << "----------------------------------------------------------\n"
<< " Error compiling OpenCL Program...\n" << " Error compiling OpenCL Program...\n"
@ -423,13 +423,13 @@ inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
cl_kernel &kernel, const char *function, cl_kernel &kernel, const char *function,
const char *options="") { const char *options="") {
cl_int error_flag; cl_int error_flag;
cl_program program=clCreateProgramWithSource(context,lines,source, cl_program program=clCreateProgramWithSource(context,lines,source,
NULL,&error_flag); NULL,&error_flag);
CL_CHECK_ERR(error_flag); CL_CHECK_ERR(error_flag);
_ocl_build(program,device,options); _ocl_build(program,device,options);
kernel=clCreateKernel(program,function,&error_flag); kernel=clCreateKernel(program,function,&error_flag);
CL_CHECK_ERR(error_flag); CL_CHECK_ERR(error_flag);
} }
template <class mat_type> template <class mat_type>
@ -452,17 +452,17 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
cl_device_id device; cl_device_id device;
CL_SAFE_CALL(clGetContextInfo(context,CL_CONTEXT_DEVICES, CL_SAFE_CALL(clGetContextInfo(context,CL_CONTEXT_DEVICES,
sizeof(cl_device_id),&device,NULL)); sizeof(cl_device_id),&device,NULL));
const char * szero[3]={ const char * szero[3]={
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void _device_zero(__global NUMTYP *a, const int offset)", "__kernel void _device_zero(__global NUMTYP *a, const int offset)",
" { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }" " { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
}; };
cl_kernel kzero; cl_kernel kzero;
_ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero", _ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
_UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag()); _UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
cl_int offset=mat.offset(); cl_int offset=mat.offset();
CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin())); CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset)); CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
@ -486,7 +486,7 @@ template<> struct _ucl_memcpy<2,2> {
assert(0==1); assert(0==1);
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq, const size_t rows, cl_command_queue &cq,
const cl_bool block, const cl_bool block,
@ -504,7 +504,7 @@ template<> struct _ucl_memcpy<2,0> {
assert(0==1); assert(0==1);
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq, const size_t rows, cl_command_queue &cq,
const cl_bool block, const cl_bool block,
@ -522,7 +522,7 @@ template<> struct _ucl_memcpy<2,1> {
assert(0==1); assert(0==1);
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq, const size_t rows, cl_command_queue &cq,
const cl_bool block, const cl_bool block,
@ -540,7 +540,7 @@ template<> struct _ucl_memcpy<0,2> {
assert(0==1); assert(0==1);
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq, const size_t rows, cl_command_queue &cq,
const cl_bool block, const cl_bool block,
@ -558,7 +558,7 @@ template<> struct _ucl_memcpy<1,2> {
assert(0==1); assert(0==1);
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq, const size_t rows, cl_command_queue &cq,
const cl_bool block, const cl_bool block,
@ -587,9 +587,9 @@ template <> struct _ucl_memcpy<1,0> {
dst.begin(),0,NULL,NULL)); dst.begin(),0,NULL,NULL));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq, const size_t rows, cl_command_queue &cq,
const cl_bool block, const cl_bool block,
size_t dst_offset, size_t src_offset) { size_t dst_offset, size_t src_offset) {
if (src.cbegin()==dst.cbegin()) { if (src.cbegin()==dst.cbegin()) {
@ -602,20 +602,20 @@ template <> struct _ucl_memcpy<1,0> {
#ifdef UCL_DBG_MEM_TRACE #ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 2NS\n"; std::cerr << "UCL_COPY 2NS\n";
#endif #endif
if (spitch==dpitch && dst.cols()==src.cols() && if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size()) src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset, CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
spitch*rows, spitch*rows,
(char *)dst.begin()+dst_offset,0,NULL, (char *)dst.begin()+dst_offset,0,NULL,
NULL)); NULL));
else else
for (size_t i=0; i<rows; i++) { for (size_t i=0; i<rows; i++) {
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,cols, CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,cols,
(char *)dst.begin()+dst_offset,0,NULL, (char *)dst.begin()+dst_offset,0,NULL,
NULL)); NULL));
src_offset+=spitch; src_offset+=spitch;
dst_offset+=dpitch; dst_offset+=dpitch;
} }
} }
}; };
@ -630,7 +630,7 @@ template <> struct _ucl_memcpy<0,1> {
#ifdef UCL_DBG_MEM_TRACE #ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 3S\n"; std::cerr << "UCL_COPY 3S\n";
#endif #endif
return; return;
} }
#ifdef UCL_DBG_MEM_TRACE #ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 3NS\n"; std::cerr << "UCL_COPY 3NS\n";
@ -639,9 +639,9 @@ template <> struct _ucl_memcpy<0,1> {
src.begin(),0,NULL,NULL)); src.begin(),0,NULL,NULL));
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq, const size_t rows, cl_command_queue &cq,
const cl_bool block, const cl_bool block,
size_t dst_offset, size_t src_offset) { size_t dst_offset, size_t src_offset) {
if (src.cbegin()==dst.cbegin()) { if (src.cbegin()==dst.cbegin()) {
@ -649,12 +649,12 @@ template <> struct _ucl_memcpy<0,1> {
#ifdef UCL_DBG_MEM_TRACE #ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 4S\n"; std::cerr << "UCL_COPY 4S\n";
#endif #endif
return; return;
} }
#ifdef UCL_DBG_MEM_TRACE #ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 4NS\n"; std::cerr << "UCL_COPY 4NS\n";
#endif #endif
if (spitch==dpitch && dst.cols()==src.cols() && if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size()) src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset, CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
spitch*rows, spitch*rows,
@ -667,7 +667,7 @@ template <> struct _ucl_memcpy<0,1> {
NULL)); NULL));
src_offset+=spitch; src_offset+=spitch;
dst_offset+=dpitch; dst_offset+=dpitch;
} }
} }
}; };
@ -687,33 +687,33 @@ template <int mem1, int mem2> struct _ucl_memcpy {
#ifdef UCL_DBG_MEM_TRACE #ifdef UCL_DBG_MEM_TRACE
else std::cerr << "UCL_COPY 6S\n"; else std::cerr << "UCL_COPY 6S\n";
#endif #endif
if (block==CL_TRUE) ucl_sync(cq); if (block==CL_TRUE) ucl_sync(cq);
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq, const size_t rows, cl_command_queue &cq,
const cl_bool block, const cl_bool block,
size_t dst_offset, size_t src_offset) { size_t dst_offset, size_t src_offset) {
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) { if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
#ifdef UCL_DBG_MEM_TRACE #ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 7NS\n"; std::cerr << "UCL_COPY 7NS\n";
#endif #endif
if (spitch==dpitch && dst.cols()==src.cols() && if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size()) src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset, CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
dst_offset,spitch*rows,0,NULL,NULL)); dst_offset,spitch*rows,0,NULL,NULL));
else else
for (size_t i=0; i<rows; i++) { for (size_t i=0; i<rows; i++) {
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(), CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),
src_offset,dst_offset,cols,0, src_offset,dst_offset,cols,0,
NULL,NULL)); NULL,NULL));
src_offset+=spitch; src_offset+=spitch;
dst_offset+=dpitch; dst_offset+=dpitch;
} }
} }
#ifdef UCL_DBG_MEM_TRACE #ifdef UCL_DBG_MEM_TRACE
else std::cerr << "UCL_COPY 7S\n"; else std::cerr << "UCL_COPY 7S\n";
#endif #endif
@ -736,8 +736,8 @@ inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
} }
template<class mat1, class mat2> template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows) { const size_t rows) {
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols, _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
rows,dst.cq(),CL_TRUE, rows,dst.cq(),CL_TRUE,
@ -745,15 +745,15 @@ inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
} }
template<class mat1, class mat2> template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
const size_t spitch, const size_t cols, const size_t spitch, const size_t cols,
const size_t rows,cl_command_queue &cq) { const size_t rows,cl_command_queue &cq) {
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols, _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
rows,cq,CL_FALSE, rows,cq,CL_FALSE,
dst.byteoff(),src.byteoff()); dst.byteoff(),src.byteoff());
} }
} // namespace ucl_cudart } // namespace ucl_cudart
#endif #endif

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -28,7 +28,7 @@
#include "ocl_mat.h" #include "ocl_mat.h"
namespace ucl_opencl { namespace ucl_opencl {
/// Class storing a texture reference /// Class storing a texture reference
class UCL_Texture { class UCL_Texture {
public: public:
@ -46,9 +46,9 @@ class UCL_Texture {
/// Unbind the texture reference from the memory allocation /// Unbind the texture reference from the memory allocation
inline void unbind() { } inline void unbind() { }
/// Make a texture reference available to kernel /// Make a texture reference available to kernel
inline void allow(UCL_Kernel &kernel) { } inline void allow(UCL_Kernel &kernel) { }
private: private:
friend class UCL_Kernel; friend class UCL_Kernel;
}; };

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -67,33 +67,33 @@ class UCL_Timer {
clRetainCommandQueue(_cq); clRetainCommandQueue(_cq);
_initialized=true; _initialized=true;
} }
/// Start timing on default command queue /// Start timing on default command queue
inline void start() { UCL_OCL_MARKER(_cq,&start_event); } inline void start() { UCL_OCL_MARKER(_cq,&start_event); }
/// Stop timing on default command queue /// Stop timing on default command queue
inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); } inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); }
/// Block until the start event has been reached on device /// Block until the start event has been reached on device
inline void sync_start() inline void sync_start()
{ CL_SAFE_CALL(clWaitForEvents(1,&start_event)); } { CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
/// Block until the stop event has been reached on device /// Block until the stop event has been reached on device
inline void sync_stop() inline void sync_stop()
{ CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); } { CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
/// Set the time elapsed to zero (not the total_time) /// Set the time elapsed to zero (not the total_time)
inline void zero() inline void zero()
{ UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); } { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); }
/// Set the total time to zero /// Set the total time to zero
inline void zero_total() { _total_time=0.0; } inline void zero_total() { _total_time=0.0; }
/// Add time from previous start and stop to total /// Add time from previous start and stop to total
/** Forces synchronization **/ /** Forces synchronization **/
inline double add_to_total() inline double add_to_total()
{ double t=time(); _total_time+=t; return t/1000.0; } { double t=time(); _total_time+=t; return t/1000.0; }
/// Add a user specified time to the total (ms) /// Add a user specified time to the total (ms)
inline void add_time_to_total(const double t) { _total_time+=t; } inline void add_time_to_total(const double t) { _total_time+=t; }
@ -107,12 +107,12 @@ class UCL_Timer {
CL_SAFE_CALL(clGetEventProfilingInfo(start_event, CL_SAFE_CALL(clGetEventProfilingInfo(start_event,
CL_PROFILING_COMMAND_END, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &tstart, NULL)); sizeof(cl_ulong), &tstart, NULL));
return (tend-tstart)*t_factor; return (tend-tstart)*t_factor;
} }
/// Return the time (s) of last start to stop - Forces synchronization /// Return the time (s) of last start to stop - Forces synchronization
inline double seconds() { return time()/1000.0; } inline double seconds() { return time()/1000.0; }
/// Return the total time in ms /// Return the total time in ms
inline double total_time() { return _total_time; } inline double total_time() { return _total_time; }

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -38,47 +38,47 @@
template <class t1, class t2, class t3, class t4, class t5> template <class t1, class t2, class t3, class t4, class t5>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) { inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
class t6> class t6>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6) { t6 *a6) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a6);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7> class t6, class t7>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7) { t6 *a6, t7 *a7) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a6); add_arg(a7);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8> class t6, class t7, class t8>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8) { t6 *a6, t7 *a7, t8 *a8) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a6); add_arg(a7); add_arg(a8);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9> class t6, class t7, class t8, class t9>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9) { t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10> class t6, class t7, class t8, class t9, class t10>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) { t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
@ -87,9 +87,9 @@
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11) { t11 *a11) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a11);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
@ -98,8 +98,8 @@
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12) { t11 *a11, t12 *a12) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a11); add_arg(a12);
} }
@ -109,9 +109,9 @@
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13) { t11 *a11, t12 *a12, t13 *a13) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a11); add_arg(a12); add_arg(a13);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
@ -120,9 +120,9 @@
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14) { t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
@ -131,9 +131,9 @@
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) { t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
@ -144,10 +144,10 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16) { t16 *a16) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a16);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
@ -158,10 +158,10 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17) { t16 *a16, t17 *a17) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a16); add_arg(a17);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
@ -172,10 +172,10 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18) { t16 *a16, t17 *a17, t18 *a18) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a16); add_arg(a17); add_arg(a18);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
@ -186,10 +186,10 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19) { t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
@ -200,10 +200,10 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) { t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
@ -216,10 +216,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21) { t21 *a21) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a21);
} }
@ -233,10 +233,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22) { t21 *a21, t22 *a22) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a21); add_arg(a22);
} }
@ -250,10 +250,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23) { t21 *a21, t22 *a22, t23 *a23) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a21); add_arg(a22); add_arg(a23);
} }
@ -267,10 +267,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24) { t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
} }
@ -284,11 +284,11 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) { t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
} }
template <class t1, class t2, class t3, class t4, class t5, template <class t1, class t2, class t3, class t4, class t5,
@ -303,11 +303,11 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26) { t26 *a26) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a26);
} }
@ -323,11 +323,11 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27) { t26 *a26, t27 *a27) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a26); add_arg(a27);
} }
@ -343,11 +343,11 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28) { t26 *a26, t27 *a27, t28 *a28) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a26); add_arg(a27); add_arg(a28);
} }
@ -363,11 +363,11 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29) { t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
} }
@ -383,12 +383,12 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) { t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
} }
@ -425,7 +425,7 @@
template <class t1, class t2, class t3, class t4, class t5> template <class t1, class t2, class t3, class t4, class t5>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) { inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
run(); run();
} }
@ -434,8 +434,8 @@
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6) { t6 *a6) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a6);
run(); run();
} }
@ -444,8 +444,8 @@
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7) { t6 *a6, t7 *a7) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a6); add_arg(a7);
run(); run();
} }
@ -454,8 +454,8 @@
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8) { t6 *a6, t7 *a7, t8 *a8) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a6); add_arg(a7); add_arg(a8);
run(); run();
} }
@ -464,8 +464,8 @@
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9) { t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
run(); run();
} }
@ -474,8 +474,8 @@
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) { t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
run(); run();
} }
@ -486,9 +486,9 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11) { t11 *a11) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a11);
run(); run();
} }
@ -499,8 +499,8 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12) { t11 *a11, t12 *a12) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a11); add_arg(a12);
run(); run();
} }
@ -512,9 +512,9 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13) { t11 *a11, t12 *a12, t13 *a13) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a11); add_arg(a12); add_arg(a13);
run(); run();
} }
@ -525,9 +525,9 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14) { t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
run(); run();
} }
@ -538,9 +538,9 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) { t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
run(); run();
} }
@ -553,10 +553,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16) { t16 *a16) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a16);
run(); run();
} }
@ -569,10 +569,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17) { t16 *a16, t17 *a17) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a16); add_arg(a17);
run(); run();
} }
@ -585,10 +585,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18) { t16 *a16, t17 *a17, t18 *a18) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a16); add_arg(a17); add_arg(a18);
run(); run();
} }
@ -601,10 +601,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19) { t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
run(); run();
} }
@ -617,10 +617,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) { t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
run(); run();
} }
@ -635,10 +635,10 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21) { t21 *a21) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a21);
run(); run();
} }
@ -654,10 +654,10 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22) { t21 *a21, t22 *a22) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a21); add_arg(a22);
run(); run();
} }
@ -673,10 +673,10 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23) { t21 *a21, t22 *a22, t23 *a23) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a21); add_arg(a22); add_arg(a23);
run(); run();
} }
@ -692,10 +692,10 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24) { t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
run(); run();
} }
@ -711,11 +711,11 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) { t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
run(); run();
} }
@ -732,11 +732,11 @@
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26) { t26 *a26) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a26);
run(); run();
} }
@ -754,11 +754,11 @@
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27) { t26 *a26, t27 *a27) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a26); add_arg(a27);
run(); run();
} }
@ -776,12 +776,12 @@
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28) { t26 *a26, t27 *a27, t28 *a28) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a26); add_arg(a27); add_arg(a28);
run(); run();
} }
@ -798,11 +798,11 @@
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29) { t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
run(); run();
} }
@ -820,11 +820,11 @@
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) { t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
clear_args(); clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
run(); run();
} }

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -52,10 +52,10 @@
/// Base class for vector/matrix containers /// Base class for vector/matrix containers
/** All containers are associated with a default command queue. /** All containers are associated with a default command queue.
* For CUDA, this is the default stream. * For CUDA, this is the default stream.
* *
* The default queue is used for asynchonrous operations on the container * The default queue is used for asynchonrous operations on the container
* that do not specify a queue. For OpenCL, this queue is also used in * that do not specify a queue. For OpenCL, this queue is also used in
* calls for reserving and copying memory **/ * calls for reserving and copying memory **/
class UCL_BaseMat { class UCL_BaseMat {
public: public:
UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { } UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { }
@ -68,8 +68,8 @@ class UCL_BaseMat {
inline void sync() { ucl_sync(_cq); } inline void sync() { ucl_sync(_cq); }
/// Return the type/permissions of memory allocation /// Return the type/permissions of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
* or UCL_VIEW **/ * or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; } inline enum UCL_MEMOPT kind() const { return _kind; }
inline bool shared_mem_device() { inline bool shared_mem_device() {
#ifdef _OCL_MAT #ifdef _OCL_MAT
@ -79,12 +79,12 @@ class UCL_BaseMat {
cl_device_type device_type; cl_device_type device_type;
CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE, CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
sizeof(device_type),&device_type,NULL)); sizeof(device_type),&device_type,NULL));
return _shared_mem_device(device_type); return _shared_mem_device(device_type);
#else #else
return false; return false;
#endif #endif
} }
protected: protected:
command_queue _cq; command_queue _cq;
enum UCL_MEMOPT _kind; enum UCL_MEMOPT _kind;

View File

@ -17,33 +17,33 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
/*************************************************************************** /***************************************************************************
The ucl_copy and ucl_cast_copy routines provide a general prototype for The ucl_copy and ucl_cast_copy routines provide a general prototype for
copying data between host and device memory (including texture memory) copying data between host and device memory (including texture memory)
for the matrix and vector types in nvc_memory. for the matrix and vector types in nvc_memory.
For host/host and host/device transfers, typecasting is performed For host/host and host/device transfers, typecasting is performed
automatically as necessary. automatically as necessary.
The routines are written so that all branches can be removed by the The routines are written so that all branches can be removed by the
compiler during template instantiation. compiler during template instantiation.
The routines currently assume row-major ordering for all types. The routines currently assume row-major ordering for all types.
For asynchronous copy in the default command queue, async is boolean true; For asynchronous copy in the default command queue, async is boolean true;
For asynchronous copy in a specified command queue, async is command queue For asynchronous copy in a specified command queue, async is command queue
Otherwise, set async to boolean false; Otherwise, set async to boolean false;
When performing frequent data copies that require casting, it is more When performing frequent data copies that require casting, it is more
efficient to allocate a casting buffer once and then pass that buffer efficient to allocate a casting buffer once and then pass that buffer
to the copy routine. This can be accomplished with the ucl_cast_copy to the copy routine. This can be accomplished with the ucl_cast_copy
routines. routines.
Examples Examples
(x's represent alignment padding - to maintain alignment) (x's represent alignment padding - to maintain alignment)
(o's represent a larger matrix in memory) (o's represent a larger matrix in memory)
(vectors represented as single row) (vectors represented as single row)
@ -51,18 +51,18 @@
dst src command dst src command
---------------------------------------------------------------- ----------------------------------------------------------------
0 1 2 3 4 <-- 0 1 2 3 4 ucl_copy(dst,src,async) 0 1 2 3 4 <-- 0 1 2 3 4 ucl_copy(dst,src,async)
0 1 2 3 <-- 0 1 2 3 4 ucl_copy(dst,src,4,async) 0 1 2 3 <-- 0 1 2 3 4 ucl_copy(dst,src,4,async)
0 1 2 <-- 0 1 2 3 4 5 ucl_copy(dst,src,async) 0 1 2 <-- 0 1 2 3 4 5 ucl_copy(dst,src,async)
3 4 5 3 4 5
0 1 2 3 4 5 <-- 0 1 2 ucl_copy(dst,src,async) 0 1 2 3 4 5 <-- 0 1 2 ucl_copy(dst,src,async)
3 4 5 3 4 5
0 1 2 <-- 0 1 2 ucl_copy(dst,src,async) 0 1 2 <-- 0 1 2 ucl_copy(dst,src,async)
3 4 5 3 4 5 3 4 5 3 4 5
0 1 2 <-- 0 1 2 ucl_copy(dst,src,6,async) 0 1 2 <-- 0 1 2 ucl_copy(dst,src,6,async)
3 4 5 3 4 5 3 4 5 3 4 5
5 6 7 5 6 7
@ -70,33 +70,33 @@
0 1 2 <-- 0 1 2 3 ucl_copy(dst,src,2,3,async) 0 1 2 <-- 0 1 2 3 ucl_copy(dst,src,2,3,async)
4 5 6 4 5 6 7 4 5 6 4 5 6 7
8 9 10 11 8 9 10 11
0 1 2 x x <-- 0 1 2 ucl_copy(dst,src,async) 0 1 2 x x <-- 0 1 2 ucl_copy(dst,src,async)
3 4 5 x x 3 4 5 3 4 5 x x 3 4 5
0 1 2 <-- 0 1 2 x x ucl_copy(dst,src,async) 0 1 2 <-- 0 1 2 x x ucl_copy(dst,src,async)
3 4 5 3 4 5 x x 3 4 5 3 4 5 x x
0 1 2 o o <-- 0 1 2 ucl_copy(dst,src,2,3,async) 0 1 2 o o <-- 0 1 2 ucl_copy(dst,src,2,3,async)
3 4 5 o o 3 4 5 3 4 5 o o 3 4 5
o o o o o o o o o o
0 1 2 o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,3,async) 0 1 2 o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,3,async)
3 4 5 o o 3 4 5 o o
o o o o o o o o o o
0 1 o o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,2,async) 0 1 o o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,2,async)
2 3 o o o 2 3 o o o
o o o o o o o o o o
0 1 2 o o <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async) 0 1 2 o o <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
5 6 7 o o 5 6 7 8 9 5 6 7 o o 5 6 7 8 9
o o o o o 10 11 12 13 14 o o o o o 10 11 12 13 14
0 1 2 5 6 7 <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async) 0 1 2 5 6 7 <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
5 6 7 8 9 5 6 7 8 9
10 11 12 13 14 10 11 12 13 14
***************************************************************************/ ***************************************************************************/
// Only allow this file to be included by nvc_memory.h and ocl_memory.h // Only allow this file to be included by nvc_memory.h and ocl_memory.h
@ -124,7 +124,7 @@ inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
assert(0==1); assert(0==1);
} }
} }
} }
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
// - HOST-HOST COPY ROUTINES // - HOST-HOST COPY ROUTINES
@ -182,7 +182,7 @@ template <> struct _host_host_copy<1,1> {
return; return;
} }
#endif #endif
#ifdef UCL_DBG_MEM_TRACE #ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 8NS\n"; std::cerr << "UCL_COPY 8NS\n";
#endif #endif
@ -212,7 +212,7 @@ template <int host_t1, int host_t2> struct _host_host_copy {
static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows, static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols) { const size_t cols) {
assert(0==1); assert(0==1);
} }
}; };
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
@ -242,20 +242,20 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) { const size_t cols, mat3 &cast_buffer) {
// Asynchronous currently pointless here // Asynchronous currently pointless here
#ifdef UCL_DEBUG #ifdef UCL_DEBUG
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
#endif #endif
if (mat1::VECTOR) { if (mat1::VECTOR) {
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
src.row_bytes(),cols*sizeof(typename mat2::data_type),rows); src.row_bytes(),cols*sizeof(typename mat2::data_type),rows);
for (size_t i=0; i<rows*cols; i++) for (size_t i=0; i<rows*cols; i++)
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]); dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
} else { } else {
if (mat2::VECTOR) if (mat2::VECTOR)
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
cols*sizeof(typename mat2::data_type), cols*sizeof(typename mat2::data_type),
cols*sizeof(typename mat2::data_type),rows); cols*sizeof(typename mat2::data_type),rows);
@ -276,23 +276,23 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
} }
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer, const size_t cols, mat3 &cast_buffer,
command_queue &cq) { command_queue &cq) {
// Asynchronous currently pointless here // Asynchronous currently pointless here
#ifdef UCL_DEBUG #ifdef UCL_DEBUG
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
#endif #endif
if (mat1::VECTOR) { if (mat1::VECTOR) {
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq); src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq);
cast_buffer.sync(); cast_buffer.sync();
for (size_t i=0; i<rows*cols; i++) for (size_t i=0; i<rows*cols; i++)
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]); dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
} else { } else {
if (mat2::VECTOR) if (mat2::VECTOR)
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
cols*sizeof(typename mat2::data_type), cols*sizeof(typename mat2::data_type),
cols*sizeof(typename mat2::data_type),rows,cq); cols*sizeof(typename mat2::data_type),rows,cq);
@ -338,7 +338,7 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
if (mat3::VECTOR==0) { if (mat3::VECTOR==0) {
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols); assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
assert(dst.rows()>=rows && dst.cols()>=cols); assert(dst.rows()>=rows && dst.cols()>=cols);
} }
@ -404,9 +404,9 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
#ifdef UCL_DEBUG #ifdef UCL_DEBUG
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
if (mat3::VECTOR==0) { if (mat3::VECTOR==0) {
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols); assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
assert(dst.rows()>=rows && dst.cols()>=cols); assert(dst.rows()>=rows && dst.cols()>=cols);
} }
@ -472,23 +472,23 @@ template <> struct _ucl_cast_copy<1,1> {
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, command_queue &cq) { mat3 &cast_buffer, command_queue &cq) {
assert(0==1); assert(0==1);
} }
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer) { mat3 &cast_buffer) {
assert(0==1); assert(0==1);
} }
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) { const size_t cols, mat3 &cast_buffer) {
assert(0==1); assert(0==1);
} }
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer, const size_t cols, mat3 &cast_buffer,
command_queue &cq) { command_queue &cq) {
assert(0==1); assert(0==1);
} }
}; };
@ -497,23 +497,23 @@ template <> struct _ucl_cast_copy<0,0> {
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, command_queue &cq) { mat3 &cast_buffer, command_queue &cq) {
assert(0==1); assert(0==1);
} }
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer) { mat3 &cast_buffer) {
assert(0==1); assert(0==1);
} }
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) { const size_t cols, mat3 &cast_buffer) {
assert(0==1); assert(0==1);
} }
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer, const size_t cols, mat3 &cast_buffer,
command_queue &cq) { command_queue &cq) {
assert(0==1); assert(0==1);
} }
}; };
@ -525,7 +525,7 @@ template <> struct _ucl_cast_copy<0,0> {
/** \param numel Number of elements (not bytes) to copy /** \param numel Number of elements (not bytes) to copy
* \param cast_buffer Buffer on host with enough storage for casting * \param cast_buffer Buffer on host with enough storage for casting
* - If the data types for the two matrices are same, no cast performed * - If the data types for the two matrices are same, no cast performed
* - Padding for 2D matrices is not considered in this routine. * - Padding for 2D matrices is not considered in this routine.
* - Currently does not handle textures **/ * - Currently does not handle textures **/
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel, inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
@ -551,7 +551,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
* \param async Perform non-blocking copy on default stream * \param async Perform non-blocking copy on default stream
* \param cast_buffer Buffer on host with enough storage for casting * \param cast_buffer Buffer on host with enough storage for casting
* - If the data types for the two matrices are same, no cast performed * - If the data types for the two matrices are same, no cast performed
* - Padding for 2D matrices is not considered in this routine. * - Padding for 2D matrices is not considered in this routine.
* - Currently does not handle textures **/ * - Currently does not handle textures **/
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel, inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
@ -580,7 +580,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
* buffer is created for copy. When multiple casts occur, it is * buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can * more efficient to create a permanent casting buffer that can
* be passed to an alternative copy routine. * be passed to an alternative copy routine.
* - Padding for 2D matrices is not considered in this routine. * - Padding for 2D matrices is not considered in this routine.
* - Currently does not handle textures **/ * - Currently does not handle textures **/
template <class mat1, class mat2> template <class mat1, class mat2>
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
@ -593,7 +593,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
#endif #endif
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel); _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) { if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer; UCL_H_Vec<typename mat2::data_type> cast_buffer;
@ -606,8 +606,8 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel, _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer,cq); cast_buffer,cq);
} }
} else } else
ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq); ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq);
} }
/// Copy matrix/vector (memory already allocated) /// Copy matrix/vector (memory already allocated)
@ -619,7 +619,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
* buffer is created for copy. When multiple casts occur, it is * buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can * more efficient to create a permanent casting buffer that can
* be passed to an alternative copy routine. * be passed to an alternative copy routine.
* - Padding for 2D matrices is not considered in this routine. * - Padding for 2D matrices is not considered in this routine.
* - The default stream is used for asynchronous copy * - The default stream is used for asynchronous copy
* - Currently does not handle textures **/ * - Currently does not handle textures **/
template <class mat1, class mat2> template <class mat1, class mat2>
@ -648,7 +648,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
cast_buffer); cast_buffer);
} }
} else } else
ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type)); ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type));
} }
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
@ -659,11 +659,11 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
/** \param async Perform non-blocking copy on default stream /** \param async Perform non-blocking copy on default stream
* \param cast_buffer Buffer on host with enough storage for casting * \param cast_buffer Buffer on host with enough storage for casting
* - If src is a vector, routine assumes row-major rows by cols copy * - If src is a vector, routine assumes row-major rows by cols copy
* - If src is a matrix, routine will copy upper left tile of matrix * - If src is a matrix, routine will copy upper left tile of matrix
* - If dst is a vector, routine assumes row-major rows by cols copy * - If dst is a vector, routine assumes row-major rows by cols copy
* - If dst is a matrix, routine will copy into left tile of matrix * - If dst is a matrix, routine will copy into left tile of matrix
* - If the data types for the two matrices are same, no cast performed * - If the data types for the two matrices are same, no cast performed
* - Padding for 2D matrices is not considered in this routine. * - Padding for 2D matrices is not considered in this routine.
* - Copy from vector to matrix and vice versa allowed * - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/ * - Currently does not handle textures **/
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
@ -686,16 +686,16 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
/// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer) /// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
/** \param cast_buffer Buffer on host with enough storage for casting /** \param cast_buffer Buffer on host with enough storage for casting
* - If src is a vector, routine assumes row-major rows by cols copy * - If src is a vector, routine assumes row-major rows by cols copy
* - If src is a matrix, routine will copy upper left tile of matrix * - If src is a matrix, routine will copy upper left tile of matrix
* - If dst is a vector, routine assumes row-major rows by cols copy * - If dst is a vector, routine assumes row-major rows by cols copy
* - If dst is a matrix, routine will copy into upper left tile of matrix * - If dst is a matrix, routine will copy into upper left tile of matrix
* - If the data types for the two matrices are same, no cast performed * - If the data types for the two matrices are same, no cast performed
* - Padding for 2D matrices is not considered in this routine. * - Padding for 2D matrices is not considered in this routine.
* - Copy from vector to matrix and vice versa allowed * - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/ * - Currently does not handle textures **/
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows, inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer, const size_t cols, mat3 &cast_buffer,
command_queue &cq) { command_queue &cq) {
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,rows,cols,cq); ucl_copy(dst,src,rows,cols,cq);
@ -710,11 +710,11 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
/// Asynchronous copy of subset matrix rows,cols (memory already allocated) /// Asynchronous copy of subset matrix rows,cols (memory already allocated)
/** - If src is a vector, routine assumes row-major rows by cols copy /** - If src is a vector, routine assumes row-major rows by cols copy
* - If src is a matrix, routine will copy upper left tile of matrix * - If src is a matrix, routine will copy upper left tile of matrix
* - If dst is a vector, routine assumes row-major rows by cols copy * - If dst is a vector, routine assumes row-major rows by cols copy
* - If dst is a matrix, routine will copy into left tile of matrix * - If dst is a matrix, routine will copy into left tile of matrix
* - If the data types of the two matrices are not the same, * - If the data types of the two matrices are not the same,
* casting will be performed automatically as long as the copy is * casting will be performed automatically as long as the copy is
* not device to device. For host/device transfers, a temporary * not device to device. For host/device transfers, a temporary
* buffer is created for copy. When multiple casts occur, it is * buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can * more efficient to create a permanent casting buffer that can
@ -730,7 +730,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
#endif #endif
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols); _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) { if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer; UCL_H_Vec<typename mat2::data_type> cast_buffer;
@ -773,9 +773,9 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
/// Copy subset of matrix rows,cols (memory already allocated) /// Copy subset of matrix rows,cols (memory already allocated)
/** \param async Perform non-blocking copy (ignored for host to host copy) /** \param async Perform non-blocking copy (ignored for host to host copy)
* - If src is a vector, routine assumes row-major rows by cols copy * - If src is a vector, routine assumes row-major rows by cols copy
* - If src is a matrix, routine will copy upper left tile of matrix * - If src is a matrix, routine will copy upper left tile of matrix
* - If dst is a vector, routine assumes row-major rows by cols copy * - If dst is a vector, routine assumes row-major rows by cols copy
* - If dst is a matrix, routine will copy into left tile of matrix * - If dst is a matrix, routine will copy into left tile of matrix
* - If the data types of the two matrices are not the same, * - If the data types of the two matrices are not the same,
* casting will be performed automatically as long as the copy is * casting will be performed automatically as long as the copy is
* not device to device. For host/device transfers, a temporary * not device to device. For host/device transfers, a temporary
@ -796,7 +796,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
ucl_copy(dst,src,rows,cols,dst.cq()); ucl_copy(dst,src,rows,cols,dst.cq());
else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols); _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) { if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer; UCL_H_Vec<typename mat2::data_type> cast_buffer;
@ -846,7 +846,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
* \param cast_buffer Buffer on host with enough storage for casting * \param cast_buffer Buffer on host with enough storage for casting
* - If the data types for the two matrices are same, no cast performed * - If the data types for the two matrices are same, no cast performed
* - The number of bytes copied is determined by entire src data * - The number of bytes copied is determined by entire src data
* - Padding for 2D matrices is not considered in this routine. * - Padding for 2D matrices is not considered in this routine.
* - Copy from vector to matrix and vice versa allowed * - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/ * - Currently does not handle textures **/
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
@ -866,7 +866,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
/** \param cast_buffer Buffer on host with enough storage for casting /** \param cast_buffer Buffer on host with enough storage for casting
* - If the data types for the two matrices are same, no cast performed * - If the data types for the two matrices are same, no cast performed
* - The number of bytes copied is determined by entire src data * - The number of bytes copied is determined by entire src data
* - Padding for 2D matrices is not considered in this routine. * - Padding for 2D matrices is not considered in this routine.
* - Copy from vector to matrix and vice versa allowed * - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/ * - Currently does not handle textures **/
template <class mat1, class mat2, class mat3> template <class mat1, class mat2, class mat3>
@ -885,7 +885,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
/// Asynchronous copy of matrix/vector (memory already allocated) /// Asynchronous copy of matrix/vector (memory already allocated)
/** - The number of bytes copied is determined by entire src data /** - The number of bytes copied is determined by entire src data
* - If the data types of the two matrices are not the same, * - If the data types of the two matrices are not the same,
* casting will be performed automatically as long as the copy is * casting will be performed automatically as long as the copy is
* not device to device. For host/device transfers, a temporary * not device to device. For host/device transfers, a temporary
* buffer is created for copy. When multiple casts occur, it is * buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can * more efficient to create a permanent casting buffer that can
@ -924,7 +924,7 @@ template <class mat1, class mat2>
inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) { inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) {
if (async) if (async)
ucl_copy(dst,src,dst.cq()); ucl_copy(dst,src,dst.cq());
else if (dst.row_bytes()==src.row_bytes() && else if (dst.row_bytes()==src.row_bytes() &&
src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW && src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
(int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,src.row_size()*src.rows(),async); ucl_copy(dst,src,src.row_size()*src.rows(),async);

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -37,23 +37,23 @@ class UCL_D_Mat : public UCL_BaseMat {
ROW_MAJOR = 1, ROW_MAJOR = 1,
VECTOR = 0 VECTOR = 0
}; };
typedef numtyp data_type; typedef numtyp data_type;
UCL_D_Mat() : _cols(0) {} UCL_D_Mat() : _cols(0) {}
~UCL_D_Mat() { _device_free(*this); } ~UCL_D_Mat() { _device_free(*this); }
/// Construct with specified rows and cols /// Construct with specified rows and cols
/** \sa alloc() **/ /** \sa alloc() **/
UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device, UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) : const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
_cols(0) { alloc(rows,cols,device,kind); } _cols(0) { alloc(rows,cols,device,kind); }
/// Row major matrix on device /// Row major matrix on device
/** The kind parameter controls memory optimizations as follows: /** The kind parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels
* \param cq Default command queue for operations copied from another mat * \param cq Default command queue for operations copied from another mat
* \note - Coalesced access using adjacent cols on same row * \note - Coalesced access using adjacent cols on same row
* UCL_D_Mat(row,col) given by array[row*row_size()+col] * UCL_D_Mat(row,col) given by array[row*row_size()+col]
* \return UCL_SUCCESS if the memory allocation is successful **/ * \return UCL_SUCCESS if the memory allocation is successful **/
@ -65,7 +65,7 @@ class UCL_D_Mat : public UCL_BaseMat {
int err=_device_alloc(*this,cq,rows,cols,_pitch,kind); int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
if (err!=UCL_SUCCESS) { if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " std::cerr << "UCL Error: Could not allocate "
<< rows*cols*sizeof(numtyp) << " bytes on device.\n"; << rows*cols*sizeof(numtyp) << " bytes on device.\n";
UCL_GERYON_EXIT; UCL_GERYON_EXIT;
#endif #endif
@ -82,9 +82,9 @@ class UCL_D_Mat : public UCL_BaseMat {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_offset=0; _offset=0;
#endif #endif
return err; return err;
} }
/// Row major matrix on device /// Row major matrix on device
/** The kind parameter controls memory optimizations as follows: /** The kind parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_READ_WRITE - Specify that you will read and write in kernels
@ -118,15 +118,15 @@ class UCL_D_Mat : public UCL_BaseMat {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_offset=0; _offset=0;
#endif #endif
return err; return err;
} }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols, inline void view(ucl_type &input, const size_t rows, const size_t cols,
const size_t stride) { const size_t stride) {
@ -145,7 +145,7 @@ class UCL_D_Mat : public UCL_BaseMat {
#else #else
_device_view(&_array,input.begin()); _device_view(&_array,input.begin());
#endif #endif
#ifndef _UCL_DEVICE_PTR_MAT #ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols; _end=_array+_cols;
#endif #endif
@ -157,39 +157,39 @@ class UCL_D_Mat : public UCL_BaseMat {
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/ * allocating container when using CUDA APIs **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) inline void view(ucl_type &input, const size_t rows, const size_t cols)
{ view(input,rows,cols,input.row_size()); } { view(input,rows,cols,input.row_size()); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t cols) inline void view(ucl_type &input, const size_t cols)
{ view(input,1,cols); } { view(input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input) inline void view(ucl_type &input)
{ view(input,input.rows(),input.cols()); } { view(input,input.rows(),input.cols()); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols, inline void view(ptr_type input, const size_t rows, const size_t cols,
const size_t stride, UCL_Device &dev) { const size_t stride, UCL_Device &dev) {
clear(); clear();
_kind=UCL_VIEW; _kind=UCL_VIEW;
_cols=cols; _cols=cols;
@ -215,7 +215,7 @@ class UCL_D_Mat : public UCL_BaseMat {
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols, inline void view(ptr_type input, const size_t rows, const size_t cols,
UCL_Device &dev) { view(input,rows,cols,cols,dev); } UCL_Device &dev) { view(input,rows,cols,cols,dev); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
@ -223,13 +223,13 @@ class UCL_D_Mat : public UCL_BaseMat {
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type input, const size_t cols, UCL_Device &dev) inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); } { view(input,1,cols,dev); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols, const size_t stride) { const size_t cols, const size_t stride) {
@ -248,7 +248,7 @@ class UCL_D_Mat : public UCL_BaseMat {
#else #else
_device_view(&_array,input.begin(),offset,sizeof(numtyp)); _device_view(&_array,input.begin(),offset,sizeof(numtyp));
#endif #endif
#ifndef _UCL_DEVICE_PTR_MAT #ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols; _end=_array+_cols;
#endif #endif
@ -261,45 +261,45 @@ class UCL_D_Mat : public UCL_BaseMat {
* allocating container when using CUDA APIs **/ * allocating container when using CUDA APIs **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols) const size_t cols)
{ view_offset(offset,input,rows,cols,input.row_size()); } { view_offset(offset,input,rows,cols,input.row_size()); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
{ view_offset(offset,input,1,cols); } { view_offset(offset,input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset, ucl_type &input) { inline void view_offset(const size_t offset, ucl_type &input) {
if (input.rows()==1) if (input.rows()==1)
view_offset(offset,input,1,input.cols()-offset); view_offset(offset,input,1,input.cols()-offset);
else else
view_offset(offset,input,input.rows()-offset/input.row_size(), view_offset(offset,input,input.rows()-offset/input.row_size(),
input.cols()); input.cols());
} }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows, inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
const size_t cols,const size_t stride, const size_t cols,const size_t stride,
UCL_Device &dev) { UCL_Device &dev) {
clear(); clear();
_kind=UCL_VIEW; _kind=UCL_VIEW;
_cols=cols; _cols=cols;
@ -307,7 +307,7 @@ class UCL_D_Mat : public UCL_BaseMat {
_pitch=stride*sizeof(numtyp); _pitch=stride*sizeof(numtyp);
_row_size=stride; _row_size=stride;
this->_cq=dev.cq(); this->_cq=dev.cq();
#ifdef _OCL_MAT #ifdef _OCL_MAT
_array=input; _array=input;
_offset=offset; _offset=offset;
@ -320,7 +320,7 @@ class UCL_D_Mat : public UCL_BaseMat {
_array=input+offset; _array=input+offset;
#endif #endif
#endif #endif
#ifndef _UCL_DEVICE_PTR_MAT #ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols; _end=_array+_cols;
#endif #endif
@ -332,20 +332,20 @@ class UCL_D_Mat : public UCL_BaseMat {
* allocating container when using CUDA APIs **/ * allocating container when using CUDA APIs **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows, inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
const size_t cols, UCL_Device &dev) const size_t cols, UCL_Device &dev)
{ view_offset(offset,input,rows,cols,cols,dev); } { view_offset(offset,input,rows,cols,cols,dev); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/ * allocating container when using CUDA APIs **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type input, inline void view_offset(const size_t offset, ptr_type input,
const size_t cols, UCL_Device &dev) const size_t cols, UCL_Device &dev)
{ view_offset(offset,input,1,cols,dev); } { view_offset(offset,input,1,cols,dev); }
/// Free memory and set size to 0 /// Free memory and set size to 0
inline void clear() inline void clear()
{ _device_free(*this); _cols=0; _kind=UCL_VIEW; } { _device_free(*this); _cols=0; _kind=UCL_VIEW; }
/// Resize the allocation to contain cols elements /// Resize the allocation to contain cols elements
@ -356,7 +356,7 @@ class UCL_D_Mat : public UCL_BaseMat {
int err=_device_resize(*this,rows,cols,_pitch); int err=_device_resize(*this,rows,cols,_pitch);
if (err!=UCL_SUCCESS) { if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " std::cerr << "UCL Error: Could not allocate "
<< rows*cols*sizeof(numtyp) << " bytes on device.\n"; << rows*cols*sizeof(numtyp) << " bytes on device.\n";
UCL_GERYON_EXIT; UCL_GERYON_EXIT;
#endif #endif
@ -372,13 +372,13 @@ class UCL_D_Mat : public UCL_BaseMat {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_offset=0; _offset=0;
#endif #endif
return err; return err;
} }
/// Resize (only if bigger) the allocation to contain rows x cols elements /// Resize (only if bigger) the allocation to contain rows x cols elements
/** \note Cannot be used on views **/ /** \note Cannot be used on views **/
inline int resize_ib(const int rows, const int cols) inline int resize_ib(const int rows, const int cols)
{ if (cols>_cols || rows>_rows) return resize(rows,cols); { if (cols>_cols || rows>_rows) return resize(rows,cols);
else return UCL_SUCCESS; } else return UCL_SUCCESS; }
/// Set each element to zero asynchronously in the default command_queue /// Set each element to zero asynchronously in the default command_queue
@ -386,10 +386,10 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Set first n elements to zero asynchronously in the default command_queue /// Set first n elements to zero asynchronously in the default command_queue
inline void zero(const int n) { zero(n,_cq); } inline void zero(const int n) { zero(n,_cq); }
/// Set each element to zero asynchronously /// Set each element to zero asynchronously
inline void zero(command_queue &cq) inline void zero(command_queue &cq)
{ _device_zero(*this,row_bytes()*_rows,cq); } { _device_zero(*this,row_bytes()*_rows,cq); }
/// Set first n elements to zero asynchronously /// Set first n elements to zero asynchronously
inline void zero(const int n, command_queue &cq) inline void zero(const int n, command_queue &cq)
{ _device_zero(*this,n*sizeof(numtyp),cq); } { _device_zero(*this,n*sizeof(numtyp),cq); }
@ -445,7 +445,7 @@ class UCL_D_Mat : public UCL_BaseMat {
inline size_t row_bytes() const { return _pitch; } inline size_t row_bytes() const { return _pitch; }
/// Get the size in bytes of 1 element /// Get the size in bytes of 1 element
inline int element_size() const { return sizeof(numtyp); } inline int element_size() const { return sizeof(numtyp); }
#ifdef _OCL_MAT #ifdef _OCL_MAT
/// Return the offset (in elements) from begin() pointer where data starts /// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/ /** \note Always 0 for host matrices and CUDA APIs **/
@ -459,7 +459,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Return the offset (in bytes) from begin() pointer where data starts /// Return the offset (in bytes) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/ /** \note Always 0 for host matrices and CUDA APIs **/
inline size_t byteoff() const { return offset()*sizeof(numtyp); } inline size_t byteoff() const { return offset()*sizeof(numtyp); }
private: private:
size_t _pitch, _row_size, _rows, _cols; size_t _pitch, _row_size, _rows, _cols;

View File

@ -17,14 +17,14 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
// Only allow this file to be included by CUDA and OpenCL specific headers // Only allow this file to be included by CUDA and OpenCL specific headers
#ifdef _UCL_MAT_ALLOW #ifdef _UCL_MAT_ALLOW
/// Row vector on device /// Row vector on device
template <class numtyp> template <class numtyp>
class UCL_D_Vec : public UCL_BaseMat { class UCL_D_Vec : public UCL_BaseMat {
public: public:
@ -37,7 +37,7 @@ class UCL_D_Vec : public UCL_BaseMat {
ROW_MAJOR = 1, ROW_MAJOR = 1,
VECTOR = 1 VECTOR = 1
}; };
typedef numtyp data_type; typedef numtyp data_type;
UCL_D_Vec() : _cols(0) {} UCL_D_Vec() : _cols(0) {}
~UCL_D_Vec() { _device_free(*this); } ~UCL_D_Vec() { _device_free(*this); }
@ -45,7 +45,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Construct with n columns /// Construct with n columns
/** \sa alloc() **/ /** \sa alloc() **/
UCL_D_Vec(const size_t n, UCL_Device &device, UCL_D_Vec(const size_t n, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) : const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
_cols(0) { alloc(n,device,kind); } _cols(0) { alloc(n,device,kind); }
/// Set up host vector with 'cols' columns and reserve memory /// Set up host vector with 'cols' columns and reserve memory
@ -58,7 +58,7 @@ class UCL_D_Vec : public UCL_BaseMat {
template <class mat_type> template <class mat_type>
inline int alloc(const size_t cols, mat_type &cq, inline int alloc(const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) { const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
clear(); clear();
_row_bytes=cols*sizeof(numtyp); _row_bytes=cols*sizeof(numtyp);
@ -82,8 +82,8 @@ class UCL_D_Vec : public UCL_BaseMat {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_offset=0; _offset=0;
#endif #endif
return err; return err;
} }
/// Set up host vector with 'cols' columns and reserve memory /// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory optimizations as follows: /** The kind parameter controls memory optimizations as follows:
@ -116,7 +116,7 @@ class UCL_D_Vec : public UCL_BaseMat {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_offset=0; _offset=0;
#endif #endif
return err; return err;
} }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
@ -142,18 +142,18 @@ class UCL_D_Vec : public UCL_BaseMat {
#else #else
_device_view(&_array,input.begin()); _device_view(&_array,input.begin());
#endif #endif
#ifndef _UCL_DEVICE_PTR_MAT #ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols; _end=_array+_cols;
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols, inline void view(ucl_type &input, const size_t rows, const size_t cols,
const size_t stride) { view(input,rows,cols); } const size_t stride) { view(input,rows,cols); }
@ -162,24 +162,24 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t cols) inline void view(ucl_type &input, const size_t cols)
{ view(input,1,cols); } { view(input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input) inline void view(ucl_type &input)
{ view(input,input.rows()*input.row_size()); } { view(input,input.rows()*input.row_size()); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
@ -205,15 +205,15 @@ class UCL_D_Vec : public UCL_BaseMat {
CL_SAFE_CALL(clRetainCommandQueue(dev.cq())); CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols, inline void view(ptr_type input, const size_t rows, const size_t cols,
const size_t stride, UCL_Device &dev) const size_t stride, UCL_Device &dev)
{ view(input,rows,cols,stride); } { view(input,rows,cols,stride); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
@ -223,7 +223,7 @@ class UCL_D_Vec : public UCL_BaseMat {
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type input, const size_t cols, UCL_Device &dev) inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); } { view(input,1,cols,dev); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
@ -248,45 +248,45 @@ class UCL_D_Vec : public UCL_BaseMat {
#else #else
_device_view(&_array,input.begin(),offset,sizeof(numtyp)); _device_view(&_array,input.begin(),offset,sizeof(numtyp));
#endif #endif
#ifndef _UCL_DEVICE_PTR_MAT #ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols; _end=_array+_cols;
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols, const size_t stride) const size_t cols, const size_t stride)
{ view_offset(offset,input,rows,cols); } { view_offset(offset,input,rows,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
{ view_offset(offset,input,1,cols); } { view_offset(offset,input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset, ucl_type &input) inline void view_offset(const size_t offset, ucl_type &input)
{ view_offset(offset,input,input.rows()*input.row_size()-offset); } { view_offset(offset,input,input.rows()*input.row_size()-offset); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
@ -302,7 +302,7 @@ class UCL_D_Vec : public UCL_BaseMat {
_cols=cols; _cols=cols;
_row_bytes=_cols*sizeof(numtyp); _row_bytes=_cols*sizeof(numtyp);
this->_cq=dev.cq(); this->_cq=dev.cq();
#ifdef _OCL_MAT #ifdef _OCL_MAT
_array=input; _array=input;
_offset=offset; _offset=offset;
@ -315,20 +315,20 @@ class UCL_D_Vec : public UCL_BaseMat {
_array=input+offset; _array=input+offset;
#endif #endif
#endif #endif
#ifndef _UCL_DEVICE_PTR_MAT #ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols; _end=_array+_cols;
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows, inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
const size_t cols,const size_t stride,UCL_Device &dev) const size_t cols,const size_t stride,UCL_Device &dev)
{ view_offset(offset,input,rows,cols,stride); } { view_offset(offset,input,rows,cols,stride); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
@ -336,12 +336,12 @@ class UCL_D_Vec : public UCL_BaseMat {
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/ * allocating container when using CUDA APIs **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type input, inline void view_offset(const size_t offset, ptr_type input,
const size_t cols, UCL_Device &dev) const size_t cols, UCL_Device &dev)
{ view_offset(offset,input,1,cols,dev); } { view_offset(offset,input,1,cols,dev); }
/// Free memory and set size to 0 /// Free memory and set size to 0
inline void clear() inline void clear()
{ _device_free(*this); _cols=0; _kind=UCL_VIEW; } { _device_free(*this); _cols=0; _kind=UCL_VIEW; }
/// Resize the allocation to contain cols elements /// Resize the allocation to contain cols elements
@ -369,9 +369,9 @@ class UCL_D_Vec : public UCL_BaseMat {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_offset=0; _offset=0;
#endif #endif
return err; return err;
} }
/// Resize (only if bigger) the allocation to contain cols elements /// Resize (only if bigger) the allocation to contain cols elements
/** \note Cannot be used on views **/ /** \note Cannot be used on views **/
inline int resize_ib(const int cols) inline int resize_ib(const int cols)
@ -384,7 +384,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Set each element to zero asynchronously /// Set each element to zero asynchronously
inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); } inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); }
/// Set first n elements to zero asynchronously /// Set first n elements to zero asynchronously
inline void zero(const int n, command_queue &cq) inline void zero(const int n, command_queue &cq)
{ _device_zero(*this,n*sizeof(numtyp),cq); } { _device_zero(*this,n*sizeof(numtyp),cq); }
#ifdef _UCL_DEVICE_PTR_MAT #ifdef _UCL_DEVICE_PTR_MAT
@ -402,7 +402,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// For CUDA-RT, get device pointer to one past last element /// For CUDA-RT, get device pointer to one past last element
inline numtyp * end() const { return _end; } inline numtyp * end() const { return _end; }
#endif #endif
#ifdef _UCL_DEVICE_PTR_MAT #ifdef _UCL_DEVICE_PTR_MAT
/// Returns an API specific device pointer /// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object /** - For OpenCL, returns a &cl_mem object
@ -427,10 +427,10 @@ class UCL_D_Vec : public UCL_BaseMat {
inline const numtyp ** cbegin() const { return &_array; } inline const numtyp ** cbegin() const { return &_array; }
/// For CUDA-RT, allocate row vector and bind texture /// For CUDA-RT, allocate row vector and bind texture
inline void safe_alloc(const size_t cols, UCL_Device &dev, inline void safe_alloc(const size_t cols, UCL_Device &dev,
textureReference *t) textureReference *t)
{ alloc(cols,dev); assign_texture(t); bind(); } { alloc(cols,dev); assign_texture(t); bind(); }
/// For CUDA-RT, assign a texture to matrix /// For CUDA-RT, assign a texture to matrix
inline void assign_texture(textureReference *t) { _tex_ptr=t; } inline void assign_texture(textureReference *t) { _tex_ptr=t; }
/// For CUDA-RT, bind to texture /// For CUDA-RT, bind to texture
inline void bind() { inline void bind() {
cuda_gb_get_channel<numtyp>(_channel); cuda_gb_get_channel<numtyp>(_channel);
@ -456,7 +456,7 @@ class UCL_D_Vec : public UCL_BaseMat {
inline size_t row_bytes() const { return _row_bytes; } inline size_t row_bytes() const { return _row_bytes; }
/// Get the size in bytes of 1 element /// Get the size in bytes of 1 element
inline int element_size() const { return sizeof(numtyp); } inline int element_size() const { return sizeof(numtyp); }
#ifdef _OCL_MAT #ifdef _OCL_MAT
/// Return the offset (in elements) from begin() pointer where data starts /// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/ /** \note Always 0 for host matrices and CUDA APIs **/
@ -473,7 +473,7 @@ class UCL_D_Vec : public UCL_BaseMat {
private: private:
size_t _row_bytes, _row_size, _rows, _cols; size_t _row_bytes, _row_size, _rows, _cols;
#ifdef _UCL_DEVICE_PTR_MAT #ifdef _UCL_DEVICE_PTR_MAT
device_ptr _array; device_ptr _array;
#else #else

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -37,21 +37,21 @@ class UCL_H_Mat : public UCL_BaseMat {
ROW_MAJOR = 1, ROW_MAJOR = 1,
VECTOR = 0 VECTOR = 0
}; };
typedef numtyp data_type; typedef numtyp data_type;
UCL_H_Mat() : _cols(0) { UCL_H_Mat() : _cols(0) {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_carray=(cl_mem)(0); _carray=(cl_mem)(0);
#endif #endif
} }
~UCL_H_Mat() { _host_free(*this); } ~UCL_H_Mat() { _host_free(*this); }
/// Construct with specied number of rows and columns /// Construct with specied number of rows and columns
/** \sa alloc() **/ /** \sa alloc() **/
UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device, UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) const enum UCL_MEMOPT kind=UCL_READ_WRITE)
{ _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); } { _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
/// Set up host matrix with specied # of rows/cols and reserve memory /// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind parameter controls memory pinning as follows: /** The kind parameter controls memory pinning as follows:
* - UCL_READ_WRITE - Specify that you will read and write from host * - UCL_READ_WRITE - Specify that you will read and write from host
@ -74,7 +74,7 @@ class UCL_H_Mat : public UCL_BaseMat {
<< " bytes on host.\n"; << " bytes on host.\n";
_row_bytes=0; _row_bytes=0;
UCL_GERYON_EXIT; UCL_GERYON_EXIT;
#endif #endif
_row_bytes=0; _row_bytes=0;
return err; return err;
} }
@ -84,7 +84,7 @@ class UCL_H_Mat : public UCL_BaseMat {
_kind=kind; _kind=kind;
_end=_array+rows*cols; _end=_array+rows*cols;
return err; return err;
} }
/// Set up host matrix with specied # of rows/cols and reserve memory /// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind parameter controls memory pinning as follows: /** The kind parameter controls memory pinning as follows:
@ -117,15 +117,15 @@ class UCL_H_Mat : public UCL_BaseMat {
_kind=kind; _kind=kind;
_end=_array+rows*cols; _end=_array+rows*cols;
return err; return err;
} }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported * - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols, inline void view(ucl_type &input, const size_t rows, const size_t cols,
const size_t stride) { const size_t stride) {
@ -149,45 +149,45 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) inline void view(ucl_type &input, const size_t rows, const size_t cols)
{ view(input,rows,cols,input.row_size()); } { view(input,rows,cols,input.row_size()); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t cols) inline void view(ucl_type &input, const size_t cols)
{ view(input,1,cols); } { view(input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view when using CUDA APIs * will be used for view when using CUDA APIs
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input) inline void view(ucl_type &input)
{ view(input,input.rows(),input.cols()); } { view(input,input.rows(),input.cols()); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported * - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols, inline void view(ptr_type *input, const size_t rows, const size_t cols,
const size_t stride, UCL_Device &dev) { const size_t stride, UCL_Device &dev) {
assert(rows==1 || stride==cols); assert(rows==1 || stride==cols);
clear(); clear();
_kind=UCL_VIEW; _kind=UCL_VIEW;
@ -197,40 +197,40 @@ class UCL_H_Mat : public UCL_BaseMat {
this->_cq=dev.cq(); this->_cq=dev.cq();
_array=input; _array=input;
_end=_array+_cols; _end=_array+_cols;
#ifdef _OCL_MAT #ifdef _OCL_MAT
_host_view(*this,dev,_row_bytes*rows); _host_view(*this,dev,_row_bytes*rows);
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols, inline void view(ptr_type *input, const size_t rows, const size_t cols,
UCL_Device &dev) { view(input,rows,cols,cols,dev); } UCL_Device &dev) { view(input,rows,cols,cols,dev); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev) inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); } { view(input,1,cols,dev); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported * - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols, const size_t stride) { const size_t cols, const size_t stride) {
assert(rows==1 || stride==cols); assert(rows==1 || stride==cols);
clear(); clear();
_kind=UCL_VIEW; _kind=UCL_VIEW;
@ -244,81 +244,81 @@ class UCL_H_Mat : public UCL_BaseMat {
_host_view(*this,input,_row_bytes*_rows); _host_view(*this,input,_row_bytes*_rows);
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols) const size_t cols)
{ view_offset(offset,input,rows,cols,input.row_size()); } { view_offset(offset,input,rows,cols,input.row_size()); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
{ view_offset(offset,input,1,cols); } { view_offset(offset,input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset, ucl_type &input) { inline void view_offset(const size_t offset, ucl_type &input) {
if (input.rows()==1) if (input.rows()==1)
view_offset(offset,input,1,input.cols()-offset); view_offset(offset,input,1,input.cols()-offset);
else else
view_offset(offset,input,input.rows()-offset/input.row_size(), view_offset(offset,input,input.rows()-offset/input.row_size(),
input.cols()); input.cols());
} }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
const size_t cols, UCL_Device &dev) const size_t cols, UCL_Device &dev)
{ view(input+offset,rows,cols,dev); } { view(input+offset,rows,cols,dev); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported * - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
const size_t cols,const size_t stride,UCL_Device &dev) const size_t cols,const size_t stride,UCL_Device &dev)
{ view(input+offset,rows,cols,stride,dev); } { view(input+offset,rows,cols,stride,dev); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type *input, inline void view_offset(const size_t offset, ptr_type *input,
const size_t cols, UCL_Device &dev) const size_t cols, UCL_Device &dev)
{ view(input+offset,1,cols,dev); } { view(input+offset,1,cols,dev); }
/// Free memory and set size to 0 /// Free memory and set size to 0
inline void clear() inline void clear()
{ _host_free(*this); _cols=0; _kind=UCL_VIEW; } { _host_free(*this); _cols=0; _kind=UCL_VIEW; }
/// Resize the allocation to rows x cols elements /// Resize the allocation to rows x cols elements
/** \note Cannot be used on views **/ /** \note Cannot be used on views **/
@ -333,7 +333,7 @@ class UCL_H_Mat : public UCL_BaseMat {
<< " bytes on host.\n"; << " bytes on host.\n";
_row_bytes=0; _row_bytes=0;
UCL_GERYON_EXIT; UCL_GERYON_EXIT;
#endif #endif
_row_bytes=0; _row_bytes=0;
return err; return err;
} }
@ -347,7 +347,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Resize (only if bigger) the allocation to contain rows x cols elements /// Resize (only if bigger) the allocation to contain rows x cols elements
/** \note Cannot be used on views **/ /** \note Cannot be used on views **/
inline int resize_ib(const int rows, const int cols) inline int resize_ib(const int rows, const int cols)
{ if (cols>_cols || rows>_rows) return resize(rows,cols); { if (cols>_cols || rows>_rows) return resize(rows,cols);
else return UCL_SUCCESS; } else return UCL_SUCCESS; }
/// Set each element to zero /// Set each element to zero
@ -376,21 +376,21 @@ class UCL_H_Mat : public UCL_BaseMat {
inline size_t row_bytes() const { return _row_bytes; } inline size_t row_bytes() const { return _row_bytes; }
/// Get the size in bytes of 1 element /// Get the size in bytes of 1 element
inline int element_size() const { return sizeof(numtyp); } inline int element_size() const { return sizeof(numtyp); }
/// Get element at index i /// Get element at index i
inline numtyp & operator[](const int i) { return _array[i]; } inline numtyp & operator[](const int i) { return _array[i]; }
/// Get element at index i /// Get element at index i
inline const numtyp & operator[](const int i) const { return _array[i]; } inline const numtyp & operator[](const int i) const { return _array[i]; }
/// 2D access (row should always be 0) /// 2D access (row should always be 0)
inline numtyp & operator()(const int row, const int col) inline numtyp & operator()(const int row, const int col)
{ return _array[row*_cols+col]; } { return _array[row*_cols+col]; }
/// 2D access (row should always be 0) /// 2D access (row should always be 0)
inline const numtyp & operator()(const int row, const int col) const inline const numtyp & operator()(const int row, const int col) const
{ return _array[row*_cols+col]; } { return _array[row*_cols+col]; }
/// Returns pointer to memory pointer for allocation on host /// Returns pointer to memory pointer for allocation on host
inline numtyp ** host_ptr() { return &_array; } inline numtyp ** host_ptr() { return &_array; }
/// Return the offset (in elements) from begin() pointer where data starts /// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/ /** \note Always 0 for host matrices and CUDA APIs **/
inline size_t offset() const { return 0; } inline size_t offset() const { return 0; }
@ -409,14 +409,14 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA) /// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
inline const void ** cbegin() const { return (const void **)&_array; } inline const void ** cbegin() const { return (const void **)&_array; }
#endif #endif
private: private:
numtyp *_array, *_end; numtyp *_array, *_end;
size_t _row_bytes, _rows, _cols; size_t _row_bytes, _rows, _cols;
#ifdef _OCL_MAT #ifdef _OCL_MAT
device_ptr _carray; device_ptr _carray;
#endif #endif
}; };
#endif #endif

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -37,21 +37,21 @@ class UCL_H_Vec : public UCL_BaseMat {
ROW_MAJOR = 1, ROW_MAJOR = 1,
VECTOR = 1 VECTOR = 1
}; };
typedef numtyp data_type; typedef numtyp data_type;
UCL_H_Vec() : _cols(0) { UCL_H_Vec() : _cols(0) {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_carray=(cl_mem)(0); _carray=(cl_mem)(0);
#endif #endif
} }
~UCL_H_Vec() { _host_free(*this); } ~UCL_H_Vec() { _host_free(*this); }
/// Construct with n columns /// Construct with n columns
/** \sa alloc() **/ /** \sa alloc() **/
UCL_H_Vec(const size_t n, UCL_Device &device, UCL_H_Vec(const size_t n, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) const enum UCL_MEMOPT kind=UCL_READ_WRITE)
{ _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); } { _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
/// Set up host vector with 'cols' columns and reserve memory /// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory pinning as follows: /** The kind parameter controls memory pinning as follows:
* - UCL_READ_WRITE - Specify that you will read and write from host * - UCL_READ_WRITE - Specify that you will read and write from host
@ -84,7 +84,7 @@ class UCL_H_Vec : public UCL_BaseMat {
_kind=kind; _kind=kind;
_end=_array+cols; _end=_array+cols;
return err; return err;
} }
/// Set up host vector with 'cols' columns and reserve memory /// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory pinning as follows: /** The kind parameter controls memory pinning as follows:
@ -108,7 +108,7 @@ class UCL_H_Vec : public UCL_BaseMat {
<< " bytes on host.\n"; << " bytes on host.\n";
_row_bytes=0; _row_bytes=0;
UCL_GERYON_EXIT; UCL_GERYON_EXIT;
#endif #endif
_row_bytes=0; _row_bytes=0;
return err; return err;
} }
@ -118,13 +118,13 @@ class UCL_H_Vec : public UCL_BaseMat {
_end=_array+cols; _end=_array+cols;
return err; return err;
} }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) { inline void view(ucl_type &input, const size_t rows, const size_t cols) {
#ifdef UCL_DEBUG #ifdef UCL_DEBUG
@ -143,14 +143,14 @@ class UCL_H_Vec : public UCL_BaseMat {
CL_SAFE_CALL(clRetainCommandQueue(input.cq())); CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported * - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols, inline void view(ucl_type &input, const size_t rows, const size_t cols,
const size_t stride) { view(input,rows,cols); } const size_t stride) { view(input,rows,cols); }
@ -159,31 +159,31 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t cols) inline void view(ucl_type &input, const size_t cols)
{ view(input,1,cols); } { view(input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input) inline void view(ucl_type &input)
{ view(input,input.rows()*input.row_size()); } { view(input,input.rows()*input.row_size()); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols, inline void view(ptr_type *input, const size_t rows, const size_t cols,
UCL_Device &dev) { UCL_Device &dev) {
@ -197,38 +197,38 @@ class UCL_H_Vec : public UCL_BaseMat {
this->_cq=dev.cq(); this->_cq=dev.cq();
_array=input; _array=input;
_end=_array+_cols; _end=_array+_cols;
#ifdef _OCL_MAT #ifdef _OCL_MAT
_host_view(*this,dev,_row_bytes); _host_view(*this,dev,_row_bytes);
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported * - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols, inline void view(ptr_type *input, const size_t rows, const size_t cols,
const size_t stride, UCL_Device &dev) const size_t stride, UCL_Device &dev)
{ view(input,rows,cols,stride); } { view(input,rows,cols,stride); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev) inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); } { view(input,1,cols,dev); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols) { const size_t cols) {
@ -246,76 +246,76 @@ class UCL_H_Vec : public UCL_BaseMat {
_host_view(*this,input,_row_bytes); _host_view(*this,input,_row_bytes);
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported * - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols, const size_t stride) const size_t cols, const size_t stride)
{ view_offset(offset,input,rows,cols); } { view_offset(offset,input,rows,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
{ view_offset(offset,input,1,cols); } { view_offset(offset,input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset, ucl_type &input) inline void view_offset(const size_t offset, ucl_type &input)
{ view_offset(offset,input,input.rows()*input.row_size()-offset); } { view_offset(offset,input,input.rows()*input.row_size()-offset); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
const size_t cols, UCL_Device &dev) const size_t cols, UCL_Device &dev)
{ view(input+offset,rows,cols,dev); } { view(input+offset,rows,cols,dev); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported * - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
const size_t cols,const size_t stride,UCL_Device &dev) const size_t cols,const size_t stride,UCL_Device &dev)
{ view(input+offset,rows,cols,stride,dev); } { view(input+offset,rows,cols,stride,dev); }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type *input, inline void view_offset(const size_t offset, ptr_type *input,
const size_t cols, UCL_Device &dev) const size_t cols, UCL_Device &dev)
{ view(input+offset,1,cols,dev); } { view(input+offset,1,cols,dev); }
/// Free memory and set size to 0 /// Free memory and set size to 0
inline void clear() inline void clear()
{ _host_free(*this); _kind=UCL_VIEW; _cols=0; } { _host_free(*this); _kind=UCL_VIEW; _cols=0; }
/// Resize the allocation to contain cols elements /// Resize the allocation to contain cols elements
@ -324,7 +324,7 @@ class UCL_H_Vec : public UCL_BaseMat {
assert(_kind!=UCL_VIEW); assert(_kind!=UCL_VIEW);
_row_bytes=cols*sizeof(numtyp); _row_bytes=cols*sizeof(numtyp);
int err=_host_resize(*this,_row_bytes); int err=_host_resize(*this,_row_bytes);
if (err!=UCL_SUCCESS) { if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes std::cerr << "UCL Error: Could not allocate " << _row_bytes
@ -340,7 +340,7 @@ class UCL_H_Vec : public UCL_BaseMat {
_end=_array+cols; _end=_array+cols;
return err; return err;
} }
/// Resize (only if bigger) the allocation to contain cols elements /// Resize (only if bigger) the allocation to contain cols elements
/** \note Cannot be used on views **/ /** \note Cannot be used on views **/
inline int resize_ib(const int cols) inline int resize_ib(const int cols)
@ -348,7 +348,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Set each element to zero /// Set each element to zero
inline void zero() { _host_zero(_array,row_bytes()); } inline void zero() { _host_zero(_array,row_bytes()); }
/// Set first n elements to zero /// Set first n elements to zero
inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); } inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); }
@ -373,35 +373,35 @@ class UCL_H_Vec : public UCL_BaseMat {
inline size_t row_bytes() const { return _row_bytes; } inline size_t row_bytes() const { return _row_bytes; }
/// Get the size in bytes of 1 element /// Get the size in bytes of 1 element
inline int element_size() const { return sizeof(numtyp); } inline int element_size() const { return sizeof(numtyp); }
/// Get element at index i /// Get element at index i
inline numtyp & operator[](const int i) { return _array[i]; } inline numtyp & operator[](const int i) { return _array[i]; }
/// Get element at index i /// Get element at index i
inline const numtyp & operator[](const int i) const { return _array[i]; } inline const numtyp & operator[](const int i) const { return _array[i]; }
/// 2D access (row should always be 0) /// 2D access (row should always be 0)
inline numtyp & operator()(const int row, const int col) inline numtyp & operator()(const int row, const int col)
{ return _array[col]; } { return _array[col]; }
/// 2D access (row should always be 0) /// 2D access (row should always be 0)
inline const numtyp & operator()(const int row, const int col) const inline const numtyp & operator()(const int row, const int col) const
{ return _array[col]; } { return _array[col]; }
/// Returns pointer to memory pointer for allocation on host /// Returns pointer to memory pointer for allocation on host
inline numtyp ** host_ptr() { return &_array; } inline numtyp ** host_ptr() { return &_array; }
/// Return the offset (in elements) from begin() pointer where data starts /// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/ /** \note Always 0 for host matrices and CUDA APIs **/
inline size_t offset() const { return 0; } inline size_t offset() const { return 0; }
/// Return the offset (in bytes) from begin() pointer where data starts /// Return the offset (in bytes) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/ /** \note Always 0 for host matrices and CUDA APIs **/
inline size_t byteoff() const { return 0; } inline size_t byteoff() const { return 0; }
#ifdef _OCL_MAT #ifdef _OCL_MAT
/// For OpenCL, returns a reference to the cl_mem object /// For OpenCL, returns a reference to the cl_mem object
inline device_ptr & cbegin() { return _carray; } inline device_ptr & cbegin() { return _carray; }
/// For OpenCL, returns a reference to the cl_mem object /// For OpenCL, returns a reference to the cl_mem object
inline const device_ptr & cbegin() const { return _carray; } inline const device_ptr & cbegin() const { return _carray; }
#endif #endif
private: private:
numtyp *_array, *_end; numtyp *_array, *_end;
size_t _row_bytes, _cols; size_t _row_bytes, _cols;

View File

@ -34,25 +34,25 @@ class UCL_Matrix {
ROW_MAJOR = 1, ROW_MAJOR = 1,
VECTOR = 0 VECTOR = 0
}; };
typedef hosttype data_type; typedef hosttype data_type;
/// Host Allocation /// Host Allocation
UCL_H_Mat<hosttype> host; UCL_H_Mat<hosttype> host;
/// Device Allocation /// Device Allocation
UCL_D_Mat<devtype> device; UCL_D_Mat<devtype> device;
UCL_Matrix() { } UCL_Matrix() { }
~UCL_Matrix() { } ~UCL_Matrix() { }
/// Construct with specied number of rows and columns /// Construct with specied number of rows and columns
/** \sa alloc() **/ /** \sa alloc() **/
UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc, UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_READ_WRITE, const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE) const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); } alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
/// Set up host matrix with specied # of rows/cols and reserve memory /// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind1 parameter controls memory access from the host /** The kind1 parameter controls memory access from the host
* - UCL_READ_WRITE - Specify that you will read and write from host * - UCL_READ_WRITE - Specify that you will read and write from host
@ -74,7 +74,7 @@ class UCL_Matrix {
const enum UCL_MEMOPT kind2=UCL_READ_WRITE) const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); } alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
/// Set up host matrix with specied # of rows/cols and reserve memory /// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind1 parameter controls memory access from the host /** The kind1 parameter controls memory access from the host
* - UCL_READ_WRITE - Specify that you will read and write from host * - UCL_READ_WRITE - Specify that you will read and write from host
@ -92,9 +92,9 @@ class UCL_Matrix {
const enum UCL_MEMOPT kind2=UCL_READ_WRITE) const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); } alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
/// Free memory and set size to 0 /// Free memory and set size to 0
inline void clear() inline void clear()
{ host.clear(); device.clear(); } { host.clear(); device.clear(); }
/// Resize the allocation to contain cols elements /// Resize the allocation to contain cols elements
@ -106,10 +106,10 @@ class UCL_Matrix {
return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
dev_resize(device,host,_buffer,rows,cols); dev_resize(device,host,_buffer,rows,cols);
} }
/// Resize (only if bigger) the allocation to contain cols elements /// Resize (only if bigger) the allocation to contain cols elements
inline int resize_ib(const int new_rows, const int new_cols) inline int resize_ib(const int new_rows, const int new_cols)
{ if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols); { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
else return UCL_SUCCESS; } else return UCL_SUCCESS; }
/// Set each element to zero (asynchronously on device) /// Set each element to zero (asynchronously on device)
@ -118,14 +118,14 @@ class UCL_Matrix {
inline void zero(const int n) { zero(n,cq()); } inline void zero(const int n) { zero(n,cq()); }
/// Set each element to zero (asynchronously on device) /// Set each element to zero (asynchronously on device)
inline void zero(command_queue &cq) { inline void zero(command_queue &cq) {
host.zero(); host.zero();
if (device.kind()!=UCL_VIEW) device.zero(cq); if (device.kind()!=UCL_VIEW) device.zero(cq);
else if (_buffer.numel()>0) _buffer.zero(); else if (_buffer.numel()>0) _buffer.zero();
} }
/// Set first n elements to zero (asynchronously on device) /// Set first n elements to zero (asynchronously on device)
inline void zero(const int n, command_queue &cq) { inline void zero(const int n, command_queue &cq) {
host.zero(n); host.zero(n);
if (device.kind()!=UCL_VIEW) device.zero(n,cq); if (device.kind()!=UCL_VIEW) device.zero(n,cq);
else if (_buffer.numel()>0) _buffer.zero(); else if (_buffer.numel()>0) _buffer.zero();
} }
@ -136,26 +136,26 @@ class UCL_Matrix {
/// Get the number of columns /// Get the number of columns
inline size_t cols() const { return host.cols(); } inline size_t cols() const { return host.cols(); }
/// Get the memory usage (bytes) of the s-object (including any buffers) /// Get the memory usage (bytes) of the s-object (including any buffers)
inline size_t host_mem_usage() inline size_t host_mem_usage()
{ return host.row_bytes()*host.rows()+_buffer.row_bytes()*_buffer.rows(); } { return host.row_bytes()*host.rows()+_buffer.row_bytes()*_buffer.rows(); }
/// Get the memory usage (bytes) of the s-object (including any buffers) /// Get the memory usage (bytes) of the s-object (including any buffers)
inline size_t device_mem_usage() inline size_t device_mem_usage()
{ return device.row_bytes()*device.rows(); } { return device.row_bytes()*device.rows(); }
/// Get element at index i /// Get element at index i
inline hosttype & operator[](const int i) { return host[i]; } inline hosttype & operator[](const int i) { return host[i]; }
/// Get element at index i /// Get element at index i
inline const hosttype & operator[](const int i) const { return host[i]; } inline const hosttype & operator[](const int i) const { return host[i]; }
/// 2D access (row should always be 0) /// 2D access (row should always be 0)
inline hosttype & operator()(const int row, const int col) inline hosttype & operator()(const int row, const int col)
{ return host(row,col); } { return host(row,col); }
/// 2D access (row should always be 0) /// 2D access (row should always be 0)
inline const hosttype & operator()(const int row, const int col) const inline const hosttype & operator()(const int row, const int col) const
{ return host(row,col); } { return host(row,col); }
/// Returns pointer to memory pointer for allocation on host /// Returns pointer to memory pointer for allocation on host
inline hosttype ** host_ptr() { return host.host_ptr(); } inline hosttype ** host_ptr() { return host.host_ptr(); }
/// Return the default command queue/stream associated with this data /// Return the default command queue/stream associated with this data
inline command_queue & cq() { return host.cq(); } inline command_queue & cq() { return host.cq(); }
/// Change the default command queue associated with this data /// Change the default command queue associated with this data
@ -172,7 +172,7 @@ class UCL_Matrix {
/// Update the allocation on the host asynchronously /// Update the allocation on the host asynchronously
inline void update_host() inline void update_host()
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,_buffer,true); } copy(host,device,_buffer,true); }
/// Update the allocation on the host (true for asynchronous copy) /// Update the allocation on the host (true for asynchronous copy)
@ -202,7 +202,7 @@ class UCL_Matrix {
/// Update the allocation on the device asynchronously /// Update the allocation on the device asynchronously
inline void update_device() inline void update_device()
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,_buffer,true); } copy(device,host,_buffer,true); }
/// Update the allocation on the device (true for asynchronous copy) /// Update the allocation on the device (true for asynchronous copy)

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -53,9 +53,9 @@ typedef struct _double4 double4;
#define BLOCK_SIZE_Y blockDim.y #define BLOCK_SIZE_Y blockDim.y
#define __kernel extern "C" __global__ #define __kernel extern "C" __global__
#define __local __shared__ #define __local __shared__
#define __global #define __global
#define atom_add atomicAdd #define atom_add atomicAdd
#define ucl_inline static __inline__ __device__ #define ucl_inline static __inline__ __device__
#endif #endif

View File

@ -17,10 +17,10 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
// Only allow this file to be included by nvc_memory.h and ocl_memory.h // Only allow this file to be included by nvc_memory.h and ocl_memory.h
#ifdef UCL_PRINT_ALLOW #ifdef UCL_PRINT_ALLOW
@ -40,7 +40,7 @@ template <> struct _ucl_print<1> {
} }
template <class mat_type> template <class mat_type>
static inline void p(mat_type &mat, const size_t rows, const size_t cols, static inline void p(mat_type &mat, const size_t rows, const size_t cols,
std::ostream &out, const std::string delim, std::ostream &out, const std::string delim,
const std::string row_delim) { const std::string row_delim) {
int offset=0; int offset=0;
int row_size=cols; int row_size=cols;
@ -58,12 +58,12 @@ template <> struct _ucl_print<1> {
} }
template <class mat_type> template <class mat_type>
static inline void p(const mat_type &mat,const size_t rows,const size_t cols, static inline void p(const mat_type &mat,const size_t rows,const size_t cols,
std::ostream &out,const std::string delim, std::ostream &out,const std::string delim,
const std::string row_delim, UCL_Device &dev) { const std::string row_delim, UCL_Device &dev) {
p(mat,rows,cols,out,delim,row_delim); p(mat,rows,cols,out,delim,row_delim);
} }
}; };
template <int mem> struct _ucl_print { template <int mem> struct _ucl_print {
template <class mat_type> template <class mat_type>
static inline void p(mat_type &mat, const size_t n, std::ostream &out, static inline void p(mat_type &mat, const size_t n, std::ostream &out,
@ -83,7 +83,7 @@ template <int mem> struct _ucl_print {
} }
template <class mat_type> template <class mat_type>
static inline void p(mat_type &mat, const size_t rows, const size_t cols, static inline void p(mat_type &mat, const size_t rows, const size_t cols,
std::ostream &out, const std::string delim, std::ostream &out, const std::string delim,
const std::string row_delim) { const std::string row_delim) {
UCL_H_Vec<typename mat_type::data_type> temp; UCL_H_Vec<typename mat_type::data_type> temp;
temp.alloc(mat.rows()*mat.cols(),mat); temp.alloc(mat.rows()*mat.cols(),mat);
@ -91,12 +91,12 @@ template <int mem> struct _ucl_print {
ucl_copy(temp,mat,rows*cols,false); ucl_copy(temp,mat,rows*cols,false);
else else
ucl_copy(temp,mat,rows,cols,false); ucl_copy(temp,mat,rows,cols,false);
_ucl_print<1>::p(temp,rows,cols,out,delim,row_delim); _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
} }
template <class mat_type> template <class mat_type>
static inline void p(const mat_type &mat, const size_t rows, static inline void p(const mat_type &mat, const size_t rows,
const size_t cols,std::ostream &out, const size_t cols,std::ostream &out,
const std::string delim, const std::string delim,
const std::string row_delim, UCL_Device &dev) { const std::string row_delim, UCL_Device &dev) {
UCL_H_Vec<typename mat_type::data_type> temp; UCL_H_Vec<typename mat_type::data_type> temp;
temp.alloc(mat.rows()*mat.cols(),dev); temp.alloc(mat.rows()*mat.cols(),dev);
@ -104,9 +104,9 @@ template <int mem> struct _ucl_print {
ucl_copy(temp,mat,rows*cols,false); ucl_copy(temp,mat,rows*cols,false);
else else
ucl_copy(temp,mat,rows,cols,false); ucl_copy(temp,mat,rows,cols,false);
_ucl_print<1>::p(temp,rows,cols,out,delim,row_delim); _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
} }
}; };
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
// - Non-const routines that do not require a device object // - Non-const routines that do not require a device object
@ -123,13 +123,13 @@ inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out,
} }
_ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim); _ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim);
} }
/// Outputs n elements of mat delimited by a space /// Outputs n elements of mat delimited by a space
template <class mat_type> template <class mat_type>
inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out) { inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out) {
ucl_print(mat,n,out," "); ucl_print(mat,n,out," ");
} }
/// Outputs n elements of mat delimited by a space to standard out /// Outputs n elements of mat delimited by a space to standard out
template <class mat_type> template <class mat_type>
inline void ucl_print(mat_type &mat, const size_t n) { inline void ucl_print(mat_type &mat, const size_t n) {
@ -139,8 +139,8 @@ inline void ucl_print(mat_type &mat, const size_t n) {
/// Outputs upper left rows and cols of mat delimited by the string delim /// Outputs upper left rows and cols of mat delimited by the string delim
template <class mat_type> template <class mat_type>
inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols, inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
std::ostream &out, const std::string delim, std::ostream &out, const std::string delim,
const std::string row_delim) { const std::string row_delim) {
if (rows*cols>mat.numel()) { if (rows*cols>mat.numel()) {
std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix " std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
<< "that only has " << mat.numel() << " elements."; << "that only has " << mat.numel() << " elements.";
@ -148,17 +148,17 @@ inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
} }
_ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim); _ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim);
} }
/// Outputs upper left rows and cols of mat delimited by a space /// Outputs upper left rows and cols of mat delimited by a space
template <class mat_type> template <class mat_type>
inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols, inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
std::ostream &out) { std::ostream &out) {
ucl_print(mat,rows,cols,out," ","\n"); ucl_print(mat,rows,cols,out," ","\n");
} }
/// Outputs upper left rows and cols of mat delimited by a space to std out /// Outputs upper left rows and cols of mat delimited by a space to std out
template <class mat_type> template <class mat_type>
inline void ucl_print(mat_type &mat, const size_t rows, inline void ucl_print(mat_type &mat, const size_t rows,
const size_t cols) { const size_t cols) {
ucl_print(mat,rows,cols,std::cout," ","\n"); ucl_print(mat,rows,cols,std::cout," ","\n");
} }
@ -177,7 +177,7 @@ inline void ucl_print(mat_type &mat, std::ostream &out) {
else else
ucl_print(mat,mat.rows(),mat.cols(),out," ","\n"); ucl_print(mat,mat.rows(),mat.cols(),out," ","\n");
} }
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
// - Const routines that do not require a device object // - Const routines that do not require a device object
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
@ -193,14 +193,14 @@ inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
} }
_ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim,dev); _ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim,dev);
} }
/// Outputs n elements of mat delimited by a space /// Outputs n elements of mat delimited by a space
template <class mat_type> template <class mat_type>
inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out, inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
UCL_Device &dev) { UCL_Device &dev) {
ucl_print(mat,n,out," ",dev); ucl_print(mat,n,out," ",dev);
} }
/// Outputs n elements of mat delimited by a space to standard out /// Outputs n elements of mat delimited by a space to standard out
template <class mat_type> template <class mat_type>
inline void ucl_print(const mat_type &mat, const size_t n, inline void ucl_print(const mat_type &mat, const size_t n,
@ -211,7 +211,7 @@ inline void ucl_print(const mat_type &mat, const size_t n,
/// Outputs upper left rows and cols of mat delimited by the string delim /// Outputs upper left rows and cols of mat delimited by the string delim
template <class mat_type> template <class mat_type>
inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols, inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
std::ostream &out, const std::string delim, std::ostream &out, const std::string delim,
const std::string row_delim, UCL_Device &dev) { const std::string row_delim, UCL_Device &dev) {
if (rows*cols>mat.numel()) { if (rows*cols>mat.numel()) {
std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix " std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
@ -220,17 +220,17 @@ inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
} }
_ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim,dev); _ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim,dev);
} }
/// Outputs upper left rows and cols of mat delimited by a space /// Outputs upper left rows and cols of mat delimited by a space
template <class mat_type> template <class mat_type>
inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols, inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
std::ostream &out, UCL_Device &dev) { std::ostream &out, UCL_Device &dev) {
ucl_print(mat,rows,cols,out," ","\n",dev); ucl_print(mat,rows,cols,out," ","\n",dev);
} }
/// Outputs upper left rows and cols of mat delimited by a space to std out /// Outputs upper left rows and cols of mat delimited by a space to std out
template <class mat_type> template <class mat_type>
inline void ucl_print(const mat_type &mat, const size_t rows, inline void ucl_print(const mat_type &mat, const size_t rows,
const size_t cols, UCL_Device &dev) { const size_t cols, UCL_Device &dev) {
ucl_print(mat,rows,cols,std::cout," ","\n",dev); ucl_print(mat,rows,cols,std::cout," ","\n",dev);
} }
@ -256,27 +256,27 @@ inline void ucl_print(const mat_type &mat, std::ostream &out, UCL_Device &dev) {
template <class numtyp> template <class numtyp>
inline std::ostream & operator << (std::ostream &out, UCL_H_Vec<numtyp> &mat) inline std::ostream & operator << (std::ostream &out, UCL_H_Vec<numtyp> &mat)
{ ucl_print(mat,out); return out; } { ucl_print(mat,out); return out; }
template <class numtyp> template <class numtyp>
inline std::ostream & operator << (std::ostream &out, UCL_H_Mat<numtyp> &mat) inline std::ostream & operator << (std::ostream &out, UCL_H_Mat<numtyp> &mat)
{ ucl_print(mat,out); return out; } { ucl_print(mat,out); return out; }
template <class numtyp> template <class numtyp>
inline std::ostream & operator << (std::ostream &out, UCL_D_Vec<numtyp> &mat) inline std::ostream & operator << (std::ostream &out, UCL_D_Vec<numtyp> &mat)
{ ucl_print(mat,out); return out; } { ucl_print(mat,out); return out; }
template <class numtyp> template <class numtyp>
inline std::ostream & operator << (std::ostream &out, UCL_D_Mat<numtyp> &mat) inline std::ostream & operator << (std::ostream &out, UCL_D_Mat<numtyp> &mat)
{ ucl_print(mat,out); return out; } { ucl_print(mat,out); return out; }
template <class t1, class t2> template <class t1, class t2>
inline std::ostream & operator << (std::ostream &out, UCL_Vector<t1,t2> &mat) inline std::ostream & operator << (std::ostream &out, UCL_Vector<t1,t2> &mat)
{ ucl_print(mat.host,out); return out; } { ucl_print(mat.host,out); return out; }
template <class t1, class t2> template <class t1, class t2>
inline std::ostream & operator << (std::ostream &out, UCL_Matrix<t1,t2> &mat) inline std::ostream & operator << (std::ostream &out, UCL_Matrix<t1,t2> &mat)
{ ucl_print(mat.host,out); return out; } { ucl_print(mat.host,out); return out; }
#endif #endif

View File

@ -3,7 +3,7 @@
------------------- -------------------
W. Michael Brown W. Michael Brown
Helper routines for allocating memory for s-objects and performing Helper routines for allocating memory for s-objects and performing
host/device updates. (Different routines depending on whether the host/device updates. (Different routines depending on whether the
same type is used on the host and device). same type is used on the host and device).
@ -141,29 +141,29 @@ template <> struct _ucl_s_obj_help<1> {
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
const bool async) { const bool async) {
ucl_copy(dst,src,cols,async); ucl_copy(dst,src,cols,async);
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
command_queue &cq) { command_queue &cq) {
ucl_copy(dst,src,cols,cq); ucl_copy(dst,src,cols,cq);
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
t3 &buffer, const bool async) { t3 &buffer, const bool async) {
ucl_copy(dst,src,rows,cols,async); ucl_copy(dst,src,rows,cols,async);
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
t3 &buffer, command_queue &cq) { t3 &buffer, command_queue &cq) {
ucl_copy(dst,src,rows,cols,cq); ucl_copy(dst,src,rows,cols,cq);
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) { static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
if (device.kind()==UCL_VIEW) { if (device.kind()==UCL_VIEW) {
@ -181,7 +181,7 @@ template <> struct _ucl_s_obj_help<1> {
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
const int cols) { const int cols) {
if (device.kind()==UCL_VIEW) { if (device.kind()==UCL_VIEW) {
device.view(host); device.view(host);
@ -255,7 +255,7 @@ template <int st> struct _ucl_s_obj_help {
e1=_buffer.alloc(cols,cq,kind1); e1=_buffer.alloc(cols,cq,kind1);
if (e1!=UCL_SUCCESS) if (e1!=UCL_SUCCESS)
return e1; return e1;
return device.alloc(cols,cq,kind2); return device.alloc(cols,cq,kind2);
} }
} }
@ -314,7 +314,7 @@ template <int st> struct _ucl_s_obj_help {
e1=_buffer.alloc(rows,cols,cq,kind1); e1=_buffer.alloc(rows,cols,cq,kind1);
if (e1!=UCL_SUCCESS) if (e1!=UCL_SUCCESS)
return e1; return e1;
return device.alloc(rows,cols,cq,kind2); return device.alloc(rows,cols,cq,kind2);
} }
} }
@ -329,25 +329,25 @@ template <int st> struct _ucl_s_obj_help {
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
const bool async) { const bool async) {
ucl_cast_copy(dst,src,cols,buffer,async); ucl_cast_copy(dst,src,cols,buffer,async);
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
command_queue &cq) { command_queue &cq) {
ucl_cast_copy(dst,src,cols,buffer,cq); ucl_cast_copy(dst,src,cols,buffer,cq);
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
t3 &buffer, const bool async) { t3 &buffer, const bool async) {
ucl_cast_copy(dst,src,rows,cols,buffer,async); ucl_cast_copy(dst,src,rows,cols,buffer,async);
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
t3 &buffer, command_queue &cq) { t3 &buffer, command_queue &cq) {
ucl_cast_copy(dst,src,rows,cols,buffer,cq); ucl_cast_copy(dst,src,rows,cols,buffer,cq);
} }
@ -373,7 +373,7 @@ template <int st> struct _ucl_s_obj_help {
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
const int cols) { const int cols) {
int err=buff.resize(rows,cols); int err=buff.resize(rows,cols);
if (err!=UCL_SUCCESS) if (err!=UCL_SUCCESS)

View File

@ -17,7 +17,7 @@
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the Simplified BSD License. the Simplified BSD License.
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */
@ -26,65 +26,65 @@
// Assign an integer id based on the data type: (int, float, double, etc) // Assign an integer id based on the data type: (int, float, double, etc)
template <class eltype> struct _UCL_DATA_ID; template <class eltype> struct _UCL_DATA_ID;
template <> struct _UCL_DATA_ID<double> { template <> struct _UCL_DATA_ID<double> {
enum { id=1 }; enum { id=1 };
static inline const char * name() { return "double"; } static inline const char * name() { return "double"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=double"; } static inline const char * numtyp_flag() { return "-D NUMTYP=double"; }
}; };
template <> struct _UCL_DATA_ID<float> { template <> struct _UCL_DATA_ID<float> {
enum { id=2 }; enum { id=2 };
static inline const char * name() { return "float"; } static inline const char * name() { return "float"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=float"; } static inline const char * numtyp_flag() { return "-D NUMTYP=float"; }
}; };
template <> struct _UCL_DATA_ID<unsigned> { template <> struct _UCL_DATA_ID<unsigned> {
enum { id=3 }; enum { id=3 };
static inline const char * name() { return "unsigned"; } static inline const char * name() { return "unsigned"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; } static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; }
}; };
template <> struct _UCL_DATA_ID<int> { template <> struct _UCL_DATA_ID<int> {
enum { id=4 }; enum { id=4 };
static inline const char * name() { return "int"; } static inline const char * name() { return "int"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=int"; } static inline const char * numtyp_flag() { return "-D NUMTYP=int"; }
}; };
template <> struct _UCL_DATA_ID<char> { template <> struct _UCL_DATA_ID<char> {
enum { id=5 }; enum { id=5 };
static inline const char * name() { return "char"; } static inline const char * name() { return "char"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=char"; } static inline const char * numtyp_flag() { return "-D NUMTYP=char"; }
}; };
template <> struct _UCL_DATA_ID<unsigned char> { template <> struct _UCL_DATA_ID<unsigned char> {
enum { id=6 }; enum { id=6 };
static inline const char * name() { return "unsigned char"; } static inline const char * name() { return "unsigned char"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; } static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; }
}; };
template <> struct _UCL_DATA_ID<short> { template <> struct _UCL_DATA_ID<short> {
enum { id=7 }; enum { id=7 };
static inline const char * name() { return "short"; } static inline const char * name() { return "short"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=short"; } static inline const char * numtyp_flag() { return "-D NUMTYP=short"; }
}; };
template <> struct _UCL_DATA_ID<unsigned short> { template <> struct _UCL_DATA_ID<unsigned short> {
enum { id=8 }; enum { id=8 };
static inline const char * name() { return "unsigned short"; } static inline const char * name() { return "unsigned short"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; } static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; }
}; };
template <> struct _UCL_DATA_ID<long> { template <> struct _UCL_DATA_ID<long> {
enum { id=9 }; enum { id=9 };
static inline const char * name() { return "long"; } static inline const char * name() { return "long"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=long"; } static inline const char * numtyp_flag() { return "-D NUMTYP=long"; }
}; };
template <> struct _UCL_DATA_ID<unsigned long> { template <> struct _UCL_DATA_ID<unsigned long> {
enum { id=10 }; enum { id=10 };
static inline const char * name() { return "unsigned long"; } static inline const char * name() { return "unsigned long"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; } static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; }
}; };
template <> struct _UCL_DATA_ID<long double> { template <> struct _UCL_DATA_ID<long double> {
enum { id=11 }; enum { id=11 };
static inline const char * name() { return "long double"; } static inline const char * name() { return "long double"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; } static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; }
}; };
template <class eltype> struct _UCL_DATA_ID { template <class eltype> struct _UCL_DATA_ID {
enum { id=0 }; enum { id=0 };
static inline const char * name() { return "error_type"; } static inline const char * name() { return "error_type"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; } static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; }
}; };
// Host memory allocation types // Host memory allocation types
@ -97,7 +97,7 @@ enum UCL_MEMOPT {
UCL_NOT_SPECIFIED UCL_NOT_SPECIFIED
}; };
enum UCL_DEVICE_TYPE { enum UCL_DEVICE_TYPE {
UCL_DEFAULT, ///< Unknown device type UCL_DEFAULT, ///< Unknown device type
UCL_CPU, ///< Device is a CPU UCL_CPU, ///< Device is a CPU
UCL_GPU, ///< Device is a GPU UCL_GPU, ///< Device is a GPU
@ -111,7 +111,7 @@ enum UCL_ERROR_FLAG {
UCL_FUNCTION_NOT_FOUND, ///< Kernel function not found UCL_FUNCTION_NOT_FOUND, ///< Kernel function not found
UCL_COMPILE_ERROR, ///< Error compiling kernel UCL_COMPILE_ERROR, ///< Error compiling kernel
UCL_MEMORY_ERROR UCL_MEMORY_ERROR
}; };
template <class numtyp> template <class numtyp>
const char * ucl_template_name() { return _UCL_DATA_ID<numtyp>::name(); } const char * ucl_template_name() { return _UCL_DATA_ID<numtyp>::name(); }

View File

@ -34,25 +34,25 @@ class UCL_Vector {
ROW_MAJOR = 1, ROW_MAJOR = 1,
VECTOR = 1 VECTOR = 1
}; };
typedef hosttype data_type; typedef hosttype data_type;
/// Host Allocation /// Host Allocation
UCL_H_Vec<hosttype> host; UCL_H_Vec<hosttype> host;
/// Device Allocation /// Device Allocation
UCL_D_Vec<devtype> device; UCL_D_Vec<devtype> device;
UCL_Vector() { } UCL_Vector() { }
~UCL_Vector() { } ~UCL_Vector() { }
/// Construct with n columns /// Construct with n columns
/** \sa alloc() **/ /** \sa alloc() **/
UCL_Vector(const size_t cols, UCL_Device &acc, UCL_Vector(const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_READ_WRITE, const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE) const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,cols,acc,kind1,kind2); } alloc(host,device,_buffer,cols,acc,kind1,kind2); }
/// Set up the vector with 'cols' columns and reserve memory /// Set up the vector with 'cols' columns and reserve memory
/** The kind1 parameter controls memory access from the host /** The kind1 parameter controls memory access from the host
* - UCL_READ_WRITE - Specify that you will read and write from host * - UCL_READ_WRITE - Specify that you will read and write from host
@ -89,12 +89,12 @@ class UCL_Vector {
* \return UCL_SUCCESS if the memory allocation is successful **/ * \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t cols, UCL_Device &acc, inline int alloc(const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_READ_WRITE, const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE) const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,cols,acc,kind1,kind2); } alloc(host,device,_buffer,cols,acc,kind1,kind2); }
/// Free memory and set size to 0 /// Free memory and set size to 0
inline void clear() inline void clear()
{ host.clear(); device.clear(); } { host.clear(); device.clear(); }
/// Resize the allocation to contain cols elements /// Resize the allocation to contain cols elements
@ -106,7 +106,7 @@ class UCL_Vector {
return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
dev_resize(device,host,_buffer,cols); dev_resize(device,host,_buffer,cols);
} }
/// Resize (only if bigger) the allocation to contain cols elements /// Resize (only if bigger) the allocation to contain cols elements
inline int resize_ib(const int new_cols) inline int resize_ib(const int new_cols)
{ if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; } { if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
@ -117,14 +117,14 @@ class UCL_Vector {
inline void zero(const int n) { zero(n,cq()); } inline void zero(const int n) { zero(n,cq()); }
/// Set each element to zero (asynchronously on device) /// Set each element to zero (asynchronously on device)
inline void zero(command_queue &cq) { inline void zero(command_queue &cq) {
host.zero(); host.zero();
if (device.kind()!=UCL_VIEW) device.zero(cq); if (device.kind()!=UCL_VIEW) device.zero(cq);
else if (_buffer.numel()>0) _buffer.zero(); else if (_buffer.numel()>0) _buffer.zero();
} }
/// Set first n elements to zero (asynchronously on device) /// Set first n elements to zero (asynchronously on device)
inline void zero(const int n, command_queue &cq) { inline void zero(const int n, command_queue &cq) {
host.zero(n); host.zero(n);
if (device.kind()!=UCL_VIEW) device.zero(n,cq); if (device.kind()!=UCL_VIEW) device.zero(n,cq);
else if (_buffer.numel()>0) _buffer.zero(); else if (_buffer.numel()>0) _buffer.zero();
} }
@ -135,27 +135,27 @@ class UCL_Vector {
/// Get the number of columns /// Get the number of columns
inline size_t cols() const { return host.cols(); } inline size_t cols() const { return host.cols(); }
/// Get the memory usage (bytes) of the s-object (including any buffers) /// Get the memory usage (bytes) of the s-object (including any buffers)
inline size_t host_mem_usage() inline size_t host_mem_usage()
{ return host.row_bytes()+_buffer.row_bytes(); } { return host.row_bytes()+_buffer.row_bytes(); }
/// Get the memory usage (bytes) of the s-object (including any buffers) /// Get the memory usage (bytes) of the s-object (including any buffers)
inline size_t device_mem_usage() inline size_t device_mem_usage()
{ return device.row_bytes(); } { return device.row_bytes(); }
/// Get element at index i /// Get element at index i
inline hosttype & operator[](const int i) { return host[i]; } inline hosttype & operator[](const int i) { return host[i]; }
/// Get element at index i /// Get element at index i
inline const hosttype & operator[](const int i) const { return host[i]; } inline const hosttype & operator[](const int i) const { return host[i]; }
/// 2D access (row should always be 0) /// 2D access (row should always be 0)
inline hosttype & operator()(const int row, const int col) inline hosttype & operator()(const int row, const int col)
{ return host[col]; } { return host[col]; }
/// 2D access (row should always be 0) /// 2D access (row should always be 0)
inline const hosttype & operator()(const int row, const int col) const inline const hosttype & operator()(const int row, const int col) const
{ return host[col]; } { return host[col]; }
/// Returns pointer to memory pointer for allocation on host /// Returns pointer to memory pointer for allocation on host
inline hosttype ** host_ptr() { return host.host_ptr(); } inline hosttype ** host_ptr() { return host.host_ptr(); }
/// Return the default command queue/stream associated with this data /// Return the default command queue/stream associated with this data
inline command_queue & cq() { return host.cq(); } inline command_queue & cq() { return host.cq(); }
/// Change the default command queue associated with this data /// Change the default command queue associated with this data
@ -172,7 +172,7 @@ class UCL_Vector {
/// Update the allocation on the host asynchronously /// Update the allocation on the host asynchronously
inline void update_host() inline void update_host()
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,_buffer,true); } copy(host,device,_buffer,true); }
/// Update the allocation on the host (true for asynchronous copy) /// Update the allocation on the host (true for asynchronous copy)
@ -202,7 +202,7 @@ class UCL_Vector {
/// Update the allocation on the device asynchronously /// Update the allocation on the device asynchronously
inline void update_device() inline void update_device()
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,_buffer,true); } copy(device,host,_buffer,true); }
/// Update the allocation on the device (true for asynchronous copy) /// Update the allocation on the device (true for asynchronous copy)

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -24,7 +24,7 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int AnswerT::bytes_per_atom() const { int AnswerT::bytes_per_atom() const {
int bytes=11*sizeof(acctyp); int bytes=11*sizeof(acctyp);
if (_rot) if (_rot)
bytes+=4*sizeof(acctyp); bytes+=4*sizeof(acctyp);
@ -38,19 +38,19 @@ bool AnswerT::alloc(const int inum) {
_max_local=static_cast<int>(static_cast<double>(inum)*1.10); _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
bool success=true; bool success=true;
_ans_fields=4; _ans_fields=4;
if (_rot) if (_rot)
_ans_fields+=4; _ans_fields+=4;
// --------------------------- Device allocations // --------------------------- Device allocations
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY, success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
UCL_READ_WRITE)==UCL_SUCCESS); UCL_READ_WRITE)==UCL_SUCCESS);
success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY, success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY,
UCL_READ_WRITE)==UCL_SUCCESS); UCL_READ_WRITE)==UCL_SUCCESS);
_gpu_bytes=engv.device.row_bytes()+force.device.row_bytes(); _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
_allocated=true; _allocated=true;
return success; return success;
} }
@ -69,21 +69,21 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot,
if (_charge) if (_charge)
_e_fields++; _e_fields++;
_ev_fields=6+_e_fields; _ev_fields=6+_e_fields;
// Initialize atom and nbor data // Initialize atom and nbor data
int ef_inum=inum; int ef_inum=inum;
if (ef_inum==0) if (ef_inum==0)
ef_inum=1000; ef_inum=1000;
// Initialize timers for the selected device // Initialize timers for the selected device
time_answer.init(*dev); time_answer.init(*dev);
time_answer.zero(); time_answer.zero();
_time_cast=0.0; _time_cast=0.0;
_time_cpu_idle=0.0; _time_cpu_idle=0.0;
return success && alloc(ef_inum); return success && alloc(ef_inum);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
bool AnswerT::add_fields(const bool charge, const bool rot) { bool AnswerT::add_fields(const bool charge, const bool rot) {
bool realloc=false; bool realloc=false;
@ -127,15 +127,15 @@ void AnswerT::clear() {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
double AnswerT::host_memory_usage() const { double AnswerT::host_memory_usage() const {
int atom_bytes=4; int atom_bytes=4;
if (_charge) if (_charge)
atom_bytes+=1; atom_bytes+=1;
if (_rot) if (_rot)
atom_bytes+=4; atom_bytes+=4;
int ans_bytes=atom_bytes+_ev_fields; int ans_bytes=atom_bytes+_ev_fields;
return ans_bytes*(_max_local)*sizeof(acctyp)+ return ans_bytes*(_max_local)*sizeof(acctyp)+
sizeof(Answer<numtyp,acctyp>); sizeof(Answer<numtyp,acctyp>);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void AnswerT::copy_answers(const bool eflag, const bool vflag, void AnswerT::copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom) { const bool ef_atom, const bool vf_atom) {
@ -144,8 +144,8 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
_vflag=vflag; _vflag=vflag;
_ef_atom=ef_atom; _ef_atom=ef_atom;
_vf_atom=vf_atom; _vf_atom=vf_atom;
int csize=_ev_fields; int csize=_ev_fields;
if (!eflag) if (!eflag)
csize-=_e_fields; csize-=_e_fields;
if (!vflag) if (!vflag)
@ -180,7 +180,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=0; i<_inum; i++) for (int i=0; i<_inum; i++)
evdwl+=engv[i]; evdwl+=engv[i];
if (_ef_atom) if (_ef_atom)
if (_ilist==NULL) if (_ilist==NULL)
for (int i=0; i<_inum; i++) for (int i=0; i<_inum; i++)
eatom[i]+=engv[i]; eatom[i]+=engv[i];
else else
@ -196,18 +196,18 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
if (_vf_atom) if (_vf_atom)
if (_ilist==NULL) { if (_ilist==NULL) {
int ii=0; int ii=0;
for (int i=vstart; i<iend; i++) for (int i=vstart; i<iend; i++)
vatom[ii++][j]+=engv[i]; vatom[ii++][j]+=engv[i];
} else { } else {
int ii=0; int ii=0;
for (int i=vstart; i<iend; i++) for (int i=vstart; i<iend; i++)
vatom[_ilist[ii++]][j]+=engv[i]; vatom[_ilist[ii++]][j]+=engv[i];
} }
vstart+=_inum; vstart+=_inum;
iend+=_inum; iend+=_inum;
} }
} }
return evdwl; return evdwl;
} }
@ -242,8 +242,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
} }
vstart=iend; vstart=iend;
iend+=_inum; iend+=_inum;
} }
if (_vflag) { if (_vflag) {
for (int j=0; j<6; j++) { for (int j=0; j<6; j++) {
for (int i=vstart; i<iend; i++) for (int i=vstart; i<iend; i++)
virial[j]+=engv[i]; virial[j]+=engv[i];
@ -254,12 +254,12 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
} else { } else {
for (int i=vstart, ii=0; i<iend; i++) for (int i=vstart, ii=0; i<iend; i++)
vatom[_ilist[ii++]][j]+=engv[i]; vatom[_ilist[ii++]][j]+=engv[i];
} }
vstart+=_inum; vstart+=_inum;
iend+=_inum; iend+=_inum;
} }
} }
return evdwl; return evdwl;
} }

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -30,7 +30,7 @@ AtomT::Atom() : _compiled(false),_allocated(false),
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int AtomT::bytes_per_atom() const { int AtomT::bytes_per_atom() const {
int id_space=0; int id_space=0;
if (_gpu_nbor==1) if (_gpu_nbor==1)
id_space=2; id_space=2;
@ -51,7 +51,7 @@ bool AtomT::alloc(const int nall) {
_max_atoms=static_cast<int>(static_cast<double>(nall)*1.10); _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
bool success=true; bool success=true;
// Ignore host/device transfers? // Ignore host/device transfers?
_host_view=false; _host_view=false;
if (dev->shared_memory() && sizeof(numtyp)==sizeof(double)) { if (dev->shared_memory() && sizeof(numtyp)==sizeof(double)) {
@ -60,11 +60,11 @@ bool AtomT::alloc(const int nall) {
assert(0==1); assert(0==1);
#endif #endif
} }
// Allocate storage for CUDPP sort // Allocate storage for CUDPP sort
#ifdef USE_CUDPP #ifdef USE_CUDPP
if (_gpu_nbor==1) { if (_gpu_nbor==1) {
CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
if (CUDPP_SUCCESS != result) if (CUDPP_SUCCESS != result)
return false; return false;
} }
@ -110,7 +110,7 @@ bool AtomT::alloc(const int nall) {
} else { } else {
success=success && (host_particle_id.alloc(_max_atoms,*dev, success=success && (host_particle_id.alloc(_max_atoms,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS); UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && success=success &&
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
} }
if (_gpu_nbor==2 && _host_view) if (_gpu_nbor==2 && _host_view)
@ -124,8 +124,8 @@ bool AtomT::alloc(const int nall) {
gpu_bytes+=x.device.row_bytes(); gpu_bytes+=x.device.row_bytes();
if (gpu_bytes>_max_gpu_bytes) if (gpu_bytes>_max_gpu_bytes)
_max_gpu_bytes=gpu_bytes; _max_gpu_bytes=gpu_bytes;
_allocated=true; _allocated=true;
return success; return success;
} }
@ -135,7 +135,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
bool success=true; bool success=true;
// Ignore host/device transfers? // Ignore host/device transfers?
int gpu_bytes=0; int gpu_bytes=0;
if (charge && _charge==false) { if (charge && _charge==false) {
_charge=true; _charge=true;
_other=true; _other=true;
@ -179,7 +179,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
_gpu_nbor=gpu_nbor; _gpu_nbor=gpu_nbor;
#ifdef USE_CUDPP #ifdef USE_CUDPP
if (_gpu_nbor==1) { if (_gpu_nbor==1) {
CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
if (CUDPP_SUCCESS != result) if (CUDPP_SUCCESS != result)
return false; return false;
} }
@ -198,9 +198,9 @@ bool AtomT::add_fields(const bool charge, const bool rot,
} else { } else {
success=success && (host_particle_id.alloc(_max_atoms,*dev, success=success && (host_particle_id.alloc(_max_atoms,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS); UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && success=success &&
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
} }
} }
return success; return success;
@ -230,7 +230,7 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
int ef_nall=nall; int ef_nall=nall;
if (ef_nall==0) if (ef_nall==0)
ef_nall=2000; ef_nall=2000;
// Initialize timers for the selected device // Initialize timers for the selected device
time_pos.init(*dev); time_pos.init(*dev);
time_q.init(*dev); time_q.init(*dev);
@ -241,14 +241,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
time_quat.zero(); time_quat.zero();
time_vel.zero(); time_vel.zero();
_time_cast=0.0; _time_cast=0.0;
#ifdef GPU_CAST #ifdef GPU_CAST
compile_kernels(*dev); compile_kernels(*dev);
#endif #endif
return success && alloc(ef_nall); return success && alloc(ef_nall);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void AtomT::clear_resize() { void AtomT::clear_resize() {
if (!_allocated) if (!_allocated)
@ -274,7 +274,7 @@ void AtomT::clear_resize() {
#ifdef USE_CUDPP #ifdef USE_CUDPP
if (_gpu_nbor==1) cudppDestroyPlan(sort_plan); if (_gpu_nbor==1) cudppDestroyPlan(sort_plan);
#endif #endif
if (_gpu_nbor==2) { if (_gpu_nbor==2) {
host_particle_id.clear(); host_particle_id.clear();
host_cell_id.clear(); host_cell_id.clear();
@ -305,21 +305,21 @@ void AtomT::clear() {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
double AtomT::host_memory_usage() const { double AtomT::host_memory_usage() const {
int atom_bytes=4; int atom_bytes=4;
if (_charge) if (_charge)
atom_bytes+=1; atom_bytes+=1;
if (_rot) if (_rot)
atom_bytes+=4; atom_bytes+=4;
if (_vel) if (_vel)
atom_bytes+=4; atom_bytes+=4;
return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>); return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
} }
// Sort arrays for neighbor list calculation // Sort arrays for neighbor list calculation
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void AtomT::sort_neighbor(const int num_atoms) { void AtomT::sort_neighbor(const int num_atoms) {
#ifdef USE_CUDPP #ifdef USE_CUDPP
CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
(int *)dev_particle_id.begin(), (int *)dev_particle_id.begin(),
8*sizeof(unsigned), num_atoms); 8*sizeof(unsigned), num_atoms);
if (CUDPP_SUCCESS != result) { if (CUDPP_SUCCESS != result) {
printf("Error in cudppSort\n"); printf("Error in cudppSort\n");

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : brownw@ornl.gov // email : brownw@ornl.gov
// ***************************************************************************/ // ***************************************************************************/
@ -17,9 +17,9 @@
#include "lal_preprocessor.h" #include "lal_preprocessor.h"
#endif #endif
__kernel void kernel_cast_x(__global numtyp4 *restrict x_type, __kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
const __global double *restrict x, const __global double *restrict x,
const __global int *restrict type, const __global int *restrict type,
const int nall) { const int nall) {
int ii=GLOBAL_ID_X; int ii=GLOBAL_ID_X;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -57,19 +57,19 @@ class Atom {
/// Set number of local+ghost atoms for future copy operations /// Set number of local+ghost atoms for future copy operations
inline void nall(const int n) { _nall=n; } inline void nall(const int n) { _nall=n; }
/// Memory usage per atom in this class /// Memory usage per atom in this class
int bytes_per_atom() const; int bytes_per_atom() const;
/// Clear any previous data and set up for a new LAMMPS run /// Clear any previous data and set up for a new LAMMPS run
/** \param rot True if atom storage needs quaternions /** \param rot True if atom storage needs quaternions
* \param gpu_nbor 0 if neighboring will be performed on host * \param gpu_nbor 0 if neighboring will be performed on host
* gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 1 if neighboring will be performed on device
* gpu_nbor 2 if binning on host and neighboring on device **/ * gpu_nbor 2 if binning on host and neighboring on device **/
bool init(const int nall, const bool charge, const bool rot, bool init(const int nall, const bool charge, const bool rot,
UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
const bool vel=false); const bool vel=false);
/// Check if we have enough device storage and realloc if not /// Check if we have enough device storage and realloc if not
/** Returns true if resized with any call during this timestep **/ /** Returns true if resized with any call during this timestep **/
inline bool resize(const int nall, bool &success) { inline bool resize(const int nall, bool &success) {
@ -81,7 +81,7 @@ class Atom {
} }
return _resized; return _resized;
} }
/// If already initialized by another LAMMPS style, add fields as necessary /// If already initialized by another LAMMPS style, add fields as necessary
/** \param rot True if atom storage needs quaternions /** \param rot True if atom storage needs quaternions
* \param gpu_nbor 0 if neighboring will be performed on host * \param gpu_nbor 0 if neighboring will be performed on host
@ -89,28 +89,28 @@ class Atom {
* gpu_nbor 2 if binning on host and neighboring on device **/ * gpu_nbor 2 if binning on host and neighboring on device **/
bool add_fields(const bool charge, const bool rot, const int gpu_nbor, bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
const bool bonds, const bool vel=false); const bool bonds, const bool vel=false);
/// Returns true if GPU is using charges /// Returns true if GPU is using charges
bool charge() { return _charge; } bool charge() { return _charge; }
/// Returns true if GPU is using quaternions /// Returns true if GPU is using quaternions
bool quaternion() { return _rot; } bool quaternion() { return _rot; }
/// Returns true if GPU is using velocities /// Returns true if GPU is using velocities
bool velocity() { return _vel; } bool velocity() { return _vel; }
/// Only free matrices of length inum or nall for resizing /// Only free matrices of length inum or nall for resizing
void clear_resize(); void clear_resize();
/// Free all memory on host and device /// Free all memory on host and device
void clear(); void clear();
/// Return the total amount of host memory used by class in bytes /// Return the total amount of host memory used by class in bytes
double host_memory_usage() const; double host_memory_usage() const;
/// Sort arrays for neighbor list calculation on device /// Sort arrays for neighbor list calculation on device
void sort_neighbor(const int num_atoms); void sort_neighbor(const int num_atoms);
/// Add copy times to timers /// Add copy times to timers
inline void acc_timers() { inline void acc_timers() {
time_pos.add_to_total(); time_pos.add_to_total();
@ -150,18 +150,18 @@ class Atom {
total+=time_vel.total_seconds(); total+=time_vel.total_seconds();
time_vel.zero_total(); time_vel.zero_total();
} }
return total+_time_transfer/1000.0; return total+_time_transfer/1000.0;
} }
/// Return the total time for data cast/pack /// Return the total time for data cast/pack
/** Zeros the time so that atom times are only included once **/ /** Zeros the time so that atom times are only included once **/
inline double cast_time() inline double cast_time()
{ double t=_time_cast; _time_cast=0.0; return t; } { double t=_time_cast; _time_cast=0.0; return t; }
/// Pack LAMMPS atom type constants into matrix and copy to device /// Pack LAMMPS atom type constants into matrix and copy to device
template <class dev_typ, class t1> template <class dev_typ, class t1>
inline void type_pack1(const int n, const int m_size, inline void type_pack1(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one) { t1 **one) {
int ii=0; int ii=0;
@ -215,7 +215,7 @@ class Atom {
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false); ucl_copy(dev_v,view,false);
} }
/// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
template <class dev_typ, class t1, class t2, class t3, class t4> template <class dev_typ, class t1, class t2, class t3, class t4>
inline void type_pack4(const int n, const int m_size, inline void type_pack4(const int n, const int m_size,
@ -239,7 +239,7 @@ class Atom {
/// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
template <class dev_typ, class t1, class t2> template <class dev_typ, class t1, class t2>
inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v, inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) { UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
for (int i=0; i<n; i++) { for (int i=0; i<n; i++) {
buffer[i*2]=static_cast<numtyp>(one[i][i]); buffer[i*2]=static_cast<numtyp>(one[i][i]);
@ -279,7 +279,7 @@ class Atom {
/// Copy positions and types to device asynchronously /// Copy positions and types to device asynchronously
/** Copies nall() elements **/ /** Copies nall() elements **/
inline void add_x_data(double **host_ptr, int *host_type) { inline void add_x_data(double **host_ptr, int *host_type) {
time_pos.start(); time_pos.start();
if (_x_avail==false) { if (_x_avail==false) {
#ifdef GPU_CAST #ifdef GPU_CAST
@ -376,7 +376,7 @@ class Atom {
/// Copy velocities and tags to device asynchronously /// Copy velocities and tags to device asynchronously
/** Copies nall() elements **/ /** Copies nall() elements **/
inline void add_v_data(double **host_ptr, tagint *host_tag) { inline void add_v_data(double **host_ptr, tagint *host_tag) {
time_vel.start(); time_vel.start();
if (_v_avail==false) { if (_v_avail==false) {
#ifdef GPU_CAST #ifdef GPU_CAST
@ -407,8 +407,8 @@ class Atom {
inline void add_transfer_time(double t) { _time_transfer+=t; } inline void add_transfer_time(double t) { _time_transfer+=t; }
/// Return number of bytes used on device /// Return number of bytes used on device
inline double max_gpu_bytes() inline double max_gpu_bytes()
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
/// Returns true if the device is addressing memory on the host /// Returns true if the device is addressing memory on the host
inline bool host_view() { return _host_view; } inline bool host_view() { return _host_view; }
@ -422,7 +422,7 @@ class Atom {
/// Quaterions /// Quaterions
UCL_Vector<numtyp,numtyp> quat; UCL_Vector<numtyp,numtyp> quat;
/// Velocities /// Velocities
UCL_Vector<numtyp,numtyp> v; UCL_Vector<numtyp,numtyp> v;
#ifdef GPU_CAST #ifdef GPU_CAST
UCL_Vector<double,double> x_cast; UCL_Vector<double,double> x_cast;
@ -436,7 +436,7 @@ class Atom {
/// Atom tag information for device nbor builds /// Atom tag information for device nbor builds
UCL_D_Vec<tagint> dev_tag; UCL_D_Vec<tagint> dev_tag;
/// Cell list identifiers for hybrid nbor builds /// Cell list identifiers for hybrid nbor builds
UCL_H_Vec<int> host_cell_id; UCL_H_Vec<int> host_cell_id;
/// Cell list identifiers for hybrid nbor builds /// Cell list identifiers for hybrid nbor builds
@ -444,7 +444,7 @@ class Atom {
/// Device timers /// Device timers
UCL_Timer time_pos, time_q, time_quat, time_vel; UCL_Timer time_pos, time_q, time_quat, time_vel;
/// Geryon device /// Geryon device
UCL_Device *dev; UCL_Device *dev;
@ -456,19 +456,19 @@ class Atom {
#endif #endif
bool _compiled; bool _compiled;
// True if data has been copied to device already // True if data has been copied to device already
bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized; bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
bool alloc(const int nall); bool alloc(const int nall);
bool _allocated, _rot, _charge, _bonds, _vel, _other; bool _allocated, _rot, _charge, _bonds, _vel, _other;
int _max_atoms, _nall, _gpu_nbor; int _max_atoms, _nall, _gpu_nbor;
bool _host_view; bool _host_view;
double _time_cast, _time_transfer; double _time_cast, _time_transfer;
double _max_gpu_bytes; double _max_gpu_bytes;
#ifdef USE_CUDPP #ifdef USE_CUDPP
CUDPPConfiguration sort_config; CUDPPConfiguration sort_config;
CUDPPHandle sort_plan; CUDPPHandle sort_plan;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -44,7 +44,7 @@ class Balance {
_init_done=false; _init_done=false;
} }
} }
/// Return the timestep since initialization /// Return the timestep since initialization
inline int timestep() { return _timestep; } inline int timestep() { return _timestep; }
@ -96,7 +96,7 @@ class Balance {
inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } } inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } }
/// Calculate the new host/device split based on the cpu and device times /// Calculate the new host/device split based on the cpu and device times
/** \note Only does calculation every _HD_BALANCE_EVERY timesteps /** \note Only does calculation every _HD_BALANCE_EVERY timesteps
(and first 10) **/ (and first 10) **/
inline void balance(const double cpu_time); inline void balance(const double cpu_time);
@ -105,13 +105,13 @@ class Balance {
balance(cpu_time); balance(cpu_time);
return get_gpu_count(ago,inum_full); return get_gpu_count(ago,inum_full);
} }
private: private:
Device<numtyp,acctyp> *_device; Device<numtyp,acctyp> *_device;
UCL_Timer _device_time; UCL_Timer _device_time;
bool _init_done; bool _init_done;
int _gpu_nbor; int _gpu_nbor;
bool _load_balance; bool _load_balance;
double _actual_split, _avg_split, _desired_split, _max_split; double _actual_split, _avg_split, _desired_split, _max_split;
int _avg_count; int _avg_count;
@ -123,15 +123,15 @@ class Balance {
#define BalanceT Balance<numtyp,acctyp> #define BalanceT Balance<numtyp,acctyp>
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BalanceT::init(Device<numtyp, acctyp> *gpu, void BalanceT::init(Device<numtyp, acctyp> *gpu,
const int gpu_nbor, const double split) { const int gpu_nbor, const double split) {
clear(); clear();
_gpu_nbor=gpu_nbor; _gpu_nbor=gpu_nbor;
_init_done=true; _init_done=true;
_device=gpu; _device=gpu;
_device_time.init(*gpu->gpu); _device_time.init(*gpu->gpu);
if (split<0.0) { if (split<0.0) {
_load_balance=true; _load_balance=true;
_desired_split=0.90; _desired_split=0.90;
@ -163,7 +163,7 @@ int BalanceT::get_gpu_count(const int ago, const int inum_full) {
_timestep++; _timestep++;
return _inum; return _inum;
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BalanceT::balance(const double cpu_time) { void BalanceT::balance(const double cpu_time) {
if (_measure_this_step) { if (_measure_this_step) {

View File

@ -9,10 +9,10 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
#include "lal_base_atomic.h" #include "lal_base_atomic.h"
using namespace LAMMPS_AL; using namespace LAMMPS_AL;
#define BaseAtomicT BaseAtomic<numtyp, acctyp> #define BaseAtomicT BaseAtomic<numtyp, acctyp>
@ -63,13 +63,13 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
_nbor_data=&(nbor->dev_packed); _nbor_data=&(nbor->dev_packed);
} else } else
_nbor_data=&(nbor->dev_nbor); _nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false, maxspecial,_gpu_host,max_nbors,cell_size,false,
_threads_per_atom); _threads_per_atom);
if (success!=0) if (success!=0)
return success; return success;
ucl_device=device->gpu; ucl_device=device->gpu;
atom=&device->atom; atom=&device->atom;
@ -139,7 +139,7 @@ int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist,
double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes) if (bytes>_max_an_bytes)
_max_an_bytes=bytes; _max_an_bytes=bytes;
return ilist; return ilist;
} }
@ -188,7 +188,7 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full,
zero_timers(); zero_timers();
return; return;
} }
int ago=hd_balancer.ago_first(f_ago); int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time); int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum); ans->inum(inum);
@ -217,7 +217,7 @@ template <class numtyp, class acctyp>
int ** BaseAtomicT::compute(const int ago, const int inum_full, int ** BaseAtomicT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag, int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const bool vatom, int &host_start,
int **ilist, int **jnum, int **ilist, int **jnum,
@ -230,12 +230,12 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
zero_timers(); zero_timers();
return NULL; return NULL;
} }
hd_balancer.balance(cpu_time); hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full); int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum); ans->inum(inum);
host_start=inum; host_start=inum;
// Build neighbor list on GPU if necessary // Build neighbor list on GPU if necessary
if (ago==0) { if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@ -255,7 +255,7 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
ans->copy_answers(eflag,vflag,eatom,vatom); ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start; return nbor->host_jlist.begin()-host_start;
} }

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -41,7 +41,7 @@ class BaseAtomic {
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* \param k_name name for the kernel for force calculation * \param k_name name for the kernel for force calculation
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -49,8 +49,8 @@ class BaseAtomic {
* - -4 if the GPU library was not compiled for GPU * - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/ * - -5 Double precision is not supported on card **/
int init_atomic(const int nlocal, const int nall, const int max_nbors, int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, const double gpu_split, FILE *screen,
const void *pair_program, const char *k_name); const void *pair_program, const char *k_name);
/// Estimate the overhead for GPU context changes and CPU driver /// Estimate the overhead for GPU context changes and CPU driver
@ -80,7 +80,7 @@ class BaseAtomic {
* \note host_inum is 0 if the host is performing neighboring * \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles * \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/ * \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum, inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) { const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success); nbor->resize(inum,host_inum,max_nbors,success);
} }
@ -119,7 +119,7 @@ class BaseAtomic {
/// Build neighbor list on device /// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum, void build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial, double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, bool &success); tagint **special, bool &success);
/// Pair loop with host neighboring /// Pair loop with host neighboring
@ -133,19 +133,19 @@ class BaseAtomic {
int * compute(const int ago, const int inum_full, int * compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo, const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success); const double cpu_time, bool &success);
/// Pair loop with device neighboring /// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full, int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo, const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success); int **ilist, int **numj, const double cpu_time, bool &success);
// -------------------------- DEVICE DATA ------------------------- // -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage /// Device Properties and Atom and Neighbor storage
Device<numtyp,acctyp> *device; Device<numtyp,acctyp> *device;

View File

@ -10,7 +10,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -64,7 +64,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
_nbor_data=&(nbor->dev_packed); _nbor_data=&(nbor->dev_packed);
} else } else
_nbor_data=&(nbor->dev_nbor); _nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor, int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false, maxspecial,_gpu_host,max_nbors,cell_size,false,
_threads_per_atom); _threads_per_atom);
@ -153,7 +153,7 @@ template <class numtyp, class acctyp>
inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum, inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, const int nall, double **host_x,
int *host_type, double *sublo, int *host_type, double *sublo,
double *subhi, tagint *tag, double *subhi, tagint *tag,
int **nspecial, tagint **special, int **nspecial, tagint **special,
bool &success) { bool &success) {
success=true; success=true;
@ -192,7 +192,7 @@ void BaseChargeT::compute(const int f_ago, const int inum_full,
zero_timers(); zero_timers();
return; return;
} }
int ago=hd_balancer.ago_first(f_ago); int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time); int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum); ans->inum(inum);
@ -226,7 +226,7 @@ template <class numtyp, class acctyp>
int** BaseChargeT::compute(const int ago, const int inum_full, int** BaseChargeT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag, int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const bool vatom, int &host_start,
int **ilist, int **jnum, int **ilist, int **jnum,
@ -240,12 +240,12 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
zero_timers(); zero_timers();
return NULL; return NULL;
} }
hd_balancer.balance(cpu_time); hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full); int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum); ans->inum(inum);
host_start=inum; host_start=inum;
// Build neighbor list on GPU if necessary // Build neighbor list on GPU if necessary
if (ago==0) { if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@ -271,7 +271,7 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
ans->copy_answers(eflag,vflag,eatom,vatom); ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start; return nbor->host_jlist.begin()-host_start;
} }

View File

@ -10,7 +10,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -42,7 +42,7 @@ class BaseCharge {
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* \param k_name name for the kernel for force calculation * \param k_name name for the kernel for force calculation
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -83,7 +83,7 @@ class BaseCharge {
* \note host_inum is 0 if the host is performing neighboring * \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles * \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/ * \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum, inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) { const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success); nbor->resize(inum,host_inum,max_nbors,success);
} }
@ -137,12 +137,12 @@ class BaseCharge {
int** compute(const int ago, const int inum_full, const int nall, int** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success, int **ilist, int **numj, const double cpu_time, bool &success,
double *charge, double *boxlo, double *prd); double *charge, double *boxlo, double *prd);
// -------------------------- DEVICE DATA ------------------------- // -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage /// Device Properties and Atom and Neighbor storage
Device<numtyp,acctyp> *device; Device<numtyp,acctyp> *device;

View File

@ -10,7 +10,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -65,7 +65,7 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
_nbor_data=&(nbor->dev_packed); _nbor_data=&(nbor->dev_packed);
} else } else
_nbor_data=&(nbor->dev_nbor); _nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor, int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false, maxspecial,_gpu_host,max_nbors,cell_size,false,
_threads_per_atom); _threads_per_atom);
@ -155,7 +155,7 @@ template <class numtyp, class acctyp>
inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum, inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, const int nall, double **host_x,
int *host_type, double *sublo, int *host_type, double *sublo,
double *subhi, tagint *tag, double *subhi, tagint *tag,
int **nspecial, tagint **special, int **nspecial, tagint **special,
bool &success) { bool &success) {
success=true; success=true;
@ -194,7 +194,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
zero_timers(); zero_timers();
return; return;
} }
int ago=hd_balancer.ago_first(f_ago); int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time); int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum); ans->inum(inum);
@ -230,12 +230,12 @@ template <class numtyp, class acctyp>
int** BaseDipoleT::compute(const int ago, const int inum_full, int** BaseDipoleT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag, int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const bool vatom, int &host_start,
int **ilist, int **jnum, int **ilist, int **jnum,
const double cpu_time, bool &success, const double cpu_time, bool &success,
double *host_q, double **host_mu, double *host_q, double **host_mu,
double *boxlo, double *prd) { double *boxlo, double *prd) {
acc_timers(); acc_timers();
if (inum_full==0) { if (inum_full==0) {
@ -245,12 +245,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
zero_timers(); zero_timers();
return NULL; return NULL;
} }
hd_balancer.balance(cpu_time); hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full); int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum); ans->inum(inum);
host_start=inum; host_start=inum;
// Build neighbor list on GPU if necessary // Build neighbor list on GPU if necessary
if (ago==0) { if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@ -279,7 +279,7 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
ans->copy_answers(eflag,vflag,eatom,vatom); ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start; return nbor->host_jlist.begin()-host_start;
} }

View File

@ -10,7 +10,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -40,7 +40,7 @@ class BaseDipole {
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* \param k_name name for the kernel for force calculation * \param k_name name for the kernel for force calculation
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -82,7 +82,7 @@ class BaseDipole {
* \note host_inum is 0 if the host is performing neighboring * \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles * \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/ * \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum, inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) { const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success); nbor->resize(inum,host_inum,max_nbors,success);
} }
@ -136,12 +136,12 @@ class BaseDipole {
int** compute(const int ago, const int inum_full, const int nall, int** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success, int **ilist, int **numj, const double cpu_time, bool &success,
double *charge, double **mu, double *boxlo, double *prd); double *charge, double **mu, double *boxlo, double *prd);
// -------------------------- DEVICE DATA ------------------------- // -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage /// Device Properties and Atom and Neighbor storage
Device<numtyp,acctyp> *device; Device<numtyp,acctyp> *device;

View File

@ -64,7 +64,7 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
_nbor_data=&(nbor->dev_packed); _nbor_data=&(nbor->dev_packed);
} else } else
_nbor_data=&(nbor->dev_nbor); _nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false, maxspecial,_gpu_host,max_nbors,cell_size,false,
_threads_per_atom,true); _threads_per_atom,true);
@ -153,7 +153,7 @@ template <class numtyp, class acctyp>
inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum, inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, const int nall, double **host_x,
int *host_type, double *sublo, int *host_type, double *sublo,
double *subhi, tagint *tag, double *subhi, tagint *tag,
int **nspecial, tagint **special, int **nspecial, tagint **special,
bool &success) { bool &success) {
success=true; success=true;
@ -182,7 +182,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
const bool eflag, const bool vflag, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, int &host_start, const double cpu_time,
bool &success, tagint *tag, double **host_v, bool &success, tagint *tag, double **host_v,
const double dtinvsqrt, const int seed, const int timestep, const double dtinvsqrt, const int seed, const int timestep,
const int nlocal, double *boxlo, double *prd) { const int nlocal, double *boxlo, double *prd) {
acc_timers(); acc_timers();
@ -193,7 +193,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
zero_timers(); zero_timers();
return; return;
} }
int ago=hd_balancer.ago_first(f_ago); int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time); int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum); ans->inum(inum);
@ -228,12 +228,12 @@ template <class numtyp, class acctyp>
int** BaseDPDT::compute(const int ago, const int inum_full, int** BaseDPDT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag, int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const bool vatom, int &host_start,
int **ilist, int **jnum, int **ilist, int **jnum,
const double cpu_time, bool &success, const double cpu_time, bool &success,
double **host_v, const double dtinvsqrt, double **host_v, const double dtinvsqrt,
const int seed, const int timestep, const int seed, const int timestep,
double *boxlo, double *prd) { double *boxlo, double *prd) {
acc_timers(); acc_timers();
@ -244,12 +244,12 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
zero_timers(); zero_timers();
return NULL; return NULL;
} }
hd_balancer.balance(cpu_time); hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full); int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum); ans->inum(inum);
host_start=inum; host_start=inum;
// Build neighbor list on GPU if necessary // Build neighbor list on GPU if necessary
if (ago==0) { if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@ -276,7 +276,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
ans->copy_answers(eflag,vflag,eatom,vatom); ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start; return nbor->host_jlist.begin()-host_start;
} }

View File

@ -40,7 +40,7 @@ class BaseDPD {
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* \param k_name name for the kernel for force calculation * \param k_name name for the kernel for force calculation
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -81,7 +81,7 @@ class BaseDPD {
* \note host_inum is 0 if the host is performing neighboring * \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles * \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/ * \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum, inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) { const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success); nbor->resize(inum,host_inum,max_nbors,success);
} }
@ -129,20 +129,20 @@ class BaseDPD {
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, tagint *tag, const double cpu_time, bool &success, tagint *tag,
double **v, const double dtinvsqrt, const int seed, double **v, const double dtinvsqrt, const int seed,
const int timestep, const int nlocal, double *boxlo, double *prd); const int timestep, const int nlocal, double *boxlo, double *prd);
/// Pair loop with device neighboring /// Pair loop with device neighboring
int** compute(const int ago, const int inum_full, const int nall, int** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success, int **ilist, int **numj, const double cpu_time, bool &success,
double **v, const double dtinvsqrt, const int seed, double **v, const double dtinvsqrt, const int seed,
const int timestep, double *boxlo, double *prd); const int timestep, double *boxlo, double *prd);
// -------------------------- DEVICE DATA ------------------------- // -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage /// Device Properties and Atom and Neighbor storage
Device<numtyp,acctyp> *device; Device<numtyp,acctyp> *device;

View File

@ -70,7 +70,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
_gpu_host=1; _gpu_host=1;
_threads_per_atom=device->threads_per_atom(); _threads_per_atom=device->threads_per_atom();
int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor, int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,true, maxspecial,_gpu_host,max_nbors,cell_size,true,
1); 1);
@ -113,7 +113,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
return -8; return -8;
if (_multiple_forms && gpu_nbor!=0) if (_multiple_forms && gpu_nbor!=0)
return -9; return -9;
if (_multiple_forms) if (_multiple_forms)
ans->force.zero(); ans->force.zero();
@ -142,7 +142,7 @@ void BaseEllipsoidT::clear_base() {
// Output any timing information // Output any timing information
output_times(); output_times();
host_olist.clear(); host_olist.clear();
if (_compiled) { if (_compiled) {
k_nbor_fast.clear(); k_nbor_fast.clear();
k_nbor.clear(); k_nbor.clear();
@ -156,7 +156,7 @@ void BaseEllipsoidT::clear_base() {
delete lj_program; delete lj_program;
_compiled=false; _compiled=false;
} }
time_nbor1.clear(); time_nbor1.clear();
time_ellipsoid.clear(); time_ellipsoid.clear();
time_nbor2.clear(); time_nbor2.clear();
@ -230,7 +230,7 @@ void BaseEllipsoidT::output_times() {
if (times[6]>0) if (times[6]>0)
fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size); fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
@ -241,10 +241,10 @@ void BaseEllipsoidT::output_times() {
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
// Pack neighbors to limit thread divergence for lj-lj and ellipse // Pack neighbors to limit thread divergence for lj-lj and ellipse
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template<class numtyp, class acctyp> template<class numtyp, class acctyp>
void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
const int inum, const int form_low, const int inum, const int form_low,
const int form_high, const bool shared_types, const int form_high, const bool shared_types,
int ntypes) { int ntypes) {
@ -264,18 +264,18 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
// Copy neighbor list from host // Copy neighbor list from host
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseEllipsoidT::reset_nbors(const int nall, const int inum, void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
const int osize, int *ilist, const int osize, int *ilist,
int *numj, int *type, int **firstneigh, int *numj, int *type, int **firstneigh,
bool &success) { bool &success) {
success=true; success=true;
int mn=nbor->max_nbor_loop(osize,numj,ilist); int mn=nbor->max_nbor_loop(osize,numj,ilist);
resize_atom(nall,success); resize_atom(nall,success);
resize_local(inum,0,mn,osize,success); resize_local(inum,0,mn,osize,success);
if (!success) if (!success)
return; return;
if (_multiple_forms) { if (_multiple_forms) {
int p=0; int p=0;
for (int i=0; i<osize; i++) { for (int i=0; i<osize; i++) {
@ -315,7 +315,7 @@ template <class numtyp, class acctyp>
inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum, inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, const int nall, double **host_x,
int *host_type, double *sublo, int *host_type, double *sublo,
double *subhi, tagint *tag, double *subhi, tagint *tag,
int **nspecial, tagint **special, int **nspecial, tagint **special,
bool &success) { bool &success) {
success=true; success=true;
@ -354,7 +354,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
zero_timers(); zero_timers();
return NULL; return NULL;
} }
int ago=hd_balancer.ago_first(f_ago); int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time); int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum); ans->inum(inum);
@ -394,7 +394,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
double **host_x, int *host_type, double *sublo, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum, int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success, const double cpu_time, bool &success,
double **host_quat) { double **host_quat) {
@ -410,7 +410,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
ans->inum(inum); ans->inum(inum);
_last_ellipse=std::min(inum,_max_last_ellipse); _last_ellipse=std::min(inum,_max_last_ellipse);
host_start=inum; host_start=inum;
// Build neighbor list on GPU if necessary // Build neighbor list on GPU if necessary
if (ago==0) { if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@ -419,7 +419,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
return NULL; return NULL;
atom->cast_quat_data(host_quat[0]); atom->cast_quat_data(host_quat[0]);
hd_balancer.start_timer(); hd_balancer.start_timer();
} else { } else {
atom->cast_x_data(host_x,host_type); atom->cast_x_data(host_x,host_type);
atom->cast_quat_data(host_quat[0]); atom->cast_quat_data(host_quat[0]);
hd_balancer.start_timer(); hd_balancer.start_timer();
@ -444,9 +444,9 @@ double BaseEllipsoidT::host_memory_usage_base() const {
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseEllipsoidT::compile_kernels(UCL_Device &dev, void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
const void *ellipsoid_string, const void *ellipsoid_string,
const void *lj_string, const void *lj_string,
const char *kname, const bool e_s) { const char *kname, const bool e_s) {
if (_compiled) if (_compiled)
return; return;

View File

@ -42,7 +42,7 @@ class BaseEllipsoid {
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* \param ellipsoid_sphere true if ellipsoid-sphere case handled separately * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
* \param k_name name for the kernel for force calculation * \param k_name name for the kernel for force calculation
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -68,7 +68,7 @@ class BaseEllipsoid {
quat_tex.bind_float(atom->quat,4); quat_tex.bind_float(atom->quat,4);
lj_pos_tex.bind_float(atom->x,4); lj_pos_tex.bind_float(atom->x,4);
lj_quat_tex.bind_float(atom->quat,4); lj_quat_tex.bind_float(atom->quat,4);
} }
} }
/// Check if there is enough storage for neighbors and realloc if not /// Check if there is enough storage for neighbors and realloc if not
@ -78,7 +78,7 @@ class BaseEllipsoid {
* \param olist_size size of list of particles from CPU neighboring * \param olist_size size of list of particles from CPU neighboring
* \note host_inum is 0 if the host is performing neighboring * \note host_inum is 0 if the host is performing neighboring
* \note if GPU is neighboring nlocal+host_inum=total number local particles * \note if GPU is neighboring nlocal+host_inum=total number local particles
* \note if CPU is neighboring olist_size=total number of local particles * \note if CPU is neighboring olist_size=total number of local particles
* \note if GPU is neighboring olist_size=0 **/ * \note if GPU is neighboring olist_size=0 **/
inline void resize_local(const int nlocal, const int host_inum, inline void resize_local(const int nlocal, const int host_inum,
const int max_nbors, const int olist_size, const int max_nbors, const int olist_size,
@ -101,7 +101,7 @@ class BaseEllipsoid {
/// Clear all host and device data /// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/ /** \note This is called at the beginning of the init() routine **/
void clear_base(); void clear_base();
/// Output any timing information /// Output any timing information
void output_times(); void output_times();
@ -130,7 +130,7 @@ class BaseEllipsoid {
ans->acc_timers(); ans->acc_timers();
} }
} }
/// Zero timers /// Zero timers
inline void zero_timers() { inline void zero_timers() {
time_nbor1.zero(); time_nbor1.zero();
@ -148,9 +148,9 @@ class BaseEllipsoid {
ans->zero_timers(); ans->zero_timers();
} }
/// Pack neighbors to limit thread divergence for lj-lj and ellipse /// Pack neighbors to limit thread divergence for lj-lj and ellipse
void pack_nbors(const int GX, const int BX, const int start, const int inum, void pack_nbors(const int GX, const int BX, const int start, const int inum,
const int form_low, const int form_high, const int form_low, const int form_high,
const bool shared_types, int ntypes); const bool shared_types, int ntypes);
/// Copy neighbor list from host /// Copy neighbor list from host
@ -174,17 +174,17 @@ class BaseEllipsoid {
int** compute(const int ago, const int inum_full, const int nall, int** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success, int **ilist, int **numj, const double cpu_time, bool &success,
double **host_quat); double **host_quat);
/// Build neighbor list on accelerator /// Build neighbor list on accelerator
void build_nbor_list(const int inum, const int host_inum, const int nall, void build_nbor_list(const int inum, const int host_inum, const int nall,
double **host_x, int *host_type, double *sublo, double **host_x, int *host_type, double *sublo,
double *subhi, bool &success); double *subhi, bool &success);
// -------------------------- DEVICE DATA ------------------------- // -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage /// Device Properties and Atom and Neighbor storage
Device<numtyp,acctyp> *device; Device<numtyp,acctyp> *device;
@ -207,7 +207,7 @@ class BaseEllipsoid {
/// Atom Data /// Atom Data
Atom<numtyp,acctyp> *atom; Atom<numtyp,acctyp> *atom;
// --------------------------- TYPE DATA -------------------------- // --------------------------- TYPE DATA --------------------------
/// cut_form.x = cutsq, cut_form.y = form /// cut_form.x = cutsq, cut_form.y = form
UCL_D_Vec<numtyp2> cut_form; UCL_D_Vec<numtyp2> cut_form;
@ -240,7 +240,7 @@ class BaseEllipsoid {
double _gpu_overhead, _driver_overhead; double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data; UCL_D_Vec<int> *_nbor_data;
// True if we want to use fast GB-sphere or sphere-sphere calculations // True if we want to use fast GB-sphere or sphere-sphere calculations
bool _multiple_forms; bool _multiple_forms;
int **_host_form; int **_host_form;
int _last_ellipse, _max_last_ellipse; int _last_ellipse, _max_last_ellipse;

View File

@ -12,7 +12,7 @@
begin : Tue April 2, 2013 begin : Tue April 2, 2013
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
#include "lal_base_three.h" #include "lal_base_three.h"
using namespace LAMMPS_AL; using namespace LAMMPS_AL;
#define BaseThreeT BaseThree<numtyp, acctyp> #define BaseThreeT BaseThree<numtyp, acctyp>
@ -45,7 +45,7 @@ int BaseThreeT::bytes_per_atom_atomic(const int max_nbors) const {
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
b+=ans2->bytes_per_atom(); b+=ans2->bytes_per_atom();
#endif #endif
return b; return b;
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -62,6 +62,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
gpu_nbor=1; gpu_nbor=1;
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH) else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2; gpu_nbor=2;
_gpu_nbor=gpu_nbor;
int _gpu_host=0; int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
@ -76,7 +77,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_nbor_data=&(nbor->dev_nbor); _nbor_data=&(nbor->dev_nbor);
if (_threads_per_atom*_threads_per_atom>device->warp_size()) if (_threads_per_atom*_threads_per_atom>device->warp_size())
return -10; return -10;
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false, maxspecial,_gpu_host,max_nbors,cell_size,false,
_threads_per_atom); _threads_per_atom);
@ -93,7 +94,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
return -3; return -3;
ans2->cq(_end_command_queue); ans2->cq(_end_command_queue);
#endif #endif
_block_pair=device->pair_block_size(); _block_pair=device->pair_block_size();
_block_size=device->block_ellipse(); _block_size=device->block_ellipse();
compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end); compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
@ -111,7 +112,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
_max_an_bytes+=ans2->gpu_bytes(); _max_an_bytes+=ans2->gpu_bytes();
#endif #endif
return 0; return 0;
} }
@ -158,7 +159,7 @@ void BaseThreeT::clear_atomic() {
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist, int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
int *ilist, int *numj, int **firstneigh, int *ilist, int *numj, int **firstneigh,
bool &success) { bool &success) {
success=true; success=true;
@ -168,7 +169,12 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
if (!success) if (!success)
return NULL; return NULL;
nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size()); // originally the requirement that nall == nlist was enforced
// to allow direct indexing neighbors of neighbors after re-arrangement
// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
// now the requirement is removed, allowing to work within pair hybrid
nbor->get_host(nlist,ilist,numj,firstneigh,block_size());
double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
@ -176,7 +182,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
#endif #endif
if (bytes>_max_an_bytes) if (bytes>_max_an_bytes)
_max_an_bytes=bytes; _max_an_bytes=bytes;
return ilist; return ilist;
} }
@ -185,11 +191,11 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum, inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, const int nall, double **host_x,
int *host_type, double *sublo, int *host_type, double *sublo,
double *subhi, tagint *tag, double *subhi, tagint *tag,
int **nspecial, tagint **special, int **nspecial, tagint **special,
bool &success) { bool &success) {
success=true; success=true;
resize_atom(inum,nall,success); resize_atom(inum,nall,success);
resize_local(nall,host_inum,nbor->max_nbors(),success); resize_local(nall,host_inum,nbor->max_nbors(),success);
@ -214,11 +220,11 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
// Copy nbor list from host if necessary and then calculate forces, virials,.. // Copy nbor list from host if necessary and then calculate forces, virials,..
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall, void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type, const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom, const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const bool vatom, int &host_start,
const double cpu_time, bool &success) { const double cpu_time, bool &success) {
acc_timers(); acc_timers();
if (nlist==0) { if (nlist==0) {
@ -228,9 +234,9 @@ void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
zero_timers(); zero_timers();
return; return;
} }
int ago=hd_balancer.ago_first(f_ago); int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,nlocal,cpu_time); int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum); ans->inum(inum);
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
ans2->inum(inum); ans2->inum(inum);
@ -270,7 +276,7 @@ template <class numtyp, class acctyp>
int ** BaseThreeT::compute(const int ago, const int inum_full, int ** BaseThreeT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag, int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const bool vatom, int &host_start,
int **ilist, int **jnum, int **ilist, int **jnum,
@ -283,7 +289,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
zero_timers(); zero_timers();
return NULL; return NULL;
} }
hd_balancer.balance(cpu_time); hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full); int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum); ans->inum(inum);
@ -291,7 +297,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
ans2->inum(inum); ans2->inum(inum);
#endif #endif
host_start=inum; host_start=inum;
// Build neighbor list on GPU if necessary // Build neighbor list on GPU if necessary
if (ago==0) { if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@ -321,7 +327,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
device->add_ans_object(ans2); device->add_ans_object(ans2);
#endif #endif
hd_balancer.stop_timer(); hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start; return nbor->host_jlist.begin()-host_start;
} }
@ -352,7 +358,7 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
k_three_end.cq(ucl_device->cq(_end_command_queue)); k_three_end.cq(ucl_device->cq(_end_command_queue));
k_three_end_vatom.cq(ucl_device->cq(_end_command_queue)); k_three_end_vatom.cq(ucl_device->cq(_end_command_queue));
#endif #endif
_compiled=true; _compiled=true;
} }

View File

@ -44,7 +44,7 @@ class BaseThree {
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* \param k_two name for the kernel for 2-body force calculation * \param k_two name for the kernel for 2-body force calculation
* \param k_three name for the kernel for 3-body force calculation * \param k_three name for the kernel for 3-body force calculation
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -53,8 +53,8 @@ class BaseThree {
* - -5 Double precision is not supported on card * - -5 Double precision is not supported on card
* - -10 if invalid thread_per_atom setting **/ * - -10 if invalid thread_per_atom setting **/
int init_three(const int nlocal, const int nall, const int max_nbors, int init_three(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, const double gpu_split, FILE *screen,
const void *pair_program, const char *k_two, const void *pair_program, const char *k_two,
const char *k_three_center, const char *k_three_end); const char *k_three_center, const char *k_three_end);
@ -88,7 +88,7 @@ class BaseThree {
* \note host_inum is 0 if the host is performing neighboring * \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles * \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/ * \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum, inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) { const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success); nbor->resize(inum,host_inum,max_nbors,success);
} }
@ -133,33 +133,33 @@ class BaseThree {
/// Build neighbor list on device /// Build neighbor list on device
int build_nbor_list(const int inum, const int host_inum, int build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial, double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, bool &success); tagint **special, bool &success);
/// Pair loop with host neighboring /// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall, void compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type, const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag, int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom, const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success); int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring /// Pair loop with device neighboring
int * compute(const int ago, const int inum_full, const int nall, int * compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success); const double cpu_time, bool &success);
/// Pair loop with device neighboring /// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full, int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo, const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success); int **ilist, int **numj, const double cpu_time, bool &success);
// -------------------------- DEVICE DATA ------------------------- // -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage /// Device Properties and Atom and Neighbor storage
Device<numtyp,acctyp> *device; Device<numtyp,acctyp> *device;
@ -186,7 +186,7 @@ class BaseThree {
Answer<numtyp,acctyp> *ans; Answer<numtyp,acctyp> *ans;
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
Answer<numtyp,acctyp> *ans2; Answer<numtyp,acctyp> *ans2;
#endif #endif
// --------------------------- NBOR DATA ---------------------------- // --------------------------- NBOR DATA ----------------------------
@ -205,15 +205,16 @@ class BaseThree {
protected: protected:
bool _compiled; bool _compiled;
int _block_pair, _block_size, _threads_per_atom, _end_command_queue; int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
int _gpu_nbor;
double _max_bytes, _max_an_bytes; double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead; double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data; UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string, void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *k_two, const char *k_three_center, const char *k_two, const char *k_three_center,
const char *k_three_end); const char *k_three_end);
virtual void loop(const bool _eflag, const bool _vflag, virtual void loop(const bool _eflag, const bool _vflag,
const int evatom) = 0; const int evatom) = 0;
}; };

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -33,17 +33,17 @@ BeckT::Beck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
BeckT::~Beck() { BeckT::~Beck() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BeckT::bytes_per_atom(const int max_nbors) const { int BeckT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BeckT::init(const int ntypes, int BeckT::init(const int ntypes,
double **host_cutsq, double **host_aa, double **host_cutsq, double **host_aa,
double **host_alpha, double **host_beta, double **host_alpha, double **host_beta,
double **host_AA, double **host_BB, double **host_AA, double **host_BB,
@ -126,7 +126,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : nguyentd@ornl.gov // email : nguyentd@ornl.gov
// ***************************************************************************/ // ***************************************************************************/
@ -24,7 +24,7 @@ texture<int4,1> pos_tex;
#define pos_tex x_ #define pos_tex x_
#endif #endif
__kernel void k_beck(const __global numtyp4 *restrict x_, __kernel void k_beck(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict beck1, const __global numtyp4 *restrict beck1,
const __global numtyp4 *restrict beck2, const __global numtyp4 *restrict beck2,
const int lj_types, const int lj_types,
@ -50,20 +50,20 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w; int itype=ix.w;
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -76,7 +76,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype; int mtype=itype*lj_types+jtype;
if (rsq<beck2[mtype].z) { if (rsq<beck2[mtype].z) {
numtyp r = ucl_sqrt(rsq); numtyp r = ucl_sqrt(rsq);
@ -103,7 +103,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
numtyp term1inv = ucl_recip(term1); numtyp term1inv = ucl_recip(term1);
numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4); numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv); e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
energy+=factor_lj*e; energy+=factor_lj*e;
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
@ -133,7 +133,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
@ -143,7 +143,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
beck1[tid]=beck1_in[tid]; beck1[tid]=beck1_in[tid];
beck2[tid]=beck2_in[tid]; beck2[tid]=beck2_in[tid];
} }
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -152,7 +152,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
@ -166,7 +166,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -179,7 +179,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<beck2[mtype].z) { if (rsq<beck2[mtype].z) {
numtyp r = ucl_sqrt(rsq); numtyp r = ucl_sqrt(rsq);
numtyp r5 = rsq*rsq*r; numtyp r5 = rsq*rsq*r;
@ -205,7 +205,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
numtyp term1inv = ucl_recip(term1); numtyp term1inv = ucl_recip(term1);
numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4); numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv); e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
energy+=factor_lj*e; energy+=factor_lj*e;
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
class Beck : public BaseAtomic<numtyp, acctyp> { class Beck : public BaseAtomic<numtyp, acctyp> {
public: public:
Beck(); Beck();
~Beck(); ~Beck();
/// Clear any previous data and set up for a new LAMMPS run /// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -41,8 +41,8 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
double **host_aa, double **host_alpha, double **host_aa, double **host_alpha,
double **host_beta, double **host_AA, double **host_beta, double **host_AA,
double **host_BB, double *host_special_lj, double **host_BB, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen); const double gpu_split, FILE *screen);
/// Clear all host and device data /// Clear all host and device data
@ -67,7 +67,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _lj_types; int _lj_types;
private: private:

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -77,7 +77,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
cell_size, gpu_split, screen); cell_size, gpu_split, screen);
BLMF.device->gpu_barrier(); BLMF.device->gpu_barrier();
if (message) if (message)
fprintf(screen,"Done.\n"); fprintf(screen,"Done.\n");
} }
if (message) if (message)
@ -102,8 +102,8 @@ int ** beck_gpu_compute_n(const int ago, const int inum_full,
return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom, subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success); vatom, host_start, ilist, jnum, cpu_time, success);
} }
void beck_gpu_compute(const int ago, const int inum_full, const int nall, void beck_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -33,10 +33,10 @@ BornT::Born() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
BornT::~Born() { BornT::~Born() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BornT::bytes_per_atom(const int max_nbors) const { int BornT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
@ -44,12 +44,12 @@ int BornT::bytes_per_atom(const int max_nbors) const {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BornT::init(const int ntypes, double **host_cutsq, int BornT::init(const int ntypes, double **host_cutsq,
double **host_rhoinv, double **host_born1, double **host_born2, double **host_rhoinv, double **host_born1, double **host_born2,
double **host_born3, double **host_a, double **host_c, double **host_born3, double **host_a, double **host_c,
double **host_d, double **host_sigma, double **host_d, double **host_sigma,
double **host_offset, double *host_special_lj, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) { const double gpu_split, FILE *_screen) {
int success; int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@ -80,7 +80,7 @@ int BornT::init(const int ntypes, double **host_cutsq,
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_d,host_offset); host_d,host_offset);
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq, this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
@ -102,18 +102,18 @@ void BornT::reinit(const int ntypes, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born1, double **host_born2,
double **host_born3, double **host_a, double **host_c, double **host_born3, double **host_a, double **host_c,
double **host_d, double **host_offset) { double **host_d, double **host_offset) {
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
UCL_WRITE_ONLY); UCL_WRITE_ONLY);
for (int i=0; i<_lj_types*_lj_types; i++) for (int i=0; i<_lj_types*_lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;
this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv, this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
host_born1,host_born2,host_born3); host_born1,host_born2,host_born3);
this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c, this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
host_d,host_offset); host_d,host_offset);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -151,7 +151,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -169,7 +169,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, this->k_pair.run(&this->atom->x, &coeff1, &coeff2,
&cutsq_sigma, &_lj_types, &sp_lj, &cutsq_sigma, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force, &this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum, &this->ans->engv, &eflag, &vflag, &ainum,

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : nguyentd@ornl.gov // email : nguyentd@ornl.gov
// ***************************************************************************/ // ***************************************************************************/
@ -24,16 +24,16 @@ texture<int4,1> pos_tex;
#define pos_tex x_ #define pos_tex x_
#endif #endif
__kernel void k_born(const __global numtyp4 *restrict x_, __kernel void k_born(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1, const __global numtyp4 *restrict coeff1,
const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff2,
const __global numtyp2 *restrict cutsq_sigma, const __global numtyp2 *restrict cutsq_sigma,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
@ -51,20 +51,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w; int itype=ix.w;
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -77,17 +77,17 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz; numtyp r2inv = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype; int mtype=itype*lj_types+jtype;
if (r2inv<cutsq_sigma[mtype].x) { if (r2inv<cutsq_sigma[mtype].x) {
numtyp r=ucl_sqrt(r2inv); numtyp r=ucl_sqrt(r2inv);
numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x); numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
r2inv=ucl_recip(r2inv); r2inv=ucl_recip(r2inv);
numtyp r6inv = r2inv*r2inv*r2inv; numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*(coeff1[mtype].y*r*rexp numtyp force = r2inv*(coeff1[mtype].y*r*rexp
- coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv); - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
force*=factor_lj; force*=factor_lj;
f.x+=delx*force; f.x+=delx*force;
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
@ -95,7 +95,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
if (eflag>0) { if (eflag>0) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv; + coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
@ -113,20 +113,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
} // if ii } // if ii
} }
__kernel void k_born_fast(const __global numtyp4 *restrict x_, __kernel void k_born_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1_in, const __global numtyp4 *restrict coeff1_in,
const __global numtyp4 *restrict coeff2_in, const __global numtyp4 *restrict coeff2_in,
const __global numtyp2 *restrict cutsq_sigma, const __global numtyp2 *restrict cutsq_sigma,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
@ -137,7 +137,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
if (eflag>0) if (eflag>0)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -146,7 +146,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
@ -160,7 +160,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -173,13 +173,13 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz; numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<cutsq_sigma[mtype].x) { if (r2inv<cutsq_sigma[mtype].x) {
numtyp r=ucl_sqrt(r2inv); numtyp r=ucl_sqrt(r2inv);
numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x); numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
r2inv=ucl_recip(r2inv); r2inv=ucl_recip(r2inv);
numtyp r6inv = r2inv*r2inv*r2inv; numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*(coeff1[mtype].y*r*rexp numtyp force = r2inv*(coeff1[mtype].y*r*rexp
- coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv); - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
force*=factor_lj; force*=factor_lj;
@ -190,7 +190,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
if (eflag>0) { if (eflag>0) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv; + coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
class Born : public BaseAtomic<numtyp, acctyp> { class Born : public BaseAtomic<numtyp, acctyp> {
public: public:
Born(); Born();
~Born(); ~Born();
/// Clear any previous data and set up for a new LAMMPS run /// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -38,20 +38,20 @@ class Born : public BaseAtomic<numtyp, acctyp> {
* - -4 if the GPU library was not compiled for GPU * - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/ * - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, int init(const int ntypes, double **host_cutsq,
double **host_rhoinv, double **host_born1, double **host_born2, double **host_rhoinv, double **host_born1, double **host_born2,
double **host_born3, double **host_a, double **host_c, double **host_born3, double **host_a, double **host_c,
double **host_d, double **host_sigma, double **host_d, double **host_sigma,
double **host_offset, double *host_special_lj, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen); const double gpu_split, FILE *screen);
/// Send updated coeffs from host to device (to be compatible with fix adapt) /// Send updated coeffs from host to device (to be compatible with fix adapt)
void reinit(const int ntypes, double **host_rhoinv, void reinit(const int ntypes, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born1, double **host_born2,
double **host_born3, double **host_a, double **host_c, double **host_born3, double **host_a, double **host_c,
double **host_d, double **host_offset); double **host_d, double **host_offset);
/// Clear all host and device data /// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/ /** \note This is called at the beginning of the init() routine **/
void clear(); void clear();
@ -77,7 +77,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _lj_types; int _lj_types;
private: private:

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -37,17 +37,17 @@ template <class numtyp, class acctyp>
BornCoulLongT::~BornCoulLongT() { BornCoulLongT::~BornCoulLongT() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BornCoulLongT::bytes_per_atom(const int max_nbors) const { int BornCoulLongT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born3, double **host_born1, double **host_born2, double **host_born3,
double **host_a, double **host_c, double **host_d, double **host_a, double **host_c, double **host_d,
double **host_sigma, double **host_offset, double **host_sigma, double **host_offset,
double *host_special_lj, const int nlocal, double *host_special_lj, const int nlocal,
const int nall, const int max_nbors, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
@ -84,12 +84,12 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_d,host_offset); host_d,host_offset);
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq, this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
host_cut_ljsq,host_sigma); host_cut_ljsq,host_sigma);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) { for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i]; host_write[i]=host_special_lj[i];
@ -142,7 +142,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -157,15 +157,15 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
&this->ans->force, &this->ans->force,
&this->ans->engv, &eflag, &vflag, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &ainum, &nbor_pitch, &this->atom->q,
&cutsq_sigma, &_cut_coulsq, &_qqrd2e, &cutsq_sigma, &_cut_coulsq, &_qqrd2e,
&_g_ewald, &this->_threads_per_atom); &_g_ewald, &this->_threads_per_atom);
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->q, &nbor_pitch, &this->atom->q,
&cutsq_sigma, &_cut_coulsq, &cutsq_sigma, &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom); &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
} }

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : nguyentd@ornl.gov // email : nguyentd@ornl.gov
// ***************************************************************************/ // ***************************************************************************/
@ -29,19 +29,19 @@ texture<int2> q_tex;
#define q_tex q_ #define q_tex q_
#endif #endif
__kernel void k_born_long(const __global numtyp4 *restrict x_, __kernel void k_born_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1, const __global numtyp4 *restrict coeff1,
const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff2,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const __global numtyp4 *restrict cutsq_sigma, const __global numtyp4 *restrict cutsq_sigma,
const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) { const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
@ -64,14 +64,14 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w; int itype=ix.w;
@ -114,129 +114,129 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
numtyp r = ucl_sqrt(rsq); numtyp r = ucl_sqrt(rsq);
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x); rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv; r6inv = r2inv*r2inv*r2inv;
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj; + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
} else forceborn = (numtyp)0.0; } else forceborn = (numtyp)0.0;
force = (forceborn + forcecoul) * r2inv; force = (forceborn + forcecoul) * r2inv;
f.x+=delx*force; f.x+=delx*force;
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (eflag>0) {
if (rsq < cut_coulsq) if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul); e_coul += prefactor*(_erfc-factor_coul);
if (rsq < coeff1[mtype].w) { if (rsq < coeff1[mtype].w) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv; + coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
virial[3] += delx*dely*force; virial[3] += delx*dely*force;
virial[4] += delx*delz*force; virial[4] += delx*delz*force;
virial[5] += dely*delz*force; virial[5] += dely*delz*force;
} }
} }
} // for nbor } // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii } // if ii
} }
__kernel void k_born_long_fast(const __global numtyp4 *restrict x_, __kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1_in, const __global numtyp4 *restrict coeff1_in,
const __global numtyp4 *restrict coeff2_in, const __global numtyp4 *restrict coeff2_in,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const __global numtyp4 *restrict cutsq_sigma, const __global numtyp4 *restrict cutsq_sigma,
const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) { const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
if (tid<8) if (tid<8)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid]; coeff1[tid]=coeff1_in[tid];
if (eflag>0) if (eflag>0)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0; acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w; int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w; int mtype=itype+jx.w;
// Compute r12 // Compute r12
numtyp delx = ix.x-jx.x; numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq_sigma[mtype].x) { if (rsq<cutsq_sigma[mtype].x) {
numtyp r2inv=ucl_recip(rsq); numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc; numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
numtyp rexp = (numtyp)0.0; numtyp rexp = (numtyp)0.0;
if (rsq < cut_coulsq) { if (rsq < cut_coulsq) {
numtyp r=ucl_rsqrt(r2inv); numtyp r=ucl_rsqrt(r2inv);
numtyp grij = g_ewald * r; numtyp grij = g_ewald * r;
numtyp expm2 = ucl_exp(-grij*grij); numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
fetch(prefactor,j,q_tex); fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r; prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else forcecoul = (numtyp)0.0; } else forcecoul = (numtyp)0.0;
if (rsq < cutsq_sigma[mtype].y) { if (rsq < cutsq_sigma[mtype].y) {
numtyp r = ucl_sqrt(rsq); numtyp r = ucl_sqrt(rsq);
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x); rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv; r6inv = r2inv*r2inv*r2inv;
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj; + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
} else forceborn = (numtyp)0.0; } else forceborn = (numtyp)0.0;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -30,19 +30,19 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
* - -3 if there is an out of memory error * - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU * - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/ * - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_rhoinv, int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born3, double **host_born1, double **host_born2, double **host_born3,
double **host_a, double **host_c, double **host_d, double **host_a, double **host_c, double **host_d,
double **host_sigma, double **host_offset, double *host_special_lj, double **host_sigma, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq, const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul, const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald); const double qqrd2e, const double g_ewald);
@ -59,12 +59,12 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
// --------------------------- TYPE DATA -------------------------- // --------------------------- TYPE DATA --------------------------
/// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
/// coeff1.w = born3 /// coeff1.w = born3
UCL_D_Vec<numtyp4> coeff1; UCL_D_Vec<numtyp4> coeff1;
/// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
UCL_D_Vec<numtyp4> coeff2; UCL_D_Vec<numtyp4> coeff2;
/// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
/// cutsq_sigma.z = sigma /// cutsq_sigma.z = sigma
UCL_D_Vec<numtyp4> cutsq_sigma; UCL_D_Vec<numtyp4> cutsq_sigma;
/// Special LJ values [0-3] and Special Coul values [4-7] /// Special LJ values [0-3] and Special Coul values [4-7]
@ -73,7 +73,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _lj_types; int _lj_types;
numtyp _cut_coulsq, _qqrd2e, _g_ewald; numtyp _cut_coulsq, _qqrd2e, _g_ewald;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -30,9 +30,9 @@ static BornCoulLong<PRECISION,ACC_PRECISION> BORNCLMF;
int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born3, double **host_born1, double **host_born2, double **host_born3,
double **host_a, double **host_c, double **host_d, double **host_a, double **host_c, double **host_d,
double **sigma, double **offset, double *special_lj, double **sigma, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors, const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode, const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e, double *host_special_coul, const double qqrd2e,
const double g_ewald) { const double g_ewald) {
@ -58,10 +58,10 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, offset, host_born3, host_a, host_c, host_d, sigma, offset,
special_lj, inum, nall, 300, maxspecial, cell_size, special_lj, inum, nall, 300, maxspecial, cell_size,
gpu_split, screen, host_cut_ljsq, host_cut_coulsq, gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald); host_special_coul, qqrd2e, g_ewald);
BORNCLMF.device->world_barrier(); BORNCLMF.device->world_barrier();
@ -78,14 +78,14 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
fflush(screen); fflush(screen);
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, offset, host_born3, host_a, host_c, host_d, sigma, offset,
special_lj, inum, nall, 300, maxspecial, cell_size, special_lj, inum, nall, 300, maxspecial, cell_size,
gpu_split, screen, host_cut_ljsq, host_cut_coulsq, gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald); host_special_coul, qqrd2e, g_ewald);
BORNCLMF.device->gpu_barrier(); BORNCLMF.device->gpu_barrier();
if (message) if (message)
fprintf(screen,"Done.\n"); fprintf(screen,"Done.\n");
} }
if (message) if (message)
@ -102,7 +102,7 @@ void borncl_gpu_clear() {
int** borncl_gpu_compute_n(const int ago, const int inum_full, int** borncl_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial, double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time, int **ilist, int **jnum, const double cpu_time,
@ -112,8 +112,8 @@ int** borncl_gpu_compute_n(const int ago, const int inum_full,
subhi, tag, nspecial, special, eflag, vflag, eatom, subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success, vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd); host_q, boxlo, prd);
} }
void borncl_gpu_compute(const int ago, const int inum_full, const int nall, void borncl_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -37,17 +37,17 @@ template <class numtyp, class acctyp>
BornCoulWolfT::~BornCoulWolfT() { BornCoulWolfT::~BornCoulWolfT() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BornCoulWolfT::bytes_per_atom(const int max_nbors) const { int BornCoulWolfT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born3, double **host_born1, double **host_born2, double **host_born3,
double **host_a, double **host_c, double **host_d, double **host_a, double **host_c, double **host_d,
double **host_sigma, double **host_offset, double **host_sigma, double **host_offset,
double *host_special_lj, const int nlocal, double *host_special_lj, const int nlocal,
const int nall, const int max_nbors, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
@ -84,12 +84,12 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_d,host_offset); host_d,host_offset);
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq, this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
host_cut_ljsq,host_sigma); host_cut_ljsq,host_sigma);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) { for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i]; host_write[i]=host_special_lj[i];
@ -144,7 +144,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -157,17 +157,17 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &ainum, &nbor_pitch, &this->atom->q,
&cutsq_sigma, &_cut_coulsq, &_qqrd2e, &cutsq_sigma, &_cut_coulsq, &_qqrd2e,
&_alf, &_e_shift, &_f_shift, &_alf, &_e_shift, &_f_shift,
&this->_threads_per_atom); &this->_threads_per_atom);
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->q, &nbor_pitch, &this->atom->q,
&cutsq_sigma, &_cut_coulsq, &cutsq_sigma, &_cut_coulsq,
&_qqrd2e, &_alf, &_e_shift, &_f_shift, &_qqrd2e, &_alf, &_e_shift, &_f_shift,
&this->_threads_per_atom); &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : nguyentd@ornl.gov // email : nguyentd@ornl.gov
// ***************************************************************************/ // ***************************************************************************/
@ -31,21 +31,21 @@ texture<int2> q_tex;
#define MY_PIS (acctyp)1.77245385090551602729 #define MY_PIS (acctyp)1.77245385090551602729
__kernel void k_born_wolf(const __global numtyp4 *restrict x_, __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1, const __global numtyp4 *restrict coeff1,
const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff2,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const __global numtyp4 *restrict cutsq_sigma, const __global numtyp4 *restrict cutsq_sigma,
const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp alf, const numtyp e_shift, const numtyp alf, const numtyp e_shift,
const numtyp f_shift, const int t_per_atom) { const numtyp f_shift, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
@ -67,20 +67,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w; int itype=ix.w;
if (eflag>0) { if (eflag>0) {
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self; e_coul += (acctyp)2.0*e_self;
} }
@ -108,12 +108,12 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
numtyp forcecoul, forceborn, force, r6inv, prefactor; numtyp forcecoul, forceborn, force, r6inv, prefactor;
numtyp v_sh = (numtyp)0.0; numtyp v_sh = (numtyp)0.0;
numtyp rexp = (numtyp)0.0; numtyp rexp = (numtyp)0.0;
if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
numtyp r = ucl_sqrt(rsq); numtyp r = ucl_sqrt(rsq);
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x); rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv; r6inv = r2inv*r2inv*r2inv;
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj; + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
} else forceborn = (numtyp)0.0; } else forceborn = (numtyp)0.0;
@ -147,7 +147,7 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv; + coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
@ -165,20 +165,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
} // if ii } // if ii
} }
__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1_in, const __global numtyp4 *restrict coeff1_in,
const __global numtyp4 *restrict coeff2_in, const __global numtyp4 *restrict coeff2_in,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const __global numtyp4 *restrict cutsq_sigma, const __global numtyp4 *restrict cutsq_sigma,
const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp alf, const numtyp e_shift, const numtyp alf, const numtyp e_shift,
const numtyp f_shift, const int t_per_atom) { const numtyp f_shift, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
@ -193,7 +193,7 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
if (eflag>0) if (eflag>0)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0; acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
@ -201,23 +201,23 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w; int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
if (eflag>0) { if (eflag>0) {
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self; e_coul += (acctyp)2.0*e_self;
} }
@ -244,12 +244,12 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
numtyp forcecoul, forceborn, force, r6inv, prefactor; numtyp forcecoul, forceborn, force, r6inv, prefactor;
numtyp v_sh = (numtyp)0.0; numtyp v_sh = (numtyp)0.0;
numtyp rexp = (numtyp)0.0; numtyp rexp = (numtyp)0.0;
if (rsq < cutsq_sigma[mtype].y) { if (rsq < cutsq_sigma[mtype].y) {
numtyp r = ucl_sqrt(rsq); numtyp r = ucl_sqrt(rsq);
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x); rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv; r6inv = r2inv*r2inv*r2inv;
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj; + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
} else forceborn = (numtyp)0.0; } else forceborn = (numtyp)0.0;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -30,19 +30,19 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
* - -3 if there is an out of memory error * - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU * - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/ * - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_rhoinv, int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born3, double **host_born1, double **host_born2, double **host_born3,
double **host_a, double **host_c, double **host_d, double **host_a, double **host_c, double **host_d,
double **host_sigma, double **host_offset, double *host_special_lj, double **host_sigma, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq, const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul, const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double alf, const double e_shift, const double qqrd2e, const double alf, const double e_shift,
@ -60,12 +60,12 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
// --------------------------- TYPE DATA -------------------------- // --------------------------- TYPE DATA --------------------------
/// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
/// coeff1.w = born3 /// coeff1.w = born3
UCL_D_Vec<numtyp4> coeff1; UCL_D_Vec<numtyp4> coeff1;
/// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
UCL_D_Vec<numtyp4> coeff2; UCL_D_Vec<numtyp4> coeff2;
/// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
/// cutsq_sigma.z = sigma /// cutsq_sigma.z = sigma
UCL_D_Vec<numtyp4> cutsq_sigma; UCL_D_Vec<numtyp4> cutsq_sigma;
/// Special LJ values [0-3] and Special Coul values [4-7] /// Special LJ values [0-3] and Special Coul values [4-7]
@ -74,7 +74,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _lj_types; int _lj_types;
numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift; numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -28,7 +28,7 @@ static BornCoulWolf<PRECISION,ACC_PRECISION> BORNCWMF;
// Allocate memory on host and device and copy constants to device // Allocate memory on host and device and copy constants to device
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born3, double **host_born1, double **host_born2, double **host_born3,
double **host_a, double **host_c, double **host_d, double **host_a, double **host_c, double **host_d,
double **sigma, double **offset, double *special_lj, const int inum, double **sigma, double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial, const int nall, const int max_nbors, const int maxspecial,
@ -60,9 +60,9 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
if (world_me==0) if (world_me==0)
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, host_cut_coulsq, host_special_coul, qqrd2e,
alf, e_shift, f_shift); alf, e_shift, f_shift);
BORNCWMF.device->world_barrier(); BORNCWMF.device->world_barrier();
@ -79,15 +79,15 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
fflush(screen); fflush(screen);
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, host_cut_coulsq, host_special_coul, qqrd2e,
alf, e_shift, f_shift); alf, e_shift, f_shift);
BORNCWMF.device->gpu_barrier(); BORNCWMF.device->gpu_barrier();
if (message) if (message)
fprintf(screen,"Done.\n"); fprintf(screen,"Done.\n");
} }
if (message) if (message)
@ -104,7 +104,7 @@ void borncw_gpu_clear() {
int** borncw_gpu_compute_n(const int ago, const int inum_full, int** borncw_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial, double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time, int **ilist, int **jnum, const double cpu_time,
@ -114,8 +114,8 @@ int** borncw_gpu_compute_n(const int ago, const int inum_full,
subhi, tag, nspecial, special, eflag, vflag, eatom, subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success, vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd); host_q, boxlo, prd);
} }
void borncw_gpu_compute(const int ago, const int inum_full, const int nall, void borncw_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -28,9 +28,9 @@ static Born<PRECISION,ACC_PRECISION> BORNMF;
// Allocate memory on host and device and copy constants to device // Allocate memory on host and device and copy constants to device
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born1, double **host_born2,
double **host_born3, double **host_a, double **host_c, double **host_born3, double **host_a, double **host_c,
double **host_d, double **sigma, double **host_d, double **sigma,
double **offset, double *special_lj, const int inum, double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial, const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) { const double cell_size, int &gpu_mode, FILE *screen) {
@ -56,7 +56,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
@ -75,13 +75,13 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
fflush(screen); fflush(screen);
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
BORNMF.device->gpu_barrier(); BORNMF.device->gpu_barrier();
if (message) if (message)
fprintf(screen,"Done.\n"); fprintf(screen,"Done.\n");
} }
if (message) if (message)
@ -102,24 +102,24 @@ void born_gpu_reinit(const int ntypes, double **host_rhoinv,
int world_me=BORNMF.device->world_me(); int world_me=BORNMF.device->world_me();
int gpu_rank=BORNMF.device->gpu_rank(); int gpu_rank=BORNMF.device->gpu_rank();
int procs_per_gpu=BORNMF.device->procs_per_gpu(); int procs_per_gpu=BORNMF.device->procs_per_gpu();
if (world_me==0) if (world_me==0)
BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2, BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, offset); host_born3, host_a, host_c, host_d, offset);
BORNMF.device->world_barrier(); BORNMF.device->world_barrier();
for (int i=0; i<procs_per_gpu; i++) { for (int i=0; i<procs_per_gpu; i++) {
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2, BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, offset); host_born3, host_a, host_c, host_d, offset);
BORNMF.device->gpu_barrier(); BORNMF.device->gpu_barrier();
} }
} }
void born_gpu_clear() { void born_gpu_clear() {
BORNMF.clear(); BORNMF.clear();
} }
int ** born_gpu_compute_n(const int ago, const int inum_full, int ** born_gpu_compute_n(const int ago, const int inum_full,
@ -132,8 +132,8 @@ int ** born_gpu_compute_n(const int ago, const int inum_full,
return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo, return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom, subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success); vatom, host_start, ilist, jnum, cpu_time, success);
} }
void born_gpu_compute(const int ago, const int inum_full, const int nall, void born_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -33,10 +33,10 @@ BuckT::Buck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
BuckT::~Buck() { BuckT::~Buck() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BuckT::bytes_per_atom(const int max_nbors) const { int BuckT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
@ -44,11 +44,11 @@ int BuckT::bytes_per_atom(const int max_nbors) const {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BuckT::init(const int ntypes, double **host_cutsq, int BuckT::init(const int ntypes, double **host_cutsq,
double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_rhoinv, double **host_buck1, double **host_buck2,
double **host_a, double **host_c, double **host_a, double **host_c,
double **host_offset, double *host_special_lj, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) { const double gpu_split, FILE *_screen) {
int success; int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@ -79,7 +79,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_offset); host_offset);
UCL_H_Vec<double> dview; UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@ -95,14 +95,14 @@ template <class numtyp, class acctyp>
void BuckT::reinit(const int ntypes, double **host_cutsq, void BuckT::reinit(const int ntypes, double **host_cutsq,
double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_rhoinv, double **host_buck1, double **host_buck2,
double **host_a, double **host_c, double **host_offset) { double **host_a, double **host_c, double **host_offset) {
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
UCL_WRITE_ONLY); UCL_WRITE_ONLY);
for (int i=0; i<_lj_types*_lj_types; i++) for (int i=0; i<_lj_types*_lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;
this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv, this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
host_buck1,host_buck2,host_cutsq); host_buck1,host_buck2,host_cutsq);
this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c, this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
@ -143,7 +143,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -154,13 +154,13 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom); &this->_threads_per_atom);
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom); &ainum, &nbor_pitch, &this->_threads_per_atom);
} }

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : nguyentd@ornl.gov // email : nguyentd@ornl.gov
// ***************************************************************************/ // ***************************************************************************/
@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
#define pos_tex x_ #define pos_tex x_
#endif #endif
__kernel void k_buck(const __global numtyp4 *restrict x_, __kernel void k_buck(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1, const __global numtyp4 *restrict coeff1,
const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff2,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
@ -50,20 +50,20 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w; int itype=ix.w;
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -76,24 +76,24 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz; numtyp r2inv = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype; int mtype=itype*lj_types+jtype;
if (r2inv<coeff1[mtype].w) { if (r2inv<coeff1[mtype].w) {
numtyp r=ucl_sqrt(r2inv); numtyp r=ucl_sqrt(r2inv);
numtyp rexp = ucl_exp(-r*coeff1[mtype].x); numtyp rexp = ucl_exp(-r*coeff1[mtype].x);
r2inv=ucl_recip(r2inv); r2inv=ucl_recip(r2inv);
numtyp r6inv = r2inv*r2inv*r2inv; numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*(coeff1[mtype].y*r*rexp numtyp force = r2inv*(coeff1[mtype].y*r*rexp
- coeff1[mtype].z*r6inv); - coeff1[mtype].z*r6inv);
force*=factor_lj; force*=factor_lj;
f.x+=delx*force; f.x+=delx*force;
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (eflag>0) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
energy+=factor_lj*(e-coeff2[mtype].z); energy+=factor_lj*(e-coeff2[mtype].z);
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
@ -111,19 +111,19 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
} // if ii } // if ii
} }
__kernel void k_buck_fast(const __global numtyp4 *restrict x_, __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1_in, const __global numtyp4 *restrict coeff1_in,
const __global numtyp4 *restrict coeff2_in, const __global numtyp4 *restrict coeff2_in,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
@ -134,7 +134,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
if (eflag>0) if (eflag>0)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -143,7 +143,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
@ -157,7 +157,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -170,13 +170,13 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz; numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<coeff1[mtype].w) { if (r2inv<coeff1[mtype].w) {
numtyp r=ucl_sqrt(r2inv); numtyp r=ucl_sqrt(r2inv);
numtyp rexp = ucl_exp(-r*coeff1[mtype].x); numtyp rexp = ucl_exp(-r*coeff1[mtype].x);
r2inv=ucl_recip(r2inv); r2inv=ucl_recip(r2inv);
numtyp r6inv = r2inv*r2inv*r2inv; numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*(coeff1[mtype].y*r*rexp numtyp force = r2inv*(coeff1[mtype].y*r*rexp
- coeff1[mtype].z*r6inv); - coeff1[mtype].z*r6inv);
force*=factor_lj; force*=factor_lj;
@ -186,7 +186,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
if (eflag>0) { if (eflag>0) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
energy+=factor_lj*(e-coeff2[mtype].z); energy+=factor_lj*(e-coeff2[mtype].z);
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
class Buck : public BaseAtomic<numtyp, acctyp> { class Buck : public BaseAtomic<numtyp, acctyp> {
public: public:
Buck(); Buck();
~Buck(); ~Buck();
/// Clear any previous data and set up for a new LAMMPS run /// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -38,18 +38,18 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
* - -4 if the GPU library was not compiled for GPU * - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/ * - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, int init(const int ntypes, double **host_cutsq,
double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_rhoinv, double **host_buck1, double **host_buck2,
double **host_a, double **host_c, double **host_a, double **host_c,
double **host_offset, double *host_special_lj, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen); const double gpu_split, FILE *screen);
/// Send updated coeffs from host to device (to be compatible with fix adapt) /// Send updated coeffs from host to device (to be compatible with fix adapt)
void reinit(const int ntypes, double **host_cutsq, void reinit(const int ntypes, double **host_cutsq,
double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_rhoinv, double **host_buck1, double **host_buck2,
double **host_a, double **host_c, double **host_offset); double **host_a, double **host_c, double **host_offset);
/// Clear all host and device data /// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/ /** \note This is called at the beginning of the init() routine **/
void clear(); void clear();
@ -72,7 +72,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _lj_types; int _lj_types;
private: private:

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -33,10 +33,10 @@ BuckCoulT::BuckCoul() : BaseCharge<numtyp,acctyp>(), _allocated(false) {
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
BuckCoulT::~BuckCoul() { BuckCoulT::~BuckCoul() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BuckCoulT::bytes_per_atom(const int max_nbors) const { int BuckCoulT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
@ -44,11 +44,11 @@ int BuckCoulT::bytes_per_atom(const int max_nbors) const {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BuckCoulT::init(const int ntypes, double **host_cutsq, int BuckCoulT::init(const int ntypes, double **host_cutsq,
double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_rhoinv, double **host_buck1, double **host_buck2,
double **host_a, double **host_c, double **host_a, double **host_c,
double **host_offset, double *host_special_lj, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen, double **host_cut_ljsq, const double gpu_split, FILE *_screen, double **host_cut_ljsq,
double **host_cut_coulsq, double *host_special_coul, double **host_cut_coulsq, double *host_special_coul,
const double qqrd2e) { const double qqrd2e) {
@ -81,21 +81,21 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_offset); host_offset);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq, this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq,
host_cut_ljsq, host_cut_coulsq); host_cut_ljsq, host_cut_coulsq);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) { for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i]; host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i]; host_write[i+4]=host_special_coul[i];
} }
ucl_copy(sp_lj,host_write,8,false); ucl_copy(sp_lj,host_write,8,false);
_qqrd2e = qqrd2e; _qqrd2e = qqrd2e;
_allocated=true; _allocated=true;
this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes(); this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes();
return 0; return 0;
@ -135,7 +135,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -147,12 +147,12 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q, &vflag, &ainum, &nbor_pitch, &this->atom->q,
&cutsq, &_qqrd2e, &this->_threads_per_atom); &cutsq, &_qqrd2e, &this->_threads_per_atom);
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &ainum, &nbor_pitch, &this->atom->q,
&cutsq, &_qqrd2e, &this->_threads_per_atom); &cutsq, &_qqrd2e, &this->_threads_per_atom);

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : nguyentd@ornl.gov // email : nguyentd@ornl.gov
// ***************************************************************************/ // ***************************************************************************/
@ -29,19 +29,19 @@ texture<int2> q_tex;
#define q_tex q_ #define q_tex q_
#endif #endif
__kernel void k_buck_coul(const __global numtyp4 *restrict x_, __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1, const __global numtyp4 *restrict coeff1,
const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff2,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
const __global numtyp *restrict q_ , const __global numtyp *restrict q_ ,
const __global numtyp4 *restrict cutsq, const __global numtyp4 *restrict cutsq,
const numtyp qqrd2e, const int t_per_atom) { const numtyp qqrd2e, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
@ -63,21 +63,21 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w; int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4]; factor_coul = sp_lj[sbmask(j)+4];
@ -91,30 +91,30 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype; int mtype=itype*lj_types+jtype;
if (rsq<cutsq[mtype].x) { if (rsq<cutsq[mtype].x) {
numtyp r2inv=ucl_recip(rsq); numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, forcebuck, force, r6inv; numtyp forcecoul, forcebuck, force, r6inv;
numtyp rexp = (numtyp)0.0; numtyp rexp = (numtyp)0.0;
if (rsq < cutsq[mtype].y) { // buckingham if (rsq < cutsq[mtype].y) { // buckingham
numtyp r=ucl_sqrt(rsq); numtyp r=ucl_sqrt(rsq);
rexp = ucl_exp(-r*coeff1[mtype].x); rexp = ucl_exp(-r*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv; r6inv = r2inv*r2inv*r2inv;
forcebuck = (coeff1[mtype].y*r*rexp forcebuck = (coeff1[mtype].y*r*rexp
- coeff1[mtype].z*r6inv)*factor_lj; - coeff1[mtype].z*r6inv)*factor_lj;
} else } else
forcebuck = (numtyp)0.0; forcebuck = (numtyp)0.0;
if (rsq < coeff2[mtype].z) { if (rsq < coeff2[mtype].z) {
fetch(forcecoul,j,q_tex); fetch(forcecoul,j,q_tex);
forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul; forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
} else } else
forcecoul = (numtyp)0.0; forcecoul = (numtyp)0.0;
force = (forcebuck + forcecoul) * r2inv; force = (forcebuck + forcecoul) * r2inv;
f.x+=delx*force; f.x+=delx*force;
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
@ -142,22 +142,22 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
} // if ii } // if ii
} }
__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_, __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1_in, const __global numtyp4 *restrict coeff1_in,
const __global numtyp4 *restrict coeff2_in, const __global numtyp4 *restrict coeff2_in,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const __global numtyp4 *restrict _cutsq, const __global numtyp4 *restrict _cutsq,
const numtyp qqrd2e, const int t_per_atom) { const numtyp qqrd2e, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@ -170,7 +170,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
if (eflag>0) if (eflag>0)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0; acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
@ -180,7 +180,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
@ -195,7 +195,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4]; factor_coul = sp_lj[sbmask(j)+4];
@ -209,27 +209,27 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[mtype].x) { if (rsq<cutsq[mtype].x) {
numtyp r2inv=ucl_recip(rsq); numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, forcebuck, force, r6inv; numtyp forcecoul, forcebuck, force, r6inv;
numtyp rexp = (numtyp)0.0; numtyp rexp = (numtyp)0.0;
if (rsq < cutsq[mtype].y) { // buckingham if (rsq < cutsq[mtype].y) { // buckingham
numtyp r=ucl_sqrt(rsq); numtyp r=ucl_sqrt(rsq);
rexp = ucl_exp(-r*coeff1[mtype].x); rexp = ucl_exp(-r*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv; r6inv = r2inv*r2inv*r2inv;
forcebuck = (coeff1[mtype].y*r*rexp forcebuck = (coeff1[mtype].y*r*rexp
- coeff1[mtype].z*r6inv)*factor_lj; - coeff1[mtype].z*r6inv)*factor_lj;
} else } else
forcebuck = (numtyp)0.0; forcebuck = (numtyp)0.0;
if (rsq < cutsq[mtype].z) { if (rsq < cutsq[mtype].z) {
fetch(forcecoul,j,q_tex); fetch(forcecoul,j,q_tex);
forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul; forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
} else } else
forcecoul = (numtyp)0.0; forcecoul = (numtyp)0.0;
force = (forcebuck + forcecoul) * r2inv; force = (forcebuck + forcecoul) * r2inv;
f.x+=delx*force; f.x+=delx*force;
@ -241,7 +241,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
if (rsq < cutsq[mtype].y) { if (rsq < cutsq[mtype].y) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
energy+=factor_lj*(e-coeff2[mtype].z); energy+=factor_lj*(e-coeff2[mtype].z);
} }
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
class BuckCoul : public BaseCharge<numtyp, acctyp> { class BuckCoul : public BaseCharge<numtyp, acctyp> {
public: public:
BuckCoul(); BuckCoul();
~BuckCoul(); ~BuckCoul();
/// Clear any previous data and set up for a new LAMMPS run /// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -38,11 +38,11 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
* - -4 if the GPU library was not compiled for GPU * - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/ * - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, int init(const int ntypes, double **host_cutsq,
double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_rhoinv, double **host_buck1, double **host_buck2,
double **host_a, double **host_c, double **host_a, double **host_c,
double **host_offset, double *host_special_lj, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq, const double gpu_split, FILE *screen, double **host_cut_ljsq,
double **host_cut_coulsq, double *host_special_coul, double **host_cut_coulsq, double *host_special_coul,
const double qqrd2e); const double qqrd2e);
@ -71,11 +71,11 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _lj_types; int _lj_types;
numtyp _qqrd2e; numtyp _qqrd2e;
private: private:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag); void loop(const bool _eflag, const bool _vflag);

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -28,8 +28,8 @@ static BuckCoul<PRECISION,ACC_PRECISION> BUCKCMF;
// Allocate memory on host and device and copy constants to device // Allocate memory on host and device and copy constants to device
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
double **host_buck1, double **host_buck2, double **host_buck1, double **host_buck2,
double **host_a, double **host_c, double **host_a, double **host_c,
double **offset, double *special_lj, const int inum, double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial, const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen, const double cell_size, int &gpu_mode, FILE *screen,
@ -57,9 +57,9 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset, special_lj, inum, nall, 300, host_a, host_c, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e); host_special_coul, qqrd2e);
@ -77,14 +77,14 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
fflush(screen); fflush(screen);
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset, special_lj, inum, nall, 300, host_a, host_c, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e); host_special_coul, qqrd2e);
BUCKCMF.device->gpu_barrier(); BUCKCMF.device->gpu_barrier();
if (message) if (message)
fprintf(screen,"Done.\n"); fprintf(screen,"Done.\n");
} }
if (message) if (message)
@ -96,12 +96,12 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
} }
void buckc_gpu_clear() { void buckc_gpu_clear() {
BUCKCMF.clear(); BUCKCMF.clear();
} }
int ** buckc_gpu_compute_n(const int ago, const int inum_full, int ** buckc_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial, double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time, int **ilist, int **jnum, const double cpu_time,
@ -111,8 +111,8 @@ int ** buckc_gpu_compute_n(const int ago, const int inum_full,
subhi, tag, nspecial, special, eflag, vflag, eatom, subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success, vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd); host_q, boxlo, prd);
} }
void buckc_gpu_compute(const int ago, const int inum_full, const int nall, void buckc_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
BuckCoulLongT::~BuckCoulLongT() { BuckCoulLongT::~BuckCoulLongT() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BuckCoulLongT::bytes_per_atom(const int max_nbors) const { int BuckCoulLongT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
@ -45,8 +45,8 @@ int BuckCoulLongT::bytes_per_atom(const int max_nbors) const {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BuckCoulLongT::init(const int ntypes, double **host_cutsq, int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_rhoinv, double **host_buck1, double **host_buck2,
double **host_a, double **host_c, double **host_offset, double **host_a, double **host_c, double **host_offset,
double *host_special_lj, const int nlocal, double *host_special_lj, const int nlocal,
const int nall, const int max_nbors, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
@ -83,11 +83,11 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_offset); host_offset);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) { for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i]; host_write[i]=host_special_lj[i];
@ -139,7 +139,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -150,16 +150,16 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q, &vflag, &ainum, &nbor_pitch, &this->atom->q,
&cutsq, &_cut_coulsq, &_qqrd2e, &cutsq, &_cut_coulsq, &_qqrd2e,
&_g_ewald, &this->_threads_per_atom); &_g_ewald, &this->_threads_per_atom);
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &cutsq, &ainum, &nbor_pitch, &this->atom->q, &cutsq,
&_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : nguyentd@ornl.gov // email : nguyentd@ornl.gov
// ***************************************************************************/ // ***************************************************************************/
@ -29,19 +29,19 @@ texture<int2> q_tex;
#define q_tex q_ #define q_tex q_
#endif #endif
__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1, const __global numtyp4 *restrict coeff1,
const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff2,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const __global numtyp *restrict cutsq, const __global numtyp *restrict cutsq,
const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) { const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
@ -64,14 +64,14 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w; int itype=ix.w;
@ -98,136 +98,136 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
numtyp r2inv=ucl_recip(rsq); numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc; numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
numtyp rexp = (numtyp)0.0; numtyp rexp = (numtyp)0.0;
if (rsq < coeff1[mtype].w) { // cut_ljsq if (rsq < coeff1[mtype].w) { // cut_ljsq
numtyp r=ucl_sqrt(rsq); numtyp r=ucl_sqrt(rsq);
rexp = ucl_exp(-r*coeff1[mtype].x); rexp = ucl_exp(-r*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv; r6inv = r2inv*r2inv*r2inv;
force_lj = (coeff1[mtype].y*r*rexp force_lj = (coeff1[mtype].y*r*rexp
- coeff1[mtype].z*r6inv)*factor_lj; - coeff1[mtype].z*r6inv)*factor_lj;
} else } else
force_lj = (numtyp)0.0; force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) { if (rsq < cut_coulsq) {
numtyp r = ucl_rsqrt(r2inv); numtyp r = ucl_rsqrt(r2inv);
numtyp grij = g_ewald * r; numtyp grij = g_ewald * r;
numtyp expm2 = ucl_exp(-grij*grij); numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
fetch(prefactor,j,q_tex); fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r; prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else } else
forcecoul = (numtyp)0.0; forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv; force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force; f.x+=delx*force;
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (eflag>0) {
if (rsq < cut_coulsq) if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul); e_coul += prefactor*(_erfc-factor_coul);
if (rsq < coeff1[mtype].w) { if (rsq < coeff1[mtype].w) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
energy+=factor_lj*(e-coeff2[mtype].z); energy+=factor_lj*(e-coeff2[mtype].z);
} }
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
virial[3] += delx*dely*force; virial[3] += delx*dely*force;
virial[4] += delx*delz*force; virial[4] += delx*delz*force;
virial[5] += dely*delz*force; virial[5] += dely*delz*force;
} }
} }
} // for nbor } // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii } // if ii
} }
__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1_in, const __global numtyp4 *restrict coeff1_in,
const __global numtyp4 *restrict coeff2_in, const __global numtyp4 *restrict coeff2_in,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const __global numtyp *restrict cutsq, const __global numtyp *restrict cutsq,
const numtyp cut_coulsq, const numtyp cut_coulsq,
const numtyp qqrd2e, const numtyp g_ewald, const numtyp qqrd2e, const numtyp g_ewald,
const int t_per_atom) { const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
if (tid<8) if (tid<8)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid]; coeff1[tid]=coeff1_in[tid];
if (eflag>0) if (eflag>0)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0; acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w; int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
numtyp factor_lj, factor_coul; numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w; int mtype=itype+jx.w;
// Compute r12 // Compute r12
numtyp delx = ix.x-jx.x; numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[mtype]) { if (rsq<cutsq[mtype]) {
numtyp r2inv=ucl_recip(rsq); numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc; numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
numtyp rexp = (numtyp)0.0; numtyp rexp = (numtyp)0.0;
if (rsq < coeff1[mtype].w) { if (rsq < coeff1[mtype].w) {
numtyp r=ucl_sqrt(rsq); numtyp r=ucl_sqrt(rsq);
rexp = ucl_exp(-r*coeff1[mtype].x); rexp = ucl_exp(-r*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv; r6inv = r2inv*r2inv*r2inv;
force_lj = (coeff1[mtype].y*r*rexp force_lj = (coeff1[mtype].y*r*rexp
- coeff1[mtype].z*r6inv)*factor_lj; - coeff1[mtype].z*r6inv)*factor_lj;
} else } else
force_lj = (numtyp)0.0; force_lj = (numtyp)0.0;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -30,7 +30,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -38,11 +38,11 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
* - -4 if the GPU library was not compiled for GPU * - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/ * - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, int init(const int ntypes, double **host_cutsq,
double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_rhoinv, double **host_buck1, double **host_buck2,
double **host_a, double **host_c, double **host_a, double **host_c,
double **host_offset, double *host_special_lj, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq, const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul, const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald); const double qqrd2e, const double g_ewald);
@ -71,7 +71,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _lj_types; int _lj_types;
numtyp _cut_coulsq, _qqrd2e, _g_ewald; numtyp _cut_coulsq, _qqrd2e, _g_ewald;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -28,7 +28,7 @@ static BuckCoulLong<PRECISION,ACC_PRECISION> BUCKCLMF;
// Allocate memory on host and device and copy constants to device // Allocate memory on host and device and copy constants to device
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
double **host_buck1, double **host_buck2, double **host_buck1, double **host_buck2,
double **host_a, double **host_c, double **host_a, double **host_c,
double **offset, double *special_lj, const int inum, double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial, const int nall, const int max_nbors, const int maxspecial,
@ -58,8 +58,8 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset, special_lj, inum, nall, 300, host_a, host_c, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
@ -77,13 +77,13 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
fflush(screen); fflush(screen);
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset, special_lj, inum, nall, 300, host_a, host_c, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
BUCKCLMF.device->gpu_barrier(); BUCKCLMF.device->gpu_barrier();
if (message) if (message)
fprintf(screen,"Done.\n"); fprintf(screen,"Done.\n");
} }
if (message) if (message)
@ -100,7 +100,7 @@ void buckcl_gpu_clear() {
int** buckcl_gpu_compute_n(const int ago, const int inum_full, int** buckcl_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial, double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time, int **ilist, int **jnum, const double cpu_time,
@ -110,8 +110,8 @@ int** buckcl_gpu_compute_n(const int ago, const int inum_full,
subhi, tag, nspecial, special, eflag, vflag, eatom, subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success, vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd); host_q, boxlo, prd);
} }
void buckcl_gpu_compute(const int ago, const int inum_full, const int nall, void buckcl_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -28,8 +28,8 @@ static Buck<PRECISION,ACC_PRECISION> BUCKMF;
// Allocate memory on host and device and copy constants to device // Allocate memory on host and device and copy constants to device
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
double **host_buck1, double **host_buck2, double **host_buck1, double **host_buck2,
double **host_a, double **host_c, double **host_a, double **host_c,
double **offset, double *special_lj, const int inum, double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial, const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) { const double cell_size, int &gpu_mode, FILE *screen) {
@ -55,7 +55,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset, special_lj, inum, nall, 300, host_a, host_c, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
@ -73,12 +73,12 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
fflush(screen); fflush(screen);
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset, special_lj, inum, nall, 300, host_a, host_c, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
BUCKMF.device->gpu_barrier(); BUCKMF.device->gpu_barrier();
if (message) if (message)
fprintf(screen,"Done.\n"); fprintf(screen,"Done.\n");
} }
if (message) if (message)
@ -98,24 +98,24 @@ void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv,
int world_me=BUCKMF.device->world_me(); int world_me=BUCKMF.device->world_me();
int gpu_rank=BUCKMF.device->gpu_rank(); int gpu_rank=BUCKMF.device->gpu_rank();
int procs_per_gpu=BUCKMF.device->procs_per_gpu(); int procs_per_gpu=BUCKMF.device->procs_per_gpu();
if (world_me==0) if (world_me==0)
BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset); host_a, host_c, offset);
BUCKMF.device->world_barrier(); BUCKMF.device->world_barrier();
for (int i=0; i<procs_per_gpu; i++) { for (int i=0; i<procs_per_gpu; i++) {
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset); host_a, host_c, offset);
BUCKMF.device->gpu_barrier(); BUCKMF.device->gpu_barrier();
} }
} }
void buck_gpu_clear() { void buck_gpu_clear() {
BUCKMF.clear(); BUCKMF.clear();
} }
int ** buck_gpu_compute_n(const int ago, const int inum_full, int ** buck_gpu_compute_n(const int ago, const int inum_full,
@ -128,8 +128,8 @@ int ** buck_gpu_compute_n(const int ago, const int inum_full,
return BUCKMF.compute(ago, inum_full, nall, host_x, host_type, sublo, return BUCKMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom, subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success); vatom, host_start, ilist, jnum, cpu_time, success);
} }
void buck_gpu_compute(const int ago, const int inum_full, const int nall, void buck_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -33,23 +33,23 @@ CGCMMT::CGCMM() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
CGCMMT::~CGCMM() { CGCMMT::~CGCMM() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int CGCMMT::bytes_per_atom(const int max_nbors) const { int CGCMMT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int CGCMMT::init(const int ntypes, double **host_cutsq, int CGCMMT::init(const int ntypes, double **host_cutsq,
int **host_cg_type, double **host_lj1, int **host_cg_type, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal, double *host_special_lj, const int nlocal,
const int nall, const int max_nbors, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) { const double gpu_split, FILE *_screen) {
int success; int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@ -75,12 +75,12 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
host_write[i]=0.0; host_write[i]=0.0;
lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY); lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq,
host_cg_type,host_lj1,host_lj2); host_cg_type,host_lj1,host_lj2);
lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY); lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4, this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
host_offset); host_offset);
UCL_H_Vec<double> dview; UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@ -126,7 +126,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -138,7 +138,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom); &this->_threads_per_atom);
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : brownw@ornl.gov // email : brownw@ornl.gov
// ***************************************************************************/ // ***************************************************************************/
@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
#define pos_tex x_ #define pos_tex x_
#endif #endif
__kernel void k_cg_cmm(const __global numtyp4 *restrict x_, __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1, const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3, const __global numtyp4 *restrict lj3,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
@ -50,20 +50,20 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w; int itype=ix.w;
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -76,12 +76,12 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz; numtyp r2inv = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype; int mtype=itype*lj_types+jtype;
if (r2inv<lj1[mtype].x) { if (r2inv<lj1[mtype].x) {
r2inv=ucl_recip(r2inv); r2inv=ucl_recip(r2inv);
numtyp inv1,inv2; numtyp inv1,inv2;
if (lj1[mtype].y == 2) { if (lj1[mtype].y == 2) {
inv1=r2inv*r2inv; inv1=r2inv*r2inv;
inv2=inv1*inv1; inv2=inv1*inv1;
@ -93,7 +93,7 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
inv2=inv1; inv2=inv1;
} }
numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w); numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
f.x+=delx*force; f.x+=delx*force;
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
@ -116,9 +116,9 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
} // if ii } // if ii
} }
__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_, __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in, const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in, const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
@ -139,30 +139,30 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
if (eflag>0) if (eflag>0)
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w; int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -175,11 +175,11 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz; numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].x) { if (r2inv<lj1[mtype].x) {
r2inv=ucl_recip(r2inv); r2inv=ucl_recip(r2inv);
numtyp inv1,inv2; numtyp inv1,inv2;
if (lj1[mtype].y == (numtyp)2) { if (lj1[mtype].y == (numtyp)2) {
inv1=r2inv*r2inv; inv1=r2inv*r2inv;
inv2=inv1*inv1; inv2=inv1*inv1;
@ -191,7 +191,7 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
inv2=inv1; inv2=inv1;
} }
numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w); numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
f.x+=delx*force; f.x+=delx*force;
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
class CGCMM : public BaseAtomic<numtyp, acctyp> { class CGCMM : public BaseAtomic<numtyp, acctyp> {
public: public:
CGCMM(); CGCMM();
~CGCMM(); ~CGCMM();
/// Clear any previous data and set up for a new LAMMPS run /// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -40,7 +40,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
int init(const int ntypes, double **host_cutsq, int **host_cg_type, int init(const int ntypes, double **host_cutsq, int **host_cg_type,
double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj, double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen); const double gpu_split, FILE *screen);
@ -66,7 +66,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _cmm_types; int _cmm_types;
private: private:

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -28,9 +28,9 @@ static CGCMM<PRECISION,ACC_PRECISION> CMMMF;
// Allocate memory on host and device and copy constants to device // Allocate memory on host and device and copy constants to device
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj, double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors, const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode, const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen) { FILE *screen) {
CMMMF.clear(); CMMMF.clear();
@ -55,7 +55,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
host_lj4, offset, special_lj, inum, nall, 300, host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
@ -78,7 +78,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
CMMMF.device->gpu_barrier(); CMMMF.device->gpu_barrier();
if (message) if (message)
fprintf(screen,"Done.\n"); fprintf(screen,"Done.\n");
} }
if (message) if (message)
@ -103,8 +103,8 @@ int** cmm_gpu_compute_n(const int ago, const int inum_full,
return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom, subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success); vatom, host_start, ilist, jnum, cpu_time, success);
} }
void cmm_gpu_compute(const int ago, const int inum_full, const int nall, void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -37,22 +37,22 @@ template <class numtyp, class acctyp>
CGCMMLongT::~CGCMMLong() { CGCMMLongT::~CGCMMLong() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int CGCMMLongT::bytes_per_atom(const int max_nbors) const { int CGCMMLongT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int CGCMMLongT::init(const int ntypes, double **host_cutsq, int CGCMMLongT::init(const int ntypes, double **host_cutsq,
int **host_cg_type, double **host_lj1, int **host_cg_type, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal, double *host_special_lj, const int nlocal,
const int nall, const int max_nbors, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen, const double gpu_split, FILE *_screen,
double **host_cut_ljsq, double **host_cut_ljsq,
const double host_cut_coulsq, const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e, double *host_special_coul, const double qqrd2e,
const double g_ewald) { const double g_ewald) {
@ -137,7 +137,7 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -149,13 +149,13 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q, &vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_cut_coulsq, &_qqrd2e, &_g_ewald,
&this->_threads_per_atom); &this->_threads_per_atom);
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom); &_qqrd2e, &_g_ewald, &this->_threads_per_atom);

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : brownw@ornl.gov // email : brownw@ornl.gov
// ***************************************************************************/ // ***************************************************************************/
@ -29,12 +29,12 @@ texture<int2> q_tex;
#define q_tex q_ #define q_tex q_
#endif #endif
__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1, const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3, const __global numtyp4 *restrict lj3,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
@ -70,7 +70,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w; int itype=ix.w;
@ -136,7 +136,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
if (rsq < lj1[mtype].y) { if (rsq < lj1[mtype].y) {
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)- energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
lj3[mtype].w; lj3[mtype].w;
} }
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
@ -154,17 +154,17 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
} // if ii } // if ii
} }
__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in, const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in, const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) { const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
@ -179,7 +179,7 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
lj1[tid]=lj1_in[tid]; lj1[tid]=lj1_in[tid];
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0; acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
@ -187,16 +187,16 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w; int iw=ix.w;
@ -262,7 +262,7 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
if (rsq < lj1[mtype].y) { if (rsq < lj1[mtype].y) {
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)- energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
lj3[mtype].w; lj3[mtype].w;
} }
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -30,7 +30,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -40,8 +40,8 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
int init(const int ntypes, double **host_cutsq, int ** cg_type, int init(const int ntypes, double **host_cutsq, int ** cg_type,
double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj, double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq, const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul, const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald); const double qqrd2e, const double g_ewald);
@ -58,7 +58,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
// --------------------------- TYPE DATA -------------------------- // --------------------------- TYPE DATA --------------------------
/// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2,
UCL_D_Vec<numtyp4> lj1; UCL_D_Vec<numtyp4> lj1;
/// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
UCL_D_Vec<numtyp4> lj3; UCL_D_Vec<numtyp4> lj3;
@ -68,7 +68,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _lj_types; int _lj_types;
numtyp _cut_coulsq, _qqrd2e, _g_ewald; numtyp _cut_coulsq, _qqrd2e, _g_ewald;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -28,9 +28,9 @@ static CGCMMLong<PRECISION,ACC_PRECISION> CMMLMF;
// Allocate memory on host and device and copy constants to device // Allocate memory on host and device and copy constants to device
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj, double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors, const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode, const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e, double *host_special_coul, const double qqrd2e,
@ -58,7 +58,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300, host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e,g_ewald); host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
@ -82,7 +82,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
host_cut_ljsq, host_cut_coulsq, host_special_coul, host_cut_ljsq, host_cut_coulsq, host_special_coul,
qqrd2e, g_ewald); qqrd2e, g_ewald);
CMMLMF.device->gpu_barrier(); CMMLMF.device->gpu_barrier();
if (message) if (message)
fprintf(screen,"Done.\n"); fprintf(screen,"Done.\n");
} }
if (message) if (message)
@ -99,7 +99,7 @@ void cmml_gpu_clear() {
int** cmml_gpu_compute_n(const int ago, const int inum_full, int** cmml_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial, double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time, int **ilist, int **jnum, const double cpu_time,
@ -109,8 +109,8 @@ int** cmml_gpu_compute_n(const int ago, const int inum_full,
subhi, tag, nspecial, special, eflag, vflag, eatom, subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success, vatom, host_start, ilist, jnum, cpu_time, success,
host_q,boxlo,prd); host_q,boxlo,prd);
} }
void cmml_gpu_compute(const int ago, const int inum_full, const int nall, void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
CHARMMLongT::~CHARMMLong() { CHARMMLongT::~CHARMMLong() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int CHARMMLongT::bytes_per_atom(const int max_nbors) const { int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
@ -45,9 +45,9 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int CHARMMLongT::init(const int ntypes, int CHARMMLongT::init(const int ntypes,
double host_cut_bothsq, double **host_lj1, double host_cut_bothsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal, double *host_special_lj, const int nlocal,
const int nall, const int max_nbors, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
@ -144,7 +144,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -153,17 +153,17 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q, &vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
&this->_threads_per_atom); &this->_threads_per_atom);
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj, this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : brownw@ornl.gov // email : brownw@ornl.gov
// ***************************************************************************/ // ***************************************************************************/
@ -31,14 +31,14 @@ texture<int2> q_tex;
__kernel void k_charmm_long(const __global numtyp4 *restrict x_, __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1, const __global numtyp4 *restrict lj1,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_lj, const __global numtyp *restrict sp_lj,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const numtyp denom_lj, const numtyp g_ewald, const numtyp denom_lj,
@ -61,7 +61,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w; int itype=ix.w;
@ -93,7 +93,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
if (rsq > cut_lj_innersq) { if (rsq > cut_lj_innersq) {
switch1 = (cut_ljsq-rsq); switch1 = (cut_ljsq-rsq);
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
denom_lj; denom_lj;
switch1 *= switch1; switch1 *= switch1;
switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/ switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
@ -130,7 +130,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
if (rsq > cut_lj_innersq) if (rsq > cut_lj_innersq)
e *= switch1; e *= switch1;
energy+=factor_lj*e; energy+=factor_lj*e;
} }
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
@ -148,19 +148,19 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
} // if ii } // if ii
} }
__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp2 *restrict ljd_in, const __global numtyp2 *restrict ljd_in,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const numtyp denom_lj, const numtyp g_ewald, const numtyp denom_lj,
const numtyp cut_bothsq, const numtyp cut_ljsq, const numtyp cut_bothsq, const numtyp cut_ljsq,
const numtyp cut_lj_innersq, const numtyp cut_lj_innersq,
const int t_per_atom) { const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
@ -174,7 +174,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
ljd[tid]=ljd_in[tid]; ljd[tid]=ljd_in[tid];
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES) if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR]; ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0; acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
@ -182,16 +182,16 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w; int itype=ix.w;
@ -229,7 +229,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4); force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
if (rsq > cut_lj_innersq) { if (rsq > cut_lj_innersq) {
switch1 = (cut_ljsq-rsq); switch1 = (cut_ljsq-rsq);
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
denom_lj; denom_lj;
switch1 *= switch1; switch1 *= switch1;
switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/ switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -30,7 +30,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -40,12 +40,12 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
int init(const int ntypes, double host_cut_bothsq, int init(const int ntypes, double host_cut_bothsq,
double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj, double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double host_cut_ljsq, const double gpu_split, FILE *screen, double host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul, const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald, const double qqrd2e, const double g_ewald,
const double cut_lj_innersq, const double denom_lj, const double cut_lj_innersq, const double denom_lj,
double **epsilon, double **sigma, const bool mix_arithmetic); double **epsilon, double **sigma, const bool mix_arithmetic);
/// Clear all host and device data /// Clear all host and device data
@ -70,7 +70,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _lj_types; int _lj_types;
numtyp _qqrd2e, _g_ewald, _denom_lj; numtyp _qqrd2e, _g_ewald, _denom_lj;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : brownw@ornl.gov email : brownw@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -87,7 +87,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
sigma, mix_arithmetic); sigma, mix_arithmetic);
CRMLMF.device->gpu_barrier(); CRMLMF.device->gpu_barrier();
if (message) if (message)
fprintf(screen,"Done.\n"); fprintf(screen,"Done.\n");
} }
if (message) if (message)
@ -104,7 +104,7 @@ void crml_gpu_clear() {
int** crml_gpu_compute_n(const int ago, const int inum_full, int** crml_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial, double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag, tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time, int **ilist, int **jnum, const double cpu_time,
@ -114,14 +114,14 @@ int** crml_gpu_compute_n(const int ago, const int inum_full,
subhi, tag, nspecial, special, eflag, vflag, eatom, subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success, vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd); host_q, boxlo, prd);
} }
void crml_gpu_compute(const int ago, const int inum_full, void crml_gpu_compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom, const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time, const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q, const int nlocal, bool &success, double *host_q, const int nlocal,
double *boxlo, double *prd) { double *boxlo, double *prd) {
CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q, eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -33,23 +33,23 @@ ColloidT::Colloid() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
ColloidT::~Colloid() { ColloidT::~Colloid() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int ColloidT::bytes_per_atom(const int max_nbors) const { int ColloidT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int ColloidT::init(const int ntypes, int ColloidT::init(const int ntypes,
double **host_cutsq, double **host_lj1, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double **host_lj4, double **host_offset,
double *host_special_lj, double **host_a12, double *host_special_lj, double **host_a12,
double **host_a1, double **host_a2, double **host_a1, double **host_a2,
double **host_d1, double **host_d2, double **host_d1, double **host_d2,
double **host_sigma3, double **host_sigma6, double **host_sigma3, double **host_sigma6,
int **host_form, const int nlocal, int **host_form, const int nlocal,
const int nall, const int max_nbors, const int nall, const int max_nbors,
@ -97,7 +97,7 @@ int ColloidT::init(const int ntypes,
UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device), UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_ONLY); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0; for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<ntypes; i++) for (int i=0; i<ntypes; i++)
for (int j=0; j<ntypes; j++) { for (int j=0; j<ntypes; j++) {
@ -153,7 +153,7 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -170,9 +170,9 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
&colloid1, &colloid2, &form, &colloid1, &colloid2, &form,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom); &ainum, &nbor_pitch, &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : nguyentd@ornl.gov // email : nguyentd@ornl.gov
// ***************************************************************************/ // ***************************************************************************/
@ -24,18 +24,18 @@ texture<int4,1> pos_tex;
#define pos_tex x_ #define pos_tex x_
#endif #endif
__kernel void k_colloid(const __global numtyp4 *restrict x_, __kernel void k_colloid(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1, const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3, const __global numtyp4 *restrict lj3,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global numtyp4 *restrict colloid1, const __global numtyp4 *restrict colloid1,
const __global numtyp4 *restrict colloid2, const __global numtyp4 *restrict colloid2,
const __global int *form, const __global int *form,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
@ -53,20 +53,20 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w; int itype=ix.w;
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -79,21 +79,21 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype; int mtype=itype*lj_types+jtype;
if (rsq<lj1[mtype].z) { if (rsq<lj1[mtype].z) {
numtyp r,r2inv,r6inv; numtyp r,r2inv,r6inv;
numtyp c1,c2,fR,evdwl; numtyp c1,c2,fR,evdwl;
numtyp K[9],h[4],g[4]; numtyp K[9],h[4],g[4];
numtyp force = (numtyp)0; numtyp force = (numtyp)0;
if (form[mtype]==0) { // SMALL_SMALL if (form[mtype]==0) { // SMALL_SMALL
r2inv=ucl_recip(rsq); r2inv=ucl_recip(rsq);
r6inv = r2inv*r2inv*r2inv; r6inv = r2inv*r2inv*r2inv;
force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
force*=factor_lj; force*=factor_lj;
} else if (form[mtype]==1) { // SMALL_LARGE } else if (form[mtype]==1) { // SMALL_LARGE
c2 = colloid1[mtype].z; c2 = colloid1[mtype].z;
K[1] = c2*c2; K[1] = c2*c2;
K[2] = rsq; K[2] = rsq;
K[0] = K[1] - rsq; K[0] = K[1] - rsq;
@ -102,15 +102,15 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
K[3] *= K[3]*K[3]; K[3] *= K[3]*K[3];
K[6] = K[3]*K[3]; K[6] = K[3]*K[3];
fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3]; fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
force = (numtyp)4.0/(numtyp)15.0*fR * force = (numtyp)4.0/(numtyp)15.0*fR *
((numtyp)2.0*(K[1]+K[2]) * ((numtyp)2.0*(K[1]+K[2]) *
(K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0]; colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
force*=factor_lj; force*=factor_lj;
} else if (form[mtype]==2) { // LARGE_LARGE } else if (form[mtype]==2) { // LARGE_LARGE
r = ucl_sqrt(rsq); r = ucl_sqrt(rsq);
c1 = colloid1[mtype].y; c1 = colloid1[mtype].y;
c2 = colloid1[mtype].z; c2 = colloid1[mtype].z;
K[0] = c1*c2; K[0] = c1*c2;
K[1] = c1+c2; K[1] = c1+c2;
K[2] = c1-c2; K[2] = c1-c2;
@ -132,16 +132,16 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4]; g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5]; g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6]; g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0; fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
evdwl = fR * (h[0]-h[1]-h[2]+h[3]); evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]); numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r* numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]); ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
force = factor_lj * (dUR+dUA)/r; force = factor_lj * (dUR+dUA)/r;
} }
f.x+=delx*force; f.x+=delx*force;
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
@ -151,14 +151,14 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
if (form[mtype]==0) { if (form[mtype]==0) {
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
} else if (form[mtype]==1) { } else if (form[mtype]==1) {
e=(numtyp)2.0/(numtyp)9.0*fR * e=(numtyp)2.0/(numtyp)9.0*fR *
((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) + ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +
(numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]); (numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]);
} else if (form[mtype]==2) { } else if (form[mtype]==2) {
e=evdwl+colloid1[mtype].x/(numtyp)6.0 * e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7])); ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
} }
energy+=factor_lj*(e-lj3[mtype].z); energy+=factor_lj*(e-lj3[mtype].z);
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
@ -176,22 +176,22 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
} // if ii } // if ii
} }
__kernel void k_colloid_fast(const __global numtyp4 *restrict x_, __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in, const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in, const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in, const __global numtyp *restrict sp_lj_in,
const __global numtyp4 *restrict colloid1_in, const __global numtyp4 *restrict colloid1_in,
const __global numtyp4 *restrict colloid2_in, const __global numtyp4 *restrict colloid2_in,
const __global int *form_in, const __global int *form_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) { const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@ -208,7 +208,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
if (eflag>0) if (eflag>0)
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -217,7 +217,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
@ -231,7 +231,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj; numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)]; factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -244,20 +244,20 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<lj1[mtype].z) { if (rsq<lj1[mtype].z) {
numtyp r,r2inv,r6inv; numtyp r,r2inv,r6inv;
numtyp c1,c2,fR,evdwl; numtyp c1,c2,fR,evdwl;
numtyp K[9],h[4],g[4]; numtyp K[9],h[4],g[4];
numtyp force = (numtyp)0; numtyp force = (numtyp)0;
if (form[mtype]==0) { // SMALL_SMALL if (form[mtype]==0) { // SMALL_SMALL
r2inv=ucl_recip(rsq); r2inv=ucl_recip(rsq);
r6inv = r2inv*r2inv*r2inv; r6inv = r2inv*r2inv*r2inv;
force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
force*=factor_lj; force*=factor_lj;
} else if (form[mtype]==1) { // SMALL_LARGE } else if (form[mtype]==1) { // SMALL_LARGE
c2 = colloid1[mtype].z; c2 = colloid1[mtype].z;
K[1] = c2*c2; K[1] = c2*c2;
K[2] = rsq; K[2] = rsq;
K[0] = K[1] - rsq; K[0] = K[1] - rsq;
@ -266,15 +266,15 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
K[3] *= K[3]*K[3]; K[3] *= K[3]*K[3];
K[6] = K[3]*K[3]; K[6] = K[3]*K[3];
fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3]; fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
force = (numtyp)4.0/(numtyp)15.0*fR * force = (numtyp)4.0/(numtyp)15.0*fR *
((numtyp)2.0*(K[1]+K[2]) * ((numtyp)2.0*(K[1]+K[2]) *
(K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0]; colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
force*=factor_lj; force*=factor_lj;
} else if (form[mtype]==2) { // LARGE_LARGE } else if (form[mtype]==2) { // LARGE_LARGE
r = ucl_sqrt(rsq); r = ucl_sqrt(rsq);
c1 = colloid1[mtype].y; c1 = colloid1[mtype].y;
c2 = colloid1[mtype].z; c2 = colloid1[mtype].z;
K[0] = c1*c2; K[0] = c1*c2;
K[1] = c1+c2; K[1] = c1+c2;
K[2] = c1-c2; K[2] = c1-c2;
@ -296,16 +296,16 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4]; g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5]; g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6]; g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0; fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
evdwl = fR * (h[0]-h[1]-h[2]+h[3]); evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]); numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r* numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]); ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
force = factor_lj * (dUR+dUA)/r; force = factor_lj * (dUR+dUA)/r;
} else force = (numtyp)0.0; } else force = (numtyp)0.0;
f.x+=delx*force; f.x+=delx*force;
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
@ -315,15 +315,15 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
if (form[mtype]==0) { if (form[mtype]==0) {
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
} else if (form[mtype]==1) { } else if (form[mtype]==1) {
e=(numtyp)2.0/(numtyp)9.0*fR * e=(numtyp)2.0/(numtyp)9.0*fR *
((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+ ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+
(numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])* (numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
colloid2[mtype].w/K[6]); colloid2[mtype].w/K[6]);
} else if (form[mtype]==2) { } else if (form[mtype]==2) {
e=evdwl+colloid1[mtype].x/(numtyp)6.0 * e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7])); ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
} }
energy+=factor_lj*(e-lj3[mtype].z); energy+=factor_lj*(e-lj3[mtype].z);
} }
if (vflag>0) { if (vflag>0) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
class Colloid : public BaseAtomic<numtyp, acctyp> { class Colloid : public BaseAtomic<numtyp, acctyp> {
public: public:
Colloid(); Colloid();
~Colloid(); ~Colloid();
/// Clear any previous data and set up for a new LAMMPS run /// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -40,11 +40,11 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
int init(const int ntypes, double **host_cutsq, int init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj, double **host_lj4, double **host_offset, double *host_special_lj,
double **host_a12, double **host_a1, double **host_a2, double **host_a12, double **host_a1, double **host_a2,
double **host_d1, double **host_d2, double **host_sigma3, double **host_d1, double **host_d2, double **host_sigma3,
double **host_sigma6, int **host_form, double **host_sigma6, int **host_form,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen); const double gpu_split, FILE *screen);
/// Clear all host and device data /// Clear all host and device data
@ -65,7 +65,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
UCL_D_Vec<numtyp4> lj3; UCL_D_Vec<numtyp4> lj3;
/// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2 /// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2
UCL_D_Vec<numtyp4> colloid1; UCL_D_Vec<numtyp4> colloid1;
/// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3, /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3,
/// colloid2.w = sigma6 /// colloid2.w = sigma6
UCL_D_Vec<numtyp4> colloid2; UCL_D_Vec<numtyp4> colloid2;
/// form /// form
@ -76,7 +76,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _lj_types; int _lj_types;
private: private:

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : nguyentd@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
@ -29,9 +29,9 @@ static Colloid<PRECISION,ACC_PRECISION> COLLMF;
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4, double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, double **offset, double *special_lj,
double **host_a12, double **host_a1, double **host_a2, double **host_a12, double **host_a1, double **host_a2,
double **host_d1, double **host_d2, double **host_sigma3, double **host_d1, double **host_d2, double **host_sigma3,
double **host_sigma6, int **host_form, const int inum, double **host_sigma6, int **host_form, const int inum,
const int nall, const int max_nbors, const int maxspecial, const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) { const double cell_size, int &gpu_mode, FILE *screen) {
@ -57,9 +57,9 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, host_a12, host_a1, host_lj4, offset, special_lj, host_a12, host_a1,
host_a2, host_d1, host_d2, host_sigma3, host_a2, host_d1, host_d2, host_sigma3,
host_sigma6, host_form, inum, nall, 300, host_sigma6, host_form, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
@ -78,13 +78,13 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, host_a12, host_a1, host_a2, offset, special_lj, host_a12, host_a1, host_a2,
host_d1, host_d2, host_sigma3, host_sigma6, host_form, host_d1, host_d2, host_sigma3, host_sigma6, host_form,
inum, nall, 300, maxspecial, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen); cell_size, gpu_split, screen);
COLLMF.device->gpu_barrier(); COLLMF.device->gpu_barrier();
if (message) if (message)
fprintf(screen,"Done.\n"); fprintf(screen,"Done.\n");
} }
if (message) if (message)
@ -109,8 +109,8 @@ int ** colloid_gpu_compute_n(const int ago, const int inum_full,
return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom, subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success); vatom, host_start, ilist, jnum, cpu_time, success);
} }
void colloid_gpu_compute(const int ago, const int inum_full, const int nall, void colloid_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : ndtrung@umich.edu email : ndtrung@umich.edu
***************************************************************************/ ***************************************************************************/
@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
CoulT::~Coul() { CoulT::~Coul() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int CoulT::bytes_per_atom(const int max_nbors) const { int CoulT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
@ -75,7 +75,7 @@ int CoulT::init(const int ntypes, double **host_scale, double **host_cutsq,
scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale); this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
@ -97,10 +97,10 @@ void CoulT::reinit(const int ntypes, double **host_scale) {
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
UCL_WRITE_ONLY); UCL_WRITE_ONLY);
for (int i=0; i<_lj_types*_lj_types; i++) for (int i=0; i<_lj_types*_lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;
this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale); this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale);
} }
@ -138,7 +138,7 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -149,14 +149,14 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl, this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q, &vflag, &ainum, &nbor_pitch, &this->atom->q,
&cutsq, &_qqrd2e, &this->_threads_per_atom); &cutsq, &_qqrd2e, &this->_threads_per_atom);
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl, this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
&cutsq, &_qqrd2e, &this->_threads_per_atom); &cutsq, &_qqrd2e, &this->_threads_per_atom);
} }

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : ndtrung@umich.edu // email : ndtrung@umich.edu
// ***************************************************************************/ // ***************************************************************************/
@ -33,14 +33,14 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
const __global numtyp *restrict scale, const __global numtyp *restrict scale,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_cl_in, const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const __global numtyp *restrict cutsq, const __global numtyp *restrict cutsq,
const numtyp qqrd2e, const int t_per_atom) { const numtyp qqrd2e, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
@ -50,7 +50,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
sp_cl[1]=sp_cl_in[1]; sp_cl[1]=sp_cl_in[1];
sp_cl[2]=sp_cl_in[2]; sp_cl[2]=sp_cl_in[2];
sp_cl[3]=sp_cl_in[3]; sp_cl[3]=sp_cl_in[3];
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0; acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
@ -58,13 +58,13 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
if (ii<inum) { if (ii<inum) {
int i, numj, nbor, nbor_end; int i, numj, nbor, nbor_end;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w; int itype=ix.w;
@ -120,14 +120,14 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
__kernel void k_coul_fast(const __global numtyp4 *restrict x_, __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict scale, const __global numtyp *restrict scale,
const __global numtyp *restrict sp_cl_in, const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
const __global numtyp *restrict q_, const __global numtyp *restrict q_,
const __global numtyp *restrict _cutsq, const __global numtyp *restrict _cutsq,
const numtyp qqrd2e, const int t_per_atom) { const numtyp qqrd2e, const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
@ -139,7 +139,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
cutsq[tid]=_cutsq[tid]; cutsq[tid]=_cutsq[tid];
} }
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0; acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
@ -147,15 +147,15 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int i, numj, nbor, nbor_end; int i, numj, nbor, nbor_end;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w; int iw=ix.w;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : ndtrung@umich.edu email : ndtrung@umich.edu
***************************************************************************/ ***************************************************************************/
@ -30,7 +30,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -39,13 +39,13 @@ class Coul : public BaseCharge<numtyp, acctyp> {
* - -5 Double precision is not supported on card **/ * - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_scale, int init(const int ntypes, double **host_scale,
double **host_cutsq, double *host_special_coul, double **host_cutsq, double *host_special_coul,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, const double qqrd2e); const double gpu_split, FILE *screen, const double qqrd2e);
/// Send updated coeffs from host to device (to be compatible with fix adapt) /// Send updated coeffs from host to device (to be compatible with fix adapt)
void reinit(const int ntypes, double **host_scale); void reinit(const int ntypes, double **host_scale);
/// Clear all host and device data /// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/ /** \note This is called at the beginning of the init() routine **/
void clear(); void clear();
@ -68,7 +68,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _lj_types; int _lj_types;
numtyp _qqrd2e; numtyp _qqrd2e;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : ndtrung@umich.edu email : ndtrung@umich.edu
***************************************************************************/ ***************************************************************************/
@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
CoulDebyeT::~CoulDebye() { CoulDebyeT::~CoulDebye() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int CoulDebyeT::bytes_per_atom(const int max_nbors) const { int CoulDebyeT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
@ -87,7 +87,7 @@ int CoulDebyeT::init(const int ntypes, double **host_scale,
_qqrd2e=qqrd2e; _qqrd2e=qqrd2e;
_kappa=kappa; _kappa=kappa;
_allocated=true; _allocated=true;
this->_max_bytes=cutsq.row_bytes()+scale.row_bytes()+sp_cl.row_bytes(); this->_max_bytes=cutsq.row_bytes()+scale.row_bytes()+sp_cl.row_bytes();
return 0; return 0;
@ -98,10 +98,10 @@ void CoulDebyeT::reinit(const int ntypes, double **host_scale) {
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
UCL_WRITE_ONLY); UCL_WRITE_ONLY);
for (int i=0; i<_lj_types*_lj_types; i++) for (int i=0; i<_lj_types*_lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;
this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale); this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale);
} }
@ -139,7 +139,7 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -156,9 +156,9 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl, this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &cutsq, &ainum, &nbor_pitch, &this->atom->q, &cutsq,
&_qqrd2e, &_kappa, &this->_threads_per_atom); &_qqrd2e, &_kappa, &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________ // __________________________________________________________________________
// //
// begin : // begin :
// email : ndtrung@umich.edu // email : ndtrung@umich.edu
// ***************************************************************************/ // ***************************************************************************/
@ -31,16 +31,16 @@ texture<int2> q_tex;
__kernel void k_coul_debye(const __global numtyp4 *restrict x_, __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
const __global numtyp *restrict scale, const __global numtyp *restrict scale,
const int lj_types, const int lj_types,
const __global numtyp *restrict sp_cl_in, const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int nbor_pitch,
const __global numtyp *restrict q_ , const __global numtyp *restrict q_ ,
const __global numtyp *restrict cutsq, const __global numtyp *restrict cutsq,
const numtyp qqrd2e, const numtyp kappa, const numtyp qqrd2e, const numtyp kappa,
const int t_per_atom) { const int t_per_atom) {
int tid, ii, offset; int tid, ii, offset;
@ -59,27 +59,27 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
if (ii<inum) { if (ii<inum) {
int i, numj, nbor, nbor_end; int i, numj, nbor, nbor_end;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w; int itype=ix.w;
numtyp factor_coul; numtyp factor_coul;
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
factor_coul = sp_cl[sbmask(j)]; factor_coul = sp_cl[sbmask(j)];
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w; int jtype=jx.w;
// Compute r12 // Compute r12
numtyp delx = ix.x-jx.x; numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y; numtyp dely = ix.y-jx.y;
@ -146,7 +146,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
scale[tid]=scale_in[tid]; scale[tid]=scale_in[tid];
cutsq[tid]=_cutsq[tid]; cutsq[tid]=_cutsq[tid];
} }
acctyp energy=(acctyp)0; acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0; acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
@ -154,15 +154,15 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
acctyp virial[6]; acctyp virial[6];
for (int i=0; i<6; i++) for (int i=0; i<6; i++)
virial[i]=(acctyp)0; virial[i]=(acctyp)0;
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int i, numj, nbor, nbor_end; int i, numj, nbor, nbor_end;
__local int n_stride; __local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w; int iw=ix.w;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : ndtrung@umich.edu email : ndtrung@umich.edu
***************************************************************************/ ***************************************************************************/
@ -30,7 +30,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
/** \param max_nbors initial number of rows in the neighbor matrix /** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin * \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device * \param gpu_split fraction of particles handled by device
* *
* Returns: * Returns:
* - 0 if successfull * - 0 if successfull
* - -1 if fix gpu not found * - -1 if fix gpu not found
@ -39,14 +39,14 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
* - -5 Double precision is not supported on card **/ * - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_scale, int init(const int ntypes, double **host_scale,
double **host_cutsq, double *host_special_coul, double **host_cutsq, double *host_special_coul,
const int nlocal, const int nall, const int max_nbors, const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, const double gpu_split, FILE *screen,
const double qqrd2e, const double kappa); const double qqrd2e, const double kappa);
/// Send updated coeffs from host to device (to be compatible with fix adapt) /// Send updated coeffs from host to device (to be compatible with fix adapt)
void reinit(const int ntypes, double **host_scale); void reinit(const int ntypes, double **host_scale);
/// Clear all host and device data /// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/ /** \note This is called at the beginning of the init() routine **/
void clear(); void clear();
@ -69,7 +69,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types /// Number of atom types
int _lj_types; int _lj_types;
numtyp _qqrd2e,_kappa; numtyp _qqrd2e,_kappa;

View File

@ -9,7 +9,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________ __________________________________________________________________________
begin : begin :
email : ndtrung@umich.edu email : ndtrung@umich.edu
***************************************************************************/ ***************************************************************************/
@ -75,7 +75,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa); maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);
CDEMF.device->gpu_barrier(); CDEMF.device->gpu_barrier();
if (message) if (message)
fprintf(screen,"Done.\n"); fprintf(screen,"Done.\n");
} }
if (message) if (message)
@ -93,16 +93,16 @@ void cdebye_gpu_reinit(const int ntypes, double **host_scale) {
int world_me=CDEMF.device->world_me(); int world_me=CDEMF.device->world_me();
int gpu_rank=CDEMF.device->gpu_rank(); int gpu_rank=CDEMF.device->gpu_rank();
int procs_per_gpu=CDEMF.device->procs_per_gpu(); int procs_per_gpu=CDEMF.device->procs_per_gpu();
if (world_me==0) if (world_me==0)
CDEMF.reinit(ntypes, host_scale); CDEMF.reinit(ntypes, host_scale);
CDEMF.device->world_barrier(); CDEMF.device->world_barrier();
for (int i=0; i<procs_per_gpu; i++) { for (int i=0; i<procs_per_gpu; i++) {
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
CDEMF.reinit(ntypes, host_scale); CDEMF.reinit(ntypes, host_scale);
CDEMF.device->gpu_barrier(); CDEMF.device->gpu_barrier();
} }
} }
@ -123,8 +123,8 @@ int** cdebye_gpu_compute_n(const int ago, const int inum_full,
subhi, tag, nspecial, special, eflag, vflag, eatom, subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success, vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd); host_q, boxlo, prd);
} }
void cdebye_gpu_compute(const int ago, const int inum_full, const int nall, void cdebye_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj, double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag, int **firstneigh, const bool eflag, const bool vflag,

View File

@ -37,18 +37,18 @@ template <class numtyp, class acctyp>
CoulDSFT::~CoulDSF() { CoulDSFT::~CoulDSF() {
clear(); clear();
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int CoulDSFT::bytes_per_atom(const int max_nbors) const { int CoulDSFT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors); return this->bytes_per_atom_atomic(max_nbors);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int CoulDSFT::init(const int ntypes, const int nlocal, const int nall, int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,
const int max_nbors, const int maxspecial, const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split, FILE *_screen, const double cell_size, const double gpu_split, FILE *_screen,
const double host_cut_coulsq, double *host_special_coul, const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double e_shift, const double f_shift, const double qqrd2e, const double e_shift, const double f_shift,
const double alpha) { const double alpha) {
int success; int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@ -123,7 +123,7 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
vflag=1; vflag=1;
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -134,15 +134,15 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &sp_lj, this->k_pair_fast.run(&this->atom->x, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q, &vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
&this->_threads_per_atom); &this->_threads_per_atom);
} else { } else {
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj, this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
&this->_threads_per_atom); &this->_threads_per_atom);

Some files were not shown because too many files have changed in this diff Show More