Merge remote-tracking branch 'origin/master'

This commit is contained in:
Vsevak
2020-06-15 18:38:40 +03:00
5526 changed files with 479177 additions and 2216677 deletions

1
lib/gpu/.gitignore vendored
View File

@ -2,6 +2,7 @@
/obj_ocl
/ocl_get_devices
/nvc_get_devices
/hip_get_devices
/*.cubin
/*_cubin.h
/*_cl.h

148
lib/gpu/Makefile.hip Normal file
View File

@ -0,0 +1,148 @@
# /* ----------------------------------------------------------------------
# Generic Linux Makefile for HIP
# - export HIP_PLATFORM=hcc (or nvcc) before execution
# - change HIP_ARCH for your GPU
# ------------------------------------------------------------------------- */
# this setting should match LAMMPS Makefile
# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
LMP_INC = -DLAMMPS_SMALLBIG
# precision for GPU calculations
# -D_SINGLE_SINGLE # Single precision for all calculations
# -D_DOUBLE_DOUBLE # Double precision for all calculations
# -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double
HIP_PRECISION = -D_SINGLE_DOUBLE
HIP_OPTS = -O3
HIP_HOST_OPTS = -Wno-deprecated-declarations
HIP_HOST_INCLUDE =
# use device sort
# requires linking with hipcc and hipCUB + (rocPRIM or CUB for AMD or Nvidia respectively)
HIP_HOST_OPTS += -DUSE_HIP_DEVICE_SORT
# path to cub
HIP_HOST_INCLUDE += -I./
# path to hipcub
HIP_HOST_INCLUDE += -I$(HIP_PATH)/../include
# use mpi
HIP_HOST_OPTS += -DMPI_GERYON -DUCL_NO_EXIT
# this settings should match LAMMPS Makefile
MPI_COMP_OPTS = $(shell mpicxx --showme:compile)
MPI_LINK_OPTS = $(shell mpicxx --showme:link)
#MPI_COMP_OPTS += -I/usr/include/mpi -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1
HIP_PATH ?= $(wildcard /opt/rocm/hip)
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
ifeq (hcc,$(HIP_PLATFORM))
HIP_OPTS += -ffast-math
# possible values: gfx803,gfx900,gfx906
HIP_ARCH = gfx906
else ifeq (nvcc,$(HIP_PLATFORM))
HIP_OPTS += --use_fast_math
HIP_ARCH = -gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_32,code=[sm_32,compute_32] -gencode arch=compute_35,code=[sm_35,compute_35] \
-gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] -gencode arch=compute_53,code=[sm_53,compute_53]\
-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] -gencode arch=compute_62,code=[sm_62,compute_62]\
-gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_72,code=[sm_72,compute_72] -gencode arch=compute_75,code=[sm_75,compute_75]
else
$(error Specify HIP platform using 'export HIP_PLATFORM=(hcc,nvcc)')
endif
BIN_DIR = .
OBJ_DIR = ./obj
LIB_DIR = .
AR = ar
BSH = /bin/sh
# /* ----------------------------------------------------------------------
# don't change section below without need
# ------------------------------------------------------------------------- */
HIP_OPTS += -DUSE_HIP $(HIP_PRECISION)
HIP_GPU_OPTS += $(HIP_OPTS) -I./
ifeq (hcc,$(HIP_PLATFORM))
HIP_HOST_OPTS += -fPIC
HIP_GPU_CC = $(HIP_PATH)/bin/hipcc --genco
HIP_GPU_OPTS_S = -t="$(HIP_ARCH)" -f=\"
HIP_GPU_OPTS_E = \"
HIP_KERNEL_SUFFIX = .cpp
HIP_LIBS_TARGET = export HCC_AMDGPU_TARGET := $(HIP_ARCH)
export HCC_AMDGPU_TARGET := $(HIP_ARCH)
else ifeq (nvcc,$(HIP_PLATFORM))
HIP_GPU_CC = $(HIP_PATH)/bin/hipcc --fatbin
HIP_GPU_OPTS += $(HIP_ARCH)
HIP_GPU_SORT_ARCH = $(HIP_ARCH)
# fix nvcc can't handle -pthread flag
MPI_COMP_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_COMP_OPTS))
MPI_LINK_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_LINK_OPTS))
endif
# hipcc is essential for device sort, because of hipcub is header only library and ROCm gpu code generation is deferred to the linking stage
HIP_HOST_CC = $(HIP_PATH)/bin/hipcc
HIP_HOST_OPTS += $(HIP_OPTS) $(MPI_COMP_OPTS) $(LMP_INC)
HIP_HOST_CC_CMD = $(HIP_HOST_CC) $(HIP_HOST_OPTS) $(HIP_HOST_INCLUDE)
# sources
ALL_H = $(wildcard ./geryon/ucl*.h) $(wildcard ./geryon/hip*.h) $(wildcard ./lal_*.h)
SRCS := $(wildcard ./lal_*.cpp)
OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
CUS := $(wildcard lal_*.cu)
CUHS := $(filter-out pppm_cubin.h, $(CUS:lal_%.cu=%_cubin.h)) pppm_f_cubin.h pppm_d_cubin.h
CUHS := $(addprefix $(OBJ_DIR)/, $(CUHS))
all: $(OBJ_DIR) $(CUHS) $(LIB_DIR)/libgpu.a $(BIN_DIR)/hip_get_devices
$(OBJ_DIR):
mkdir -p $@
# GPU kernels compilation
$(OBJ_DIR)/pppm_f_cubin.h: lal_pppm.cu $(ALL_H)
@cp $< $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=float -Dgrdtyp4=float4 $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
@xxd -i $(OBJ_DIR)/pppm_f.cubin $@
@sed -i "s/[a-zA-Z0-9_]*pppm_f_cubin/pppm_f/g" $@
@rm $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_f.cubin
$(OBJ_DIR)/pppm_d_cubin.h: lal_pppm.cu $(ALL_H)
@cp $< $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=double -Dgrdtyp4=double4 $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
@xxd -i $(OBJ_DIR)/pppm_d.cubin $@
@sed -i "s/[a-zA-Z0-9_]*pppm_d_cubin/pppm_d/g" $@
@rm $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_d.cubin
$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H)
@cp $< $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
@xxd -i $(OBJ_DIR)/$*.cubin $@
@sed -i "s/[a-zA-Z0-9_]*$*_cubin/$*/g" $@
@rm $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/$*.cubin
# host sources compilation
$(OBJ_DIR)/lal_atom.o: lal_atom.cpp $(CUHS) $(ALL_H)
$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR) $(HIP_GPU_SORT_ARCH)
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR)
# libgpu building
$(LIB_DIR)/libgpu.a: $(OBJS)
$(AR) -crs $@ $(OBJS)
printf "export HIP_PLATFORM := %s\n%s\n" "$(HIP_PLATFORM)" "$(HIP_LIBS_TARGET)" > Makefile.lammps
# test app building
$(BIN_DIR)/hip_get_devices: ./geryon/ucl_get_devices.cpp $(ALL_H)
$(HIP_HOST_CC_CMD) -o $@ $< -DUCL_HIP $(MPI_LINK_OPTS)
clean:
-rm -f $(BIN_DIR)/hip_get_devices $(LIB_DIR)/libgpu.a $(OBJS) $(OBJ_DIR)/temp_* $(CUHS)

View File

@ -52,7 +52,7 @@ user-gpu_SYSLIB = CUDA libraries needed by this package
user-gpu_SYSPATH = path(s) to where those libraries are
Because you have the CUDA compilers on your system, you should have
the needed libraries. If the CUDA developement tools were installed
the needed libraries. If the CUDA development tools were installed
in the standard manner, the settings in the Makefile.lammps.standard
file should work.
@ -67,8 +67,8 @@ library requires installing the CUDA GPU driver and CUDA toolkit for
your operating system. Installation of the CUDA SDK is not necessary.
In addition to the LAMMPS library, the binary nvc_get_devices will also
be built. This can be used to query the names and properties of GPU
devices on your system. A Makefile for OpenCL compilation is provided,
but support for OpenCL use is not currently provided by the developers.
devices on your system. A Makefile for OpenCL and ROCm HIP compilation
is provided, but support for it is not currently provided by the developers.
Details of the implementation are provided in:
----
@ -169,6 +169,25 @@ NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG,
src/MAKE/Makefile.foo) should be consistent with that specified
when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).
BUILDING FOR HIP FRAMEWORK
--------------------------------
1. Install the latest ROCm framework (https://github.com/RadeonOpenCompute/ROCm).
2. GPU sorting requires installing hipcub
(https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
additionally requires cub (https://nvlabs.github.io/cub). Download and
extract the cub directory to lammps/lib/gpu/ or specify an appropriate
path in lammps/lib/gpu/Makefile.hip.
3. In Makefile.hip it is possible to specify the target platform via
export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target
architecture (gfx803, gfx900, gfx906 etc.)
4. If your MPI implementation does not support `mpicxx --showme` command,
it is required to specify the corresponding MPI compiler and linker flags
in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
5. Building the GPU library (libgpu.a):
cd lammps/lib/gpu; make -f Makefile.hip -j
6. Building the LAMMPS executable (lmp_hip):
cd ../../src; make hip -j
EXAMPLE CONVENTIONAL BUILD PROCESS
--------------------------------

View File

@ -27,7 +27,7 @@
* defined in cudpp.h. Public interface functions call functions in the
* \link cudpp_app Application-Level\endlink interface. The public
* interface functions include Plan Interface functions and Algorithm
* Interface functions. Plan Inteface functions are used for creating
* Interface functions. Plan Interface functions are used for creating
* CUDPP Plan objects which contain configuration details, intermediate
* storage space, and in the case of cudppSparseMatrix(), data. The
* Algorithm Interface is the set of functions that do the real work

View File

@ -8,16 +8,7 @@
// in the root directory of this source distribution.
// -------------------------------------------------------------
#include "cudpp_maximal_launch.h"
inline size_t min(size_t x, size_t y)
{
return (x <= y) ? x : y;
}
inline size_t max(size_t x, size_t y)
{
return (x >= y) ? x : y;
}
#include <algorithm>
// computes next highest multiple of f from x
inline size_t multiple(size_t x, size_t f)
@ -65,7 +56,7 @@ size_t maxBlocks(cudaFuncAttributes &attribs,
size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
size_t ctaLimitThreads = maxThreadsPerSM / threadsPerBlock;
return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
return devprop.multiProcessorCount * std::min(ctaLimitRegs, std::min(ctaLimitSMem, std::min(ctaLimitThreads, (size_t)maxBlocksPerSM)));
}
extern "C"
@ -80,15 +71,15 @@ size_t maxBlocksFromPointer(void* kernel,
{
err = cudaGetDeviceProperties(&devprop, deviceID);
if (err != cudaSuccess)
return -1;
return (size_t)-1;
cudaFuncAttributes attr;
err = cudaFuncGetAttributes(&attr, (const char*)kernel);
if (err != cudaSuccess)
return -1;
return (size_t)-1;
return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
}
return -1;
return (size_t)-1;
}

View File

@ -367,7 +367,7 @@ extern "C" {
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! initialized withing Cutil then cutFree() has to be used to
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
@ -382,7 +382,7 @@ extern "C" {
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! initialized withing Cutil then cutFree() has to be used to
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
@ -466,7 +466,7 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
// Command line arguments: General notes
// * All command line arguments begin with '--' followed by the token;
// token and value are seperated by '='; example --samples=50
// token and value are separated by '='; example --samples=50
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// (without whitespaces)
////////////////////////////////////////////////////////////////////////////

View File

@ -46,7 +46,7 @@
* the rows of \a d_blockSums (in elements) in \a blockSumRowPitch, and invoke
* with a thread block grid with height greater than 1.
*
* This function peforms one level of a recursive, multiblock scan. At the
* This function performs one level of a recursive, multiblock scan. At the
* app level, this function is called by cudppScan and cudppMultiScan and used
* in combination with vectorAddUniform4() to produce a complete scan.
*

519
lib/gpu/geryon/hip_device.h Normal file
View File

@ -0,0 +1,519 @@
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef HIP_DEVICE
#define HIP_DEVICE
#include <hip/hip_runtime.h>
#include <unordered_map>
#include <string>
#include <vector>
#include <iostream>
#include "hip_macros.h"
#include "ucl_types.h"
namespace ucl_hip {
// --------------------------------------------------------------------------
// - COMMAND QUEUE STUFF
// --------------------------------------------------------------------------
typedef hipStream_t command_queue;
inline void ucl_sync(hipStream_t &stream) {
CU_SAFE_CALL(hipStreamSynchronize(stream));
}
struct NVDProperties {
int device_id;
std::string name;
int major;
int minor;
CUDA_INT_TYPE totalGlobalMem;
int multiProcessorCount;
int maxThreadsPerBlock;
int maxThreadsDim[3];
int maxGridSize[3];
int sharedMemPerBlock;
int totalConstantMemory;
int SIMDWidth;
int memPitch;
int regsPerBlock;
int clockRate;
int textureAlign;
int kernelExecTimeoutEnabled;
int integrated;
int canMapHostMemory;
int concurrentKernels;
int ECCEnabled;
int computeMode;
};
/// Class for looking at device properties
/** \note Calls to change the device outside of the class results in incorrect
* behavior
* \note There is no error checking for indexing past the number of devices **/
class UCL_Device {
public:
/// Collect properties for every GPU on the node
/** \note You must set the active GPU with set() before using the device **/
inline UCL_Device();
inline ~UCL_Device();
/// Returns 1 (For compatibility with OpenCL)
inline int num_platforms() { return 1; }
/// Return a string with name and info of the current platform
inline std::string platform_name()
{ return "HIP platform"; }
/// Delete any contexts/data and set the platform number to be used
inline int set_platform(const int pid);
/// Return the number of devices that support CUDA
inline int num_devices() { return _properties.size(); }
/// Set the CUDA device to the specified device number
/** A context and default command queue will be created for the device
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
* be allocated for use. clear() is called to delete any contexts and
* associated data from previous calls to set(). **/
inline int set(int num);
/// Delete any context and associated data stored from a call to set()
inline void clear();
/// Get the current device number
inline int device_num() { return _device; }
/// Returns the default stream for the current device
inline command_queue & cq() { return cq(0); }
/// Returns the stream indexed by i
inline command_queue & cq(const int i) { return _cq[i]; }
/// Block until all commands in the default stream have completed
inline void sync() { sync(0); }
/// Block until all commands in the specified stream have completed
inline void sync(const int i) { ucl_sync(cq(i)); }
/// Get the number of command queues currently available on device
inline int num_queues()
{ return _cq.size(); }
/// Add a stream for device computations
inline void push_command_queue() {
_cq.push_back(hipStream_t());
CU_SAFE_CALL(hipStreamCreateWithFlags(&_cq.back(),0));
}
/// Remove a stream for device computations
/** \note You cannot delete the default stream **/
inline void pop_command_queue() {
if (_cq.size()<2) return;
CU_SAFE_CALL_NS(hipStreamDestroy(_cq.back()));
_cq.pop_back();
}
/// Set the default command queue (by default this is the null stream)
/** \param i index of the command queue (as added by push_command_queue())
If i is 0, the default command queue is set to the null stream **/
inline void set_command_queue(const int i) {
if (i==0) _cq[0]=0;
else _cq[0]=_cq[i];
}
/// Get the current CUDA device name
inline std::string name() { return name(_device); }
/// Get the CUDA device name
inline std::string name(const int i)
{ return std::string(_properties[i].name); }
/// Get a string telling the type of the current device
inline std::string device_type_name() { return device_type_name(_device); }
/// Get a string telling the type of the device
inline std::string device_type_name(const int i) { return "GPU"; }
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type() { return device_type(_device); }
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i) { return UCL_GPU; }
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
/// Returns true if double precision is support for the current device
inline bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device
inline bool double_precision(const int i) {return arch(i)>=1.3;}
/// Get the number of compute units on the current device
inline unsigned cus() { return cus(_device); }
/// Get the number of compute units
inline unsigned cus(const int i)
{ return _properties[i].multiProcessorCount; }
/// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); }
/// Get the number of cores
inline unsigned cores(const int i)
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
else return _properties[i].multiProcessorCount*192; }
/// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); }
/// Get the gigabytes of global memory
inline double gigabytes(const int i)
{ return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
/// Get the bytes of global memory in the current device
inline size_t bytes() { return bytes(_device); }
/// Get the bytes of global memory
inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
// Get the gigabytes of free memory in the current device
inline double free_gigabytes() { return free_gigabytes(_device); }
// Get the gigabytes of free memory
inline double free_gigabytes(const int i)
{ return static_cast<double>(free_bytes(i))/1073741824; }
// Get the bytes of free memory in the current device
inline size_t free_bytes() { return free_bytes(_device); }
// Get the bytes of free memory
inline size_t free_bytes(const int i) {
CUDA_INT_TYPE dfree, dtotal;
CU_SAFE_CALL_NS(hipMemGetInfo(&dfree, &dtotal));
return static_cast<size_t>(dfree);
}
/// Return the GPGPU compute capability for current device
inline double arch() { return arch(_device); }
/// Return the GPGPU compute capability
inline double arch(const int i)
{ return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
/// Clock rate in GHz for current device
inline double clock_rate() { return clock_rate(_device); }
/// Clock rate in GHz
inline double clock_rate(const int i)
{ return _properties[i].clockRate*1e-6;}
/// Get the maximum number of threads per block
inline size_t group_size() { return group_size(_device); }
/// Get the maximum number of threads per block
inline size_t group_size(const int i)
{ return _properties[i].maxThreadsPerBlock; }
/// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); }
/// Return the maximum memory pitch in bytes
inline size_t max_pitch(const int i) { return _properties[i].memPitch; }
/// Returns false if accelerator cannot be shared by multiple processes
/** If it cannot be determined, true is returned **/
inline bool sharing_supported() { return sharing_supported(_device); }
/// Returns false if accelerator cannot be shared by multiple processes
/** If it cannot be determined, true is returned **/
inline bool sharing_supported(const int i)
{ return (_properties[i].computeMode == hipComputeModeDefault); }
/// True if splitting device into equal subdevices supported
inline bool fission_equal()
{ return fission_equal(_device); }
/// True if splitting device into equal subdevices supported
inline bool fission_equal(const int i)
{ return false; }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts()
{ return fission_by_counts(_device); }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts(const int i)
{ return false; }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity()
{ return fission_by_affinity(_device); }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity(const int i)
{ return false; }
/// Maximum number of subdevices allowed from device fission
inline int max_sub_devices()
{ return max_sub_devices(_device); }
/// Maximum number of subdevices allowed from device fission
inline int max_sub_devices(const int i)
{ return 0; }
/// List all devices along with all properties
inline void print_all(std::ostream &out);
/// Select the platform that has accelerators (for compatibility with OpenCL)
inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
inline int load_module(const void* program, hipModule_t& module, std::string *log=NULL){
auto it = _loaded_modules.emplace(program, hipModule_t());
if(!it.second){
module = it.first->second;
return UCL_SUCCESS;
}
const unsigned int num_opts=2;
hipJitOption options[num_opts];
void *values[num_opts];
// set up size of compilation log buffer
options[0] = hipJitOptionInfoLogBufferSizeBytes;
values[0] = (void *)(int)10240;
// set up pointer to the compilation log buffer
options[1] = hipJitOptionInfoLogBuffer;
char clog[10240] = { 0 };
values[1] = clog;
hipError_t err=hipModuleLoadDataEx(&module,program,num_opts, options,(void **)values);
if (log!=NULL)
*log=std::string(clog);
if (err != hipSuccess) {
#ifndef UCL_NO_EXIT
std::cerr << std::endl
<< "----------------------------------------------------------\n"
<< " UCL Error: Error compiling PTX Program...\n"
<< "----------------------------------------------------------\n";
std::cerr << log << std::endl;
#endif
_loaded_modules.erase(it.first);
return UCL_COMPILE_ERROR;
}
it.first->second = module;
return UCL_SUCCESS;
}
private:
std::unordered_map<const void*, hipModule_t> _loaded_modules;
int _device, _num_devices;
std::vector<NVDProperties> _properties;
std::vector<hipStream_t> _cq;
hipDevice_t _cu_device;
};
// Grabs the properties for all devices
UCL_Device::UCL_Device() {
CU_SAFE_CALL_NS(hipInit(0));
CU_SAFE_CALL_NS(hipGetDeviceCount(&_num_devices));
for (int i=0; i<_num_devices; ++i) {
hipDevice_t dev;
CU_SAFE_CALL_NS(hipDeviceGet(&dev,i));
int major, minor;
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, dev));
if (major==9999)
continue;
NVDProperties prop;
prop.device_id = i;
prop.major=major;
prop.minor=minor;
char namecstr[1024];
CU_SAFE_CALL_NS(hipDeviceGetName(namecstr,1024,dev));
prop.name=namecstr;
CU_SAFE_CALL_NS(hipDeviceTotalMem(&prop.totalGlobalMem,dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.multiProcessorCount, hipDeviceAttributeMultiprocessorCount, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsPerBlock, hipDeviceAttributeMaxThreadsPerBlock, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[0], hipDeviceAttributeMaxBlockDimX, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[1], hipDeviceAttributeMaxBlockDimY, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[2], hipDeviceAttributeMaxBlockDimZ, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[0], hipDeviceAttributeMaxGridDimX, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[1], hipDeviceAttributeMaxGridDimY, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[2], hipDeviceAttributeMaxGridDimZ, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.sharedMemPerBlock, hipDeviceAttributeMaxSharedMemoryPerBlock, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.totalConstantMemory, hipDeviceAttributeTotalConstantMemory, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.SIMDWidth, hipDeviceAttributeWarpSize, dev));
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.regsPerBlock, hipDeviceAttributeMaxRegistersPerBlock, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.clockRate, hipDeviceAttributeClockRate, dev));
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
//#if CUDA_VERSION >= 2020
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.computeMode, hipDeviceAttributeComputeMode,dev));
//#endif
//#if CUDA_VERSION >= 3010
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.concurrentKernels, hipDeviceAttributeConcurrentKernels, dev));
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
//#endif
_properties.push_back(prop);
}
_device=-1;
_cq.push_back(hipStream_t());
_cq.back()=0;
}
UCL_Device::~UCL_Device() {
clear();
}
int UCL_Device::set_platform(const int pid) {
clear();
#ifdef UCL_DEBUG
assert(pid<num_platforms());
#endif
return UCL_SUCCESS;
}
// Set the CUDA device to the specified device number
int UCL_Device::set(int num) {
clear();
_device=_properties[num].device_id;
hipError_t err=hipDeviceGet(&_cu_device,_device);
if (err!=hipSuccess) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not access accelerator number " << num
<< " for use.\n";
UCL_GERYON_EXIT;
#endif
return UCL_ERROR;
}
//hipError_t err=hipCtxCreate(&_context,0,_cu_device); deprecated and unnecessary
err=hipSetDevice(_device);
if (err!=hipSuccess) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not set accelerator number " << num
<< " for use.\n";
UCL_GERYON_EXIT;
#endif
return UCL_ERROR;
}
return UCL_SUCCESS;
}
void UCL_Device::clear() {
if (_device>-1) {
for (int i=1; i<num_queues(); i++) pop_command_queue();
CU_SAFE_CALL_NS(hipDeviceReset());
}
_device=-1;
}
// List all devices along with all properties
void UCL_Device::print_all(std::ostream &out) {
//#if CUDA_VERSION >= 2020
int driver_version;
hipDriverGetVersion(&driver_version);
out << "Driver Version: "
<< driver_version/1000 << "." << driver_version%100
<< std::endl;
//#endif
if (num_devices() == 0)
out << "There is no device supporting HIP\n";
for (int i=0; i<num_devices(); ++i) {
out << "\nDevice " << i << ": \"" << name(i) << "\"\n";
out << " Type of device: "
<< device_type_name(i).c_str() << std::endl;
out << " Compute capability: "
<< arch(i) << std::endl;
out << " Double precision support: ";
if (double_precision(i))
out << "Yes\n";
else
out << "No\n";
out << " Total amount of global memory: "
<< gigabytes(i) << " GB\n";
//#if CUDA_VERSION >= 2000
out << " Number of compute units/multiprocessors: "
<< _properties[i].multiProcessorCount << std::endl;
out << " Number of cores: "
<< cores(i) << std::endl;
//#endif
out << " Total amount of constant memory: "
<< _properties[i].totalConstantMemory << " bytes\n";
out << " Total amount of local/shared memory per block: "
<< _properties[i].sharedMemPerBlock << " bytes\n";
out << " Total number of registers available per block: "
<< _properties[i].regsPerBlock << std::endl;
out << " Warp size: "
<< _properties[i].SIMDWidth << std::endl;
out << " Maximum number of threads per block: "
<< _properties[i].maxThreadsPerBlock << std::endl;
out << " Maximum group size (# of threads per block) "
<< _properties[i].maxThreadsDim[0] << " x "
<< _properties[i].maxThreadsDim[1] << " x "
<< _properties[i].maxThreadsDim[2] << std::endl;
out << " Maximum item sizes (# threads for each dim) "
<< _properties[i].maxGridSize[0] << " x "
<< _properties[i].maxGridSize[1] << " x "
<< _properties[i].maxGridSize[2] << std::endl;
//out << " Maximum memory pitch: "
// << max_pitch(i) << " bytes\n";
//out << " Texture alignment: "
// << _properties[i].textureAlign << " bytes\n";
out << " Clock rate: "
<< clock_rate(i) << " GHz\n";
//#if CUDA_VERSION >= 2020
//out << " Run time limit on kernels: ";
//if (_properties[i].kernelExecTimeoutEnabled)
// out << "Yes\n";
//else
// out << "No\n";
out << " Integrated: ";
if (_properties[i].integrated)
out << "Yes\n";
else
out << "No\n";
//out << " Support host page-locked memory mapping: ";
//if (_properties[i].canMapHostMemory)
// out << "Yes\n";
//else
// out << "No\n";
out << " Compute mode: ";
if (_properties[i].computeMode == hipComputeModeDefault)
out << "Default\n"; // multiple threads can use device
//#if CUDA_VERSION >= 8000
// else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
//#else
else if (_properties[i].computeMode == hipComputeModeExclusive)
//#endif
out << "Exclusive\n"; // only thread can use device
else if (_properties[i].computeMode == hipComputeModeProhibited)
out << "Prohibited\n"; // no thread can use device
//#if CUDART_VERSION >= 4000
else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
out << "Exclusive Process\n"; // multiple threads 1 process
//#endif
else
out << "Unknown\n";
//#endif
//#if CUDA_VERSION >= 3010
out << " Concurrent kernel execution: ";
if (_properties[i].concurrentKernels)
out << "Yes\n";
else
out << "No\n";
//out << " Device has ECC support enabled: ";
//if (_properties[i].ECCEnabled)
// out << "Yes\n";
//else
// out << "No\n";
//#endif
}
}
}
#endif

298
lib/gpu/geryon/hip_kernel.h Normal file
View File

@ -0,0 +1,298 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef HIP_KERNEL
#define HIP_KERNEL
#include <hip/hip_runtime.h>
#include "hip_device.h"
#include <fstream>
#include <string>
#include <iostream>
namespace ucl_hip {
class UCL_Texture;
template <class numtyp> class UCL_D_Vec;
template <class numtyp> class UCL_D_Mat;
template <class hosttype, class devtype> class UCL_Vector;
template <class hosttype, class devtype> class UCL_Matrix;
#define UCL_MAX_KERNEL_ARGS 256
/// Class storing 1 or more kernel functions from a single string or file
class UCL_Program {
UCL_Device* _device_ptr;
public:
inline UCL_Program(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
inline UCL_Program(UCL_Device &device, const void *program,
const char *flags="", std::string *log=NULL) {
_device_ptr = &device; _cq=device.cq();
init(device);
load_string(program,flags,log);
}
inline ~UCL_Program() {}
/// Initialize the program with a device
inline void init(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
/// Clear any data associated with program
/** \note Must call init() after each clear **/
inline void clear() { }
/// Load a program from a file and compile with flags
inline int load(const char *filename, const char *flags="", std::string *log=NULL) {
std::ifstream in(filename);
if (!in || in.is_open()==false) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not open kernel file: "
<< filename << std::endl;
UCL_GERYON_EXIT;
#endif
return UCL_FILE_NOT_FOUND;
}
std::string program((std::istreambuf_iterator<char>(in)),
std::istreambuf_iterator<char>());
in.close();
return load_string(program.c_str(),flags,log);
}
/// Load a program from a string and compile with flags
inline int load_string(const void *program, const char *flags="", std::string *log=NULL) {
return _device_ptr->load_module(program, _module, log);
}
friend class UCL_Kernel;
private:
hipModule_t _module;
hipStream_t _cq;
friend class UCL_Texture;
};
/// Class for dealing with CUDA Driver kernels
class UCL_Kernel {
public:
UCL_Kernel() : _dimensions(1), _num_args(0) {
_num_blocks[0]=0;
}
UCL_Kernel(UCL_Program &program, const char *function) :
_dimensions(1), _num_args(0) {
_num_blocks[0]=0;
set_function(program,function);
_cq=program._cq;
}
~UCL_Kernel() {}
/// Clear any function associated with the kernel
inline void clear() { }
/// Get the kernel function from a program
/** \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
inline int set_function(UCL_Program &program, const char *function) {
hipError_t err=hipModuleGetFunction(&_kernel,program._module,function);
if (err!=hipSuccess) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not find function: " << function
<< " in program.\n";
UCL_GERYON_EXIT;
#endif
return UCL_FUNCTION_NOT_FOUND;
}
_cq=program._cq;
return UCL_SUCCESS;
}
/// Set the kernel argument.
/** If not a device pointer, this must be repeated each time the argument
* changes
* \note To set kernel parameter i (i>0), parameter i-1 must be set **/
template <class dtype>
inline void set_arg(const unsigned index, const dtype * const arg) {
if (index==_num_args)
add_arg(arg);
else if (index<_num_args){
assert(0==1); // not implemented
}
else
assert(0==1); // Must add kernel parameters in sequential order
}
/// Set a geryon container as a kernel argument.
template <class numtyp>
inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
{ set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument.
template <class numtyp>
inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
{ set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); }
/// Set a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); }
/// Add a kernel argument.
inline void add_arg(const hipDeviceptr_t* const arg) {
add_arg<void*>((void**)arg);
}
/// Add a kernel argument.
template <class dtype>
inline void add_arg(const dtype* const arg) {
const auto old_size = _hip_kernel_args.size();
const auto aligned_size = (old_size+alignof(dtype)-1) & ~(alignof(dtype)-1);
const auto arg_size = sizeof(dtype);
_hip_kernel_args.resize(aligned_size + arg_size);
*((dtype*)(&_hip_kernel_args[aligned_size])) = *arg;
_num_args++;
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
}
/// Add a geryon container as a kernel argument.
template <class numtyp>
inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
{ add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument.
template <class numtyp>
inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
{ add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); }
/// Add a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); }
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks, const size_t block_size) {
_dimensions=1;
_num_blocks[0]=num_blocks;
_num_blocks[1]=1;
_num_blocks[2]=1;
_block_size[0]=block_size;
_block_size[1]=1;
_block_size[2]=1;
}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue for the kernel is changed to cq **/
inline void set_size(const size_t num_blocks, const size_t block_size,
command_queue &cq)
{ _cq=cq; set_size(num_blocks,block_size); }
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y) {
_dimensions=2;
_num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y;
_num_blocks[2]=1;
_block_size[0]=block_size_x;
_block_size[1]=block_size_y;
_block_size[2]=1;
}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue for the kernel is changed to cq **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y,
command_queue &cq)
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x,
const size_t block_size_y, const size_t block_size_z) {
_dimensions=2;
_num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y;
_num_blocks[2]=1;
_block_size[0]=block_size_x;
_block_size[1]=block_size_y;
_block_size[2]=block_size_z;
}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y,
const size_t block_size_z, command_queue &cq) {
_cq=cq;
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
block_size_z);
}
/// Run the kernel in the default command queue
inline void run() {
size_t args_size = _hip_kernel_args.size();
void *config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)_hip_kernel_args.data(),
HIP_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
HIP_LAUNCH_PARAM_END
};
const auto res = hipModuleLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
_num_blocks[2],_block_size[0],_block_size[1],
_block_size[2],0,_cq, NULL, config);
CU_SAFE_CALL(res);
//#endif
}
/// Clear any arguments associated with the kernel
inline void clear_args() {
_num_args=0;
_hip_kernel_args.clear();
}
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return _cq; }
/// Change the default command queue associated with matrix
inline void cq(command_queue &cq_in) { _cq=cq_in; }
#include "ucl_arg_kludge.h"
private:
hipFunction_t _kernel;
hipStream_t _cq;
unsigned _dimensions;
unsigned _num_blocks[3];
unsigned _num_args;
friend class UCL_Texture;
unsigned _block_size[3];
std::vector<char> _hip_kernel_args;
};
} // namespace
#endif

View File

@ -0,0 +1,83 @@
#ifndef HIP_MACROS_H
#define HIP_MACROS_H
#include <cstdio>
#include <cassert>
#include <hip/hip_runtime.h>
//#if CUDA_VERSION >= 3020
#define CUDA_INT_TYPE size_t
//#else
//#define CUDA_INT_TYPE unsigned
//#endif
#ifdef MPI_GERYON
#include "mpi.h"
#define NVD_GERYON_EXIT do { \
int is_final; \
MPI_Finalized(&is_final); \
if (!is_final) \
MPI_Abort(MPI_COMM_WORLD,-1); \
} while(0)
#else
#define NVD_GERYON_EXIT assert(0==1)
#endif
#ifndef UCL_GERYON_EXIT
#define UCL_GERYON_EXIT NVD_GERYON_EXIT
#endif
#ifdef UCL_DEBUG
#define UCL_SYNC_DEBUG
#define UCL_DESTRUCT_CHECK
#endif
#ifndef UCL_NO_API_CHECK
#define CU_SAFE_CALL_NS( call ) do { \
hipError_t err = call; \
if( hipSuccess != err) { \
fprintf(stderr, "HIP runtime error %d in call at file '%s' in line %i.\n", \
err, __FILE__, __LINE__ ); \
NVD_GERYON_EXIT; \
} } while (0)
#ifdef UCL_SYNC_DEBUG
#define CU_SAFE_CALL( call ) do { \
CU_SAFE_CALL_NS( call ); \
hipError_t err=hipCtxSynchronize(); \
if( hipSuccess != err) { \
fprintf(stderr, "HIP runtime error %d in file '%s' in line %i.\n", \
err, __FILE__, __LINE__ ); \
NVD_GERYON_EXIT; \
} } while (0)
#else
#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call )
#endif
#else // not DEBUG
// void macros for performance reasons
#define CU_SAFE_CALL_NS( call ) call
#define CU_SAFE_CALL( call) call
#endif
#ifdef UCL_DESTRUCT_CHECK
#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call)
#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call)
#else
#define CU_DESTRUCT_CALL( call) call
#define CU_DESTRUCT_CALL_NS( call) call
#endif
#endif

43
lib/gpu/geryon/hip_mat.h Normal file
View File

@ -0,0 +1,43 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
/*! \file */
#ifndef HIP_MAT_H
#define HIP_MAT_H
#include <hip/hip_runtime.h>
#include "hip_memory.h"
/// Namespace for CUDA Driver routines
namespace ucl_hip {
#define _UCL_MAT_ALLOW
#define _UCL_DEVICE_PTR_MAT
#include "ucl_basemat.h"
#include "ucl_h_vec.h"
#include "ucl_h_mat.h"
#include "ucl_d_vec.h"
#include "ucl_d_mat.h"
#include "ucl_s_obj_help.h"
#include "ucl_vector.h"
#include "ucl_matrix.h"
#undef _UCL_DEVICE_PTR_MAT
#undef _UCL_MAT_ALLOW
#define UCL_COPY_ALLOW
#include "ucl_copy.h"
#undef UCL_COPY_ALLOW
#define UCL_PRINT_ALLOW
#include "ucl_print.h"
#undef UCL_PRINT_ALLOW
} // namespace ucl_cudadr
#endif

279
lib/gpu/geryon/hip_memory.h Normal file
View File

@ -0,0 +1,279 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef HIP_MEMORY_H
#define HIP_MEMORY_H
#include <hip/hip_runtime.h>
#include <iostream>
#include <cassert>
#include <cstring>
#include "hip_macros.h"
#include "hip_device.h"
#include "ucl_types.h"
namespace ucl_hip {
// --------------------------------------------------------------------------
// - API Specific Types
// --------------------------------------------------------------------------
//typedef dim3 ucl_kernel_dim;
#ifdef __HIP_PLATFORM_NVCC__
typedef enum hipArray_Format {
HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
HIP_AD_FORMAT_HALF = 0x10,
HIP_AD_FORMAT_FLOAT = 0x20
}hipArray_Format;
#endif
// --------------------------------------------------------------------------
// - API SPECIFIC DEVICE POINTERS
// --------------------------------------------------------------------------
typedef hipDeviceptr_t device_ptr;
// --------------------------------------------------------------------------
// - HOST MEMORY ALLOCATION ROUTINES
// --------------------------------------------------------------------------
template <class mat_type, class copy_type>
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
hipError_t err=hipSuccess;
if (kind==UCL_NOT_PINNED)
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (kind==UCL_WRITE_ONLY)
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
else
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
if (err!=hipSuccess || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
return UCL_SUCCESS;
}
template <class mat_type>
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
hipError_t err=hipSuccess;
if (kind==UCL_NOT_PINNED)
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (kind==UCL_WRITE_ONLY)
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
else
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
if (err!=hipSuccess || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
mat.cq()=dev.cq();
return UCL_SUCCESS;
}
template <class mat_type>
inline void _host_free(mat_type &mat) {
if (mat.kind()==UCL_VIEW)
return;
else if (mat.kind()!=UCL_NOT_PINNED)
CU_DESTRUCT_CALL(hipHostFree(mat.begin()));
else
free(mat.begin());
}
template <class mat_type>
inline int _host_resize(mat_type &mat, const size_t n) {
_host_free(mat);
hipError_t err=hipSuccess;
if (mat.kind()==UCL_NOT_PINNED)
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (mat.kind()==UCL_WRITE_ONLY)
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
else
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
if (err!=hipSuccess || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
// --------------------------------------------------------------------------
// - DEVICE MEMORY ALLOCATION ROUTINES
// --------------------------------------------------------------------------
template <class mat_type, class copy_type>
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind) {
hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
if (err!=hipSuccess)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
return UCL_SUCCESS;
}
template <class mat_type>
inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind) {
hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
if (err!=hipSuccess)
return UCL_MEMORY_ERROR;
mat.cq()=dev.cq();
return UCL_SUCCESS;
}
template <class mat_type, class copy_type>
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
const size_t cols, size_t &pitch,
const enum UCL_MEMOPT kind) {
hipError_t err;
size_t upitch;
err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows);
pitch=static_cast<size_t>(upitch);
if (err!=hipSuccess)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
return UCL_SUCCESS;
}
template <class mat_type, class copy_type>
inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
const size_t cols, size_t &pitch,
const enum UCL_MEMOPT kind) {
hipError_t err;
size_t upitch;
err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows);
pitch=static_cast<size_t>(upitch);
if (err!=hipSuccess)
return UCL_MEMORY_ERROR;
mat.cq()=d.cq();
return UCL_SUCCESS;
}
template <class mat_type>
inline void _device_free(mat_type &mat) {
if (mat.kind()!=UCL_VIEW){
CU_DESTRUCT_CALL(hipFree((void*)mat.cbegin()));
}
}
template <class mat_type>
inline int _device_resize(mat_type &mat, const size_t n) {
_device_free(mat);
hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
if (err!=hipSuccess)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
template <class mat_type>
inline int _device_resize(mat_type &mat, const size_t rows,
const size_t cols, size_t &pitch) {
_device_free(mat);
hipError_t err;
size_t upitch;
err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows);
pitch=static_cast<size_t>(upitch);
if (err!=hipSuccess)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in) {
*ptr=in;
}
template <class numtyp>
inline void _device_view(hipDeviceptr_t *ptr, numtyp *in) {
*ptr=0;
}
inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in,
const size_t offset, const size_t numsize) {
*ptr=(hipDeviceptr_t)(((char*)in)+offset*numsize);
}
template <class numtyp>
inline void _device_view(hipDeviceptr_t *ptr, numtyp *in,
const size_t offset, const size_t numsize) {
*ptr=0;
}
// --------------------------------------------------------------------------
// - DEVICE IMAGE ALLOCATION ROUTINES
// --------------------------------------------------------------------------
template <class mat_type, class copy_type>
inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
const size_t cols) {
assert(0==1);
}
template <class mat_type, class copy_type>
inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
const size_t cols) {
assert(0==1);
}
template <class mat_type>
inline void _device_image_free(mat_type &mat) {
assert(0==1);
}
// --------------------------------------------------------------------------
// - ZERO ROUTINES
// --------------------------------------------------------------------------
inline void _host_zero(void *ptr, const size_t n) {
memset(ptr,0,n);
}
template <class mat_type>
inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
CU_SAFE_CALL(hipMemsetAsync((void*)mat.cbegin(),0,n,cq));
}
// --------------------------------------------------------------------------
// - MEMCPY ROUTINES
// --------------------------------------------------------------------------
template<class mat1, class mat2>
hipMemcpyKind _memcpy_kind(mat1 &dst, const mat2 &src){
assert(mat1::MEM_TYPE < 2 && mat2::MEM_TYPE < 2);
return (hipMemcpyKind)((1 - mat2::MEM_TYPE)*2 + (1 - mat1::MEM_TYPE));
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
CU_SAFE_CALL(hipMemcpy((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src)));
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n, hipStream_t &cq) {
CU_SAFE_CALL(hipMemcpyAsync((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src), cq));
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CU_SAFE_CALL(hipMemcpy2D((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src)));
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
const size_t spitch, const size_t cols,
const size_t rows,hipStream_t &cq) {
CU_SAFE_CALL(hipMemcpy2DAsync((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src), cq));
}
} // namespace ucl_cudart
#endif

View File

@ -0,0 +1,113 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef HIP_TEXTURE
#define HIP_TEXTURE
#include <hip/hip_runtime.h>
#include "hip_kernel.h"
#include "hip_mat.h"
namespace ucl_hip {
#ifdef __HIP_PLATFORM_NVCC__
inline hipError_t hipModuleGetTexRef(CUtexref* texRef, hipModule_t hmod, const char* name){
return hipCUResultTohipError(cuModuleGetTexRef(texRef, hmod, name));
}
inline hipError_t hipTexRefSetFormat(CUtexref tex, hipArray_Format fmt, int NumPackedComponents) {
return hipCUResultTohipError(cuTexRefSetFormat(tex, (CUarray_format)fmt, NumPackedComponents ));
}
inline hipError_t hipTexRefSetAddress(size_t* offset, CUtexref tex, hipDeviceptr_t devPtr, size_t size) {
return hipCUResultTohipError(cuTexRefSetAddress(offset, tex, devPtr, size));
}
#endif
/// Class storing a texture reference
class UCL_Texture {
public:
UCL_Texture() {}
~UCL_Texture() {}
/// Construct with a specified texture reference
inline UCL_Texture(UCL_Program &prog, const char *texture_name)
{ get_texture(prog,texture_name); }
/// Set the texture reference for this object
inline void get_texture(UCL_Program &prog, const char *texture_name)
{
#ifdef __HIP_PLATFORM_NVCC__
CU_SAFE_CALL(hipModuleGetTexRef(&_tex, prog._module, texture_name));
#else
size_t _global_var_size;
CU_SAFE_CALL(hipModuleGetGlobal(&_device_ptr_to_global_var, &_global_var_size, prog._module, texture_name));
#endif
}
/// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp>
inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
{ _bind_float(vec,numel); }
/// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp>
inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
{ _bind_float(vec,numel); }
/// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp, class devtyp>
inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
{ _bind_float(vec.device,numel); }
/// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp, class devtyp>
inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
{ _bind_float(vec.device,numel); }
/// Unbind the texture reference from the memory allocation
inline void unbind() { }
/// Make a texture reference available to kernel
inline void allow(UCL_Kernel &kernel) {
//#if CUDA_VERSION < 4000
//CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
//#endif
}
private:
#ifdef __HIP_PLATFORM_NVCC__
CUtexref _tex;
#else
void* _device_ptr_to_global_var;
#endif
friend class UCL_Kernel;
template<class mat_typ>
inline void _bind_float(mat_typ &vec, const unsigned numel) {
#ifdef UCL_DEBUG
assert(numel!=0 && numel<5);
#endif
#ifdef __HIP_PLATFORM_NVCC__
if (vec.element_size()==sizeof(float))
CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_FLOAT, numel));
else {
if (numel>2)
CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_SIGNED_INT32, numel));
else
CU_SAFE_CALL(hipTexRefSetFormat(_tex,HIP_AD_FORMAT_SIGNED_INT32,numel*2));
}
CU_SAFE_CALL(hipTexRefSetAddress(NULL, _tex, vec.cbegin(), vec.numel()*vec.element_size()));
#else
void* data_ptr = (void*)vec.cbegin();
CU_SAFE_CALL(hipMemcpyHtoD(hipDeviceptr_t(_device_ptr_to_global_var), &data_ptr, sizeof(void*)));
#endif
}
};
} // namespace
#endif

107
lib/gpu/geryon/hip_timer.h Normal file
View File

@ -0,0 +1,107 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef HIP_TIMER_H
#define HIP_TIMER_H
#include <hip/hip_runtime.h>
#include "hip_macros.h"
#include "hip_device.h"
namespace ucl_hip {
/// Class for timing CUDA Driver events
class UCL_Timer {
public:
inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
{ init(dev); }
inline ~UCL_Timer() { clear(); }
/// Clear any data associated with timer
/** \note init() must be called to reuse timer after a clear() **/
inline void clear() {
if (_initialized) {
CU_DESTRUCT_CALL(hipEventDestroy(start_event));
CU_DESTRUCT_CALL(hipEventDestroy(stop_event));
_initialized=false;
_total_time=0.0;
}
}
/// Initialize default command queue for timing
inline void init(UCL_Device &dev) { init(dev, dev.cq()); }
/// Initialize command queue for timing
inline void init(UCL_Device &dev, command_queue &cq) {
clear();
_cq=cq;
_initialized=true;
CU_SAFE_CALL( hipEventCreateWithFlags(&start_event,0) );
CU_SAFE_CALL( hipEventCreateWithFlags(&stop_event,0) );
}
/// Start timing on command queue
inline void start() { CU_SAFE_CALL(hipEventRecord(start_event,_cq)); }
/// Stop timing on command queue
inline void stop() { CU_SAFE_CALL(hipEventRecord(stop_event,_cq)); }
/// Block until the start event has been reached on device
inline void sync_start()
{ CU_SAFE_CALL(hipEventSynchronize(start_event)); }
/// Block until the stop event has been reached on device
inline void sync_stop()
{ CU_SAFE_CALL(hipEventSynchronize(stop_event)); }
/// Set the time elapsed to zero (not the total_time)
inline void zero() {
CU_SAFE_CALL(hipEventRecord(start_event,_cq));
CU_SAFE_CALL(hipEventRecord(stop_event,_cq));
}
/// Set the total time to zero
inline void zero_total() { _total_time=0.0; }
/// Add time from previous start and stop to total
/** Forces synchronization **/
inline double add_to_total()
{ double t=time(); _total_time+=t; return t/1000.0; }
/// Add a user specified time to the total (ms)
inline void add_time_to_total(const double t) { _total_time+=t; }
/// Return the time (ms) of last start to stop - Forces synchronization
inline double time() {
float timer;
CU_SAFE_CALL(hipEventSynchronize(stop_event));
CU_SAFE_CALL( hipEventElapsedTime(&timer,start_event,stop_event) );
return timer;
}
/// Return the time (s) of last start to stop - Forces synchronization
inline double seconds() { return time()/1000.0; }
/// Return the total time in ms
inline double total_time() { return _total_time; }
/// Return the total time in seconds
inline double total_seconds() { return _total_time/1000.0; }
private:
hipEvent_t start_event, stop_event;
hipStream_t _cq;
double _total_time;
bool _initialized;
};
} // namespace
#endif

View File

@ -244,7 +244,7 @@ class UCL_Kernel {
template <class dtype>
inline void add_arg(const dtype* const arg) {
#if CUDA_VERSION >= 4000
_kernel_args[_num_args]=const_cast<dtype * const>(arg);
_kernel_args[_num_args]=const_cast<dtype *>(arg);
#else
_param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));

View File

@ -36,6 +36,11 @@ using namespace ucl_cudadr;
using namespace ucl_cudart;
#endif
#ifdef UCL_HIP
#include "hip_device.h"
using namespace ucl_hip;
#endif
int main(int argc, char** argv) {
UCL_Device cop;
std::cout << "Found " << cop.num_platforms() << " platform(s).\n";

View File

@ -179,13 +179,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
if (_eflag) {
for (int i=0; i<_inum; i++)
evdwl+=engv[i];
if (_ef_atom)
if (_ilist==NULL)
if (_ef_atom) {
if (_ilist==NULL) {
for (int i=0; i<_inum; i++)
eatom[i]+=engv[i];
else
} else {
for (int i=0; i<_inum; i++)
eatom[_ilist[i]]+=engv[i];
}
}
vstart=_inum;
}
if (_vflag) {
@ -193,7 +195,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int j=0; j<6; j++) {
for (int i=vstart; i<iend; i++)
virial[j]+=engv[i];
if (_vf_atom)
if (_vf_atom){
if (_ilist==NULL) {
int ii=0;
for (int i=vstart; i<iend; i++)
@ -203,6 +205,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=vstart; i<iend; i++)
vatom[_ilist[ii++]][j]+=engv[i];
}
}
vstart+=_inum;
iend+=_inum;
}
@ -228,7 +231,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
evdwl+=engv[i];
for (int i=_inum; i<iend; i++)
ecoul+=engv[i];
if (_ef_atom)
if (_ef_atom) {
if (_ilist==NULL) {
for (int i=0; i<_inum; i++)
eatom[i]+=engv[i];
@ -240,6 +243,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=_inum, ii=0; i<iend; i++)
eatom[_ilist[ii++]]+=engv[i];
}
}
vstart=iend;
iend+=_inum;
}
@ -247,7 +251,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int j=0; j<6; j++) {
for (int i=vstart; i<iend; i++)
virial[j]+=engv[i];
if (_vf_atom)
if (_vf_atom) {
if (_ilist==NULL) {
for (int i=vstart, ii=0; i<iend; i++)
vatom[ii++][j]+=engv[i];
@ -255,6 +259,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=vstart, ii=0; i<iend; i++)
vatom[_ilist[ii++]][j]+=engv[i];
}
}
vstart+=_inum;
iend+=_inum;
}

View File

@ -27,6 +27,10 @@ using namespace ucl_opencl;
#include "geryon/nvc_timer.h"
#include "geryon/nvc_mat.h"
using namespace ucl_cudart;
#elif defined(USE_HIP)
#include "geryon/hip_timer.h"
#include "geryon/hip_mat.h"
using namespace ucl_hip;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"

View File

@ -15,6 +15,11 @@
#include "lal_atom.h"
#ifdef USE_HIP_DEVICE_SORT
#include <hip/hip_runtime.h>
#include <hipcub/hipcub.hpp>
#endif
namespace LAMMPS_AL {
#define AtomT Atom<numtyp,acctyp>
@ -70,6 +75,26 @@ bool AtomT::alloc(const int nall) {
}
#endif
#ifdef USE_HIP_DEVICE_SORT
if (_gpu_nbor==1) {
size_t temp_storage_bytes = 0;
if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms))
return false;
if(sort_out_size < _max_atoms){
if (sort_out_keys ) hipFree(sort_out_keys);
if (sort_out_values) hipFree(sort_out_values);
hipMalloc(&sort_out_keys , _max_atoms * sizeof(unsigned));
hipMalloc(&sort_out_values, _max_atoms * sizeof(int ));
sort_out_size = _max_atoms;
}
if(temp_storage_bytes > sort_temp_storage_size){
if(sort_temp_storage) hipFree(sort_temp_storage);
hipMalloc(&sort_temp_storage, temp_storage_bytes);
sort_temp_storage_size = temp_storage_bytes;
}
}
#endif
// --------------------------- Device allocations
int gpu_bytes=0;
success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
@ -184,6 +209,27 @@ bool AtomT::add_fields(const bool charge, const bool rot,
return false;
}
#endif
#ifdef USE_HIP_DEVICE_SORT
if (_gpu_nbor==1) {
size_t temp_storage_bytes = 0;
if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms))
return false;
if(sort_out_size < _max_atoms){
if (sort_out_keys ) hipFree(sort_out_keys);
if (sort_out_values) hipFree(sort_out_values);
hipMalloc(&sort_out_keys , _max_atoms * sizeof(unsigned));
hipMalloc(&sort_out_values, _max_atoms * sizeof(int ));
sort_out_size = _max_atoms;
}
if(temp_storage_bytes > sort_temp_storage_size){
if(sort_temp_storage) hipFree(sort_temp_storage);
hipMalloc(&sort_temp_storage, temp_storage_bytes);
sort_temp_storage_size = temp_storage_bytes;
}
}
#endif
success=success && (dev_particle_id.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_particle_id.row_bytes();
@ -275,6 +321,19 @@ void AtomT::clear_resize() {
if (_gpu_nbor==1) cudppDestroyPlan(sort_plan);
#endif
#ifdef USE_HIP_DEVICE_SORT
if (_gpu_nbor==1) {
if(sort_out_keys) hipFree(sort_out_keys);
if(sort_out_values) hipFree(sort_out_values);
if(sort_temp_storage) hipFree(sort_temp_storage);
sort_out_keys = nullptr;
sort_out_values = nullptr;
sort_temp_storage = nullptr;
sort_temp_storage_size = 0;
sort_out_size = 0;
}
#endif
if (_gpu_nbor==2) {
host_particle_id.clear();
host_cell_id.clear();
@ -326,6 +385,22 @@ void AtomT::sort_neighbor(const int num_atoms) {
UCL_GERYON_EXIT;
}
#endif
#ifdef USE_HIP_DEVICE_SORT
if(sort_out_size < num_atoms){
printf("AtomT::sort_neighbor: invalid temp buffer size\n");
UCL_GERYON_EXIT;
}
if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(sort_temp_storage, sort_temp_storage_size, (unsigned *)dev_cell_id.begin(), sort_out_keys, (int *)dev_particle_id.begin(), sort_out_values, num_atoms)){
printf("AtomT::sort_neighbor: DeviceRadixSort error\n");
UCL_GERYON_EXIT;
}
if(hipSuccess != hipMemcpy((unsigned *)dev_cell_id.begin(), sort_out_keys , num_atoms*sizeof(unsigned), hipMemcpyDeviceToDevice) ||
hipSuccess != hipMemcpy((int *) dev_particle_id.begin(), sort_out_values, num_atoms*sizeof(int ), hipMemcpyDeviceToDevice)){
printf("AtomT::sort_neighbor: copy output error\n");
UCL_GERYON_EXIT;
}
#endif
}
#ifdef GPU_CAST

View File

@ -11,9 +11,9 @@
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#endif

View File

@ -29,6 +29,11 @@ using namespace ucl_opencl;
#include "geryon/nvc_mat.h"
#include "geryon/nvc_kernel.h"
using namespace ucl_cudart;
#elif defined(USE_HIP)
#include "geryon/hip_timer.h"
#include "geryon/hip_mat.h"
#include "geryon/hip_kernel.h"
using namespace ucl_hip;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
@ -477,6 +482,14 @@ class Atom {
CUDPPConfiguration sort_config;
CUDPPHandle sort_plan;
#endif
#ifdef USE_HIP_DEVICE_SORT
unsigned* sort_out_keys = nullptr;
int* sort_out_values = nullptr;
void* sort_temp_storage = nullptr;
size_t sort_temp_storage_size = 0;
size_t sort_out_size = 0;
#endif
};
}

View File

@ -13,7 +13,7 @@
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#endif

View File

@ -24,6 +24,8 @@
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
@ -43,7 +45,7 @@ class BaseAtomic {
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -25,6 +25,8 @@
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
@ -44,7 +46,7 @@ class BaseCharge {
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -23,6 +23,8 @@
#ifdef USE_OPENCL
#include "geryon/ocl_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
@ -42,7 +44,7 @@ class BaseDipole {
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -23,6 +23,8 @@
#ifdef USE_OPENCL
#include "geryon/ocl_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
@ -42,7 +44,7 @@ class BaseDPD {
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -24,6 +24,8 @@
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
@ -44,7 +46,7 @@ class BaseEllipsoid {
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -24,6 +24,8 @@
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
@ -46,7 +48,7 @@ class BaseThree {
* \param k_three name for the kernel for 3-body force calculation
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,14 +11,14 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -32,7 +32,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,14 +11,14 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -32,7 +32,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,18 @@
//
// begin : June 2018
// email : ndactrung@gmail.com
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else
@ -108,7 +109,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<cutsq_sigma[mtype].x) { // cutsq
if (rsq<cutsq_sigma[mtype].x) { // cutsq
numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;
rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
@ -249,7 +250,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq_sigma[mtype].x) { // cutsq
if (rsq<cutsq_sigma[mtype].x) { // cutsq
numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;
rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;

View File

@ -32,7 +32,7 @@ class BornCoulLongCS : public BornCoulLong<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin :
// email : ndactrung@gmail.com
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class BornCoulWolfCS : public BornCoulWolf<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,14 +11,14 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -32,7 +32,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,14 +11,14 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -32,7 +32,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin :
// email : ndtrung@umich.edu
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin :
// email : ndtrung@umich.edu
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin : 8/15/2012
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class CoulDSF : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin : July 2011
// email : a.kohlmeyer@temple.edu
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin : June 2018
// email : ndactrung@gmail.com
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class CoulLongCS : public CoulLong<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -268,7 +268,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
gpu_nbor=1;
else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
#ifndef USE_CUDPP
#if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
if (gpu_nbor==1)
gpu_nbor=2;
#endif
@ -341,7 +341,7 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
gpu_nbor=1;
else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
#ifndef USE_CUDPP
#if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
if (gpu_nbor==1)
gpu_nbor=2;
#endif
@ -712,7 +712,7 @@ int DeviceT::compile_kernels() {
gpu_lib_data.update_host(false);
_ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
#ifndef USE_OPENCL
#if !(defined(USE_OPENCL) || defined(USE_HIP))
if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch()))
return -4;
#endif

View File

@ -13,7 +13,7 @@
// email : brownw@ornl.gov
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#endif

View File

@ -41,7 +41,7 @@ class Device {
/** Sets up a per-device MPI communicator for load balancing and initializes
* the device (>=first_gpu and <=last_gpu) that this proc will be using
* Returns:
* - 0 if successfull
* - 0 if successful
* - -2 if GPU not found
* - -4 if GPU library not compiled for GPU
* - -6 if GPU could not be initialized for use
@ -62,7 +62,7 @@ class Device {
* \param vel True if velocities need to be stored
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
@ -76,7 +76,7 @@ class Device {
* \param nall Total number of local+ghost particles
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
@ -100,7 +100,7 @@ class Device {
* \param threads_per_atom value to be used by the neighbor list only
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,18 +11,18 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float4> mu_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
_texture( mu_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int4,1> mu_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
_texture_2d( mu_tex,int4);
#endif
#else

View File

@ -32,7 +32,7 @@ class DipoleLJ : public BaseDipole<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,19 +11,19 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float4> mu_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
_texture( mu_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int4,1> mu_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
_texture_2d( mu_tex,int4);
#endif
#else

View File

@ -32,7 +32,7 @@ class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,18 +11,18 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float4> mu_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
_texture( mu_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int4,1> mu_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
_texture_2d( mu_tex,int4);
#endif
#else

View File

@ -32,7 +32,7 @@ class DipoleLongLJ : public BaseDipole<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,16 +11,16 @@
//
// begin : Jan 15, 2014
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float4> vel_tex;
_texture( pos_tex,float4);
_texture( vel_tex,float4);
#else
texture<int4,1> pos_tex;
texture<int4,1> vel_tex;
_texture_2d( pos_tex,int4);
_texture_2d( vel_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -32,7 +32,7 @@ class DPD : public BaseDPD<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,29 +11,29 @@
//
// begin :
// email : brownw@ornl.gov nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> fp_tex;
texture<float4> rhor_sp1_tex;
texture<float4> rhor_sp2_tex;
texture<float4> frho_sp1_tex;
texture<float4> frho_sp2_tex;
texture<float4> z2r_sp1_tex;
texture<float4> z2r_sp2_tex;
_texture( pos_tex,float4);
_texture( fp_tex,float);
_texture( rhor_sp1_tex,float4);
_texture( rhor_sp2_tex,float4);
_texture( frho_sp1_tex,float4);
_texture( frho_sp2_tex,float4);
_texture( z2r_sp1_tex,float4);
_texture( z2r_sp2_tex,float4);
#else
texture<int4> pos_tex;
texture<int2> fp_tex;
texture<int4> rhor_sp1_tex;
texture<int4> rhor_sp2_tex;
texture<int4> frho_sp1_tex;
texture<int4> frho_sp2_tex;
texture<int4> z2r_sp1_tex;
texture<int4> z2r_sp2_tex;
_texture( pos_tex,int4);
_texture( fp_tex,int2);
_texture( rhor_sp1_tex,int4);
_texture( rhor_sp2_tex,int4);
_texture( frho_sp1_tex,int4);
_texture( frho_sp2_tex,int4);
_texture( z2r_sp1_tex,int4);
_texture( z2r_sp2_tex,int4);
#endif
#else

View File

@ -33,7 +33,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -18,12 +18,14 @@
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex, quat_tex;
_texture( pos_tex, float4);
_texture( quat_tex,float4);
#else
texture<int4,1> pos_tex, quat_tex;
_texture_2d( pos_tex,int4);
_texture_2d( quat_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -9,16 +9,15 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_
@ -53,8 +52,8 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
int itype=fast_mul(iw,ntypes);
int newj=0;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
int j=dev_ij[nbor];
j &= NEIGHMASK;
int sj=dev_ij[nbor];
int j = sj & NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
@ -69,7 +68,7 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
rsq+=t*t;
if (rsq<cf.x) {
dev_nbor[packed]=j;
dev_nbor[packed]=sj;
packed+=nbor_pitch;
newj++;
}
@ -117,8 +116,8 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
int newj=0;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
int j=dev_ij[nbor];
j &= NEIGHMASK;
int sj=dev_ij[nbor];
int j = sj & NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
@ -133,7 +132,7 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
rsq+=t*t;
if (rsq<cutsq[mtype]) {
dev_nbor[packed]=j;
dev_nbor[packed]=sj;
packed+=nbor_pitch;
newj++;
}

View File

@ -11,14 +11,14 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -32,7 +32,7 @@ class Gauss : public BaseAtomic<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,9 +11,9 @@
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_ellipsoid_extra.h"
#endif

View File

@ -34,7 +34,7 @@ class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
* \return false if there is not sufficient memory or device init prob
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,9 +11,9 @@
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_ellipsoid_extra.h"
#endif

View File

@ -11,14 +11,14 @@
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -32,7 +32,7 @@ class LJ : public BaseAtomic<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,14 +11,14 @@
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -32,7 +32,7 @@ class LJ96 : public BaseAtomic<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin : Mon May 16 2011
// email : brownw@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class LJClass2Long : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class LJCoul : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,21 +11,21 @@
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float> gcons_tex;
texture<float> dgcons_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
_texture( gcons_tex,float);
_texture( dgcons_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int2> gcons_tex;
texture<int2> dgcons_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
_texture( gcons_tex,int2);
_texture( dgcons_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,14 +11,14 @@
//
// begin :
// email : ndactrung@gmail.com
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else
#define pos_tex x_

View File

@ -32,7 +32,7 @@ class LJCubic : public BaseAtomic<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,17 +11,17 @@
//
// begin : 7/12/2012
// email : brownw@ornl.gov
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
_texture( pos_tex,float4);
_texture( q_tex,float);
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
_texture_2d( pos_tex,int4);
_texture( q_tex,int2);
#endif
#else

View File

@ -32,7 +32,7 @@ class LJDSF : public BaseCharge<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

View File

@ -11,15 +11,15 @@
//
// begin :
// email : ibains@nvidia.com
// ***************************************************************************/
// ***************************************************************************
#ifdef NV_KERNEL
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
_texture( pos_tex,float4);
#else
texture<int4,1> pos_tex;
_texture_2d( pos_tex,int4);
#endif
#else

View File

@ -32,7 +32,7 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU

Some files were not shown because too many files have changed in this diff Show More