Merge remote-tracking branch 'origin/master'
This commit is contained in:
1
lib/gpu/.gitignore
vendored
1
lib/gpu/.gitignore
vendored
@ -2,6 +2,7 @@
|
||||
/obj_ocl
|
||||
/ocl_get_devices
|
||||
/nvc_get_devices
|
||||
/hip_get_devices
|
||||
/*.cubin
|
||||
/*_cubin.h
|
||||
/*_cl.h
|
||||
|
||||
148
lib/gpu/Makefile.hip
Normal file
148
lib/gpu/Makefile.hip
Normal file
@ -0,0 +1,148 @@
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Generic Linux Makefile for HIP
|
||||
# - export HIP_PLATFORM=hcc (or nvcc) before execution
|
||||
# - change HIP_ARCH for your GPU
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
# this setting should match LAMMPS Makefile
|
||||
# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
|
||||
|
||||
LMP_INC = -DLAMMPS_SMALLBIG
|
||||
|
||||
# precision for GPU calculations
|
||||
# -D_SINGLE_SINGLE # Single precision for all calculations
|
||||
# -D_DOUBLE_DOUBLE # Double precision for all calculations
|
||||
# -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double
|
||||
|
||||
HIP_PRECISION = -D_SINGLE_DOUBLE
|
||||
|
||||
HIP_OPTS = -O3
|
||||
HIP_HOST_OPTS = -Wno-deprecated-declarations
|
||||
HIP_HOST_INCLUDE =
|
||||
|
||||
# use device sort
|
||||
# requires linking with hipcc and hipCUB + (rocPRIM or CUB for AMD or Nvidia respectively)
|
||||
HIP_HOST_OPTS += -DUSE_HIP_DEVICE_SORT
|
||||
# path to cub
|
||||
HIP_HOST_INCLUDE += -I./
|
||||
# path to hipcub
|
||||
HIP_HOST_INCLUDE += -I$(HIP_PATH)/../include
|
||||
|
||||
# use mpi
|
||||
HIP_HOST_OPTS += -DMPI_GERYON -DUCL_NO_EXIT
|
||||
# this settings should match LAMMPS Makefile
|
||||
MPI_COMP_OPTS = $(shell mpicxx --showme:compile)
|
||||
MPI_LINK_OPTS = $(shell mpicxx --showme:link)
|
||||
#MPI_COMP_OPTS += -I/usr/include/mpi -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1
|
||||
|
||||
HIP_PATH ?= $(wildcard /opt/rocm/hip)
|
||||
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
|
||||
|
||||
ifeq (hcc,$(HIP_PLATFORM))
|
||||
HIP_OPTS += -ffast-math
|
||||
# possible values: gfx803,gfx900,gfx906
|
||||
HIP_ARCH = gfx906
|
||||
else ifeq (nvcc,$(HIP_PLATFORM))
|
||||
HIP_OPTS += --use_fast_math
|
||||
HIP_ARCH = -gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_32,code=[sm_32,compute_32] -gencode arch=compute_35,code=[sm_35,compute_35] \
|
||||
-gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] -gencode arch=compute_53,code=[sm_53,compute_53]\
|
||||
-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] -gencode arch=compute_62,code=[sm_62,compute_62]\
|
||||
-gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_72,code=[sm_72,compute_72] -gencode arch=compute_75,code=[sm_75,compute_75]
|
||||
else
|
||||
$(error Specify HIP platform using 'export HIP_PLATFORM=(hcc,nvcc)')
|
||||
endif
|
||||
|
||||
BIN_DIR = .
|
||||
OBJ_DIR = ./obj
|
||||
LIB_DIR = .
|
||||
AR = ar
|
||||
BSH = /bin/sh
|
||||
|
||||
|
||||
# /* ----------------------------------------------------------------------
|
||||
# don't change section below without need
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
HIP_OPTS += -DUSE_HIP $(HIP_PRECISION)
|
||||
HIP_GPU_OPTS += $(HIP_OPTS) -I./
|
||||
|
||||
ifeq (hcc,$(HIP_PLATFORM))
|
||||
HIP_HOST_OPTS += -fPIC
|
||||
HIP_GPU_CC = $(HIP_PATH)/bin/hipcc --genco
|
||||
HIP_GPU_OPTS_S = -t="$(HIP_ARCH)" -f=\"
|
||||
HIP_GPU_OPTS_E = \"
|
||||
HIP_KERNEL_SUFFIX = .cpp
|
||||
HIP_LIBS_TARGET = export HCC_AMDGPU_TARGET := $(HIP_ARCH)
|
||||
export HCC_AMDGPU_TARGET := $(HIP_ARCH)
|
||||
else ifeq (nvcc,$(HIP_PLATFORM))
|
||||
HIP_GPU_CC = $(HIP_PATH)/bin/hipcc --fatbin
|
||||
HIP_GPU_OPTS += $(HIP_ARCH)
|
||||
HIP_GPU_SORT_ARCH = $(HIP_ARCH)
|
||||
# fix nvcc can't handle -pthread flag
|
||||
MPI_COMP_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_COMP_OPTS))
|
||||
MPI_LINK_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_LINK_OPTS))
|
||||
endif
|
||||
|
||||
# hipcc is essential for device sort, because of hipcub is header only library and ROCm gpu code generation is deferred to the linking stage
|
||||
HIP_HOST_CC = $(HIP_PATH)/bin/hipcc
|
||||
HIP_HOST_OPTS += $(HIP_OPTS) $(MPI_COMP_OPTS) $(LMP_INC)
|
||||
HIP_HOST_CC_CMD = $(HIP_HOST_CC) $(HIP_HOST_OPTS) $(HIP_HOST_INCLUDE)
|
||||
|
||||
# sources
|
||||
|
||||
ALL_H = $(wildcard ./geryon/ucl*.h) $(wildcard ./geryon/hip*.h) $(wildcard ./lal_*.h)
|
||||
SRCS := $(wildcard ./lal_*.cpp)
|
||||
OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
|
||||
CUS := $(wildcard lal_*.cu)
|
||||
CUHS := $(filter-out pppm_cubin.h, $(CUS:lal_%.cu=%_cubin.h)) pppm_f_cubin.h pppm_d_cubin.h
|
||||
CUHS := $(addprefix $(OBJ_DIR)/, $(CUHS))
|
||||
|
||||
all: $(OBJ_DIR) $(CUHS) $(LIB_DIR)/libgpu.a $(BIN_DIR)/hip_get_devices
|
||||
|
||||
$(OBJ_DIR):
|
||||
mkdir -p $@
|
||||
|
||||
# GPU kernels compilation
|
||||
|
||||
$(OBJ_DIR)/pppm_f_cubin.h: lal_pppm.cu $(ALL_H)
|
||||
@cp $< $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
|
||||
$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=float -Dgrdtyp4=float4 $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
|
||||
@xxd -i $(OBJ_DIR)/pppm_f.cubin $@
|
||||
@sed -i "s/[a-zA-Z0-9_]*pppm_f_cubin/pppm_f/g" $@
|
||||
@rm $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_f.cubin
|
||||
|
||||
$(OBJ_DIR)/pppm_d_cubin.h: lal_pppm.cu $(ALL_H)
|
||||
@cp $< $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
|
||||
$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=double -Dgrdtyp4=double4 $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
|
||||
@xxd -i $(OBJ_DIR)/pppm_d.cubin $@
|
||||
@sed -i "s/[a-zA-Z0-9_]*pppm_d_cubin/pppm_d/g" $@
|
||||
@rm $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_d.cubin
|
||||
|
||||
$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H)
|
||||
@cp $< $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
|
||||
$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) $(HIP_GPU_OPTS_E) -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
|
||||
@xxd -i $(OBJ_DIR)/$*.cubin $@
|
||||
@sed -i "s/[a-zA-Z0-9_]*$*_cubin/$*/g" $@
|
||||
@rm $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/$*.cubin
|
||||
|
||||
# host sources compilation
|
||||
|
||||
$(OBJ_DIR)/lal_atom.o: lal_atom.cpp $(CUHS) $(ALL_H)
|
||||
$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR) $(HIP_GPU_SORT_ARCH)
|
||||
|
||||
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
|
||||
$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR)
|
||||
|
||||
# libgpu building
|
||||
|
||||
$(LIB_DIR)/libgpu.a: $(OBJS)
|
||||
$(AR) -crs $@ $(OBJS)
|
||||
printf "export HIP_PLATFORM := %s\n%s\n" "$(HIP_PLATFORM)" "$(HIP_LIBS_TARGET)" > Makefile.lammps
|
||||
|
||||
# test app building
|
||||
|
||||
$(BIN_DIR)/hip_get_devices: ./geryon/ucl_get_devices.cpp $(ALL_H)
|
||||
$(HIP_HOST_CC_CMD) -o $@ $< -DUCL_HIP $(MPI_LINK_OPTS)
|
||||
|
||||
clean:
|
||||
-rm -f $(BIN_DIR)/hip_get_devices $(LIB_DIR)/libgpu.a $(OBJS) $(OBJ_DIR)/temp_* $(CUHS)
|
||||
@ -52,7 +52,7 @@ user-gpu_SYSLIB = CUDA libraries needed by this package
|
||||
user-gpu_SYSPATH = path(s) to where those libraries are
|
||||
|
||||
Because you have the CUDA compilers on your system, you should have
|
||||
the needed libraries. If the CUDA developement tools were installed
|
||||
the needed libraries. If the CUDA development tools were installed
|
||||
in the standard manner, the settings in the Makefile.lammps.standard
|
||||
file should work.
|
||||
|
||||
@ -67,8 +67,8 @@ library requires installing the CUDA GPU driver and CUDA toolkit for
|
||||
your operating system. Installation of the CUDA SDK is not necessary.
|
||||
In addition to the LAMMPS library, the binary nvc_get_devices will also
|
||||
be built. This can be used to query the names and properties of GPU
|
||||
devices on your system. A Makefile for OpenCL compilation is provided,
|
||||
but support for OpenCL use is not currently provided by the developers.
|
||||
devices on your system. A Makefile for OpenCL and ROCm HIP compilation
|
||||
is provided, but support for it is not currently provided by the developers.
|
||||
Details of the implementation are provided in:
|
||||
|
||||
----
|
||||
@ -169,6 +169,25 @@ NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG,
|
||||
src/MAKE/Makefile.foo) should be consistent with that specified
|
||||
when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).
|
||||
|
||||
BUILDING FOR HIP FRAMEWORK
|
||||
--------------------------------
|
||||
1. Install the latest ROCm framework (https://github.com/RadeonOpenCompute/ROCm).
|
||||
2. GPU sorting requires installing hipcub
|
||||
(https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
|
||||
additionally requires cub (https://nvlabs.github.io/cub). Download and
|
||||
extract the cub directory to lammps/lib/gpu/ or specify an appropriate
|
||||
path in lammps/lib/gpu/Makefile.hip.
|
||||
3. In Makefile.hip it is possible to specify the target platform via
|
||||
export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target
|
||||
architecture (gfx803, gfx900, gfx906 etc.)
|
||||
4. If your MPI implementation does not support `mpicxx --showme` command,
|
||||
it is required to specify the corresponding MPI compiler and linker flags
|
||||
in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
|
||||
5. Building the GPU library (libgpu.a):
|
||||
cd lammps/lib/gpu; make -f Makefile.hip -j
|
||||
6. Building the LAMMPS executable (lmp_hip):
|
||||
cd ../../src; make hip -j
|
||||
|
||||
EXAMPLE CONVENTIONAL BUILD PROCESS
|
||||
--------------------------------
|
||||
|
||||
|
||||
@ -27,7 +27,7 @@
|
||||
* defined in cudpp.h. Public interface functions call functions in the
|
||||
* \link cudpp_app Application-Level\endlink interface. The public
|
||||
* interface functions include Plan Interface functions and Algorithm
|
||||
* Interface functions. Plan Inteface functions are used for creating
|
||||
* Interface functions. Plan Interface functions are used for creating
|
||||
* CUDPP Plan objects which contain configuration details, intermediate
|
||||
* storage space, and in the case of cudppSparseMatrix(), data. The
|
||||
* Algorithm Interface is the set of functions that do the real work
|
||||
|
||||
@ -8,16 +8,7 @@
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
#include "cudpp_maximal_launch.h"
|
||||
|
||||
inline size_t min(size_t x, size_t y)
|
||||
{
|
||||
return (x <= y) ? x : y;
|
||||
}
|
||||
|
||||
inline size_t max(size_t x, size_t y)
|
||||
{
|
||||
return (x >= y) ? x : y;
|
||||
}
|
||||
#include <algorithm>
|
||||
|
||||
// computes next highest multiple of f from x
|
||||
inline size_t multiple(size_t x, size_t f)
|
||||
@ -65,7 +56,7 @@ size_t maxBlocks(cudaFuncAttributes &attribs,
|
||||
size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
|
||||
size_t ctaLimitThreads = maxThreadsPerSM / threadsPerBlock;
|
||||
|
||||
return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
|
||||
return devprop.multiProcessorCount * std::min(ctaLimitRegs, std::min(ctaLimitSMem, std::min(ctaLimitThreads, (size_t)maxBlocksPerSM)));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
@ -80,15 +71,15 @@ size_t maxBlocksFromPointer(void* kernel,
|
||||
{
|
||||
err = cudaGetDeviceProperties(&devprop, deviceID);
|
||||
if (err != cudaSuccess)
|
||||
return -1;
|
||||
return (size_t)-1;
|
||||
|
||||
cudaFuncAttributes attr;
|
||||
err = cudaFuncGetAttributes(&attr, (const char*)kernel);
|
||||
if (err != cudaSuccess)
|
||||
return -1;
|
||||
return (size_t)-1;
|
||||
|
||||
return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
|
||||
}
|
||||
|
||||
return -1;
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
@ -367,7 +367,7 @@ extern "C" {
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized withing Cutil then cutFree() has to be used to
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
@ -382,7 +382,7 @@ extern "C" {
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized withing Cutil then cutFree() has to be used to
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
@ -466,7 +466,7 @@ extern "C" {
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are seperated by '='; example --samples=50
|
||||
// token and value are separated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -46,7 +46,7 @@
|
||||
* the rows of \a d_blockSums (in elements) in \a blockSumRowPitch, and invoke
|
||||
* with a thread block grid with height greater than 1.
|
||||
*
|
||||
* This function peforms one level of a recursive, multiblock scan. At the
|
||||
* This function performs one level of a recursive, multiblock scan. At the
|
||||
* app level, this function is called by cudppScan and cudppMultiScan and used
|
||||
* in combination with vectorAddUniform4() to produce a complete scan.
|
||||
*
|
||||
|
||||
519
lib/gpu/geryon/hip_device.h
Normal file
519
lib/gpu/geryon/hip_device.h
Normal file
@ -0,0 +1,519 @@
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef HIP_DEVICE
|
||||
#define HIP_DEVICE
|
||||
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <unordered_map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include "hip_macros.h"
|
||||
#include "ucl_types.h"
|
||||
|
||||
namespace ucl_hip {
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - COMMAND QUEUE STUFF
|
||||
// --------------------------------------------------------------------------
|
||||
typedef hipStream_t command_queue;
|
||||
|
||||
inline void ucl_sync(hipStream_t &stream) {
|
||||
CU_SAFE_CALL(hipStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
struct NVDProperties {
|
||||
int device_id;
|
||||
std::string name;
|
||||
int major;
|
||||
int minor;
|
||||
CUDA_INT_TYPE totalGlobalMem;
|
||||
int multiProcessorCount;
|
||||
|
||||
int maxThreadsPerBlock;
|
||||
int maxThreadsDim[3];
|
||||
int maxGridSize[3];
|
||||
int sharedMemPerBlock;
|
||||
int totalConstantMemory;
|
||||
int SIMDWidth;
|
||||
int memPitch;
|
||||
int regsPerBlock;
|
||||
int clockRate;
|
||||
int textureAlign;
|
||||
|
||||
int kernelExecTimeoutEnabled;
|
||||
int integrated;
|
||||
int canMapHostMemory;
|
||||
int concurrentKernels;
|
||||
int ECCEnabled;
|
||||
int computeMode;
|
||||
};
|
||||
|
||||
/// Class for looking at device properties
|
||||
/** \note Calls to change the device outside of the class results in incorrect
|
||||
* behavior
|
||||
* \note There is no error checking for indexing past the number of devices **/
|
||||
class UCL_Device {
|
||||
public:
|
||||
/// Collect properties for every GPU on the node
|
||||
/** \note You must set the active GPU with set() before using the device **/
|
||||
inline UCL_Device();
|
||||
|
||||
inline ~UCL_Device();
|
||||
|
||||
/// Returns 1 (For compatibility with OpenCL)
|
||||
inline int num_platforms() { return 1; }
|
||||
|
||||
/// Return a string with name and info of the current platform
|
||||
inline std::string platform_name()
|
||||
{ return "HIP platform"; }
|
||||
|
||||
/// Delete any contexts/data and set the platform number to be used
|
||||
inline int set_platform(const int pid);
|
||||
|
||||
/// Return the number of devices that support CUDA
|
||||
inline int num_devices() { return _properties.size(); }
|
||||
|
||||
/// Set the CUDA device to the specified device number
|
||||
/** A context and default command queue will be created for the device
|
||||
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
|
||||
* be allocated for use. clear() is called to delete any contexts and
|
||||
* associated data from previous calls to set(). **/
|
||||
inline int set(int num);
|
||||
|
||||
/// Delete any context and associated data stored from a call to set()
|
||||
inline void clear();
|
||||
|
||||
/// Get the current device number
|
||||
inline int device_num() { return _device; }
|
||||
|
||||
/// Returns the default stream for the current device
|
||||
inline command_queue & cq() { return cq(0); }
|
||||
|
||||
/// Returns the stream indexed by i
|
||||
inline command_queue & cq(const int i) { return _cq[i]; }
|
||||
|
||||
/// Block until all commands in the default stream have completed
|
||||
inline void sync() { sync(0); }
|
||||
|
||||
/// Block until all commands in the specified stream have completed
|
||||
inline void sync(const int i) { ucl_sync(cq(i)); }
|
||||
|
||||
/// Get the number of command queues currently available on device
|
||||
inline int num_queues()
|
||||
{ return _cq.size(); }
|
||||
|
||||
/// Add a stream for device computations
|
||||
inline void push_command_queue() {
|
||||
_cq.push_back(hipStream_t());
|
||||
CU_SAFE_CALL(hipStreamCreateWithFlags(&_cq.back(),0));
|
||||
}
|
||||
|
||||
/// Remove a stream for device computations
|
||||
/** \note You cannot delete the default stream **/
|
||||
inline void pop_command_queue() {
|
||||
if (_cq.size()<2) return;
|
||||
CU_SAFE_CALL_NS(hipStreamDestroy(_cq.back()));
|
||||
_cq.pop_back();
|
||||
}
|
||||
|
||||
/// Set the default command queue (by default this is the null stream)
|
||||
/** \param i index of the command queue (as added by push_command_queue())
|
||||
If i is 0, the default command queue is set to the null stream **/
|
||||
inline void set_command_queue(const int i) {
|
||||
if (i==0) _cq[0]=0;
|
||||
else _cq[0]=_cq[i];
|
||||
}
|
||||
|
||||
/// Get the current CUDA device name
|
||||
inline std::string name() { return name(_device); }
|
||||
/// Get the CUDA device name
|
||||
inline std::string name(const int i)
|
||||
{ return std::string(_properties[i].name); }
|
||||
|
||||
/// Get a string telling the type of the current device
|
||||
inline std::string device_type_name() { return device_type_name(_device); }
|
||||
/// Get a string telling the type of the device
|
||||
inline std::string device_type_name(const int i) { return "GPU"; }
|
||||
|
||||
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type() { return device_type(_device); }
|
||||
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type(const int i) { return UCL_GPU; }
|
||||
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory() { return shared_memory(_device); }
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
|
||||
|
||||
/// Returns true if double precision is support for the current device
|
||||
inline bool double_precision() { return double_precision(_device); }
|
||||
/// Returns true if double precision is support for the device
|
||||
inline bool double_precision(const int i) {return arch(i)>=1.3;}
|
||||
|
||||
/// Get the number of compute units on the current device
|
||||
inline unsigned cus() { return cus(_device); }
|
||||
/// Get the number of compute units
|
||||
inline unsigned cus(const int i)
|
||||
{ return _properties[i].multiProcessorCount; }
|
||||
|
||||
/// Get the number of cores in the current device
|
||||
inline unsigned cores() { return cores(_device); }
|
||||
/// Get the number of cores
|
||||
inline unsigned cores(const int i)
|
||||
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
|
||||
else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
|
||||
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
|
||||
else return _properties[i].multiProcessorCount*192; }
|
||||
|
||||
/// Get the gigabytes of global memory in the current device
|
||||
inline double gigabytes() { return gigabytes(_device); }
|
||||
/// Get the gigabytes of global memory
|
||||
inline double gigabytes(const int i)
|
||||
{ return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
|
||||
|
||||
/// Get the bytes of global memory in the current device
|
||||
inline size_t bytes() { return bytes(_device); }
|
||||
/// Get the bytes of global memory
|
||||
inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
|
||||
|
||||
// Get the gigabytes of free memory in the current device
|
||||
inline double free_gigabytes() { return free_gigabytes(_device); }
|
||||
// Get the gigabytes of free memory
|
||||
inline double free_gigabytes(const int i)
|
||||
{ return static_cast<double>(free_bytes(i))/1073741824; }
|
||||
|
||||
// Get the bytes of free memory in the current device
|
||||
inline size_t free_bytes() { return free_bytes(_device); }
|
||||
// Get the bytes of free memory
|
||||
inline size_t free_bytes(const int i) {
|
||||
CUDA_INT_TYPE dfree, dtotal;
|
||||
CU_SAFE_CALL_NS(hipMemGetInfo(&dfree, &dtotal));
|
||||
return static_cast<size_t>(dfree);
|
||||
}
|
||||
|
||||
/// Return the GPGPU compute capability for current device
|
||||
inline double arch() { return arch(_device); }
|
||||
/// Return the GPGPU compute capability
|
||||
inline double arch(const int i)
|
||||
{ return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
|
||||
|
||||
/// Clock rate in GHz for current device
|
||||
inline double clock_rate() { return clock_rate(_device); }
|
||||
/// Clock rate in GHz
|
||||
inline double clock_rate(const int i)
|
||||
{ return _properties[i].clockRate*1e-6;}
|
||||
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size() { return group_size(_device); }
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size(const int i)
|
||||
{ return _properties[i].maxThreadsPerBlock; }
|
||||
|
||||
/// Return the maximum memory pitch in bytes for current device
|
||||
inline size_t max_pitch() { return max_pitch(_device); }
|
||||
/// Return the maximum memory pitch in bytes
|
||||
inline size_t max_pitch(const int i) { return _properties[i].memPitch; }
|
||||
|
||||
/// Returns false if accelerator cannot be shared by multiple processes
|
||||
/** If it cannot be determined, true is returned **/
|
||||
inline bool sharing_supported() { return sharing_supported(_device); }
|
||||
/// Returns false if accelerator cannot be shared by multiple processes
|
||||
/** If it cannot be determined, true is returned **/
|
||||
inline bool sharing_supported(const int i)
|
||||
{ return (_properties[i].computeMode == hipComputeModeDefault); }
|
||||
|
||||
/// True if splitting device into equal subdevices supported
|
||||
inline bool fission_equal()
|
||||
{ return fission_equal(_device); }
|
||||
/// True if splitting device into equal subdevices supported
|
||||
inline bool fission_equal(const int i)
|
||||
{ return false; }
|
||||
/// True if splitting device into subdevices by specified counts supported
|
||||
inline bool fission_by_counts()
|
||||
{ return fission_by_counts(_device); }
|
||||
/// True if splitting device into subdevices by specified counts supported
|
||||
inline bool fission_by_counts(const int i)
|
||||
{ return false; }
|
||||
/// True if splitting device into subdevices by affinity domains supported
|
||||
inline bool fission_by_affinity()
|
||||
{ return fission_by_affinity(_device); }
|
||||
/// True if splitting device into subdevices by affinity domains supported
|
||||
inline bool fission_by_affinity(const int i)
|
||||
{ return false; }
|
||||
|
||||
/// Maximum number of subdevices allowed from device fission
|
||||
inline int max_sub_devices()
|
||||
{ return max_sub_devices(_device); }
|
||||
/// Maximum number of subdevices allowed from device fission
|
||||
inline int max_sub_devices(const int i)
|
||||
{ return 0; }
|
||||
|
||||
/// List all devices along with all properties
|
||||
inline void print_all(std::ostream &out);
|
||||
|
||||
/// Select the platform that has accelerators (for compatibility with OpenCL)
|
||||
inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
|
||||
|
||||
inline int load_module(const void* program, hipModule_t& module, std::string *log=NULL){
|
||||
auto it = _loaded_modules.emplace(program, hipModule_t());
|
||||
if(!it.second){
|
||||
module = it.first->second;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
const unsigned int num_opts=2;
|
||||
hipJitOption options[num_opts];
|
||||
void *values[num_opts];
|
||||
|
||||
// set up size of compilation log buffer
|
||||
options[0] = hipJitOptionInfoLogBufferSizeBytes;
|
||||
values[0] = (void *)(int)10240;
|
||||
// set up pointer to the compilation log buffer
|
||||
options[1] = hipJitOptionInfoLogBuffer;
|
||||
char clog[10240] = { 0 };
|
||||
values[1] = clog;
|
||||
|
||||
hipError_t err=hipModuleLoadDataEx(&module,program,num_opts, options,(void **)values);
|
||||
|
||||
if (log!=NULL)
|
||||
*log=std::string(clog);
|
||||
|
||||
if (err != hipSuccess) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << std::endl
|
||||
<< "----------------------------------------------------------\n"
|
||||
<< " UCL Error: Error compiling PTX Program...\n"
|
||||
<< "----------------------------------------------------------\n";
|
||||
std::cerr << log << std::endl;
|
||||
#endif
|
||||
_loaded_modules.erase(it.first);
|
||||
return UCL_COMPILE_ERROR;
|
||||
}
|
||||
it.first->second = module;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
private:
|
||||
std::unordered_map<const void*, hipModule_t> _loaded_modules;
|
||||
int _device, _num_devices;
|
||||
std::vector<NVDProperties> _properties;
|
||||
std::vector<hipStream_t> _cq;
|
||||
hipDevice_t _cu_device;
|
||||
};
|
||||
|
||||
// Grabs the properties for all devices
|
||||
UCL_Device::UCL_Device() {
|
||||
CU_SAFE_CALL_NS(hipInit(0));
|
||||
CU_SAFE_CALL_NS(hipGetDeviceCount(&_num_devices));
|
||||
for (int i=0; i<_num_devices; ++i) {
|
||||
hipDevice_t dev;
|
||||
CU_SAFE_CALL_NS(hipDeviceGet(&dev,i));
|
||||
int major, minor;
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, dev));
|
||||
if (major==9999)
|
||||
continue;
|
||||
|
||||
NVDProperties prop;
|
||||
prop.device_id = i;
|
||||
prop.major=major;
|
||||
prop.minor=minor;
|
||||
|
||||
char namecstr[1024];
|
||||
CU_SAFE_CALL_NS(hipDeviceGetName(namecstr,1024,dev));
|
||||
prop.name=namecstr;
|
||||
|
||||
CU_SAFE_CALL_NS(hipDeviceTotalMem(&prop.totalGlobalMem,dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.multiProcessorCount, hipDeviceAttributeMultiprocessorCount, dev));
|
||||
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsPerBlock, hipDeviceAttributeMaxThreadsPerBlock, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[0], hipDeviceAttributeMaxBlockDimX, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[1], hipDeviceAttributeMaxBlockDimY, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[2], hipDeviceAttributeMaxBlockDimZ, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[0], hipDeviceAttributeMaxGridDimX, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[1], hipDeviceAttributeMaxGridDimY, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[2], hipDeviceAttributeMaxGridDimZ, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.sharedMemPerBlock, hipDeviceAttributeMaxSharedMemoryPerBlock, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.totalConstantMemory, hipDeviceAttributeTotalConstantMemory, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.SIMDWidth, hipDeviceAttributeWarpSize, dev));
|
||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.regsPerBlock, hipDeviceAttributeMaxRegistersPerBlock, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.clockRate, hipDeviceAttributeClockRate, dev));
|
||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
|
||||
|
||||
//#if CUDA_VERSION >= 2020
|
||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
|
||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.computeMode, hipDeviceAttributeComputeMode,dev));
|
||||
//#endif
|
||||
//#if CUDA_VERSION >= 3010
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.concurrentKernels, hipDeviceAttributeConcurrentKernels, dev));
|
||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
|
||||
//#endif
|
||||
|
||||
_properties.push_back(prop);
|
||||
}
|
||||
_device=-1;
|
||||
_cq.push_back(hipStream_t());
|
||||
_cq.back()=0;
|
||||
}
|
||||
|
||||
UCL_Device::~UCL_Device() {
|
||||
clear();
|
||||
}
|
||||
|
||||
int UCL_Device::set_platform(const int pid) {
|
||||
clear();
|
||||
#ifdef UCL_DEBUG
|
||||
assert(pid<num_platforms());
|
||||
#endif
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
// Set the CUDA device to the specified device number
|
||||
int UCL_Device::set(int num) {
|
||||
clear();
|
||||
_device=_properties[num].device_id;
|
||||
hipError_t err=hipDeviceGet(&_cu_device,_device);
|
||||
if (err!=hipSuccess) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not access accelerator number " << num
|
||||
<< " for use.\n";
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
return UCL_ERROR;
|
||||
}
|
||||
//hipError_t err=hipCtxCreate(&_context,0,_cu_device); deprecated and unnecessary
|
||||
err=hipSetDevice(_device);
|
||||
if (err!=hipSuccess) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not set accelerator number " << num
|
||||
<< " for use.\n";
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
return UCL_ERROR;
|
||||
}
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
void UCL_Device::clear() {
|
||||
if (_device>-1) {
|
||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||
CU_SAFE_CALL_NS(hipDeviceReset());
|
||||
}
|
||||
_device=-1;
|
||||
}
|
||||
|
||||
// List all devices along with all properties
|
||||
void UCL_Device::print_all(std::ostream &out) {
|
||||
//#if CUDA_VERSION >= 2020
|
||||
int driver_version;
|
||||
hipDriverGetVersion(&driver_version);
|
||||
out << "Driver Version: "
|
||||
<< driver_version/1000 << "." << driver_version%100
|
||||
<< std::endl;
|
||||
//#endif
|
||||
|
||||
if (num_devices() == 0)
|
||||
out << "There is no device supporting HIP\n";
|
||||
for (int i=0; i<num_devices(); ++i) {
|
||||
out << "\nDevice " << i << ": \"" << name(i) << "\"\n";
|
||||
out << " Type of device: "
|
||||
<< device_type_name(i).c_str() << std::endl;
|
||||
out << " Compute capability: "
|
||||
<< arch(i) << std::endl;
|
||||
out << " Double precision support: ";
|
||||
if (double_precision(i))
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Total amount of global memory: "
|
||||
<< gigabytes(i) << " GB\n";
|
||||
//#if CUDA_VERSION >= 2000
|
||||
out << " Number of compute units/multiprocessors: "
|
||||
<< _properties[i].multiProcessorCount << std::endl;
|
||||
out << " Number of cores: "
|
||||
<< cores(i) << std::endl;
|
||||
//#endif
|
||||
out << " Total amount of constant memory: "
|
||||
<< _properties[i].totalConstantMemory << " bytes\n";
|
||||
out << " Total amount of local/shared memory per block: "
|
||||
<< _properties[i].sharedMemPerBlock << " bytes\n";
|
||||
out << " Total number of registers available per block: "
|
||||
<< _properties[i].regsPerBlock << std::endl;
|
||||
out << " Warp size: "
|
||||
<< _properties[i].SIMDWidth << std::endl;
|
||||
out << " Maximum number of threads per block: "
|
||||
<< _properties[i].maxThreadsPerBlock << std::endl;
|
||||
out << " Maximum group size (# of threads per block) "
|
||||
<< _properties[i].maxThreadsDim[0] << " x "
|
||||
<< _properties[i].maxThreadsDim[1] << " x "
|
||||
<< _properties[i].maxThreadsDim[2] << std::endl;
|
||||
out << " Maximum item sizes (# threads for each dim) "
|
||||
<< _properties[i].maxGridSize[0] << " x "
|
||||
<< _properties[i].maxGridSize[1] << " x "
|
||||
<< _properties[i].maxGridSize[2] << std::endl;
|
||||
//out << " Maximum memory pitch: "
|
||||
// << max_pitch(i) << " bytes\n";
|
||||
//out << " Texture alignment: "
|
||||
// << _properties[i].textureAlign << " bytes\n";
|
||||
out << " Clock rate: "
|
||||
<< clock_rate(i) << " GHz\n";
|
||||
//#if CUDA_VERSION >= 2020
|
||||
//out << " Run time limit on kernels: ";
|
||||
//if (_properties[i].kernelExecTimeoutEnabled)
|
||||
// out << "Yes\n";
|
||||
//else
|
||||
// out << "No\n";
|
||||
out << " Integrated: ";
|
||||
if (_properties[i].integrated)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
//out << " Support host page-locked memory mapping: ";
|
||||
//if (_properties[i].canMapHostMemory)
|
||||
// out << "Yes\n";
|
||||
//else
|
||||
// out << "No\n";
|
||||
out << " Compute mode: ";
|
||||
if (_properties[i].computeMode == hipComputeModeDefault)
|
||||
out << "Default\n"; // multiple threads can use device
|
||||
//#if CUDA_VERSION >= 8000
|
||||
// else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
|
||||
//#else
|
||||
else if (_properties[i].computeMode == hipComputeModeExclusive)
|
||||
//#endif
|
||||
out << "Exclusive\n"; // only thread can use device
|
||||
else if (_properties[i].computeMode == hipComputeModeProhibited)
|
||||
out << "Prohibited\n"; // no thread can use device
|
||||
//#if CUDART_VERSION >= 4000
|
||||
else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
|
||||
out << "Exclusive Process\n"; // multiple threads 1 process
|
||||
//#endif
|
||||
else
|
||||
out << "Unknown\n";
|
||||
//#endif
|
||||
//#if CUDA_VERSION >= 3010
|
||||
out << " Concurrent kernel execution: ";
|
||||
if (_properties[i].concurrentKernels)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
//out << " Device has ECC support enabled: ";
|
||||
//if (_properties[i].ECCEnabled)
|
||||
// out << "Yes\n";
|
||||
//else
|
||||
// out << "No\n";
|
||||
//#endif
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
298
lib/gpu/geryon/hip_kernel.h
Normal file
298
lib/gpu/geryon/hip_kernel.h
Normal file
@ -0,0 +1,298 @@
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef HIP_KERNEL
|
||||
#define HIP_KERNEL
|
||||
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "hip_device.h"
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
|
||||
namespace ucl_hip {
|
||||
|
||||
class UCL_Texture;
|
||||
template <class numtyp> class UCL_D_Vec;
|
||||
template <class numtyp> class UCL_D_Mat;
|
||||
template <class hosttype, class devtype> class UCL_Vector;
|
||||
template <class hosttype, class devtype> class UCL_Matrix;
|
||||
#define UCL_MAX_KERNEL_ARGS 256
|
||||
|
||||
/// Class storing 1 or more kernel functions from a single string or file
|
||||
class UCL_Program {
|
||||
UCL_Device* _device_ptr;
|
||||
public:
|
||||
inline UCL_Program(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
|
||||
inline UCL_Program(UCL_Device &device, const void *program,
|
||||
const char *flags="", std::string *log=NULL) {
|
||||
_device_ptr = &device; _cq=device.cq();
|
||||
init(device);
|
||||
load_string(program,flags,log);
|
||||
}
|
||||
|
||||
inline ~UCL_Program() {}
|
||||
|
||||
/// Initialize the program with a device
|
||||
inline void init(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
|
||||
|
||||
/// Clear any data associated with program
|
||||
/** \note Must call init() after each clear **/
|
||||
inline void clear() { }
|
||||
|
||||
/// Load a program from a file and compile with flags
|
||||
inline int load(const char *filename, const char *flags="", std::string *log=NULL) {
|
||||
std::ifstream in(filename);
|
||||
if (!in || in.is_open()==false) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not open kernel file: "
|
||||
<< filename << std::endl;
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
return UCL_FILE_NOT_FOUND;
|
||||
}
|
||||
|
||||
std::string program((std::istreambuf_iterator<char>(in)),
|
||||
std::istreambuf_iterator<char>());
|
||||
in.close();
|
||||
return load_string(program.c_str(),flags,log);
|
||||
}
|
||||
|
||||
/// Load a program from a string and compile with flags
|
||||
inline int load_string(const void *program, const char *flags="", std::string *log=NULL) {
|
||||
return _device_ptr->load_module(program, _module, log);
|
||||
}
|
||||
|
||||
friend class UCL_Kernel;
|
||||
private:
|
||||
hipModule_t _module;
|
||||
hipStream_t _cq;
|
||||
friend class UCL_Texture;
|
||||
};
|
||||
|
||||
/// Class for dealing with CUDA Driver kernels
|
||||
class UCL_Kernel {
|
||||
public:
|
||||
UCL_Kernel() : _dimensions(1), _num_args(0) {
|
||||
_num_blocks[0]=0;
|
||||
}
|
||||
|
||||
UCL_Kernel(UCL_Program &program, const char *function) :
|
||||
_dimensions(1), _num_args(0) {
|
||||
_num_blocks[0]=0;
|
||||
set_function(program,function);
|
||||
_cq=program._cq;
|
||||
}
|
||||
|
||||
~UCL_Kernel() {}
|
||||
|
||||
/// Clear any function associated with the kernel
|
||||
inline void clear() { }
|
||||
|
||||
/// Get the kernel function from a program
|
||||
/** \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
|
||||
inline int set_function(UCL_Program &program, const char *function) {
|
||||
hipError_t err=hipModuleGetFunction(&_kernel,program._module,function);
|
||||
if (err!=hipSuccess) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not find function: " << function
|
||||
<< " in program.\n";
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
return UCL_FUNCTION_NOT_FOUND;
|
||||
}
|
||||
_cq=program._cq;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
/// Set the kernel argument.
|
||||
/** If not a device pointer, this must be repeated each time the argument
|
||||
* changes
|
||||
* \note To set kernel parameter i (i>0), parameter i-1 must be set **/
|
||||
template <class dtype>
|
||||
inline void set_arg(const unsigned index, const dtype * const arg) {
|
||||
if (index==_num_args)
|
||||
add_arg(arg);
|
||||
else if (index<_num_args){
|
||||
assert(0==1); // not implemented
|
||||
}
|
||||
else
|
||||
assert(0==1); // Must add kernel parameters in sequential order
|
||||
}
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
{ set_arg(&arg->begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
{ set_arg(&arg->begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
{ set_arg(&arg->device.begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
{ set_arg(&arg->device.begin()); }
|
||||
|
||||
/// Add a kernel argument.
|
||||
inline void add_arg(const hipDeviceptr_t* const arg) {
|
||||
add_arg<void*>((void**)arg);
|
||||
}
|
||||
|
||||
/// Add a kernel argument.
|
||||
template <class dtype>
|
||||
inline void add_arg(const dtype* const arg) {
|
||||
const auto old_size = _hip_kernel_args.size();
|
||||
const auto aligned_size = (old_size+alignof(dtype)-1) & ~(alignof(dtype)-1);
|
||||
const auto arg_size = sizeof(dtype);
|
||||
_hip_kernel_args.resize(aligned_size + arg_size);
|
||||
*((dtype*)(&_hip_kernel_args[aligned_size])) = *arg;
|
||||
_num_args++;
|
||||
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
|
||||
}
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
{ add_arg(&arg->begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
{ add_arg(&arg->begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
{ add_arg(&arg->device.begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
{ add_arg(&arg->device.begin()); }
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks, const size_t block_size) {
|
||||
_dimensions=1;
|
||||
_num_blocks[0]=num_blocks;
|
||||
_num_blocks[1]=1;
|
||||
_num_blocks[2]=1;
|
||||
|
||||
_block_size[0]=block_size;
|
||||
_block_size[1]=1;
|
||||
_block_size[2]=1;
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue for the kernel is changed to cq **/
|
||||
inline void set_size(const size_t num_blocks, const size_t block_size,
|
||||
command_queue &cq)
|
||||
{ _cq=cq; set_size(num_blocks,block_size); }
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y) {
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x;
|
||||
_num_blocks[1]=num_blocks_y;
|
||||
_num_blocks[2]=1;
|
||||
|
||||
_block_size[0]=block_size_x;
|
||||
_block_size[1]=block_size_y;
|
||||
_block_size[2]=1;
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue for the kernel is changed to cq **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y,
|
||||
command_queue &cq)
|
||||
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x,
|
||||
const size_t block_size_y, const size_t block_size_z) {
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x;
|
||||
_num_blocks[1]=num_blocks_y;
|
||||
_num_blocks[2]=1;
|
||||
|
||||
_block_size[0]=block_size_x;
|
||||
_block_size[1]=block_size_y;
|
||||
_block_size[2]=block_size_z;
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y,
|
||||
const size_t block_size_z, command_queue &cq) {
|
||||
_cq=cq;
|
||||
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
|
||||
block_size_z);
|
||||
}
|
||||
|
||||
/// Run the kernel in the default command queue
|
||||
inline void run() {
|
||||
size_t args_size = _hip_kernel_args.size();
|
||||
void *config[] = {
|
||||
HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)_hip_kernel_args.data(),
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
|
||||
HIP_LAUNCH_PARAM_END
|
||||
};
|
||||
const auto res = hipModuleLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
|
||||
_num_blocks[2],_block_size[0],_block_size[1],
|
||||
_block_size[2],0,_cq, NULL, config);
|
||||
CU_SAFE_CALL(res);
|
||||
//#endif
|
||||
}
|
||||
|
||||
/// Clear any arguments associated with the kernel
|
||||
inline void clear_args() {
|
||||
_num_args=0;
|
||||
_hip_kernel_args.clear();
|
||||
}
|
||||
|
||||
/// Return the default command queue/stream associated with this data
|
||||
inline command_queue & cq() { return _cq; }
|
||||
/// Change the default command queue associated with matrix
|
||||
inline void cq(command_queue &cq_in) { _cq=cq_in; }
|
||||
#include "ucl_arg_kludge.h"
|
||||
|
||||
private:
|
||||
hipFunction_t _kernel;
|
||||
hipStream_t _cq;
|
||||
unsigned _dimensions;
|
||||
unsigned _num_blocks[3];
|
||||
unsigned _num_args;
|
||||
friend class UCL_Texture;
|
||||
|
||||
unsigned _block_size[3];
|
||||
std::vector<char> _hip_kernel_args;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
83
lib/gpu/geryon/hip_macros.h
Normal file
83
lib/gpu/geryon/hip_macros.h
Normal file
@ -0,0 +1,83 @@
|
||||
#ifndef HIP_MACROS_H
|
||||
#define HIP_MACROS_H
|
||||
|
||||
#include <cstdio>
|
||||
#include <cassert>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
//#if CUDA_VERSION >= 3020
|
||||
#define CUDA_INT_TYPE size_t
|
||||
//#else
|
||||
//#define CUDA_INT_TYPE unsigned
|
||||
//#endif
|
||||
|
||||
#ifdef MPI_GERYON
|
||||
#include "mpi.h"
|
||||
#define NVD_GERYON_EXIT do { \
|
||||
int is_final; \
|
||||
MPI_Finalized(&is_final); \
|
||||
if (!is_final) \
|
||||
MPI_Abort(MPI_COMM_WORLD,-1); \
|
||||
} while(0)
|
||||
#else
|
||||
#define NVD_GERYON_EXIT assert(0==1)
|
||||
#endif
|
||||
|
||||
#ifndef UCL_GERYON_EXIT
|
||||
#define UCL_GERYON_EXIT NVD_GERYON_EXIT
|
||||
#endif
|
||||
|
||||
#ifdef UCL_DEBUG
|
||||
#define UCL_SYNC_DEBUG
|
||||
#define UCL_DESTRUCT_CHECK
|
||||
#endif
|
||||
|
||||
#ifndef UCL_NO_API_CHECK
|
||||
|
||||
#define CU_SAFE_CALL_NS( call ) do { \
|
||||
hipError_t err = call; \
|
||||
if( hipSuccess != err) { \
|
||||
fprintf(stderr, "HIP runtime error %d in call at file '%s' in line %i.\n", \
|
||||
err, __FILE__, __LINE__ ); \
|
||||
NVD_GERYON_EXIT; \
|
||||
} } while (0)
|
||||
|
||||
#ifdef UCL_SYNC_DEBUG
|
||||
|
||||
#define CU_SAFE_CALL( call ) do { \
|
||||
CU_SAFE_CALL_NS( call ); \
|
||||
hipError_t err=hipCtxSynchronize(); \
|
||||
if( hipSuccess != err) { \
|
||||
fprintf(stderr, "HIP runtime error %d in file '%s' in line %i.\n", \
|
||||
err, __FILE__, __LINE__ ); \
|
||||
NVD_GERYON_EXIT; \
|
||||
} } while (0)
|
||||
|
||||
#else
|
||||
|
||||
#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call )
|
||||
|
||||
#endif
|
||||
|
||||
#else // not DEBUG
|
||||
|
||||
// void macros for performance reasons
|
||||
#define CU_SAFE_CALL_NS( call ) call
|
||||
#define CU_SAFE_CALL( call) call
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef UCL_DESTRUCT_CHECK
|
||||
|
||||
#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call)
|
||||
#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call)
|
||||
|
||||
#else
|
||||
|
||||
#define CU_DESTRUCT_CALL( call) call
|
||||
#define CU_DESTRUCT_CALL_NS( call) call
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
43
lib/gpu/geryon/hip_mat.h
Normal file
43
lib/gpu/geryon/hip_mat.h
Normal file
@ -0,0 +1,43 @@
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
/*! \file */
|
||||
|
||||
#ifndef HIP_MAT_H
|
||||
#define HIP_MAT_H
|
||||
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "hip_memory.h"
|
||||
|
||||
/// Namespace for CUDA Driver routines
|
||||
namespace ucl_hip {
|
||||
|
||||
#define _UCL_MAT_ALLOW
|
||||
#define _UCL_DEVICE_PTR_MAT
|
||||
#include "ucl_basemat.h"
|
||||
#include "ucl_h_vec.h"
|
||||
#include "ucl_h_mat.h"
|
||||
#include "ucl_d_vec.h"
|
||||
#include "ucl_d_mat.h"
|
||||
#include "ucl_s_obj_help.h"
|
||||
#include "ucl_vector.h"
|
||||
#include "ucl_matrix.h"
|
||||
#undef _UCL_DEVICE_PTR_MAT
|
||||
#undef _UCL_MAT_ALLOW
|
||||
|
||||
#define UCL_COPY_ALLOW
|
||||
#include "ucl_copy.h"
|
||||
#undef UCL_COPY_ALLOW
|
||||
|
||||
#define UCL_PRINT_ALLOW
|
||||
#include "ucl_print.h"
|
||||
#undef UCL_PRINT_ALLOW
|
||||
|
||||
} // namespace ucl_cudadr
|
||||
|
||||
#endif
|
||||
279
lib/gpu/geryon/hip_memory.h
Normal file
279
lib/gpu/geryon/hip_memory.h
Normal file
@ -0,0 +1,279 @@
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef HIP_MEMORY_H
|
||||
#define HIP_MEMORY_H
|
||||
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include "hip_macros.h"
|
||||
#include "hip_device.h"
|
||||
#include "ucl_types.h"
|
||||
|
||||
namespace ucl_hip {
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - API Specific Types
|
||||
// --------------------------------------------------------------------------
|
||||
//typedef dim3 ucl_kernel_dim;
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
typedef enum hipArray_Format {
|
||||
HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
|
||||
HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
|
||||
HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
|
||||
HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
|
||||
HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
|
||||
HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
|
||||
HIP_AD_FORMAT_HALF = 0x10,
|
||||
HIP_AD_FORMAT_FLOAT = 0x20
|
||||
}hipArray_Format;
|
||||
#endif
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - API SPECIFIC DEVICE POINTERS
|
||||
// --------------------------------------------------------------------------
|
||||
typedef hipDeviceptr_t device_ptr;
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - HOST MEMORY ALLOCATION ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
template <class mat_type, class copy_type>
|
||||
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
||||
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
||||
hipError_t err=hipSuccess;
|
||||
if (kind==UCL_NOT_PINNED)
|
||||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||
else if (kind==UCL_WRITE_ONLY)
|
||||
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
|
||||
else
|
||||
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
|
||||
if (err!=hipSuccess || *(mat.host_ptr())==NULL)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=cm.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
||||
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
||||
hipError_t err=hipSuccess;
|
||||
if (kind==UCL_NOT_PINNED)
|
||||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||
else if (kind==UCL_WRITE_ONLY)
|
||||
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
|
||||
else
|
||||
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
|
||||
if (err!=hipSuccess || *(mat.host_ptr())==NULL)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=dev.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline void _host_free(mat_type &mat) {
|
||||
if (mat.kind()==UCL_VIEW)
|
||||
return;
|
||||
else if (mat.kind()!=UCL_NOT_PINNED)
|
||||
CU_DESTRUCT_CALL(hipHostFree(mat.begin()));
|
||||
else
|
||||
free(mat.begin());
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _host_resize(mat_type &mat, const size_t n) {
|
||||
_host_free(mat);
|
||||
hipError_t err=hipSuccess;
|
||||
if (mat.kind()==UCL_NOT_PINNED)
|
||||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||
else if (mat.kind()==UCL_WRITE_ONLY)
|
||||
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
|
||||
else
|
||||
err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
|
||||
if (err!=hipSuccess || *(mat.host_ptr())==NULL)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - DEVICE MEMORY ALLOCATION ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
template <class mat_type, class copy_type>
|
||||
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
||||
const enum UCL_MEMOPT kind) {
|
||||
hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
|
||||
if (err!=hipSuccess)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=cm.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
||||
const enum UCL_MEMOPT kind) {
|
||||
hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
|
||||
if (err!=hipSuccess)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=dev.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type, class copy_type>
|
||||
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
|
||||
const size_t cols, size_t &pitch,
|
||||
const enum UCL_MEMOPT kind) {
|
||||
hipError_t err;
|
||||
size_t upitch;
|
||||
err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
|
||||
cols*sizeof(typename mat_type::data_type),rows);
|
||||
pitch=static_cast<size_t>(upitch);
|
||||
if (err!=hipSuccess)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=cm.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type, class copy_type>
|
||||
inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
|
||||
const size_t cols, size_t &pitch,
|
||||
const enum UCL_MEMOPT kind) {
|
||||
hipError_t err;
|
||||
size_t upitch;
|
||||
err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
|
||||
cols*sizeof(typename mat_type::data_type),rows);
|
||||
pitch=static_cast<size_t>(upitch);
|
||||
if (err!=hipSuccess)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=d.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline void _device_free(mat_type &mat) {
|
||||
if (mat.kind()!=UCL_VIEW){
|
||||
CU_DESTRUCT_CALL(hipFree((void*)mat.cbegin()));
|
||||
}
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _device_resize(mat_type &mat, const size_t n) {
|
||||
_device_free(mat);
|
||||
hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
|
||||
if (err!=hipSuccess)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _device_resize(mat_type &mat, const size_t rows,
|
||||
const size_t cols, size_t &pitch) {
|
||||
_device_free(mat);
|
||||
hipError_t err;
|
||||
size_t upitch;
|
||||
err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
|
||||
cols*sizeof(typename mat_type::data_type),rows);
|
||||
pitch=static_cast<size_t>(upitch);
|
||||
if (err!=hipSuccess)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in) {
|
||||
*ptr=in;
|
||||
}
|
||||
|
||||
template <class numtyp>
|
||||
inline void _device_view(hipDeviceptr_t *ptr, numtyp *in) {
|
||||
*ptr=0;
|
||||
}
|
||||
|
||||
inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in,
|
||||
const size_t offset, const size_t numsize) {
|
||||
*ptr=(hipDeviceptr_t)(((char*)in)+offset*numsize);
|
||||
}
|
||||
|
||||
template <class numtyp>
|
||||
inline void _device_view(hipDeviceptr_t *ptr, numtyp *in,
|
||||
const size_t offset, const size_t numsize) {
|
||||
*ptr=0;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - DEVICE IMAGE ALLOCATION ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
template <class mat_type, class copy_type>
|
||||
inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
|
||||
const size_t cols) {
|
||||
assert(0==1);
|
||||
}
|
||||
|
||||
template <class mat_type, class copy_type>
|
||||
inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
|
||||
const size_t cols) {
|
||||
assert(0==1);
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline void _device_image_free(mat_type &mat) {
|
||||
assert(0==1);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - ZERO ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
inline void _host_zero(void *ptr, const size_t n) {
|
||||
memset(ptr,0,n);
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
|
||||
CU_SAFE_CALL(hipMemsetAsync((void*)mat.cbegin(),0,n,cq));
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - MEMCPY ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
|
||||
template<class mat1, class mat2>
|
||||
hipMemcpyKind _memcpy_kind(mat1 &dst, const mat2 &src){
|
||||
assert(mat1::MEM_TYPE < 2 && mat2::MEM_TYPE < 2);
|
||||
return (hipMemcpyKind)((1 - mat2::MEM_TYPE)*2 + (1 - mat1::MEM_TYPE));
|
||||
}
|
||||
|
||||
template<class mat1, class mat2>
|
||||
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
|
||||
CU_SAFE_CALL(hipMemcpy((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src)));
|
||||
}
|
||||
|
||||
template<class mat1, class mat2>
|
||||
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n, hipStream_t &cq) {
|
||||
CU_SAFE_CALL(hipMemcpyAsync((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src), cq));
|
||||
}
|
||||
|
||||
template<class mat1, class mat2>
|
||||
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CU_SAFE_CALL(hipMemcpy2D((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src)));
|
||||
}
|
||||
|
||||
template<class mat1, class mat2>
|
||||
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows,hipStream_t &cq) {
|
||||
CU_SAFE_CALL(hipMemcpy2DAsync((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src), cq));
|
||||
}
|
||||
|
||||
} // namespace ucl_cudart
|
||||
|
||||
#endif
|
||||
|
||||
113
lib/gpu/geryon/hip_texture.h
Normal file
113
lib/gpu/geryon/hip_texture.h
Normal file
@ -0,0 +1,113 @@
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef HIP_TEXTURE
|
||||
#define HIP_TEXTURE
|
||||
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "hip_kernel.h"
|
||||
#include "hip_mat.h"
|
||||
|
||||
namespace ucl_hip {
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
inline hipError_t hipModuleGetTexRef(CUtexref* texRef, hipModule_t hmod, const char* name){
|
||||
return hipCUResultTohipError(cuModuleGetTexRef(texRef, hmod, name));
|
||||
}
|
||||
inline hipError_t hipTexRefSetFormat(CUtexref tex, hipArray_Format fmt, int NumPackedComponents) {
|
||||
return hipCUResultTohipError(cuTexRefSetFormat(tex, (CUarray_format)fmt, NumPackedComponents ));
|
||||
}
|
||||
inline hipError_t hipTexRefSetAddress(size_t* offset, CUtexref tex, hipDeviceptr_t devPtr, size_t size) {
|
||||
return hipCUResultTohipError(cuTexRefSetAddress(offset, tex, devPtr, size));
|
||||
}
|
||||
#endif
|
||||
|
||||
/// Class storing a texture reference
|
||||
class UCL_Texture {
|
||||
public:
|
||||
UCL_Texture() {}
|
||||
~UCL_Texture() {}
|
||||
/// Construct with a specified texture reference
|
||||
inline UCL_Texture(UCL_Program &prog, const char *texture_name)
|
||||
{ get_texture(prog,texture_name); }
|
||||
/// Set the texture reference for this object
|
||||
inline void get_texture(UCL_Program &prog, const char *texture_name)
|
||||
{
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
CU_SAFE_CALL(hipModuleGetTexRef(&_tex, prog._module, texture_name));
|
||||
#else
|
||||
size_t _global_var_size;
|
||||
CU_SAFE_CALL(hipModuleGetGlobal(&_device_ptr_to_global_var, &_global_var_size, prog._module, texture_name));
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class numtyp>
|
||||
inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
|
||||
{ _bind_float(vec,numel); }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class numtyp>
|
||||
inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
|
||||
{ _bind_float(vec,numel); }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class numtyp, class devtyp>
|
||||
inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
|
||||
{ _bind_float(vec.device,numel); }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class numtyp, class devtyp>
|
||||
inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
|
||||
{ _bind_float(vec.device,numel); }
|
||||
|
||||
/// Unbind the texture reference from the memory allocation
|
||||
inline void unbind() { }
|
||||
|
||||
/// Make a texture reference available to kernel
|
||||
inline void allow(UCL_Kernel &kernel) {
|
||||
//#if CUDA_VERSION < 4000
|
||||
//CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
|
||||
//#endif
|
||||
}
|
||||
|
||||
private:
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
CUtexref _tex;
|
||||
#else
|
||||
void* _device_ptr_to_global_var;
|
||||
#endif
|
||||
friend class UCL_Kernel;
|
||||
|
||||
template<class mat_typ>
|
||||
inline void _bind_float(mat_typ &vec, const unsigned numel) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(numel!=0 && numel<5);
|
||||
#endif
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
if (vec.element_size()==sizeof(float))
|
||||
CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_FLOAT, numel));
|
||||
else {
|
||||
if (numel>2)
|
||||
CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_SIGNED_INT32, numel));
|
||||
else
|
||||
CU_SAFE_CALL(hipTexRefSetFormat(_tex,HIP_AD_FORMAT_SIGNED_INT32,numel*2));
|
||||
}
|
||||
CU_SAFE_CALL(hipTexRefSetAddress(NULL, _tex, vec.cbegin(), vec.numel()*vec.element_size()));
|
||||
#else
|
||||
void* data_ptr = (void*)vec.cbegin();
|
||||
CU_SAFE_CALL(hipMemcpyHtoD(hipDeviceptr_t(_device_ptr_to_global_var), &data_ptr, sizeof(void*)));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
107
lib/gpu/geryon/hip_timer.h
Normal file
107
lib/gpu/geryon/hip_timer.h
Normal file
@ -0,0 +1,107 @@
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef HIP_TIMER_H
|
||||
#define HIP_TIMER_H
|
||||
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "hip_macros.h"
|
||||
#include "hip_device.h"
|
||||
|
||||
namespace ucl_hip {
|
||||
|
||||
/// Class for timing CUDA Driver events
|
||||
class UCL_Timer {
|
||||
public:
|
||||
inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
|
||||
inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
|
||||
{ init(dev); }
|
||||
|
||||
inline ~UCL_Timer() { clear(); }
|
||||
|
||||
/// Clear any data associated with timer
|
||||
/** \note init() must be called to reuse timer after a clear() **/
|
||||
inline void clear() {
|
||||
if (_initialized) {
|
||||
CU_DESTRUCT_CALL(hipEventDestroy(start_event));
|
||||
CU_DESTRUCT_CALL(hipEventDestroy(stop_event));
|
||||
_initialized=false;
|
||||
_total_time=0.0;
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize default command queue for timing
|
||||
inline void init(UCL_Device &dev) { init(dev, dev.cq()); }
|
||||
|
||||
/// Initialize command queue for timing
|
||||
inline void init(UCL_Device &dev, command_queue &cq) {
|
||||
clear();
|
||||
_cq=cq;
|
||||
_initialized=true;
|
||||
CU_SAFE_CALL( hipEventCreateWithFlags(&start_event,0) );
|
||||
CU_SAFE_CALL( hipEventCreateWithFlags(&stop_event,0) );
|
||||
}
|
||||
|
||||
/// Start timing on command queue
|
||||
inline void start() { CU_SAFE_CALL(hipEventRecord(start_event,_cq)); }
|
||||
|
||||
/// Stop timing on command queue
|
||||
inline void stop() { CU_SAFE_CALL(hipEventRecord(stop_event,_cq)); }
|
||||
|
||||
/// Block until the start event has been reached on device
|
||||
inline void sync_start()
|
||||
{ CU_SAFE_CALL(hipEventSynchronize(start_event)); }
|
||||
|
||||
/// Block until the stop event has been reached on device
|
||||
inline void sync_stop()
|
||||
{ CU_SAFE_CALL(hipEventSynchronize(stop_event)); }
|
||||
|
||||
/// Set the time elapsed to zero (not the total_time)
|
||||
inline void zero() {
|
||||
CU_SAFE_CALL(hipEventRecord(start_event,_cq));
|
||||
CU_SAFE_CALL(hipEventRecord(stop_event,_cq));
|
||||
}
|
||||
|
||||
/// Set the total time to zero
|
||||
inline void zero_total() { _total_time=0.0; }
|
||||
|
||||
/// Add time from previous start and stop to total
|
||||
/** Forces synchronization **/
|
||||
inline double add_to_total()
|
||||
{ double t=time(); _total_time+=t; return t/1000.0; }
|
||||
|
||||
/// Add a user specified time to the total (ms)
|
||||
inline void add_time_to_total(const double t) { _total_time+=t; }
|
||||
|
||||
/// Return the time (ms) of last start to stop - Forces synchronization
|
||||
inline double time() {
|
||||
float timer;
|
||||
CU_SAFE_CALL(hipEventSynchronize(stop_event));
|
||||
CU_SAFE_CALL( hipEventElapsedTime(&timer,start_event,stop_event) );
|
||||
return timer;
|
||||
}
|
||||
|
||||
/// Return the time (s) of last start to stop - Forces synchronization
|
||||
inline double seconds() { return time()/1000.0; }
|
||||
|
||||
/// Return the total time in ms
|
||||
inline double total_time() { return _total_time; }
|
||||
|
||||
/// Return the total time in seconds
|
||||
inline double total_seconds() { return _total_time/1000.0; }
|
||||
|
||||
private:
|
||||
hipEvent_t start_event, stop_event;
|
||||
hipStream_t _cq;
|
||||
double _total_time;
|
||||
bool _initialized;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
@ -244,7 +244,7 @@ class UCL_Kernel {
|
||||
template <class dtype>
|
||||
inline void add_arg(const dtype* const arg) {
|
||||
#if CUDA_VERSION >= 4000
|
||||
_kernel_args[_num_args]=const_cast<dtype * const>(arg);
|
||||
_kernel_args[_num_args]=const_cast<dtype *>(arg);
|
||||
#else
|
||||
_param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
|
||||
CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
|
||||
|
||||
@ -36,6 +36,11 @@ using namespace ucl_cudadr;
|
||||
using namespace ucl_cudart;
|
||||
#endif
|
||||
|
||||
#ifdef UCL_HIP
|
||||
#include "hip_device.h"
|
||||
using namespace ucl_hip;
|
||||
#endif
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
UCL_Device cop;
|
||||
std::cout << "Found " << cop.num_platforms() << " platform(s).\n";
|
||||
|
||||
@ -179,13 +179,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
if (_eflag) {
|
||||
for (int i=0; i<_inum; i++)
|
||||
evdwl+=engv[i];
|
||||
if (_ef_atom)
|
||||
if (_ilist==NULL)
|
||||
if (_ef_atom) {
|
||||
if (_ilist==NULL) {
|
||||
for (int i=0; i<_inum; i++)
|
||||
eatom[i]+=engv[i];
|
||||
else
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++)
|
||||
eatom[_ilist[i]]+=engv[i];
|
||||
}
|
||||
}
|
||||
vstart=_inum;
|
||||
}
|
||||
if (_vflag) {
|
||||
@ -193,7 +195,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
for (int j=0; j<6; j++) {
|
||||
for (int i=vstart; i<iend; i++)
|
||||
virial[j]+=engv[i];
|
||||
if (_vf_atom)
|
||||
if (_vf_atom){
|
||||
if (_ilist==NULL) {
|
||||
int ii=0;
|
||||
for (int i=vstart; i<iend; i++)
|
||||
@ -203,6 +205,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
for (int i=vstart; i<iend; i++)
|
||||
vatom[_ilist[ii++]][j]+=engv[i];
|
||||
}
|
||||
}
|
||||
vstart+=_inum;
|
||||
iend+=_inum;
|
||||
}
|
||||
@ -228,7 +231,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
evdwl+=engv[i];
|
||||
for (int i=_inum; i<iend; i++)
|
||||
ecoul+=engv[i];
|
||||
if (_ef_atom)
|
||||
if (_ef_atom) {
|
||||
if (_ilist==NULL) {
|
||||
for (int i=0; i<_inum; i++)
|
||||
eatom[i]+=engv[i];
|
||||
@ -240,6 +243,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
for (int i=_inum, ii=0; i<iend; i++)
|
||||
eatom[_ilist[ii++]]+=engv[i];
|
||||
}
|
||||
}
|
||||
vstart=iend;
|
||||
iend+=_inum;
|
||||
}
|
||||
@ -247,7 +251,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
for (int j=0; j<6; j++) {
|
||||
for (int i=vstart; i<iend; i++)
|
||||
virial[j]+=engv[i];
|
||||
if (_vf_atom)
|
||||
if (_vf_atom) {
|
||||
if (_ilist==NULL) {
|
||||
for (int i=vstart, ii=0; i<iend; i++)
|
||||
vatom[ii++][j]+=engv[i];
|
||||
@ -255,6 +259,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
for (int i=vstart, ii=0; i<iend; i++)
|
||||
vatom[_ilist[ii++]][j]+=engv[i];
|
||||
}
|
||||
}
|
||||
vstart+=_inum;
|
||||
iend+=_inum;
|
||||
}
|
||||
|
||||
@ -27,6 +27,10 @@ using namespace ucl_opencl;
|
||||
#include "geryon/nvc_timer.h"
|
||||
#include "geryon/nvc_mat.h"
|
||||
using namespace ucl_cudart;
|
||||
#elif defined(USE_HIP)
|
||||
#include "geryon/hip_timer.h"
|
||||
#include "geryon/hip_mat.h"
|
||||
using namespace ucl_hip;
|
||||
#else
|
||||
#include "geryon/nvd_timer.h"
|
||||
#include "geryon/nvd_mat.h"
|
||||
|
||||
@ -15,6 +15,11 @@
|
||||
|
||||
#include "lal_atom.h"
|
||||
|
||||
#ifdef USE_HIP_DEVICE_SORT
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hipcub/hipcub.hpp>
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
#define AtomT Atom<numtyp,acctyp>
|
||||
|
||||
@ -70,6 +75,26 @@ bool AtomT::alloc(const int nall) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_HIP_DEVICE_SORT
|
||||
if (_gpu_nbor==1) {
|
||||
size_t temp_storage_bytes = 0;
|
||||
if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms))
|
||||
return false;
|
||||
if(sort_out_size < _max_atoms){
|
||||
if (sort_out_keys ) hipFree(sort_out_keys);
|
||||
if (sort_out_values) hipFree(sort_out_values);
|
||||
hipMalloc(&sort_out_keys , _max_atoms * sizeof(unsigned));
|
||||
hipMalloc(&sort_out_values, _max_atoms * sizeof(int ));
|
||||
sort_out_size = _max_atoms;
|
||||
}
|
||||
if(temp_storage_bytes > sort_temp_storage_size){
|
||||
if(sort_temp_storage) hipFree(sort_temp_storage);
|
||||
hipMalloc(&sort_temp_storage, temp_storage_bytes);
|
||||
sort_temp_storage_size = temp_storage_bytes;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// --------------------------- Device allocations
|
||||
int gpu_bytes=0;
|
||||
success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
|
||||
@ -184,6 +209,27 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_HIP_DEVICE_SORT
|
||||
if (_gpu_nbor==1) {
|
||||
size_t temp_storage_bytes = 0;
|
||||
if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms))
|
||||
return false;
|
||||
if(sort_out_size < _max_atoms){
|
||||
if (sort_out_keys ) hipFree(sort_out_keys);
|
||||
if (sort_out_values) hipFree(sort_out_values);
|
||||
hipMalloc(&sort_out_keys , _max_atoms * sizeof(unsigned));
|
||||
hipMalloc(&sort_out_values, _max_atoms * sizeof(int ));
|
||||
sort_out_size = _max_atoms;
|
||||
}
|
||||
if(temp_storage_bytes > sort_temp_storage_size){
|
||||
if(sort_temp_storage) hipFree(sort_temp_storage);
|
||||
hipMalloc(&sort_temp_storage, temp_storage_bytes);
|
||||
sort_temp_storage_size = temp_storage_bytes;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
success=success && (dev_particle_id.alloc(_max_atoms,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
gpu_bytes+=dev_particle_id.row_bytes();
|
||||
@ -275,6 +321,19 @@ void AtomT::clear_resize() {
|
||||
if (_gpu_nbor==1) cudppDestroyPlan(sort_plan);
|
||||
#endif
|
||||
|
||||
#ifdef USE_HIP_DEVICE_SORT
|
||||
if (_gpu_nbor==1) {
|
||||
if(sort_out_keys) hipFree(sort_out_keys);
|
||||
if(sort_out_values) hipFree(sort_out_values);
|
||||
if(sort_temp_storage) hipFree(sort_temp_storage);
|
||||
sort_out_keys = nullptr;
|
||||
sort_out_values = nullptr;
|
||||
sort_temp_storage = nullptr;
|
||||
sort_temp_storage_size = 0;
|
||||
sort_out_size = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (_gpu_nbor==2) {
|
||||
host_particle_id.clear();
|
||||
host_cell_id.clear();
|
||||
@ -326,6 +385,22 @@ void AtomT::sort_neighbor(const int num_atoms) {
|
||||
UCL_GERYON_EXIT;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_HIP_DEVICE_SORT
|
||||
if(sort_out_size < num_atoms){
|
||||
printf("AtomT::sort_neighbor: invalid temp buffer size\n");
|
||||
UCL_GERYON_EXIT;
|
||||
}
|
||||
if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(sort_temp_storage, sort_temp_storage_size, (unsigned *)dev_cell_id.begin(), sort_out_keys, (int *)dev_particle_id.begin(), sort_out_values, num_atoms)){
|
||||
printf("AtomT::sort_neighbor: DeviceRadixSort error\n");
|
||||
UCL_GERYON_EXIT;
|
||||
}
|
||||
if(hipSuccess != hipMemcpy((unsigned *)dev_cell_id.begin(), sort_out_keys , num_atoms*sizeof(unsigned), hipMemcpyDeviceToDevice) ||
|
||||
hipSuccess != hipMemcpy((int *) dev_particle_id.begin(), sort_out_values, num_atoms*sizeof(int ), hipMemcpyDeviceToDevice)){
|
||||
printf("AtomT::sort_neighbor: copy output error\n");
|
||||
UCL_GERYON_EXIT;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef GPU_CAST
|
||||
|
||||
@ -11,9 +11,9 @@
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_preprocessor.h"
|
||||
#endif
|
||||
|
||||
|
||||
@ -29,6 +29,11 @@ using namespace ucl_opencl;
|
||||
#include "geryon/nvc_mat.h"
|
||||
#include "geryon/nvc_kernel.h"
|
||||
using namespace ucl_cudart;
|
||||
#elif defined(USE_HIP)
|
||||
#include "geryon/hip_timer.h"
|
||||
#include "geryon/hip_mat.h"
|
||||
#include "geryon/hip_kernel.h"
|
||||
using namespace ucl_hip;
|
||||
#else
|
||||
#include "geryon/nvd_timer.h"
|
||||
#include "geryon/nvd_mat.h"
|
||||
@ -477,6 +482,14 @@ class Atom {
|
||||
CUDPPConfiguration sort_config;
|
||||
CUDPPHandle sort_plan;
|
||||
#endif
|
||||
|
||||
#ifdef USE_HIP_DEVICE_SORT
|
||||
unsigned* sort_out_keys = nullptr;
|
||||
int* sort_out_values = nullptr;
|
||||
void* sort_temp_storage = nullptr;
|
||||
size_t sort_temp_storage_size = 0;
|
||||
size_t sort_out_size = 0;
|
||||
#endif
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -13,7 +13,7 @@
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_preprocessor.h"
|
||||
#endif
|
||||
|
||||
|
||||
@ -24,6 +24,8 @@
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_texture.h"
|
||||
#elif defined(USE_HIP)
|
||||
#include "geryon/hip_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
@ -43,7 +45,7 @@ class BaseAtomic {
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -25,6 +25,8 @@
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_texture.h"
|
||||
#elif defined(USE_HIP)
|
||||
#include "geryon/hip_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
@ -44,7 +46,7 @@ class BaseCharge {
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -23,6 +23,8 @@
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_HIP)
|
||||
#include "geryon/hip_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
@ -42,7 +44,7 @@ class BaseDipole {
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -23,6 +23,8 @@
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_HIP)
|
||||
#include "geryon/hip_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
@ -42,7 +44,7 @@ class BaseDPD {
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -24,6 +24,8 @@
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_texture.h"
|
||||
#elif defined(USE_HIP)
|
||||
#include "geryon/hip_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
@ -44,7 +46,7 @@ class BaseEllipsoid {
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -24,6 +24,8 @@
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_texture.h"
|
||||
#elif defined(USE_HIP)
|
||||
#include "geryon/hip_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
@ -46,7 +48,7 @@ class BaseThree {
|
||||
* \param k_three name for the kernel for 3-body force calculation
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,14 +11,14 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
_texture( pos_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
|
||||
@ -32,7 +32,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,14 +11,14 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
_texture( pos_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
|
||||
@ -32,7 +32,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,18 @@
|
||||
//
|
||||
// begin : June 2018
|
||||
// email : ndactrung@gmail.com
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
@ -108,7 +109,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (rsq<cutsq_sigma[mtype].x) { // cutsq
|
||||
if (rsq<cutsq_sigma[mtype].x) { // cutsq
|
||||
numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;
|
||||
|
||||
rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
|
||||
@ -249,7 +250,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq_sigma[mtype].x) { // cutsq
|
||||
if (rsq<cutsq_sigma[mtype].x) { // cutsq
|
||||
numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;
|
||||
|
||||
rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
|
||||
|
||||
@ -32,7 +32,7 @@ class BornCoulLongCS : public BornCoulLong<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin :
|
||||
// email : ndactrung@gmail.com
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class BornCoulWolfCS : public BornCoulWolf<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,14 +11,14 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
_texture( pos_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
|
||||
@ -32,7 +32,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,14 +11,14 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
_texture( pos_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
|
||||
@ -32,7 +32,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin :
|
||||
// email : ndtrung@umich.edu
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin :
|
||||
// email : ndtrung@umich.edu
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin : 8/15/2012
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class CoulDSF : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin : July 2011
|
||||
// email : a.kohlmeyer@temple.edu
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin : June 2018
|
||||
// email : ndactrung@gmail.com
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class CoulLongCS : public CoulLong<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -268,7 +268,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||
gpu_nbor=1;
|
||||
else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
|
||||
gpu_nbor=2;
|
||||
#ifndef USE_CUDPP
|
||||
#if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
|
||||
if (gpu_nbor==1)
|
||||
gpu_nbor=2;
|
||||
#endif
|
||||
@ -341,7 +341,7 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
|
||||
gpu_nbor=1;
|
||||
else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
|
||||
gpu_nbor=2;
|
||||
#ifndef USE_CUDPP
|
||||
#if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
|
||||
if (gpu_nbor==1)
|
||||
gpu_nbor=2;
|
||||
#endif
|
||||
@ -712,7 +712,7 @@ int DeviceT::compile_kernels() {
|
||||
gpu_lib_data.update_host(false);
|
||||
|
||||
_ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
|
||||
#ifndef USE_OPENCL
|
||||
#if !(defined(USE_OPENCL) || defined(USE_HIP))
|
||||
if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch()))
|
||||
return -4;
|
||||
#endif
|
||||
|
||||
@ -13,7 +13,7 @@
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_preprocessor.h"
|
||||
#endif
|
||||
|
||||
|
||||
@ -41,7 +41,7 @@ class Device {
|
||||
/** Sets up a per-device MPI communicator for load balancing and initializes
|
||||
* the device (>=first_gpu and <=last_gpu) that this proc will be using
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -2 if GPU not found
|
||||
* - -4 if GPU library not compiled for GPU
|
||||
* - -6 if GPU could not be initialized for use
|
||||
@ -62,7 +62,7 @@ class Device {
|
||||
* \param vel True if velocities need to be stored
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
@ -76,7 +76,7 @@ class Device {
|
||||
* \param nall Total number of local+ghost particles
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
@ -100,7 +100,7 @@ class Device {
|
||||
* \param threads_per_atom value to be used by the neighbor list only
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,18 +11,18 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
texture<float4> mu_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
_texture( mu_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
texture<int4,1> mu_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
_texture_2d( mu_tex,int4);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class DipoleLJ : public BaseDipole<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,19 +11,19 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
texture<float4> mu_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
_texture( mu_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
texture<int4,1> mu_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
_texture_2d( mu_tex,int4);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,18 +11,18 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
texture<float4> mu_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
_texture( mu_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
texture<int4,1> mu_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
_texture_2d( mu_tex,int4);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class DipoleLongLJ : public BaseDipole<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,16 +11,16 @@
|
||||
//
|
||||
// begin : Jan 15, 2014
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float4> vel_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( vel_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int4,1> vel_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture_2d( vel_tex,int4);
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
|
||||
@ -32,7 +32,7 @@ class DPD : public BaseDPD<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,29 +11,29 @@
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> fp_tex;
|
||||
texture<float4> rhor_sp1_tex;
|
||||
texture<float4> rhor_sp2_tex;
|
||||
texture<float4> frho_sp1_tex;
|
||||
texture<float4> frho_sp2_tex;
|
||||
texture<float4> z2r_sp1_tex;
|
||||
texture<float4> z2r_sp2_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( fp_tex,float);
|
||||
_texture( rhor_sp1_tex,float4);
|
||||
_texture( rhor_sp2_tex,float4);
|
||||
_texture( frho_sp1_tex,float4);
|
||||
_texture( frho_sp2_tex,float4);
|
||||
_texture( z2r_sp1_tex,float4);
|
||||
_texture( z2r_sp2_tex,float4);
|
||||
#else
|
||||
texture<int4> pos_tex;
|
||||
texture<int2> fp_tex;
|
||||
texture<int4> rhor_sp1_tex;
|
||||
texture<int4> rhor_sp2_tex;
|
||||
texture<int4> frho_sp1_tex;
|
||||
texture<int4> frho_sp2_tex;
|
||||
texture<int4> z2r_sp1_tex;
|
||||
texture<int4> z2r_sp2_tex;
|
||||
_texture( pos_tex,int4);
|
||||
_texture( fp_tex,int2);
|
||||
_texture( rhor_sp1_tex,int4);
|
||||
_texture( rhor_sp2_tex,int4);
|
||||
_texture( frho_sp1_tex,int4);
|
||||
_texture( frho_sp2_tex,int4);
|
||||
_texture( z2r_sp1_tex,int4);
|
||||
_texture( z2r_sp2_tex,int4);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -33,7 +33,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -18,12 +18,14 @@
|
||||
|
||||
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex, quat_tex;
|
||||
_texture( pos_tex, float4);
|
||||
_texture( quat_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex, quat_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture_2d( quat_tex,int4);
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
|
||||
@ -9,16 +9,15 @@
|
||||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_preprocessor.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
_texture( pos_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
@ -53,8 +52,8 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
|
||||
int itype=fast_mul(iw,ntypes);
|
||||
int newj=0;
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
int j=dev_ij[nbor];
|
||||
j &= NEIGHMASK;
|
||||
int sj=dev_ij[nbor];
|
||||
int j = sj & NEIGHMASK;
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
int mtype=itype+jtype;
|
||||
@ -69,7 +68,7 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
|
||||
rsq+=t*t;
|
||||
|
||||
if (rsq<cf.x) {
|
||||
dev_nbor[packed]=j;
|
||||
dev_nbor[packed]=sj;
|
||||
packed+=nbor_pitch;
|
||||
newj++;
|
||||
}
|
||||
@ -117,8 +116,8 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
int newj=0;
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
int j=dev_ij[nbor];
|
||||
j &= NEIGHMASK;
|
||||
int sj=dev_ij[nbor];
|
||||
int j = sj & NEIGHMASK;
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
int mtype=itype+jtype;
|
||||
@ -133,7 +132,7 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
|
||||
rsq+=t*t;
|
||||
|
||||
if (rsq<cutsq[mtype]) {
|
||||
dev_nbor[packed]=j;
|
||||
dev_nbor[packed]=sj;
|
||||
packed+=nbor_pitch;
|
||||
newj++;
|
||||
}
|
||||
|
||||
@ -11,14 +11,14 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
_texture( pos_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
|
||||
@ -32,7 +32,7 @@ class Gauss : public BaseAtomic<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,9 +11,9 @@
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_ellipsoid_extra.h"
|
||||
#endif
|
||||
|
||||
|
||||
@ -34,7 +34,7 @@ class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
|
||||
* \return false if there is not sufficient memory or device init prob
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,9 +11,9 @@
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_ellipsoid_extra.h"
|
||||
#endif
|
||||
|
||||
|
||||
@ -11,14 +11,14 @@
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
_texture( pos_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
|
||||
@ -32,7 +32,7 @@ class LJ : public BaseAtomic<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,14 +11,14 @@
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
_texture( pos_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
|
||||
@ -32,7 +32,7 @@ class LJ96 : public BaseAtomic<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin : Mon May 16 2011
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class LJClass2Long : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class LJCoul : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,21 +11,21 @@
|
||||
//
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
texture<float> gcons_tex;
|
||||
texture<float> dgcons_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
_texture( gcons_tex,float);
|
||||
_texture( dgcons_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
texture<int2> gcons_tex;
|
||||
texture<int2> dgcons_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
_texture( gcons_tex,int2);
|
||||
_texture( dgcons_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,14 +11,14 @@
|
||||
//
|
||||
// begin :
|
||||
// email : ndactrung@gmail.com
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
_texture( pos_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
|
||||
@ -32,7 +32,7 @@ class LJCubic : public BaseAtomic<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,17 +11,17 @@
|
||||
//
|
||||
// begin : 7/12/2012
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
_texture( pos_tex,float4);
|
||||
_texture( q_tex,float);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
_texture( q_tex,int2);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class LJDSF : public BaseCharge<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
@ -11,15 +11,15 @@
|
||||
//
|
||||
// begin :
|
||||
// email : ibains@nvidia.com
|
||||
// ***************************************************************************/
|
||||
// ***************************************************************************
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
_texture( pos_tex,float4);
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
_texture_2d( pos_tex,int4);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -32,7 +32,7 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user