Merge remote-tracking branch 'origin/master'

2020-06-15 18:38:40 +03:00
parent 3298a86bd9 419ebd74fe
commit 622db7053e
5526 changed files with 479177 additions and 2216677 deletions
--- a/lib/gpu/.gitignore
+++ b/lib/gpu/.gitignore
@ -2,6 +2,7 @@
 /obj_ocl
 /ocl_get_devices
 /nvc_get_devices
+/hip_get_devices
 /*.cubin
 /*_cubin.h
 /*_cl.h
--- a/lib/gpu/Makefile.hip
+++ b/lib/gpu/Makefile.hip
@ -0,0 +1,148 @@
+# /* ----------------------------------------------------------------------
+#  Generic Linux Makefile for HIP
+#     - export HIP_PLATFORM=hcc (or nvcc) before execution
+#     - change HIP_ARCH for your GPU
+# ------------------------------------------------------------------------- */
+
+# this setting should match LAMMPS Makefile
+# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+# precision for GPU calculations
+# -D_SINGLE_SINGLE  # Single precision for all calculations
+# -D_DOUBLE_DOUBLE  # Double precision for all calculations
+# -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
+
+HIP_PRECISION = -D_SINGLE_DOUBLE
+
+HIP_OPTS = -O3
+HIP_HOST_OPTS = -Wno-deprecated-declarations
+HIP_HOST_INCLUDE =
+
+# use device sort
+# requires linking with hipcc and hipCUB + (rocPRIM or CUB for AMD or Nvidia respectively)
+HIP_HOST_OPTS += -DUSE_HIP_DEVICE_SORT
+# path to cub
+HIP_HOST_INCLUDE += -I./
+# path to hipcub
+HIP_HOST_INCLUDE += -I$(HIP_PATH)/../include
+
+# use mpi
+HIP_HOST_OPTS += -DMPI_GERYON -DUCL_NO_EXIT
+# this settings should match LAMMPS Makefile
+MPI_COMP_OPTS = $(shell mpicxx --showme:compile)
+MPI_LINK_OPTS = $(shell mpicxx --showme:link)
+#MPI_COMP_OPTS += -I/usr/include/mpi  -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1
+
+HIP_PATH ?= $(wildcard /opt/rocm/hip)
+HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
+
+ifeq (hcc,$(HIP_PLATFORM))
+	HIP_OPTS  += -ffast-math
+	# possible values: gfx803,gfx900,gfx906
+	HIP_ARCH = gfx906
+else ifeq (nvcc,$(HIP_PLATFORM))
+	HIP_OPTS  += --use_fast_math
+	HIP_ARCH = -gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_32,code=[sm_32,compute_32] -gencode arch=compute_35,code=[sm_35,compute_35] \
+		    -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] -gencode arch=compute_53,code=[sm_53,compute_53]\
+			-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] -gencode arch=compute_62,code=[sm_62,compute_62]\
+			-gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_72,code=[sm_72,compute_72] -gencode arch=compute_75,code=[sm_75,compute_75]
+else
+	$(error Specify HIP platform using 'export HIP_PLATFORM=(hcc,nvcc)')
+endif
+
+BIN_DIR = .
+OBJ_DIR = ./obj
+LIB_DIR = .
+AR = ar
+BSH = /bin/sh
+
+
+# /* ----------------------------------------------------------------------
+#  				don't change section below without need
+# ------------------------------------------------------------------------- */
+
+HIP_OPTS += -DUSE_HIP $(HIP_PRECISION)
+HIP_GPU_OPTS += $(HIP_OPTS) -I./
+
+ifeq (hcc,$(HIP_PLATFORM))
+	HIP_HOST_OPTS += -fPIC
+	HIP_GPU_CC  = $(HIP_PATH)/bin/hipcc --genco
+	HIP_GPU_OPTS_S = -t="$(HIP_ARCH)" -f=\"
+	HIP_GPU_OPTS_E = \"
+	HIP_KERNEL_SUFFIX = .cpp
+	HIP_LIBS_TARGET = export HCC_AMDGPU_TARGET := $(HIP_ARCH)
+	export HCC_AMDGPU_TARGET := $(HIP_ARCH)
+else ifeq (nvcc,$(HIP_PLATFORM))
+	HIP_GPU_CC  = $(HIP_PATH)/bin/hipcc --fatbin
+	HIP_GPU_OPTS += $(HIP_ARCH)
+	HIP_GPU_SORT_ARCH = $(HIP_ARCH)
+	# fix nvcc can't handle -pthread flag
+	MPI_COMP_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_COMP_OPTS))
+	MPI_LINK_OPTS := $(subst -pthread,-Xcompiler -pthread,$(MPI_LINK_OPTS))
+endif
+
+# hipcc is essential for device sort, because of hipcub is header only library and ROCm gpu code generation is deferred to the linking stage
+HIP_HOST_CC = $(HIP_PATH)/bin/hipcc
+HIP_HOST_OPTS += $(HIP_OPTS) $(MPI_COMP_OPTS) $(LMP_INC)
+HIP_HOST_CC_CMD  = $(HIP_HOST_CC) $(HIP_HOST_OPTS) $(HIP_HOST_INCLUDE)
+
+# sources
+
+ALL_H  =  $(wildcard ./geryon/ucl*.h) $(wildcard ./geryon/hip*.h) $(wildcard ./lal_*.h)
+SRCS := $(wildcard ./lal_*.cpp)
+OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
+CUS  := $(wildcard lal_*.cu)
+CUHS := $(filter-out pppm_cubin.h, $(CUS:lal_%.cu=%_cubin.h)) pppm_f_cubin.h pppm_d_cubin.h
+CUHS := $(addprefix $(OBJ_DIR)/, $(CUHS))
+
+all: $(OBJ_DIR) $(CUHS) $(LIB_DIR)/libgpu.a $(BIN_DIR)/hip_get_devices
+
+$(OBJ_DIR):
+	mkdir -p $@
+
+# GPU kernels compilation
+
+$(OBJ_DIR)/pppm_f_cubin.h: lal_pppm.cu  $(ALL_H)
+	@cp $< $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
+	$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=float  -Dgrdtyp4=float4 $(HIP_GPU_OPTS_E)  -o $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX)
+	@xxd -i $(OBJ_DIR)/pppm_f.cubin $@
+	@sed -i "s/[a-zA-Z0-9_]*pppm_f_cubin/pppm_f/g" $@
+	@rm $(OBJ_DIR)/temp_pppm_f.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_f.cubin
+
+$(OBJ_DIR)/pppm_d_cubin.h: lal_pppm.cu  $(ALL_H)
+	@cp $< $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
+	$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) -Dgrdtyp=double -Dgrdtyp4=double4 $(HIP_GPU_OPTS_E)  -o $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX)
+	@xxd -i $(OBJ_DIR)/pppm_d.cubin $@
+	@sed -i "s/[a-zA-Z0-9_]*pppm_d_cubin/pppm_d/g" $@
+	@rm $(OBJ_DIR)/temp_pppm_d.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/pppm_d.cubin
+
+$(OBJ_DIR)/%_cubin.h: lal_%.cu  $(ALL_H)
+	@cp $< $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
+	$(HIP_GPU_CC) $(HIP_GPU_OPTS_S) $(HIP_GPU_OPTS) $(HIP_GPU_OPTS_E)  -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX)
+	@xxd -i $(OBJ_DIR)/$*.cubin $@
+	@sed -i "s/[a-zA-Z0-9_]*$*_cubin/$*/g" $@
+	@rm $(OBJ_DIR)/temp_$*.cu$(HIP_KERNEL_SUFFIX) $(OBJ_DIR)/$*.cubin
+
+# host sources compilation
+
+$(OBJ_DIR)/lal_atom.o: lal_atom.cpp $(CUHS) $(ALL_H)
+	$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR) $(HIP_GPU_SORT_ARCH)
+
+$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
+	$(HIP_HOST_CC_CMD) -o $@ -c $< -I$(OBJ_DIR)
+
+# libgpu building
+
+$(LIB_DIR)/libgpu.a: $(OBJS)
+	$(AR) -crs $@ $(OBJS)
+	printf "export HIP_PLATFORM := %s\n%s\n" "$(HIP_PLATFORM)" "$(HIP_LIBS_TARGET)" > Makefile.lammps
+
+# test app building
+
+$(BIN_DIR)/hip_get_devices: ./geryon/ucl_get_devices.cpp $(ALL_H)
+	$(HIP_HOST_CC_CMD) -o $@ $< -DUCL_HIP $(MPI_LINK_OPTS)
+
+clean:
+	-rm -f $(BIN_DIR)/hip_get_devices $(LIB_DIR)/libgpu.a $(OBJS) $(OBJ_DIR)/temp_* $(CUHS)
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -52,7 +52,7 @@ user-gpu_SYSLIB = CUDA libraries needed by this package
 user-gpu_SYSPATH = path(s) to where those libraries are

 Because you have the CUDA compilers on your system, you should have
-the needed libraries.  If the CUDA developement tools were installed
+the needed libraries.  If the CUDA development tools were installed
 in the standard manner, the settings in the Makefile.lammps.standard
 file should work.

@ -67,8 +67,8 @@ library requires installing the CUDA GPU driver and CUDA toolkit for
 your operating system. Installation of the CUDA SDK is not necessary.
 In addition to the LAMMPS library, the binary nvc_get_devices will also
 be built. This can be used to query the names and properties of GPU 
-devices on your system. A Makefile for OpenCL compilation is provided,
-but support for OpenCL use is not currently provided by the developers.
+devices on your system. A Makefile for OpenCL and ROCm HIP compilation
+is provided, but support for it is not currently provided by the developers.
 Details of the implementation are provided in:

 ----
@ -169,6 +169,25 @@ NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG,
      src/MAKE/Makefile.foo) should be consistent with that specified 
      when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).

+                      BUILDING FOR HIP FRAMEWORK
+                   --------------------------------
+1. Install the latest ROCm framework (https://github.com/RadeonOpenCompute/ROCm).
+2. GPU sorting requires installing hipcub 
+(https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
+additionally requires cub (https://nvlabs.github.io/cub). Download and
+extract the cub directory to lammps/lib/gpu/ or specify an appropriate
+path in lammps/lib/gpu/Makefile.hip.
+3. In Makefile.hip it is possible to specify the target platform via 
+export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target 
+architecture (gfx803, gfx900, gfx906 etc.)
+4. If your MPI implementation does not support `mpicxx --showme` command,
+it is required to specify the corresponding MPI compiler and linker flags
+in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
+5. Building the GPU library (libgpu.a): 
+    cd lammps/lib/gpu; make -f Makefile.hip -j
+6. Building the LAMMPS executable (lmp_hip):
+    cd ../../src; make hip -j
+
                      EXAMPLE CONVENTIONAL BUILD PROCESS
                  --------------------------------
                    
--- a/lib/gpu/cudpp_mini/cudpp.cpp
+++ b/lib/gpu/cudpp_mini/cudpp.cpp
@ -27,7 +27,7 @@
 * defined in cudpp.h.  Public interface functions call functions in the
 * \link cudpp_app Application-Level\endlink interface. The public 
 * interface functions include Plan Interface functions and Algorithm
- * Interface functions.  Plan Inteface functions are used for creating
+ * Interface functions.  Plan Interface functions are used for creating
 * CUDPP Plan objects which contain configuration details, intermediate
 * storage space, and in the case of cudppSparseMatrix(), data.  The 
 * Algorithm Interface is the set of functions that do the real work 
--- a/lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp
+++ b/lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp
@ -8,16 +8,7 @@
 // in the root directory of this source distribution.
 // ------------------------------------------------------------- 
 #include "cudpp_maximal_launch.h"
- 
-inline size_t min(size_t x, size_t y)
-{
-    return (x <= y) ? x : y;
-}
-
-inline size_t max(size_t x, size_t y)
-{
-    return (x >= y) ? x : y;
-}
+#include <algorithm>

 // computes next highest multiple of f from x
 inline size_t multiple(size_t x, size_t f)
@ -65,7 +56,7 @@ size_t maxBlocks(cudaFuncAttributes &attribs,
    size_t ctaLimitSMem    = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
    size_t ctaLimitThreads =                  maxThreadsPerSM           / threadsPerBlock;

-    return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
+    return devprop.multiProcessorCount * std::min(ctaLimitRegs, std::min(ctaLimitSMem, std::min(ctaLimitThreads, (size_t)maxBlocksPerSM)));
 }

 extern "C"
@ -80,15 +71,15 @@ size_t maxBlocksFromPointer(void*  kernel,
    {
        err = cudaGetDeviceProperties(&devprop, deviceID);
        if (err != cudaSuccess)
-            return -1;
+            return (size_t)-1;

        cudaFuncAttributes attr;
        err = cudaFuncGetAttributes(&attr, (const char*)kernel);
        if (err != cudaSuccess)
-            return -1;
+            return (size_t)-1;

        return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
    }

-    return -1;    
+    return (size_t)-1;
 }
--- a/lib/gpu/cudpp_mini/cutil.h
+++ b/lib/gpu/cudpp_mini/cutil.h
@ -367,7 +367,7 @@ extern "C" {
    //! @param w     width of the image
    //! @param h     height of the image
    //! @note If a NULL pointer is passed to this function and it is 
-    //!       initialized  withing Cutil then cutFree() has to be used to
+    //!       initialized within Cutil then cutFree() has to be used to
    //!       deallocate the memory
    ////////////////////////////////////////////////////////////////////////////
    DLL_MAPPING
@ -382,7 +382,7 @@ extern "C" {
    //! @param w     width of the image
    //! @param h     height of the image
    //! @note If a NULL pointer is passed to this function and it is 
-    //!       initialized withing Cutil then cutFree() has to be used to 
+    //!       initialized within Cutil then cutFree() has to be used to
    //!       deallocate the memory
    ////////////////////////////////////////////////////////////////////////////
    DLL_MAPPING
@ -466,7 +466,7 @@ extern "C" {
    ////////////////////////////////////////////////////////////////////////////
    // Command line arguments: General notes
    // * All command line arguments begin with '--' followed by the token; 
-    //   token and value are seperated by '='; example --samples=50
+    //   token and value are separated by '='; example --samples=50
    // * Arrays have the form --model=[one.obj,two.obj,three.obj] 
    //   (without whitespaces)
    ////////////////////////////////////////////////////////////////////////////
--- a/lib/gpu/cudpp_mini/kernel/scan_kernel.cu
+++ b/lib/gpu/cudpp_mini/kernel/scan_kernel.cu
@ -46,7 +46,7 @@
  * the rows of \a d_blockSums (in elements) in \a blockSumRowPitch, and invoke
  * with a thread block grid with height greater than 1.  
  * 
-  * This function peforms one level of a recursive, multiblock scan.  At the 
+  * This function performs one level of a recursive, multiblock scan.  At the
  * app level, this function is called by cudppScan and cudppMultiScan and used 
  * in combination with vectorAddUniform4() to produce a complete scan.
  *
--- a/lib/gpu/geryon/hip_device.h
+++ b/lib/gpu/geryon/hip_device.h
@ -0,0 +1,519 @@
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_DEVICE
+#define HIP_DEVICE
+
+
+#include <hip/hip_runtime.h>
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <iostream>
+#include "hip_macros.h"
+#include "ucl_types.h"
+
+namespace ucl_hip {
+
+// --------------------------------------------------------------------------
+// - COMMAND QUEUE STUFF
+// --------------------------------------------------------------------------
+typedef hipStream_t command_queue;
+
+inline void ucl_sync(hipStream_t &stream) {
+  CU_SAFE_CALL(hipStreamSynchronize(stream));
+}
+
+struct NVDProperties {
+  int device_id;
+  std::string name;
+  int major;
+  int minor;
+  CUDA_INT_TYPE totalGlobalMem;
+  int multiProcessorCount;
+
+  int maxThreadsPerBlock;
+  int maxThreadsDim[3];
+  int maxGridSize[3];
+  int sharedMemPerBlock;
+  int totalConstantMemory;
+  int SIMDWidth;
+  int memPitch;
+  int regsPerBlock;
+  int clockRate;
+  int textureAlign;
+
+  int kernelExecTimeoutEnabled;
+  int integrated;
+  int canMapHostMemory;
+  int concurrentKernels;
+  int ECCEnabled;
+  int computeMode;
+};
+
+/// Class for looking at device properties
+/** \note Calls to change the device outside of the class results in incorrect
+  *       behavior
+  * \note There is no error checking for indexing past the number of devices **/
+class UCL_Device {
+ public:
+  /// Collect properties for every GPU on the node
+  /** \note You must set the active GPU with set() before using the device **/
+  inline UCL_Device();
+
+  inline ~UCL_Device();
+
+  /// Returns 1 (For compatibility with OpenCL)
+  inline int num_platforms() { return 1; }
+
+  /// Return a string with name and info of the current platform
+  inline std::string platform_name()
+    { return "HIP platform"; }
+
+  /// Delete any contexts/data and set the platform number to be used
+  inline int set_platform(const int pid);
+
+  /// Return the number of devices that support CUDA
+  inline int num_devices() { return _properties.size(); }
+
+  /// Set the CUDA device to the specified device number
+  /** A context and default command queue will be created for the device
+    * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
+    * be allocated for use. clear() is called to delete any contexts and
+    * associated data from previous calls to set(). **/
+  inline int set(int num);
+
+  /// Delete any context and associated data stored from a call to set()
+  inline void clear();
+
+  /// Get the current device number
+  inline int device_num() { return _device; }
+
+  /// Returns the default stream for the current device
+  inline command_queue & cq() { return cq(0); }
+
+  /// Returns the stream indexed by i
+  inline command_queue & cq(const int i) { return _cq[i]; }
+
+  /// Block until all commands in the default stream have completed
+  inline void sync() { sync(0); }
+
+  /// Block until all commands in the specified stream have completed
+  inline void sync(const int i) { ucl_sync(cq(i)); }
+
+  /// Get the number of command queues currently available on device
+  inline int num_queues()
+    { return _cq.size(); }
+
+  /// Add a stream for device computations
+  inline void push_command_queue() {
+    _cq.push_back(hipStream_t());
+    CU_SAFE_CALL(hipStreamCreateWithFlags(&_cq.back(),0));
+  }
+
+  /// Remove a stream for device computations
+  /** \note You cannot delete the default stream **/
+  inline void pop_command_queue() {
+    if (_cq.size()<2) return;
+    CU_SAFE_CALL_NS(hipStreamDestroy(_cq.back()));
+    _cq.pop_back();
+  }
+
+  /// Set the default command queue (by default this is the null stream)
+  /** \param i index of the command queue (as added by push_command_queue())
+      If i is 0, the default command queue is set to the null stream **/
+  inline void set_command_queue(const int i) {
+    if (i==0) _cq[0]=0;
+    else _cq[0]=_cq[i];
+  }
+
+  /// Get the current CUDA device name
+  inline std::string name() { return name(_device); }
+  /// Get the CUDA device name
+  inline std::string name(const int i)
+    { return std::string(_properties[i].name); }
+
+  /// Get a string telling the type of the current device
+  inline std::string device_type_name() { return device_type_name(_device); }
+  /// Get a string telling the type of the device
+  inline std::string device_type_name(const int i) { return "GPU"; }
+
+  /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
+  inline int device_type() { return device_type(_device); }
+  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
+  inline int device_type(const int i) { return UCL_GPU; }
+
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory() { return shared_memory(_device); }
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
+
+  /// Returns true if double precision is support for the current device
+  inline bool double_precision() { return double_precision(_device); }
+  /// Returns true if double precision is support for the device
+  inline bool double_precision(const int i) {return arch(i)>=1.3;}
+
+  /// Get the number of compute units on the current device
+  inline unsigned cus() { return cus(_device); }
+  /// Get the number of compute units
+  inline unsigned cus(const int i)
+    { return _properties[i].multiProcessorCount; }
+
+  /// Get the number of cores in the current device
+  inline unsigned cores() { return cores(_device); }
+  /// Get the number of cores
+  inline unsigned cores(const int i)
+    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
+      else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
+      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
+      else return _properties[i].multiProcessorCount*192; }
+
+  /// Get the gigabytes of global memory in the current device
+  inline double gigabytes() { return gigabytes(_device); }
+  /// Get the gigabytes of global memory
+  inline double gigabytes(const int i)
+    { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
+
+  /// Get the bytes of global memory in the current device
+  inline size_t bytes() { return bytes(_device); }
+  /// Get the bytes of global memory
+  inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
+
+  // Get the gigabytes of free memory in the current device
+  inline double free_gigabytes() { return free_gigabytes(_device); }
+  // Get the gigabytes of free memory
+  inline double free_gigabytes(const int i)
+    { return static_cast<double>(free_bytes(i))/1073741824; }
+
+  // Get the bytes of free memory in the current device
+  inline size_t free_bytes() { return free_bytes(_device); }
+  // Get the bytes of free memory
+  inline size_t free_bytes(const int i) {
+    CUDA_INT_TYPE dfree, dtotal;
+    CU_SAFE_CALL_NS(hipMemGetInfo(&dfree, &dtotal));
+    return static_cast<size_t>(dfree);
+  }
+
+  /// Return the GPGPU compute capability for current device
+  inline double arch() { return arch(_device); }
+  /// Return the GPGPU compute capability
+  inline double arch(const int i)
+    { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
+
+  /// Clock rate in GHz for current device
+  inline double clock_rate() { return clock_rate(_device); }
+  /// Clock rate in GHz
+  inline double clock_rate(const int i)
+    { return _properties[i].clockRate*1e-6;}
+
+  /// Get the maximum number of threads per block
+  inline size_t group_size() { return group_size(_device); }
+  /// Get the maximum number of threads per block
+  inline size_t group_size(const int i)
+    { return _properties[i].maxThreadsPerBlock; }
+
+  /// Return the maximum memory pitch in bytes for current device
+  inline size_t max_pitch() { return max_pitch(_device); }
+  /// Return the maximum memory pitch in bytes
+  inline size_t max_pitch(const int i) { return _properties[i].memPitch; }
+
+  /// Returns false if accelerator cannot be shared by multiple processes
+  /** If it cannot be determined, true is returned **/
+  inline bool sharing_supported() { return sharing_supported(_device); }
+  /// Returns false if accelerator cannot be shared by multiple processes
+  /** If it cannot be determined, true is returned **/
+  inline bool sharing_supported(const int i)
+    { return (_properties[i].computeMode == hipComputeModeDefault); }
+
+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal()
+    { return fission_equal(_device); }
+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal(const int i)
+    { return false; }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts()
+    { return fission_by_counts(_device); }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts(const int i)
+    { return false; }
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity()
+    { return fission_by_affinity(_device); }
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity(const int i)
+    { return false; }
+
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices()
+    { return max_sub_devices(_device); }
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices(const int i)
+    { return 0; }
+
+  /// List all devices along with all properties
+  inline void print_all(std::ostream &out);
+
+  /// Select the platform that has accelerators (for compatibility with OpenCL)
+  inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
+
+  inline int load_module(const void* program, hipModule_t& module, std::string *log=NULL){
+    auto it = _loaded_modules.emplace(program, hipModule_t());
+    if(!it.second){
+      module = it.first->second;
+      return UCL_SUCCESS;
+    }
+    const unsigned int num_opts=2;
+    hipJitOption options[num_opts];
+    void *values[num_opts];
+
+    // set up size of compilation log buffer
+    options[0] = hipJitOptionInfoLogBufferSizeBytes;
+    values[0] = (void *)(int)10240;
+    // set up pointer to the compilation log buffer
+    options[1] = hipJitOptionInfoLogBuffer;
+    char clog[10240] = { 0 };
+    values[1] = clog;
+
+    hipError_t err=hipModuleLoadDataEx(&module,program,num_opts, options,(void **)values);
+
+    if (log!=NULL)
+      *log=std::string(clog);
+
+    if (err != hipSuccess) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << std::endl
+                << "----------------------------------------------------------\n"
+                << " UCL Error: Error compiling PTX Program...\n"
+                << "----------------------------------------------------------\n";
+      std::cerr << log << std::endl;
+      #endif
+      _loaded_modules.erase(it.first);
+      return UCL_COMPILE_ERROR;
+    }
+    it.first->second = module;
+    return UCL_SUCCESS;
+  }
+ private:
+  std::unordered_map<const void*, hipModule_t> _loaded_modules;
+  int _device, _num_devices;
+  std::vector<NVDProperties> _properties;
+  std::vector<hipStream_t> _cq;
+  hipDevice_t _cu_device;
+};
+
+// Grabs the properties for all devices
+UCL_Device::UCL_Device() {
+  CU_SAFE_CALL_NS(hipInit(0));
+  CU_SAFE_CALL_NS(hipGetDeviceCount(&_num_devices));
+  for (int i=0; i<_num_devices; ++i) {
+    hipDevice_t dev;
+    CU_SAFE_CALL_NS(hipDeviceGet(&dev,i));
+    int major, minor;
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, dev));
+    if (major==9999)
+      continue;
+
+    NVDProperties prop;
+    prop.device_id = i;
+    prop.major=major;
+    prop.minor=minor;
+
+    char namecstr[1024];
+    CU_SAFE_CALL_NS(hipDeviceGetName(namecstr,1024,dev));
+    prop.name=namecstr;
+
+    CU_SAFE_CALL_NS(hipDeviceTotalMem(&prop.totalGlobalMem,dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.multiProcessorCount, hipDeviceAttributeMultiprocessorCount, dev));
+
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsPerBlock, hipDeviceAttributeMaxThreadsPerBlock, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[0], hipDeviceAttributeMaxBlockDimX, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[1], hipDeviceAttributeMaxBlockDimY, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[2], hipDeviceAttributeMaxBlockDimZ, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[0], hipDeviceAttributeMaxGridDimX, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[1], hipDeviceAttributeMaxGridDimY, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[2], hipDeviceAttributeMaxGridDimZ, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.sharedMemPerBlock, hipDeviceAttributeMaxSharedMemoryPerBlock, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.totalConstantMemory, hipDeviceAttributeTotalConstantMemory, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.SIMDWidth, hipDeviceAttributeWarpSize, dev));
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.regsPerBlock, hipDeviceAttributeMaxRegistersPerBlock, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.clockRate, hipDeviceAttributeClockRate, dev));
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
+
+    //#if CUDA_VERSION >= 2020
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.computeMode, hipDeviceAttributeComputeMode,dev));
+    //#endif
+    //#if CUDA_VERSION >= 3010
+    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.concurrentKernels, hipDeviceAttributeConcurrentKernels, dev));
+    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
+    //#endif
+
+    _properties.push_back(prop);
+  }
+  _device=-1;
+  _cq.push_back(hipStream_t());
+  _cq.back()=0;
+}
+
+UCL_Device::~UCL_Device() {
+  clear();
+}
+
+int UCL_Device::set_platform(const int pid) {
+  clear();
+  #ifdef UCL_DEBUG
+  assert(pid<num_platforms());
+  #endif
+  return UCL_SUCCESS;
+}
+
+// Set the CUDA device to the specified device number
+int UCL_Device::set(int num) {
+  clear();
+  _device=_properties[num].device_id;
+  hipError_t err=hipDeviceGet(&_cu_device,_device);
+  if (err!=hipSuccess) {
+    #ifndef UCL_NO_EXIT
+    std::cerr << "UCL Error: Could not access accelerator number " << num
+              << " for use.\n";
+    UCL_GERYON_EXIT;
+    #endif
+    return UCL_ERROR;
+  }
+  //hipError_t err=hipCtxCreate(&_context,0,_cu_device); deprecated and unnecessary
+  err=hipSetDevice(_device);
+  if (err!=hipSuccess) {
+    #ifndef UCL_NO_EXIT
+    std::cerr << "UCL Error: Could not set accelerator number " << num
+              << " for use.\n";
+    UCL_GERYON_EXIT;
+    #endif
+    return UCL_ERROR;
+  }
+  return UCL_SUCCESS;
+}
+
+void UCL_Device::clear() {
+  if (_device>-1) {
+    for (int i=1; i<num_queues(); i++) pop_command_queue();
+    CU_SAFE_CALL_NS(hipDeviceReset());
+  }
+  _device=-1;
+}
+
+// List all devices along with all properties
+void UCL_Device::print_all(std::ostream &out) {
+  //#if CUDA_VERSION >= 2020
+  int driver_version;
+  hipDriverGetVersion(&driver_version);
+  out << "Driver Version:                           "
+      << driver_version/1000 << "." << driver_version%100
+                  << std::endl;
+  //#endif
+
+  if (num_devices() == 0)
+    out << "There is no device supporting HIP\n";
+  for (int i=0; i<num_devices(); ++i) {
+    out << "\nDevice " << i << ": \"" << name(i) << "\"\n";
+    out << "  Type of device:                                "
+        << device_type_name(i).c_str() << std::endl;
+    out << "  Compute capability:                            "
+        << arch(i) << std::endl;
+    out << "  Double precision support:                      ";
+    if (double_precision(i))
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Total amount of global memory:                 "
+        << gigabytes(i) << " GB\n";
+    //#if CUDA_VERSION >= 2000
+    out << "  Number of compute units/multiprocessors:       "
+        << _properties[i].multiProcessorCount << std::endl;
+    out << "  Number of cores:                               "
+        << cores(i) << std::endl;
+    //#endif
+    out << "  Total amount of constant memory:               "
+        << _properties[i].totalConstantMemory << " bytes\n";
+    out << "  Total amount of local/shared memory per block: "
+        << _properties[i].sharedMemPerBlock << " bytes\n";
+    out << "  Total number of registers available per block: "
+        << _properties[i].regsPerBlock << std::endl;
+    out << "  Warp size:                                     "
+        << _properties[i].SIMDWidth << std::endl;
+    out << "  Maximum number of threads per block:           "
+        << _properties[i].maxThreadsPerBlock << std::endl;
+    out << "  Maximum group size (# of threads per block)    "
+        << _properties[i].maxThreadsDim[0] << " x "
+        << _properties[i].maxThreadsDim[1] << " x "
+        << _properties[i].maxThreadsDim[2] << std::endl;
+    out << "  Maximum item sizes (# threads for each dim)    "
+        << _properties[i].maxGridSize[0] << " x "
+        << _properties[i].maxGridSize[1] << " x "
+        << _properties[i].maxGridSize[2] << std::endl;
+    //out << "  Maximum memory pitch:                          "
+    //    << max_pitch(i) << " bytes\n";
+    //out << "  Texture alignment:                             "
+    //    << _properties[i].textureAlign << " bytes\n";
+    out << "  Clock rate:                                    "
+        << clock_rate(i) << " GHz\n";
+    //#if CUDA_VERSION >= 2020
+    //out << "  Run time limit on kernels:                     ";
+    //if (_properties[i].kernelExecTimeoutEnabled)
+    //  out << "Yes\n";
+    //else
+    //  out << "No\n";
+    out << "  Integrated:                                    ";
+    if (_properties[i].integrated)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    //out << "  Support host page-locked memory mapping:       ";
+    //if (_properties[i].canMapHostMemory)
+    //  out << "Yes\n";
+    //else
+    //  out << "No\n";
+    out << "  Compute mode:                                  ";
+    if (_properties[i].computeMode == hipComputeModeDefault)
+      out << "Default\n"; // multiple threads can use device
+//#if CUDA_VERSION >= 8000
+//    else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
+//#else
+    else if (_properties[i].computeMode == hipComputeModeExclusive)
+//#endif
+      out << "Exclusive\n"; // only thread can use device
+    else if (_properties[i].computeMode == hipComputeModeProhibited)
+      out << "Prohibited\n"; // no thread can use device
+    //#if CUDART_VERSION >= 4000
+    else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
+      out << "Exclusive Process\n"; // multiple threads 1 process
+    //#endif
+    else
+      out << "Unknown\n";
+    //#endif
+    //#if CUDA_VERSION >= 3010
+    out << "  Concurrent kernel execution:                   ";
+    if (_properties[i].concurrentKernels)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    //out << "  Device has ECC support enabled:                ";
+    //if (_properties[i].ECCEnabled)
+    //  out << "Yes\n";
+    //else
+    //  out << "No\n";
+    //#endif
+  }
+}
+
+}
+
+#endif
--- a/lib/gpu/geryon/hip_kernel.h
+++ b/lib/gpu/geryon/hip_kernel.h
@ -0,0 +1,298 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_KERNEL
+#define HIP_KERNEL
+
+
+#include <hip/hip_runtime.h>
+#include "hip_device.h"
+#include <fstream>
+#include <string>
+#include <iostream>
+
+namespace ucl_hip {
+
+class UCL_Texture;
+template <class numtyp> class UCL_D_Vec;
+template <class numtyp> class UCL_D_Mat;
+template <class hosttype, class devtype> class UCL_Vector;
+template <class hosttype, class devtype> class UCL_Matrix;
+#define UCL_MAX_KERNEL_ARGS 256
+
+/// Class storing 1 or more kernel functions from a single string or file
+class UCL_Program {
+  UCL_Device* _device_ptr;
+ public:
+  inline UCL_Program(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
+  inline UCL_Program(UCL_Device &device, const void *program,
+                     const char *flags="", std::string *log=NULL) {
+    _device_ptr = &device; _cq=device.cq();
+    init(device);
+    load_string(program,flags,log);
+  }
+
+  inline ~UCL_Program() {}
+
+  /// Initialize the program with a device
+  inline void init(UCL_Device &device) { _device_ptr = &device; _cq=device.cq(); }
+
+  /// Clear any data associated with program
+  /** \note Must call init() after each clear **/
+  inline void clear() { }
+
+  /// Load a program from a file and compile with flags
+  inline int load(const char *filename, const char *flags="", std::string *log=NULL) {
+    std::ifstream in(filename);
+    if (!in || in.is_open()==false) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not open kernel file: "
+                << filename << std::endl;
+      UCL_GERYON_EXIT;
+      #endif
+      return UCL_FILE_NOT_FOUND;
+    }
+
+    std::string program((std::istreambuf_iterator<char>(in)),
+                        std::istreambuf_iterator<char>());
+    in.close();
+    return load_string(program.c_str(),flags,log);
+  }
+
+  /// Load a program from a string and compile with flags
+  inline int load_string(const void *program, const char *flags="", std::string *log=NULL) {
+    return _device_ptr->load_module(program, _module, log);
+  }
+
+  friend class UCL_Kernel;
+ private:
+  hipModule_t _module;
+  hipStream_t _cq;
+  friend class UCL_Texture;
+};
+
+/// Class for dealing with CUDA Driver kernels
+class UCL_Kernel {
+ public:
+  UCL_Kernel() : _dimensions(1), _num_args(0) {
+    _num_blocks[0]=0;
+  }
+
+  UCL_Kernel(UCL_Program &program, const char *function) :
+    _dimensions(1), _num_args(0) {
+    _num_blocks[0]=0;
+    set_function(program,function);
+    _cq=program._cq;
+  }
+
+  ~UCL_Kernel() {}
+
+  /// Clear any function associated with the kernel
+  inline void clear() { }
+
+  /// Get the kernel function from a program
+  /** \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
+  inline int set_function(UCL_Program &program, const char *function) {
+    hipError_t err=hipModuleGetFunction(&_kernel,program._module,function);
+    if (err!=hipSuccess) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not find function: " << function
+                << " in program.\n";
+      UCL_GERYON_EXIT;
+      #endif
+      return UCL_FUNCTION_NOT_FOUND;
+    }
+    _cq=program._cq;
+    return UCL_SUCCESS;
+  }
+
+  /// Set the kernel argument.
+  /** If not a device pointer, this must be repeated each time the argument
+    * changes
+    * \note To set kernel parameter i (i>0), parameter i-1 must be set **/
+  template <class dtype>
+  inline void set_arg(const unsigned index, const dtype * const arg) {
+    if (index==_num_args)
+      add_arg(arg);
+    else if (index<_num_args){
+      assert(0==1); // not implemented
+    }
+    else
+      assert(0==1); // Must add kernel parameters in sequential order
+  }
+
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
+    { set_arg(&arg->device.begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
+    { set_arg(&arg->device.begin()); }
+
+  /// Add a kernel argument.
+  inline void add_arg(const hipDeviceptr_t* const arg) {
+    add_arg<void*>((void**)arg);
+  }
+
+  /// Add a kernel argument.
+  template <class dtype>
+  inline void add_arg(const dtype* const arg) {
+    const auto old_size = _hip_kernel_args.size();
+    const auto aligned_size = (old_size+alignof(dtype)-1) & ~(alignof(dtype)-1);
+    const auto arg_size = sizeof(dtype);
+    _hip_kernel_args.resize(aligned_size + arg_size);
+    *((dtype*)(&_hip_kernel_args[aligned_size])) = *arg; 
+    _num_args++;
+    if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
+  }
+
+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
+    { add_arg(&arg->device.begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
+    { add_arg(&arg->device.begin()); }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks, const size_t block_size) {
+    _dimensions=1;
+    _num_blocks[0]=num_blocks;
+    _num_blocks[1]=1;
+    _num_blocks[2]=1;
+    
+    _block_size[0]=block_size;
+    _block_size[1]=1;
+    _block_size[2]=1;
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks, const size_t block_size,
+                       command_queue &cq)
+    { _cq=cq; set_size(num_blocks,block_size); }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y) {
+    _dimensions=2;
+    _num_blocks[0]=num_blocks_x;
+    _num_blocks[1]=num_blocks_y;
+    _num_blocks[2]=1;
+
+    _block_size[0]=block_size_x;
+    _block_size[1]=block_size_y;
+    _block_size[2]=1;
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       command_queue &cq)
+    {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x,
+                       const size_t block_size_y, const size_t block_size_z) {
+    _dimensions=2;
+    _num_blocks[0]=num_blocks_x;
+    _num_blocks[1]=num_blocks_y;
+    _num_blocks[2]=1;
+
+    _block_size[0]=block_size_x;
+    _block_size[1]=block_size_y;
+    _block_size[2]=block_size_z;
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       const size_t block_size_z, command_queue &cq) {
+    _cq=cq;
+    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
+             block_size_z);
+  }
+
+  /// Run the kernel in the default command queue
+  inline void run() {
+    size_t args_size = _hip_kernel_args.size();
+    void *config[] = {
+            HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)_hip_kernel_args.data(),
+            HIP_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
+            HIP_LAUNCH_PARAM_END
+    };
+    const auto res = hipModuleLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
+                                _num_blocks[2],_block_size[0],_block_size[1],
+                                _block_size[2],0,_cq, NULL, config);
+    CU_SAFE_CALL(res);
+//#endif
+  }
+
+  /// Clear any arguments associated with the kernel
+  inline void clear_args() {
+    _num_args=0;
+    _hip_kernel_args.clear();
+  }
+
+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return _cq; }
+  /// Change the default command queue associated with matrix
+  inline void cq(command_queue &cq_in) { _cq=cq_in; }
+  #include "ucl_arg_kludge.h"
+
+ private:
+  hipFunction_t _kernel;
+  hipStream_t _cq;
+  unsigned _dimensions;
+  unsigned _num_blocks[3];
+  unsigned _num_args;
+  friend class UCL_Texture;
+
+  unsigned _block_size[3];
+  std::vector<char> _hip_kernel_args;
+};
+
+} // namespace
+
+#endif
+
--- a/lib/gpu/geryon/hip_macros.h
+++ b/lib/gpu/geryon/hip_macros.h
@ -0,0 +1,83 @@
+#ifndef HIP_MACROS_H
+#define HIP_MACROS_H
+
+#include <cstdio>
+#include <cassert>
+#include <hip/hip_runtime.h>
+
+//#if CUDA_VERSION >= 3020
+#define CUDA_INT_TYPE size_t
+//#else
+//#define CUDA_INT_TYPE unsigned
+//#endif
+
+#ifdef MPI_GERYON
+#include "mpi.h"
+#define NVD_GERYON_EXIT do {                                               \
+  int is_final;                                                            \
+  MPI_Finalized(&is_final);                                                \
+  if (!is_final)                                                           \
+    MPI_Abort(MPI_COMM_WORLD,-1);                                          \
+  } while(0)
+#else
+#define NVD_GERYON_EXIT assert(0==1)
+#endif
+
+#ifndef UCL_GERYON_EXIT
+#define UCL_GERYON_EXIT NVD_GERYON_EXIT
+#endif
+
+#ifdef UCL_DEBUG
+#define UCL_SYNC_DEBUG
+#define UCL_DESTRUCT_CHECK
+#endif
+
+#ifndef UCL_NO_API_CHECK
+
+#define CU_SAFE_CALL_NS( call ) do {                                         \
+    hipError_t err = call;                                                     \
+    if( hipSuccess != err) {                                               \
+        fprintf(stderr, "HIP runtime error %d in call at file '%s' in line %i.\n",   \
+                err, __FILE__, __LINE__ );                                   \
+        NVD_GERYON_EXIT;                                                     \
+    } } while (0)
+
+#ifdef UCL_SYNC_DEBUG
+
+#define CU_SAFE_CALL( call ) do {                                            \
+    CU_SAFE_CALL_NS( call );                                                 \
+    hipError_t err=hipCtxSynchronize();                                                  \
+    if( hipSuccess != err) {                                               \
+        fprintf(stderr, "HIP runtime error %d in file '%s' in line %i.\n",   \
+                err, __FILE__, __LINE__ );                                   \
+        NVD_GERYON_EXIT;                                                     \
+    } } while (0)
+
+#else
+
+#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call )
+
+#endif
+
+#else  // not DEBUG
+
+// void macros for performance reasons
+#define CU_SAFE_CALL_NS( call ) call
+#define CU_SAFE_CALL( call) call
+
+#endif
+
+#ifdef UCL_DESTRUCT_CHECK
+
+#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call)
+#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call)
+
+#else
+
+#define CU_DESTRUCT_CALL( call) call
+#define CU_DESTRUCT_CALL_NS( call) call
+
+#endif
+
+#endif
+
--- a/lib/gpu/geryon/hip_mat.h
+++ b/lib/gpu/geryon/hip_mat.h
@ -0,0 +1,43 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+/*! \file */
+
+#ifndef HIP_MAT_H
+#define HIP_MAT_H
+
+
+#include <hip/hip_runtime.h>
+#include "hip_memory.h"
+
+/// Namespace for CUDA Driver routines
+namespace ucl_hip {
+
+#define _UCL_MAT_ALLOW
+#define _UCL_DEVICE_PTR_MAT
+#include "ucl_basemat.h"
+#include "ucl_h_vec.h"
+#include "ucl_h_mat.h"
+#include "ucl_d_vec.h"
+#include "ucl_d_mat.h"
+#include "ucl_s_obj_help.h"
+#include "ucl_vector.h"
+#include "ucl_matrix.h"
+#undef _UCL_DEVICE_PTR_MAT
+#undef _UCL_MAT_ALLOW
+
+#define UCL_COPY_ALLOW
+#include "ucl_copy.h"
+#undef UCL_COPY_ALLOW
+
+#define UCL_PRINT_ALLOW
+#include "ucl_print.h"
+#undef UCL_PRINT_ALLOW
+
+} // namespace ucl_cudadr
+
+#endif
--- a/lib/gpu/geryon/hip_memory.h
+++ b/lib/gpu/geryon/hip_memory.h
@ -0,0 +1,279 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_MEMORY_H
+#define HIP_MEMORY_H
+
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <cassert>
+#include <cstring>
+#include "hip_macros.h"
+#include "hip_device.h"
+#include "ucl_types.h"
+
+namespace ucl_hip {
+
+// --------------------------------------------------------------------------
+// - API Specific Types
+// --------------------------------------------------------------------------
+//typedef dim3 ucl_kernel_dim;
+
+#ifdef __HIP_PLATFORM_NVCC__
+typedef enum hipArray_Format {
+    HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+    HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+    HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+    HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
+    HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
+    HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
+    HIP_AD_FORMAT_HALF = 0x10,
+    HIP_AD_FORMAT_FLOAT = 0x20
+}hipArray_Format;
+#endif
+
+// --------------------------------------------------------------------------
+// - API SPECIFIC DEVICE POINTERS
+// --------------------------------------------------------------------------
+typedef hipDeviceptr_t device_ptr;
+
+// --------------------------------------------------------------------------
+// - HOST MEMORY ALLOCATION ROUTINES
+// --------------------------------------------------------------------------
+template <class mat_type, class copy_type>
+inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
+  hipError_t err=hipSuccess;
+  if (kind==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (kind==UCL_WRITE_ONLY)
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
+  else
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
+  if (err!=hipSuccess || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
+  hipError_t err=hipSuccess;
+  if (kind==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (kind==UCL_WRITE_ONLY)
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
+  else
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
+  if (err!=hipSuccess || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=dev.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline void _host_free(mat_type &mat) {
+  if (mat.kind()==UCL_VIEW)
+    return;
+  else if (mat.kind()!=UCL_NOT_PINNED)
+    CU_DESTRUCT_CALL(hipHostFree(mat.begin()));
+  else
+    free(mat.begin());
+}
+
+template <class mat_type>
+inline int _host_resize(mat_type &mat, const size_t n) {
+  _host_free(mat);
+  hipError_t err=hipSuccess;
+  if (mat.kind()==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (mat.kind()==UCL_WRITE_ONLY)
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocWriteCombined);
+  else
+    err=hipHostMalloc((void **)mat.host_ptr(),n,hipHostMallocDefault);
+  if (err!=hipSuccess || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+// --------------------------------------------------------------------------
+// - DEVICE MEMORY ALLOCATION ROUTINES
+// --------------------------------------------------------------------------
+template <class mat_type, class copy_type>
+inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
+                         const enum UCL_MEMOPT kind) {
+  hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
+                         const enum UCL_MEMOPT kind) {
+  hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=dev.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type, class copy_type>
+inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
+                         const size_t cols, size_t &pitch,
+                         const enum UCL_MEMOPT kind) {
+  hipError_t err;
+  size_t upitch;
+  err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows);
+  pitch=static_cast<size_t>(upitch);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type, class copy_type>
+inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
+                         const size_t cols, size_t &pitch,
+                         const enum UCL_MEMOPT kind) {
+  hipError_t err;
+  size_t upitch;
+  err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows);
+  pitch=static_cast<size_t>(upitch);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  mat.cq()=d.cq();
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline void _device_free(mat_type &mat) {
+  if (mat.kind()!=UCL_VIEW){
+    CU_DESTRUCT_CALL(hipFree((void*)mat.cbegin()));
+  }
+}
+
+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t n) {
+  _device_free(mat);
+  hipError_t err=hipMalloc((void**)&mat.cbegin(),n);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t rows,
+                          const size_t cols, size_t &pitch) {
+  _device_free(mat);
+  hipError_t err;
+  size_t upitch;
+  err=hipMallocPitch((void**)&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows);
+  pitch=static_cast<size_t>(upitch);
+  if (err!=hipSuccess)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in) {
+  *ptr=in;
+}
+
+template <class numtyp>
+inline void _device_view(hipDeviceptr_t *ptr, numtyp *in) {
+  *ptr=0;
+}
+
+inline void _device_view(hipDeviceptr_t *ptr, hipDeviceptr_t &in,
+                         const size_t offset, const size_t numsize) {
+  *ptr=(hipDeviceptr_t)(((char*)in)+offset*numsize);
+}
+
+template <class numtyp>
+inline void _device_view(hipDeviceptr_t *ptr, numtyp *in,
+                         const size_t offset, const size_t numsize) {
+  *ptr=0;
+}
+
+// --------------------------------------------------------------------------
+// - DEVICE IMAGE ALLOCATION ROUTINES
+// --------------------------------------------------------------------------
+template <class mat_type, class copy_type>
+inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
+                                const size_t cols) {
+  assert(0==1);
+}
+
+template <class mat_type, class copy_type>
+inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
+                                const size_t cols) {
+  assert(0==1);
+}
+
+template <class mat_type>
+inline void _device_image_free(mat_type &mat) {
+  assert(0==1);
+}
+
+// --------------------------------------------------------------------------
+// - ZERO ROUTINES
+// --------------------------------------------------------------------------
+inline void _host_zero(void *ptr, const size_t n) {
+  memset(ptr,0,n);
+}
+
+template <class mat_type>
+inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
+    CU_SAFE_CALL(hipMemsetAsync((void*)mat.cbegin(),0,n,cq));
+}
+
+
+// --------------------------------------------------------------------------
+// - MEMCPY ROUTINES
+// --------------------------------------------------------------------------
+
+
+template<class mat1, class mat2>
+hipMemcpyKind _memcpy_kind(mat1 &dst, const mat2 &src){
+  assert(mat1::MEM_TYPE < 2 && mat2::MEM_TYPE < 2);
+  return (hipMemcpyKind)((1 - mat2::MEM_TYPE)*2 + (1 - mat1::MEM_TYPE));
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
+  CU_SAFE_CALL(hipMemcpy((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src)));
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n, hipStream_t &cq) {
+  CU_SAFE_CALL(hipMemcpyAsync((void*)dst.begin(), (void*)src.begin(), n, _memcpy_kind(dst, src), cq));
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                       const size_t spitch, const size_t cols,
+                       const size_t rows) {
+  CU_SAFE_CALL(hipMemcpy2D((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src)));
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                       const size_t spitch, const size_t cols,
+                       const size_t rows,hipStream_t &cq) {
+  CU_SAFE_CALL(hipMemcpy2DAsync((void*)dst.begin(), dpitch, (void*)src.begin(), spitch, cols, rows, _memcpy_kind(dst, src), cq));
+}
+
+} // namespace ucl_cudart
+
+#endif
+
--- a/lib/gpu/geryon/hip_texture.h
+++ b/lib/gpu/geryon/hip_texture.h
@ -0,0 +1,113 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_TEXTURE
+#define HIP_TEXTURE
+
+
+#include <hip/hip_runtime.h>
+#include "hip_kernel.h"
+#include "hip_mat.h"
+
+namespace ucl_hip {
+
+#ifdef __HIP_PLATFORM_NVCC__
+inline hipError_t hipModuleGetTexRef(CUtexref* texRef, hipModule_t hmod, const char* name){
+  return hipCUResultTohipError(cuModuleGetTexRef(texRef, hmod, name)); 
+}
+inline hipError_t hipTexRefSetFormat(CUtexref tex, hipArray_Format fmt, int NumPackedComponents) {
+    return hipCUResultTohipError(cuTexRefSetFormat(tex, (CUarray_format)fmt, NumPackedComponents ));
+}
+inline hipError_t hipTexRefSetAddress(size_t* offset, CUtexref tex, hipDeviceptr_t devPtr, size_t size) {
+    return hipCUResultTohipError(cuTexRefSetAddress(offset, tex, devPtr, size));
+}
+#endif
+
+/// Class storing a texture reference
+class UCL_Texture {
+ public:
+  UCL_Texture() {}
+  ~UCL_Texture() {}
+  /// Construct with a specified texture reference
+  inline UCL_Texture(UCL_Program &prog, const char *texture_name)
+    { get_texture(prog,texture_name); }
+  /// Set the texture reference for this object
+  inline void get_texture(UCL_Program &prog, const char *texture_name)
+    { 
+  #ifdef __HIP_PLATFORM_NVCC__
+      CU_SAFE_CALL(hipModuleGetTexRef(&_tex, prog._module, texture_name)); 
+  #else
+      size_t _global_var_size;
+      CU_SAFE_CALL(hipModuleGetGlobal(&_device_ptr_to_global_var, &_global_var_size, prog._module, texture_name));
+  #endif
+    }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp>
+  inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
+    { _bind_float(vec,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp>
+  inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
+    { _bind_float(vec,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp, class devtyp>
+  inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
+    { _bind_float(vec.device,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp, class devtyp>
+  inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
+    { _bind_float(vec.device,numel); }
+
+  /// Unbind the texture reference from the memory allocation
+  inline void unbind() { }
+
+  /// Make a texture reference available to kernel
+  inline void allow(UCL_Kernel &kernel) {
+    //#if CUDA_VERSION < 4000
+    //CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
+    //#endif
+  }
+
+ private:
+#ifdef __HIP_PLATFORM_NVCC__
+  CUtexref _tex;
+#else
+  void* _device_ptr_to_global_var;
+#endif
+  friend class UCL_Kernel;
+
+  template<class mat_typ>
+  inline void _bind_float(mat_typ &vec, const unsigned numel) {
+    #ifdef UCL_DEBUG
+    assert(numel!=0 && numel<5);
+    #endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+    if (vec.element_size()==sizeof(float))
+      CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_FLOAT, numel));
+    else {
+      if (numel>2)
+        CU_SAFE_CALL(hipTexRefSetFormat(_tex, HIP_AD_FORMAT_SIGNED_INT32, numel));
+      else
+        CU_SAFE_CALL(hipTexRefSetFormat(_tex,HIP_AD_FORMAT_SIGNED_INT32,numel*2));
+    }
+    CU_SAFE_CALL(hipTexRefSetAddress(NULL, _tex, vec.cbegin(), vec.numel()*vec.element_size()));
+#else
+    void* data_ptr = (void*)vec.cbegin();
+    CU_SAFE_CALL(hipMemcpyHtoD(hipDeviceptr_t(_device_ptr_to_global_var), &data_ptr, sizeof(void*)));
+#endif
+  }
+};
+
+} // namespace
+
+#endif
+
--- a/lib/gpu/geryon/hip_timer.h
+++ b/lib/gpu/geryon/hip_timer.h
@ -0,0 +1,107 @@
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef HIP_TIMER_H
+#define HIP_TIMER_H
+
+
+#include <hip/hip_runtime.h>
+#include "hip_macros.h"
+#include "hip_device.h"
+
+namespace ucl_hip {
+
+/// Class for timing CUDA Driver events
+class UCL_Timer {
+ public:
+  inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
+  inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
+    { init(dev); }
+
+  inline ~UCL_Timer() { clear(); }
+
+  /// Clear any data associated with timer
+  /** \note init() must be called to reuse timer after a clear() **/
+  inline void clear() {
+    if (_initialized) {
+      CU_DESTRUCT_CALL(hipEventDestroy(start_event));
+      CU_DESTRUCT_CALL(hipEventDestroy(stop_event));
+      _initialized=false;
+      _total_time=0.0;
+    }
+  }
+
+  /// Initialize default command queue for timing
+  inline void init(UCL_Device &dev) { init(dev, dev.cq()); }
+
+  /// Initialize command queue for timing
+  inline void init(UCL_Device &dev, command_queue &cq) {
+    clear();
+    _cq=cq;
+    _initialized=true;
+    CU_SAFE_CALL( hipEventCreateWithFlags(&start_event,0) );
+    CU_SAFE_CALL( hipEventCreateWithFlags(&stop_event,0) );
+  }
+
+  /// Start timing on command queue
+  inline void start() { CU_SAFE_CALL(hipEventRecord(start_event,_cq)); }
+
+  /// Stop timing on command queue
+  inline void stop() { CU_SAFE_CALL(hipEventRecord(stop_event,_cq)); }
+
+  /// Block until the start event has been reached on device
+  inline void sync_start()
+    { CU_SAFE_CALL(hipEventSynchronize(start_event)); }
+
+  /// Block until the stop event has been reached on device
+  inline void sync_stop()
+    { CU_SAFE_CALL(hipEventSynchronize(stop_event)); }
+
+  /// Set the time elapsed to zero (not the total_time)
+  inline void zero() {
+    CU_SAFE_CALL(hipEventRecord(start_event,_cq));
+    CU_SAFE_CALL(hipEventRecord(stop_event,_cq));
+  }
+
+  /// Set the total time to zero
+  inline void zero_total() { _total_time=0.0; }
+
+  /// Add time from previous start and stop to total
+  /** Forces synchronization **/
+  inline double add_to_total()
+    { double t=time(); _total_time+=t; return t/1000.0; }
+
+  /// Add a user specified time to the total (ms)
+  inline void add_time_to_total(const double t) { _total_time+=t; }
+
+  /// Return the time (ms) of last start to stop - Forces synchronization
+  inline double time() {
+    float timer;
+    CU_SAFE_CALL(hipEventSynchronize(stop_event));
+    CU_SAFE_CALL( hipEventElapsedTime(&timer,start_event,stop_event) );
+    return timer;
+  }
+
+  /// Return the time (s) of last start to stop - Forces synchronization
+  inline double seconds() { return time()/1000.0; }
+
+  /// Return the total time in ms
+  inline double total_time() { return _total_time; }
+
+  /// Return the total time in seconds
+  inline double total_seconds() { return _total_time/1000.0; }
+
+ private:
+  hipEvent_t start_event, stop_event;
+  hipStream_t _cq;
+  double _total_time;
+  bool _initialized;
+};
+
+} // namespace
+
+#endif
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@ -244,7 +244,7 @@ class UCL_Kernel {
  template <class dtype>
  inline void add_arg(const dtype* const arg) {
    #if CUDA_VERSION >= 4000
-    _kernel_args[_num_args]=const_cast<dtype * const>(arg);
+    _kernel_args[_num_args]=const_cast<dtype *>(arg);
    #else
    _param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
    CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
--- a/lib/gpu/geryon/ucl_get_devices.cpp
+++ b/lib/gpu/geryon/ucl_get_devices.cpp
@ -36,6 +36,11 @@ using namespace ucl_cudadr;
 using namespace ucl_cudart;
 #endif

+#ifdef UCL_HIP
+#include "hip_device.h"
+using namespace ucl_hip;
+#endif
+
 int main(int argc, char** argv) {
  UCL_Device cop;
  std::cout << "Found " << cop.num_platforms() << " platform(s).\n";
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -179,13 +179,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
  if (_eflag) {
    for (int i=0; i<_inum; i++)
      evdwl+=engv[i];
-    if (_ef_atom)
-      if (_ilist==NULL)
+    if (_ef_atom) {
+      if (_ilist==NULL) {
        for (int i=0; i<_inum; i++)
          eatom[i]+=engv[i];
-      else
+      } else {
        for (int i=0; i<_inum; i++)
          eatom[_ilist[i]]+=engv[i];
+      }
+    }
    vstart=_inum;
  }
  if (_vflag) {
@ -193,7 +195,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
    for (int j=0; j<6; j++) {
      for (int i=vstart; i<iend; i++)
        virial[j]+=engv[i];
-      if (_vf_atom)
+      if (_vf_atom){
        if (_ilist==NULL) {
          int ii=0;
          for (int i=vstart; i<iend; i++)
@ -203,6 +205,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
          for (int i=vstart; i<iend; i++)
            vatom[_ilist[ii++]][j]+=engv[i];
        }
+      }
      vstart+=_inum;
      iend+=_inum;
    }
@ -228,7 +231,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
      evdwl+=engv[i];
    for (int i=_inum; i<iend; i++)
      ecoul+=engv[i];
-    if (_ef_atom)
+    if (_ef_atom) {
      if (_ilist==NULL) {
        for (int i=0; i<_inum; i++)
          eatom[i]+=engv[i];
@ -240,6 +243,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
        for (int i=_inum, ii=0; i<iend; i++)
          eatom[_ilist[ii++]]+=engv[i];
      }
+    }
    vstart=iend;
    iend+=_inum;
  }
@ -247,7 +251,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
    for (int j=0; j<6; j++) {
      for (int i=vstart; i<iend; i++)
        virial[j]+=engv[i];
-      if (_vf_atom)
+      if (_vf_atom) {
        if (_ilist==NULL) {
          for (int i=vstart, ii=0; i<iend; i++)
            vatom[ii++][j]+=engv[i];
@ -255,6 +259,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
          for (int i=vstart, ii=0; i<iend; i++)
            vatom[_ilist[ii++]][j]+=engv[i];
        }
+      }
      vstart+=_inum;
      iend+=_inum;
    }
--- a/lib/gpu/lal_answer.h
+++ b/lib/gpu/lal_answer.h
@ -27,6 +27,10 @@ using namespace ucl_opencl;
 #include "geryon/nvc_timer.h"
 #include "geryon/nvc_mat.h"
 using namespace ucl_cudart;
+#elif defined(USE_HIP)
+#include "geryon/hip_timer.h"
+#include "geryon/hip_mat.h"
+using namespace ucl_hip;
 #else
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -15,6 +15,11 @@

 #include "lal_atom.h"

+#ifdef USE_HIP_DEVICE_SORT
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+#endif
+
 namespace LAMMPS_AL {
 #define AtomT Atom<numtyp,acctyp>

@ -70,6 +75,26 @@ bool AtomT::alloc(const int nall) {
  }
  #endif

+  #ifdef USE_HIP_DEVICE_SORT
+  if (_gpu_nbor==1) {
+    size_t   temp_storage_bytes = 0;
+    if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms))
+      return false;
+    if(sort_out_size < _max_atoms){
+      if (sort_out_keys  ) hipFree(sort_out_keys);
+      if (sort_out_values) hipFree(sort_out_values);
+      hipMalloc(&sort_out_keys  , _max_atoms * sizeof(unsigned));
+      hipMalloc(&sort_out_values, _max_atoms * sizeof(int     ));
+      sort_out_size = _max_atoms;
+    }
+    if(temp_storage_bytes > sort_temp_storage_size){
+      if(sort_temp_storage) hipFree(sort_temp_storage);
+      hipMalloc(&sort_temp_storage, temp_storage_bytes);
+      sort_temp_storage_size = temp_storage_bytes;
+    }
+  }
+  #endif
+
  // ---------------------------  Device allocations
  int gpu_bytes=0;
  success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
@ -184,6 +209,27 @@ bool AtomT::add_fields(const bool charge, const bool rot,
        return false;
    }
    #endif
+
+    #ifdef USE_HIP_DEVICE_SORT
+    if (_gpu_nbor==1) {
+      size_t   temp_storage_bytes = 0;
+      if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, sort_out_keys, sort_out_keys, sort_out_values, sort_out_values, _max_atoms))
+        return false;
+      if(sort_out_size < _max_atoms){
+        if (sort_out_keys  ) hipFree(sort_out_keys);
+        if (sort_out_values) hipFree(sort_out_values);
+        hipMalloc(&sort_out_keys  , _max_atoms * sizeof(unsigned));
+        hipMalloc(&sort_out_values, _max_atoms * sizeof(int     ));
+        sort_out_size = _max_atoms;
+      }
+      if(temp_storage_bytes > sort_temp_storage_size){
+        if(sort_temp_storage) hipFree(sort_temp_storage);
+        hipMalloc(&sort_temp_storage, temp_storage_bytes);
+        sort_temp_storage_size = temp_storage_bytes;
+      }
+    }
+    #endif
+
    success=success && (dev_particle_id.alloc(_max_atoms,*dev,
                                              UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=dev_particle_id.row_bytes();
@ -275,6 +321,19 @@ void AtomT::clear_resize() {
  if (_gpu_nbor==1) cudppDestroyPlan(sort_plan);
  #endif

+  #ifdef USE_HIP_DEVICE_SORT
+  if (_gpu_nbor==1) {
+    if(sort_out_keys)     hipFree(sort_out_keys);
+    if(sort_out_values)   hipFree(sort_out_values);
+    if(sort_temp_storage) hipFree(sort_temp_storage);
+    sort_out_keys = nullptr;
+    sort_out_values = nullptr;
+    sort_temp_storage = nullptr;
+    sort_temp_storage_size = 0;
+    sort_out_size = 0;
+  }
+  #endif
+
  if (_gpu_nbor==2) {
    host_particle_id.clear();
    host_cell_id.clear();
@ -326,6 +385,22 @@ void AtomT::sort_neighbor(const int num_atoms) {
    UCL_GERYON_EXIT;
  }
  #endif
+
+  #ifdef USE_HIP_DEVICE_SORT
+    if(sort_out_size < num_atoms){
+      printf("AtomT::sort_neighbor: invalid temp buffer size\n");
+      UCL_GERYON_EXIT;
+    }
+    if(hipSuccess != hipcub::DeviceRadixSort::SortPairs(sort_temp_storage, sort_temp_storage_size, (unsigned *)dev_cell_id.begin(), sort_out_keys, (int *)dev_particle_id.begin(), sort_out_values, num_atoms)){
+      printf("AtomT::sort_neighbor: DeviceRadixSort error\n");
+      UCL_GERYON_EXIT;
+    }
+    if(hipSuccess != hipMemcpy((unsigned *)dev_cell_id.begin(), sort_out_keys  , num_atoms*sizeof(unsigned), hipMemcpyDeviceToDevice) ||
+       hipSuccess != hipMemcpy((int *) dev_particle_id.begin(), sort_out_values, num_atoms*sizeof(int     ), hipMemcpyDeviceToDevice)){
+      printf("AtomT::sort_neighbor: copy output error\n");
+      UCL_GERYON_EXIT;
+    }
+  #endif
 }

 #ifdef GPU_CAST
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@ -11,9 +11,9 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #endif

--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -29,6 +29,11 @@ using namespace ucl_opencl;
 #include "geryon/nvc_mat.h"
 #include "geryon/nvc_kernel.h"
 using namespace ucl_cudart;
+#elif defined(USE_HIP)
+#include "geryon/hip_timer.h"
+#include "geryon/hip_mat.h"
+#include "geryon/hip_kernel.h"
+using namespace ucl_hip;
 #else
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
@ -477,6 +482,14 @@ class Atom {
  CUDPPConfiguration sort_config;
  CUDPPHandle sort_plan;
  #endif
+
+  #ifdef USE_HIP_DEVICE_SORT
+  unsigned* sort_out_keys = nullptr;
+  int* sort_out_values = nullptr;
+  void* sort_temp_storage = nullptr;
+  size_t sort_temp_storage_size = 0;
+  size_t sort_out_size = 0;
+  #endif
 };

 }
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #endif

--- a/lib/gpu/lal_base_atomic.h
+++ b/lib/gpu/lal_base_atomic.h
@ -24,6 +24,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@ -43,7 +45,7 @@ class BaseAtomic {
    * \param k_name name for the kernel for force calculation
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@ -25,6 +25,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@ -44,7 +46,7 @@ class BaseCharge {
    * \param k_name name for the kernel for force calculation
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_base_dipole.h
+++ b/lib/gpu/lal_base_dipole.h
@ -23,6 +23,8 @@

 #ifdef USE_OPENCL
 #include "geryon/ocl_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@ -42,7 +44,7 @@ class BaseDipole {
    * \param k_name name for the kernel for force calculation
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_base_dpd.h
+++ b/lib/gpu/lal_base_dpd.h
@ -23,6 +23,8 @@

 #ifdef USE_OPENCL
 #include "geryon/ocl_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@ -42,7 +44,7 @@ class BaseDPD {
    * \param k_name name for the kernel for force calculation
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@ -24,6 +24,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@ -44,7 +46,7 @@ class BaseEllipsoid {
    * \param k_name name for the kernel for force calculation
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@ -24,6 +24,8 @@
 #include "geryon/ocl_texture.h"
 #elif defined(USE_CUDART)
 #include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@ -46,7 +48,7 @@ class BaseThree {
    * \param k_three name for the kernel for 3-body force calculation
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@ -11,14 +11,14 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_beck.h
+++ b/lib/gpu/lal_beck.h
@ -32,7 +32,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -11,14 +11,14 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_born.h
+++ b/lib/gpu/lal_born.h
@ -32,7 +32,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -11,17 +11,17 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_born_coul_long.h
+++ b/lib/gpu/lal_born_coul_long.h
@ -32,7 +32,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_born_coul_long_cs.cu
+++ b/lib/gpu/lal_born_coul_long_cs.cu
@ -11,17 +11,18 @@
 //
 //    begin                : June 2018
 //    email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
+
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
@ -108,7 +109,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
      numtyp rsq = delx*delx+dely*dely+delz*delz;

      int mtype=itype*lj_types+jtype;
-      if (rsq<cutsq_sigma[mtype].x) { // cutsq 
+      if (rsq<cutsq_sigma[mtype].x) { // cutsq
        numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;

        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
@ -249,7 +250,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

-      if (rsq<cutsq_sigma[mtype].x) { // cutsq 
+      if (rsq<cutsq_sigma[mtype].x) { // cutsq
        numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;

        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
--- a/lib/gpu/lal_born_coul_long_cs.h
+++ b/lib/gpu/lal_born_coul_long_cs.h
@ -32,7 +32,7 @@ class BornCoulLongCS : public BornCoulLong<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -11,17 +11,17 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_born_coul_wolf.h
+++ b/lib/gpu/lal_born_coul_wolf.h
@ -32,7 +32,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_born_coul_wolf_cs.cu
+++ b/lib/gpu/lal_born_coul_wolf_cs.cu
@ -11,17 +11,17 @@
 //
 //    begin                :
 //    email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_born_coul_wolf_cs.h
+++ b/lib/gpu/lal_born_coul_wolf_cs.h
@ -32,7 +32,7 @@ class BornCoulWolfCS : public BornCoulWolf<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@ -11,14 +11,14 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_buck.h
+++ b/lib/gpu/lal_buck.h
@ -32,7 +32,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@ -11,17 +11,17 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_buck_coul.h
+++ b/lib/gpu/lal_buck_coul.h
@ -32,7 +32,7 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@ -11,17 +11,17 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_buck_coul_long.h
+++ b/lib/gpu/lal_buck_coul_long.h
@ -32,7 +32,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -11,17 +11,17 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_charmm_long.h
+++ b/lib/gpu/lal_charmm_long.h
@ -32,7 +32,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@ -11,14 +11,14 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_colloid.h
+++ b/lib/gpu/lal_colloid.h
@ -32,7 +32,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@ -11,17 +11,17 @@
 //
 //    begin                :
 //    email                : ndtrung@umich.edu
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul.h
+++ b/lib/gpu/lal_coul.h
@ -32,7 +32,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@ -11,17 +11,17 @@
 //
 //    begin                :
 //    email                : ndtrung@umich.edu
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul_debye.h
+++ b/lib/gpu/lal_coul_debye.h
@ -32,7 +32,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@ -11,17 +11,17 @@
 //
 //    begin                : 8/15/2012
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul_dsf.h
+++ b/lib/gpu/lal_coul_dsf.h
@ -32,7 +32,7 @@ class CoulDSF : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -11,17 +11,17 @@
 //
 //    begin                : July 2011
 //    email                : a.kohlmeyer@temple.edu
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul_long.h
+++ b/lib/gpu/lal_coul_long.h
@ -32,7 +32,7 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_coul_long_cs.cu
+++ b/lib/gpu/lal_coul_long_cs.cu
@ -11,17 +11,17 @@
 //
 //    begin                : June 2018
 //    email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_coul_long_cs.h
+++ b/lib/gpu/lal_coul_long_cs.h
@ -32,7 +32,7 @@ class CoulLongCS : public CoulLong<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -268,7 +268,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
    gpu_nbor=1;
  else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
    gpu_nbor=2;
-  #ifndef USE_CUDPP
+  #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
  if (gpu_nbor==1)
    gpu_nbor=2;
  #endif
@ -341,7 +341,7 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
    gpu_nbor=1;
  else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
    gpu_nbor=2;
-  #ifndef USE_CUDPP
+  #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
  if (gpu_nbor==1)
    gpu_nbor=2;
  #endif
@ -712,7 +712,7 @@ int DeviceT::compile_kernels() {
  gpu_lib_data.update_host(false);

  _ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
-  #ifndef USE_OPENCL
+  #if !(defined(USE_OPENCL) || defined(USE_HIP))
  if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch()))
    return -4;
  #endif
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@ -13,7 +13,7 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #endif

--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -41,7 +41,7 @@ class Device {
  /** Sets up a per-device MPI communicator for load balancing and initializes
    * the device (>=first_gpu and <=last_gpu) that this proc will be using
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -2 if GPU not found
    * - -4 if GPU library not compiled for GPU
    * - -6 if GPU could not be initialized for use
@ -62,7 +62,7 @@ class Device {
    * \param vel True if velocities need to be stored
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
@ -76,7 +76,7 @@ class Device {
    * \param nall Total number of local+ghost particles
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
@ -100,7 +100,7 @@ class Device {
    * \param threads_per_atom value to be used by the neighbor list only
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@ -11,18 +11,18 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+_texture( mu_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+_texture_2d( mu_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_dipole_lj.h
+++ b/lib/gpu/lal_dipole_lj.h
@ -32,7 +32,7 @@ class DipoleLJ : public BaseDipole<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@ -11,19 +11,19 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+_texture( mu_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+_texture_2d( mu_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_dipole_lj_sf.h
+++ b/lib/gpu/lal_dipole_lj_sf.h
@ -32,7 +32,7 @@ class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_dipole_long_lj.cu
+++ b/lib/gpu/lal_dipole_long_lj.cu
@ -11,18 +11,18 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+_texture( mu_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+_texture_2d( mu_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_dipole_long_lj.h
+++ b/lib/gpu/lal_dipole_long_lj.h
@ -32,7 +32,7 @@ class DipoleLongLJ : public BaseDipole<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@ -11,16 +11,16 @@
 //
 //    begin                : Jan 15, 2014
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float4> vel_tex;
+_texture( pos_tex,float4);
+_texture( vel_tex,float4);
 #else
-texture<int4,1> pos_tex;
-texture<int4,1> vel_tex;
+_texture_2d( pos_tex,int4);
+_texture_2d( vel_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_dpd.h
+++ b/lib/gpu/lal_dpd.h
@ -32,7 +32,7 @@ class DPD : public BaseDPD<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@ -11,29 +11,29 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"

 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> fp_tex;
-texture<float4> rhor_sp1_tex;
-texture<float4> rhor_sp2_tex;
-texture<float4> frho_sp1_tex;
-texture<float4> frho_sp2_tex;
-texture<float4> z2r_sp1_tex;
-texture<float4> z2r_sp2_tex;
+_texture( pos_tex,float4);
+_texture( fp_tex,float);
+_texture( rhor_sp1_tex,float4);
+_texture( rhor_sp2_tex,float4);
+_texture( frho_sp1_tex,float4);
+_texture( frho_sp2_tex,float4);
+_texture( z2r_sp1_tex,float4);
+_texture( z2r_sp2_tex,float4);
 #else
-texture<int4> pos_tex;
-texture<int2> fp_tex;
-texture<int4> rhor_sp1_tex;
-texture<int4> rhor_sp2_tex;
-texture<int4> frho_sp1_tex;
-texture<int4> frho_sp2_tex;
-texture<int4> z2r_sp1_tex;
-texture<int4> z2r_sp2_tex;
+_texture( pos_tex,int4);
+_texture( fp_tex,int2);
+_texture( rhor_sp1_tex,int4);
+_texture( rhor_sp2_tex,int4);
+_texture( frho_sp1_tex,int4);
+_texture( frho_sp2_tex,int4);
+_texture( z2r_sp1_tex,int4);
+_texture( z2r_sp2_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@ -33,7 +33,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@ -18,12 +18,14 @@

 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex, quat_tex;
+_texture( pos_tex, float4);
+_texture( quat_tex,float4);
 #else
-texture<int4,1> pos_tex, quat_tex;
+_texture_2d( pos_tex,int4);
+_texture_2d( quat_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@ -9,16 +9,15 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
@ -53,8 +52,8 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
    int itype=fast_mul(iw,ntypes);
    int newj=0;
    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
-      int j=dev_ij[nbor];
-      j &= NEIGHMASK;
+      int sj=dev_ij[nbor];
+      int j = sj & NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      int mtype=itype+jtype;
@ -69,7 +68,7 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
        rsq+=t*t;

        if (rsq<cf.x) {
-          dev_nbor[packed]=j;
+          dev_nbor[packed]=sj;
          packed+=nbor_pitch;
          newj++;
        }
@ -117,8 +116,8 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,

    int newj=0;
    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
-      int j=dev_ij[nbor];
-      j &= NEIGHMASK;
+      int sj=dev_ij[nbor];
+      int j = sj & NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      int mtype=itype+jtype;
@ -133,7 +132,7 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
        rsq+=t*t;

        if (rsq<cutsq[mtype]) {
-          dev_nbor[packed]=j;
+          dev_nbor[packed]=sj;
          packed+=nbor_pitch;
          newj++;
        }
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@ -11,14 +11,14 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_gauss.h
+++ b/lib/gpu/lal_gauss.h
@ -32,7 +32,7 @@ class Gauss : public BaseAtomic<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@ -11,9 +11,9 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
 #endif

--- a/lib/gpu/lal_gayberne.h
+++ b/lib/gpu/lal_gayberne.h
@ -34,7 +34,7 @@ class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
    * \return false if there is not sufficient memory or device init prob
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@ -11,9 +11,9 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
 #endif

--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@ -11,14 +11,14 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_lj.h
+++ b/lib/gpu/lal_lj.h
@ -32,7 +32,7 @@ class LJ : public BaseAtomic<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@ -11,14 +11,14 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_lj96.h
+++ b/lib/gpu/lal_lj96.h
@ -32,7 +32,7 @@ class LJ96 : public BaseAtomic<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@ -11,17 +11,17 @@
 //
 //    begin                : Mon May 16 2011
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_class2_long.h
+++ b/lib/gpu/lal_lj_class2_long.h
@ -32,7 +32,7 @@ class LJClass2Long : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@ -11,17 +11,17 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_coul.h
+++ b/lib/gpu/lal_lj_coul.h
@ -32,7 +32,7 @@ class LJCoul : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@ -11,17 +11,17 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_coul_debye.h
+++ b/lib/gpu/lal_lj_coul_debye.h
@ -32,7 +32,7 @@ class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@ -11,17 +11,17 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_coul_long.h
+++ b/lib/gpu/lal_lj_coul_long.h
@ -32,7 +32,7 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@ -11,21 +11,21 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float> gcons_tex;
-texture<float> dgcons_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+_texture( gcons_tex,float);
+_texture( dgcons_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int2> gcons_tex;
-texture<int2> dgcons_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+_texture( gcons_tex,int2);
+_texture( dgcons_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_coul_msm.h
+++ b/lib/gpu/lal_lj_coul_msm.h
@ -32,7 +32,7 @@ class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_lj_cubic.cu
+++ b/lib/gpu/lal_lj_cubic.cu
@ -11,14 +11,14 @@
 //
 //    begin                :
 //    email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif
 #else
 #define pos_tex x_
--- a/lib/gpu/lal_lj_cubic.h
+++ b/lib/gpu/lal_lj_cubic.h
@ -32,7 +32,7 @@ class LJCubic : public BaseAtomic<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@ -11,17 +11,17 @@
 //
 //    begin                : 7/12/2012
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture( pos_tex,float4);
+_texture( q_tex,float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
 #endif

 #else
--- a/lib/gpu/lal_lj_dsf.h
+++ b/lib/gpu/lal_lj_dsf.h
@ -32,7 +32,7 @@ class LJDSF : public BaseCharge<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@ -11,15 +11,15 @@
 //
 //    begin                :
 //    email                : ibains@nvidia.com
-// ***************************************************************************/
+// ***************************************************************************

-#ifdef NV_KERNEL
+#if defined(NV_KERNEL) || defined(USE_HIP)

 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
+_texture( pos_tex,float4);
 #else
-texture<int4,1> pos_tex;
+_texture_2d( pos_tex,int4);
 #endif

 #else
--- a/lib/gpu/lal_lj_expand.h
+++ b/lib/gpu/lal_lj_expand.h
@ -32,7 +32,7 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
    * \param gpu_split fraction of particles handled by device
    *
    * Returns:
-    * -  0 if successfull
+    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
--- a/Show More
+++ b/Show More