Merge branch 'develop' into yotamfe/develop

2023-01-26 11:47:32 -05:00
parent 83fc0f16f5 624c95b164
commit b905f6a239
78 changed files with 12730 additions and 252 deletions
--- a/cmake/Modules/LAMMPSUtils.cmake
+++ b/cmake/Modules/LAMMPSUtils.cmake
@ -99,8 +99,15 @@ function(check_for_autogen_files source_dir)
 endfunction()

 macro(pkg_depends PKG1 PKG2)
-  if(PKG_${PKG1} AND NOT (PKG_${PKG2} OR BUILD_${PKG2}))
-    message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with the ${PKG2} package")
+  if(DEFINED BUILD_${PKG2})
+    if(PKG_${PKG1} AND NOT BUILD_${PKG2})
+      message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with -D BUILD_${PKG2}=ON")
+    endif()
+  elseif(DEFINED PKG_${PKG2})
+    if(PKG_${PKG1} AND NOT PKG_${PKG2})
+      message(WARNING "The ${PKG1} package depends on the ${PKG2} package. Enabling it.")
+      set(PKG_${PKG2} ON CACHE BOOL "" FORCE)
+    endif()
  endif()
 endmacro()

--- a/cmake/Modules/Packages/COMPRESS.cmake
+++ b/cmake/Modules/Packages/COMPRESS.cmake
@ -1,4 +1,9 @@
-find_package(ZLIB REQUIRED)
+find_package(ZLIB)
+if(NOT ZLIB_FOUND)
+  message(WARNING "No Zlib development support found. Disabling COMPRESS package...")
+  set(PKG_COMPRESS OFF CACHE BOOL "" FORCE)
+  return()
+endif()
 target_link_libraries(lammps PRIVATE ZLIB::ZLIB)

 find_package(PkgConfig QUIET)
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@ -26,6 +26,19 @@ elseif(GPU_PREC STREQUAL "SINGLE")
  set(GPU_PREC_SETTING "SINGLE_SINGLE")
 endif()

+option(GPU_DEBUG "Enable debugging code of the GPU package" OFF)
+mark_as_advanced(GPU_DEBUG)
+
+if(PKG_AMOEBA AND FFT_SINGLE)
+  message(FATAL_ERROR "GPU acceleration of AMOEBA is not (yet) compatible with single precision FFT")
+endif()
+
+if (PKG_AMOEBA)
+  list(APPEND GPU_SOURCES
+              ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h
+              ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp)
+endif()
+
 file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
 file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)

@ -151,7 +164,12 @@ if(GPU_API STREQUAL "CUDA")
  add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
  target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
  target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
-  target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS})
+  target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} ${GPU_CUDA_MPS_FLAGS})
+  if(GPU_DEBUG)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+  else()
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
+  endif()
  if(CUDPP_OPT)
    target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
    target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
@ -192,6 +210,7 @@ elseif(GPU_API STREQUAL "OPENCL")
    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu
    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu
    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu
+    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu
  )

  foreach(GPU_KERNEL ${GPU_LIB_CU})
@ -208,6 +227,7 @@ elseif(GPU_API STREQUAL "OPENCL")
  GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu)
  GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu)
  GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu)
+  GenerateOpenCLHeader(hippo ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu)

  list(APPEND GPU_LIB_SOURCES
    ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h
@ -217,14 +237,18 @@ elseif(GPU_API STREQUAL "OPENCL")
    ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h
    ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h
    ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h
+    ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h
  )

  add_library(gpu STATIC ${GPU_LIB_SOURCES})
  target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
  target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
-  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
-  target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
-
+  target_compile_definitions(gpu PRIVATE -DUSE_OPENCL -D_${GPU_PREC_SETTING})
+  if(GPU_DEBUG)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+  else()
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
+  endif()
  target_link_libraries(lammps PRIVATE gpu)

  add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
@ -374,8 +398,12 @@ elseif(GPU_API STREQUAL "HIP")

  add_library(gpu STATIC ${GPU_LIB_SOURCES})
  target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
-  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
-  target_compile_definitions(gpu PRIVATE -DUSE_HIP)
+  target_compile_definitions(gpu PRIVATE -DUSE_HIP -D_${GPU_PREC_SETTING})
+  if(GPU_DEBUG)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+  else()
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
+  endif()
  target_link_libraries(gpu PRIVATE hip::host)

  if(HIP_USE_DEVICE_SORT)
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@ -126,10 +126,11 @@ CMake build
   -D GPU_API=value             # value = opencl (default) or cuda or hip
   -D GPU_PREC=value            # precision setting
                                # value = double or mixed (default) or single
-   -D HIP_PATH                  # path to HIP installation. Must be set if GPU_API=HIP
   -D GPU_ARCH=value            # primary GPU hardware choice for GPU_API=cuda
-                                # value = sm_XX, see below
-                                # default is sm_50
+                                # value = sm_XX (see below, default is sm_50)
+   -D GPU_DEBUG=value           # enable debug code in the GPU package library, mostly useful for developers
+                                # value = yes or no (default)
+   -D HIP_PATH=value            # value = path to HIP installation. Must be set if GPU_API=HIP
   -D HIP_ARCH=value            # primary GPU hardware choice for GPU_API=hip
                                # value depends on selected HIP_PLATFORM
                                # default is 'gfx906' for HIP_PLATFORM=amd and 'sm_50' for HIP_PLATFORM=nvcc
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@ -39,7 +39,7 @@ OPT.
   * :doc:`agni (o) <pair_agni>`
   * :doc:`airebo (io) <pair_airebo>`
   * :doc:`airebo/morse (io) <pair_airebo>`
-   * :doc:`amoeba <pair_amoeba>`
+   * :doc:`amoeba (g) <pair_amoeba>`
   * :doc:`atm <pair_atm>`
   * :doc:`awpmd/cut <pair_awpmd>`
   * :doc:`beck (go) <pair_beck>`
@ -126,7 +126,7 @@ OPT.
   * :doc:`hbond/dreiding/lj (o) <pair_hbond_dreiding>`
   * :doc:`hbond/dreiding/morse (o) <pair_hbond_dreiding>`
   * :doc:`hdnnp <pair_hdnnp>`
-   * :doc:`hippo <pair_amoeba>`
+   * :doc:`hippo (g) <pair_amoeba>`
   * :doc:`ilp/graphene/hbn (t) <pair_ilp_graphene_hbn>`
   * :doc:`ilp/tmd (t) <pair_ilp_tmd>`
   * :doc:`kolmogorov/crespi/full <pair_kolmogorov_crespi_full>`
--- a/doc/src/fix_reaxff_species.rst
+++ b/doc/src/fix_reaxff_species.rst
@ -39,6 +39,9 @@ Syntax
           *masslimit* value = massmin massmax
             massmin = minimum molecular weight of species to delete
             massmax = maximum molecular weight of species to delete
+       *delete_rate_limit* value = Nlimit Nsteps
+             Nlimit = maximum number of deletions allowed to occur within interval
+             Nsteps = the interval (number of timesteps) over which to count deletions

 Examples
 """"""""
@ -142,7 +145,13 @@ When using the *masslimit* keyword, each line of the *filedel* file
 contains the timestep on which deletions occurs, followed by how many
 of each species are deleted (with quantities preceding chemical
 formulae).  The *specieslist* and *masslimit* keywords cannot both be
-used in the same *reaxff/species* fix.
+used in the same *reaxff/species* fix.  The *delete_rate_limit*
+keyword can enforce an upper limit on the overall rate of molecule
+deletion.  The number of deletion occurrences is limited to Nlimit
+within an interval of Nsteps timesteps.  When using the
+*delete_rate_limit* keyword, no deletions are permitted to occur
+within the first Nsteps timesteps of the first run (after reading a
+either a data or restart file).

 ----------

--- a/doc/src/fix_rigid.rst
+++ b/doc/src/fix_rigid.rst
@ -732,8 +732,8 @@ choices:

 * Use one of the 4 NPT or NPH styles for the rigid bodies.  Use the
  *dilate* all option so that it will dilate the positions of the
-  *non-rigid particles as well.  Use :doc:`fix nvt <fix_nh>` (or any
-  *other thermostat) for the non-rigid particles.
+  non-rigid particles as well.  Use :doc:`fix nvt <fix_nh>` (or any
+  other thermostat) for the non-rigid particles.
 * Use :doc:`fix npt <fix_nh>` for the group of non-rigid particles.  Use
  the *dilate* all option so that it will dilate the center-of-mass
  positions of the rigid bodies as well.  Use one of the 4 NVE or 2 NVT
--- a/doc/src/pair_amoeba.rst
+++ b/doc/src/pair_amoeba.rst
@ -1,11 +1,18 @@
 .. index:: pair_style amoeba
+.. index:: pair_style amoeba/gpu
 .. index:: pair_style hippo
+.. index:: pair_style hippo/gpu

 pair_style amoeba command
 =========================

+Accelerator Variants: *amoeba/gpu*
+
 pair_style hippo command
 ========================
+
+Accelerator Variants: *hippo/gpu*
+
 Syntax
 """"""

@ -127,6 +134,10 @@ version discussed in :ref:`(Ponder) <amoeba-Ponder>`, :ref:`(Ren)
 implementation of HIPPO in LAMMPS matches the version discussed in
 :ref:`(Rackers) <amoeba-Rackers>`.

+.. versionadded:: TBD
+
+Accelerator support via the GPU package is available.
+
 ----------

 Only a single pair_coeff command is used with either the *amoeba* and
@ -187,6 +198,19 @@ These pair styles can only be used via the *pair* keyword of the

 ----------

+.. include:: accel_styles.rst
+
+.. note::
+
+  Using the GPU accelerated pair styles 'amoeba/gpu' or 'hippo/gpu'
+  when compiling the GPU package for OpenCL has a few known issues
+  when running on integrated GPUs and the calculation may crash.
+
+  The GPU accelerated pair styles are also not (yet) compatible
+  with single precision FFTs.
+
+----------
+
 Restrictions
 """"""""""""

--- a/lib/gpu/Makefile.lammps.standard
+++ b/lib/gpu/Makefile.lammps.standard
@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda
 endif

 gpu_SYSINC =
-gpu_SYSLIB =  -lcudart -lcuda
+gpu_SYSLIB =  -lcudart -lcuda -lcufft
 gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs

--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -1,9 +1,17 @@
+# Common headers for kernels
+PRE1_H = lal_preprocessor.h lal_aux_fun1.h
+
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \
         lal_pre_cuda_hip.h
-ALL_H  =  $(NVD_H) $(wildcard ./lal_*.h)

+# Headers for Host files
+HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
+         lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
+         lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
+         lal_neighbor_shared.h lal_pre_ocl_config.h $(NVD_H)
+         
 # Source files
 SRCS := $(wildcard ./lal_*.cpp)
 OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
@ -54,13 +62,40 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
 $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
 	$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h

-$(OBJ_DIR)/%_cubin.h: lal_%.cu  $(ALL_H)
+$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
 	$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
 	$(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@

 # host code compilation

-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
+$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)

 #ifdef CUDPP_OPT
@ -77,10 +112,10 @@ $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
 	$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini

 $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
-	$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
+	$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu -Icudpp_mini

 $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
-	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
+	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu -Icudpp_mini
 #endif

 # build libgpu.a
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -6,7 +6,7 @@ UCL_H  = $(wildcard ./geryon/ucl*.h)
 OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h

 # Headers for Host files
-HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \
+HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
         lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
         lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
         lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H)
@ -74,6 +74,9 @@ $(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.
 $(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h
 	$(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h;

+$(OBJ_DIR)/hippo_cl.h: lal_hippo.cu $(PRE1_H) lal_hippo_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh hippo $(PRE1_H) lal_hippo_extra.h lal_hippo.cu $(OBJ_DIR)/hippo_cl.h;
+
 $(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;

--- a/lib/gpu/geryon/hip_macros.h
+++ b/lib/gpu/geryon/hip_macros.h
@ -26,6 +26,9 @@
 #ifdef UCL_DEBUG
 #define UCL_SYNC_DEBUG
 #define UCL_DESTRUCT_CHECK
+#define UCL_DEBUG_ARG(arg) arg
+#else
+#define UCL_DEBUG_ARG(arg)
 #endif

 #ifndef UCL_NO_API_CHECK
--- a/lib/gpu/geryon/nvd_macros.h
+++ b/lib/gpu/geryon/nvd_macros.h
@ -33,6 +33,9 @@
 #ifdef UCL_DEBUG
 #define UCL_SYNC_DEBUG
 #define UCL_DESTRUCT_CHECK
+#define UCL_DEBUG_ARG(arg) arg
+#else
+#define UCL_DEBUG_ARG(arg)
 #endif

 #ifndef UCL_NO_API_CHECK
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -309,15 +309,14 @@ class UCL_Device {
  /// Return the maximum memory pitch in bytes for current device
  inline size_t max_pitch() { return max_pitch(_device); }
  /// Return the maximum memory pitch in bytes
-  inline size_t max_pitch(const int i) { return 0; }
+  inline size_t max_pitch(const int) { return 0; }

  /// Returns false if accelerator cannot be shared by multiple processes
  /** If it cannot be determined, true is returned **/
  inline bool sharing_supported() { return sharing_supported(_device); }
  /// Returns false if accelerator cannot be shared by multiple processes
  /** If it cannot be determined, true is returned **/
-  inline bool sharing_supported(const int i)
-    { return true; }
+  inline bool sharing_supported(const int) { return true; }

  /// True if the device is a sub-device
  inline bool is_subdevice()
--- a/lib/gpu/geryon/ocl_macros.h
+++ b/lib/gpu/geryon/ocl_macros.h
@ -33,6 +33,9 @@
 #ifdef UCL_DEBUG
 #define UCL_SYNC_DEBUG
 #define UCL_DESTRUCT_CHECK
+#define UCL_DEBUG_ARG(arg) arg
+#else
+#define UCL_DEBUG_ARG(arg)
 #endif

 #ifndef UCL_NO_API_CHECK
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@ -137,7 +137,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,

 template <class mat_type>
 inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
-                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT /*kind2*/){
  cl_mem_flags buffer_perm;
  cl_map_flags map_perm;
  if (kind==UCL_READ_ONLY) {
@ -583,7 +583,7 @@ template <> struct _ucl_memcpy<1,0> {
  template <class p1, class p2>
  static inline void mc(p1 &dst, const p2 &src, const size_t n,
                        cl_command_queue &cq, const cl_bool block,
-                        const size_t dst_offset, const size_t src_offset) {
+                        const size_t /*dst_offset*/, const size_t src_offset) {
    if (src.cbegin()==dst.cbegin()) {
      #ifdef UCL_DBG_MEM_TRACE
      std::cerr << "UCL_COPY 1S\n";
@ -641,7 +641,7 @@ template <> struct _ucl_memcpy<0,1> {
  template <class p1, class p2>
  static inline void mc(p1 &dst, const p2 &src, const size_t n,
                        cl_command_queue &cq, const cl_bool block,
-                        const size_t dst_offset, const size_t src_offset) {
+                        const size_t dst_offset, const size_t /*src_offset*/) {
    if (src.cbegin()==dst.cbegin()) {
      if (block) ucl_sync(cq);
      #ifdef UCL_DBG_MEM_TRACE
--- a/lib/gpu/geryon/ocl_texture.h
+++ b/lib/gpu/geryon/ocl_texture.h
@ -35,19 +35,19 @@ class UCL_Texture {
  UCL_Texture() {}
  ~UCL_Texture() {}
  /// Construct with a specified texture reference
-  inline UCL_Texture(UCL_Program &prog, const char *texture_name) { }
+  inline UCL_Texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }
  /// Set the texture reference for this object
-  inline void get_texture(UCL_Program &prog, const char *texture_name) { }
+  inline void get_texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }

  /// Bind a float array where each fetch grabs a vector of length numel
  template<class mat_typ>
-  inline void bind_float(mat_typ &vec, const unsigned numel) { }
+    inline void bind_float(mat_typ & /*vec*/, const unsigned /*numel*/) { }

  /// Unbind the texture reference from the memory allocation
  inline void unbind() { }

  /// Make a texture reference available to kernel
-  inline void allow(UCL_Kernel &kernel) { }
+  inline void allow(UCL_Kernel & /*kernel*/) { }

 private:
  friend class UCL_Kernel;
@ -62,7 +62,7 @@ class UCL_Const {
  inline UCL_Const(UCL_Program &prog, const char *global_name)
    { get_global(prog,global_name); }
  /// Set the global reference for this object
-  inline void get_global(UCL_Program &prog, const char *global_name) {
+  inline void get_global(UCL_Program &prog, const char * /*global_name*/) {
    if (_active) {
      CL_DESTRUCT_CALL(clReleaseContext(_context));
      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -71,7 +71,7 @@ class UCL_Timer {
  inline void init(UCL_Device &dev) { init(dev,dev.cq()); }

  /// Initialize command queue for timing
-  inline void init(UCL_Device &dev, command_queue &cq) {
+  inline void init(UCL_Device & /*dev*/, command_queue &cq) {
    clear();
    _cq=cq;
    clRetainCommandQueue(_cq);
--- a/lib/gpu/geryon/ucl_copy.h
+++ b/lib/gpu/geryon/ucl_copy.h
@ -205,12 +205,11 @@ template <> struct _host_host_copy<1,1> {
 // Should never be here
 template <int host_t1, int host_t2> struct _host_host_copy {
  template <class mat1, class mat2>
-  static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
+  static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/) {
    assert(0==1);
  }
  template <class mat1, class mat2>
-  static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
-                         const size_t cols) {
+  static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, const size_t /*cols*/) {
    assert(0==1);
  }
 };
@ -470,24 +469,22 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
 // Neither on host or both on host
 template <> struct _ucl_cast_copy<1,1> {
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer, command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
+                          mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t /*cols*/, mat3 & /*cast_buffer*/) {
    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer,
-                        command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t /*cols*/, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
    assert(0==1);
  }
 };
@ -495,24 +492,22 @@ template <> struct _ucl_cast_copy<1,1> {
 // Neither on host or both on host
 template <> struct _ucl_cast_copy<0,0> {
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer, command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
+                          mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t /*cols*/, mat3 & /*cast_buffer*/) {
    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer,
-                        command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t cols, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
    assert(0==1);
  }
 };
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@ -125,7 +125,7 @@ class UCL_D_Vec : public UCL_BaseMat {
    * - The view does not prevent the memory from being freed by the
    *   allocating container when using CUDA APIs **/
  template <class ucl_type>
-  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
+  inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
    #ifdef UCL_DEBUG
    assert(rows==1);
    #endif
@ -230,8 +230,8 @@ class UCL_D_Vec : public UCL_BaseMat {
    * - The view does not prevent the memory from being freed by the
    *   allocating container when using CUDA APIs **/
  template <class ucl_type>
-  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols) {
+  inline void view_offset(const size_t offset,ucl_type &input,
+                          const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
    #ifdef UCL_DEBUG
    assert(rows==1);
    #endif
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@ -126,7 +126,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    *   allocating container when using CUDA APIs
    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
-  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
+  inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
    #ifdef UCL_DEBUG
    assert(rows==1);
    #endif
@ -188,7 +188,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    *   allocating container when using CUDA APIs
    * - Viewing a device pointer on the host is not supported **/
  template <class ptr_type>
-  inline void view(ptr_type *input, const size_t rows, const size_t cols,
+  inline void view(ptr_type *input, const size_t UCL_DEBUG_ARG(rows), const size_t cols,
                   UCL_Device &dev) {
    #ifdef UCL_DEBUG
    assert(rows==1);
@ -233,7 +233,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    *   allocating container when using CUDA APIs
    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
-  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t UCL_DEBUG_ARG(rows),
                          const size_t cols) {
    #ifdef UCL_DEBUG
    assert(rows==1);
--- a/lib/gpu/geryon/ucl_s_obj_help.h
+++ b/lib/gpu/geryon/ucl_s_obj_help.h
@ -27,7 +27,7 @@ template <int st> struct _ucl_s_obj_help;
 // -- Can potentially use same memory if shared by accelerator
 template <> struct _ucl_s_obj_help<1> {
  template <class t1, class t2, class t3>
-  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
+    static inline int alloc(t1 &host, t2 &device, t3 & /*_buffer*/,
                          const int cols, UCL_Device &acc,
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
@ -131,41 +131,37 @@ template <> struct _ucl_s_obj_help<1> {
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
+    static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, const bool async) {
    ucl_copy(dst,src,async);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
+    static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, command_queue &cq) {
    ucl_copy(dst,src,cq);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
-                          const bool async) {
+    static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, const bool async) {
    ucl_copy(dst,src,cols,async);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
-                          command_queue &cq) {
+    static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, command_queue &cq) {
    ucl_copy(dst,src,cols,cq);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
-                          t3 &buffer, const bool async) {
+    static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, const bool async) {
    ucl_copy(dst,src,rows,cols,async);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
-                          t3 &buffer, command_queue &cq) {
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, command_queue &cq) {
    ucl_copy(dst,src,rows,cols,cq);
  }

  template <class t1, class t2, class t3>
-  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
+    static inline int dev_resize(t1 &device, t2 &host, t3 & /*buff*/,const int cols) {
    if (device.kind()==UCL_VIEW) {
      device.view(host);
      return UCL_SUCCESS;
@ -353,7 +349,7 @@ template <int st> struct _ucl_s_obj_help {
  }

  template <class t1, class t2, class t3>
-  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
+  static inline int dev_resize(t1 &device, t2 & /*host*/, t3 &buff,const int cols) {
    int err=buff.resize(cols);
    if (err!=UCL_SUCCESS)
      return err;
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -0,0 +1,322 @@
+/***************************************************************************
+                                 amoeba.cpp
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the amoeba pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "amoeba_cl.h"
+#elif defined(USE_CUDART)
+const char *amoeba=0;
+#else
+#include "amoeba_cubin.h"
+#endif
+
+#include "lal_amoeba.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define AmoebaT Amoeba<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+AmoebaT::Amoeba() : BaseAmoeba<numtyp,acctyp>(),
+  _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+AmoebaT::~Amoeba() {
+  clear();
+}
+
+template <class numtyp, class acctyp>
+int AmoebaT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
+                  const double *host_pdamp, const double *host_thole,
+                  const double *host_dirdamp, const int *host_amtype2class,
+                  const double *host_special_hal,
+                  const double * /*host_special_repel*/,
+                  const double * /*host_special_disp*/,
+                  const double *host_special_mpole,
+                  const double * /*host_special_polar_wscale*/,
+                  const double *host_special_polar_piscale,
+                  const double *host_special_polar_pscale,
+                  const double *host_csix, const double *host_adisp,
+                  const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const int maxspecial15,
+                  const double cell_size, const double gpu_split, FILE *_screen,
+                  const double polar_dscale, const double polar_uscale) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
+                            cell_size,gpu_split,_screen,amoeba,
+                            "k_amoeba_multipole", "k_amoeba_udirect2b",
+                            "k_amoeba_umutual2b", "k_amoeba_polar",
+                            "k_amoeba_fphi_uind", "k_amoeba_fphi_mpole",
+                            "k_amoeba_short_nbor", "k_amoeba_special15");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+
+  UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_pdamp[i];
+    host_write[i].y = host_thole[i];
+    host_write[i].z = host_dirdamp[i];
+    host_write[i].w = host_amtype2class[i];
+  }
+
+  coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amtype,host_write,false);
+
+  UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amclass; i++) {
+    host_write2[i].x = host_csix[i];
+    host_write2[i].y = host_adisp[i];
+    host_write2[i].z = (numtyp)0;
+    host_write2[i].w = (numtyp)0;
+  }
+
+  coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amclass,host_write2,false);
+
+  UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
+  sp_amoeba.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_hal[i];
+    dview[i].y=host_special_polar_piscale[i];
+    dview[i].z=host_special_polar_pscale[i];
+    dview[i].w=host_special_mpole[i];
+  }
+  ucl_copy(sp_amoeba,dview,5,false);
+
+  _polar_dscale = polar_dscale;
+  _polar_uscale = polar_uscale;
+
+  _allocated=true;
+  this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
+    + sp_amoeba.row_bytes() + this->_tep.row_bytes()
+    + this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
+    + this->_thetai2.row_bytes()  + this->_thetai3.row_bytes()
+    + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void AmoebaT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  coeff_amtype.clear();
+  coeff_amclass.clear();
+  sp_amoeba.clear();
+
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double AmoebaT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the multipole real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::multipole_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_mpole,
+  //   at this point mpole is the first kernel in a time step for AMOEBA
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_mpole, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  this->k_multipole.set_size(GX,BX);
+  this->k_multipole.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &sp_amoeba,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->ans->force, &this->ans->engv, &this->_tep,
+                        &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                        &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Launch the real-space permanent field kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff _off2_polar, if not done yet
+  //   this is the first kernel in a time step where _off2_polar is used
+
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(),
+                           &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                           &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_udirect2b.set_size(GX,BX);
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
+                        &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Launch the real-space induced field kernel, returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(), &this->dev_short_nbor,
+                           &this->_off2_polar, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_umutual2b.set_size(GX,BX);
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
+                        &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
+                        &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Launch the polar real-space kernel, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::polar_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=this->block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  /*
+  const int cus = this->device->gpu->cus();
+  while (GX < cus && GX > 1) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  }
+  */
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_polar.set_size(GX,BX);
+  this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                    &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  // Signal that short nbor list is not avail for the next time step
+  //   do it here because polar_real() is the last kernel in a time step at this point
+
+  this->short_nbor_polar_avail = false;
+
+  return GX;
+}
+
+template class Amoeba<PRECISION,ACC_PRECISION>;
+}
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@ -0,0 +1,100 @@
+/***************************************************************************
+                                  amoeba.h
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the amoeba pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_AMOEBA_H
+#define LAL_AMOEBA_H
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Amoeba : public BaseAmoeba<numtyp, acctyp> {
+ public:
+  Amoeba();
+  ~Amoeba();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const int max_amtype, const int max_amclass,
+           const double *host_pdamp, const double *host_thole,
+           const double *host_dirdamp, const int *host_amtype2class,
+           const double *host_special_mpole,
+           const double *host_special_hal,
+           const double *host_special_repel,
+           const double *host_special_disp,
+           const double *host_special_polar_wscale,
+           const double *host_special_polar_piscale,
+           const double *host_special_polar_pscale,
+           const double *host_csix, const double *host_adisp,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const int maxspecial15, const double cell_size,
+           const double gpu_split, FILE *_screen,
+           const double polar_dscale, const double polar_uscale);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
+  /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
+  UCL_D_Vec<numtyp4> coeff_amtype;
+  /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
+  UCL_D_Vec<numtyp4> coeff_amclass;
+  /// Special amoeba values [0-4]:
+  ///   sp_amoeba.x = special_hal
+  ///   sp_amoeba.y = special_polar_pscale,
+  ///   sp_amoeba.z = special_polar_piscale
+  ///   sp_amoeba.w = special_mpole
+  UCL_D_Vec<numtyp4> sp_amoeba;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _polar_dscale, _polar_uscale;
+  numtyp _qqrd2e;
+
+ protected:
+  bool _allocated;
+  int multipole_real(const int eflag, const int vflag);
+  int udirect2b(const int eflag, const int vflag);
+  int umutual2b(const int eflag, const int vflag);
+  int polar_real(const int eflag, const int vflag);
+
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@ -0,0 +1,213 @@
+/***************************************************************************
+                                 amoeba_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Functions for LAMMPS access to amoeba acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_amoeba.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp, const int *host_amtype2class,
+                    const double *host_special_hal,
+                    const double *host_special_repel,
+                    const double *host_special_disp,
+                    const double *host_special_mpole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const double *host_csix, const double *host_adisp,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double polar_dscale, const double polar_uscale) {
+  AMOEBAMF.clear();
+  gpu_mode=AMOEBAMF.device->gpu_mode();
+  double gpu_split=AMOEBAMF.device->particle_split();
+  int first_gpu=AMOEBAMF.device->first_device();
+  int last_gpu=AMOEBAMF.device->last_device();
+  int world_me=AMOEBAMF.device->world_me();
+  int gpu_rank=AMOEBAMF.device->gpu_rank();
+  int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();
+
+  AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);
+
+  bool message=false;
+  if (AMOEBAMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
+                          host_pdamp, host_thole, host_dirdamp,
+                          host_amtype2class, host_special_hal,
+                          host_special_repel, host_special_disp,
+                          host_special_mpole, host_special_polar_wscale,
+                          host_special_polar_piscale, host_special_polar_pscale,
+                          host_csix, host_adisp, nlocal, nall, max_nbors,
+                          maxspecial, maxspecial15, cell_size, gpu_split,
+                          screen, polar_dscale, polar_uscale);
+
+  AMOEBAMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
+                            host_pdamp, host_thole, host_dirdamp,
+                            host_amtype2class, host_special_hal,
+                            host_special_repel, host_special_disp,
+                            host_special_mpole, host_special_polar_wscale,
+                            host_special_polar_piscale, host_special_polar_pscale,
+                            host_csix, host_adisp, nlocal, nall, max_nbors,
+                            maxspecial, maxspecial15, cell_size, gpu_split,
+                            screen, polar_dscale, polar_uscale);
+
+    AMOEBAMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    AMOEBAMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void amoeba_gpu_clear() {
+  AMOEBAMF.clear();
+}
+
+int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
+                            double **host_x, int *host_type, int *host_amtype,
+                            int *host_amgroup, double **host_rpole, double ** /*host_uind*/,
+                            double ** /*host_uinp*/, double * /*host_pval*/,
+                            double *sublo, double *subhi, tagint *tag,
+                            int **nspecial, tagint **special,
+                            int *nspecial15, tagint **special15,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom, int &host_start,
+                            int **ilist, int **jnum, const double cpu_time,
+                            bool &success, double *host_q, double *boxlo, double *prd) {
+  return AMOEBAMF.precompute(ago, inum_full, nall, host_x, host_type,
+                             host_amtype, host_amgroup, host_rpole,
+                             nullptr, nullptr, nullptr, sublo, subhi, tag,
+                             nspecial, special, nspecial15, special15,
+                             eflag_in, vflag_in, eatom, vatom,
+                             host_start, ilist, jnum, cpu_time,
+                             success, host_q, boxlo, prd);
+}
+
+
+void amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double felec, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
+}
+
+void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           const double aewald, const double off2, void **fieldp_ptr) {
+  AMOEBAMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
+                             aewald, off2, fieldp_ptr);
+}
+
+void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                   double **host_uind, double **host_uinp,
+                                   const double aewald, const double off2, void **fieldp_ptr) {
+  AMOEBAMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
+                             aewald, off2, fieldp_ptr);
+}
+
+void amoeba_gpu_update_fieldp(void **fieldp_ptr) {
+  AMOEBAMF.update_fieldp(fieldp_ptr);
+}
+
+void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                    double **host_uind, double **host_uinp,
+                                    const bool eflag_in, const bool vflag_in,
+                                    const bool eatom, const bool vatom,
+                                    const double aewald, const double felec, const double off2,
+                                    void **tep_ptr) {
+  AMOEBAMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
+                              eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
+}
+
+void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out) {
+   AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid,
+                              nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out);
+}
+
+void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi) {
+   AMOEBAMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1,
+                              host_fdip_phi2, host_fdip_sum_phi);
+}
+
+void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) {
+   AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec);
+}
+
+void amoeba_setup_fft(const int numel, const int element_type) {
+  AMOEBAMF.setup_fft(numel, element_type);
+}
+
+void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) {
+  AMOEBAMF.compute_fft1d(in, out, numel, mode);
+}
+
+double amoeba_gpu_bytes() {
+  return AMOEBAMF.host_memory_usage();
+}
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const {
    bytes+=sizeof(numtyp);
  if (_vel)
    bytes+=4*sizeof(numtyp);
+  if (_extra_fields>0)
+    bytes+=_extra_fields*sizeof(numtyp4);
  return bytes;
 }

@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) {
                                   UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=v.device.row_bytes();
  }
+  if (_extra_fields>0) {
+    success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=extra.device.row_bytes();
+  }

  if (_gpu_nbor>0) {
    if (_bonds) {
@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) {

 template <class numtyp, class acctyp>
 bool AtomT::add_fields(const bool charge, const bool rot,
-                       const int gpu_nbor, const bool bonds, const bool vel) {
+                       const int gpu_nbor, const bool bonds, const bool vel,
+                       const int extra_fields) {
  bool success=true;
  // Ignore host/device transfers?
  int gpu_bytes=0;
@ -191,7 +199,17 @@ bool AtomT::add_fields(const bool charge, const bool rot,
    }
  }

-  if (bonds && !_bonds) {
+  if (extra_fields > 0 && _extra_fields==0) {
+    _extra_fields=extra_fields;
+    _other=true;
+    if (_host_view==false) {
+      success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
+                                     UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=extra.device.row_bytes();
+    }
+  }
+
+  if (bonds && _bonds==false) {
    _bonds=true;
    if (_bonds && _gpu_nbor>0) {
      success=success && (dev_tag.alloc(_max_atoms,*dev,
@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,

 template <class numtyp, class acctyp>
 bool AtomT::init(const int nall, const bool charge, const bool rot,
-                 UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) {
+                 UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel,
+                 const int extra_fields) {
  clear();

  bool success=true;
@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
  _q_avail=false;
  _quat_avail=false;
  _v_avail=false;
+  _extra_avail=false;
  _resized=false;
  _gpu_nbor=gpu_nbor;
  _bonds=bonds;
  _charge=charge;
  _rot=rot;
  _vel=vel;
-  _other=_charge || _rot || _vel;
+  _extra_fields=extra_fields;
+  _other=_charge || _rot || _vel || (extra_fields>0);
  dev=&devi;
  _time_transfer=0;

@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
  time_q.init(*dev);
  time_quat.init(*dev);
  time_vel.init(*dev);
+  time_extra.init(*dev);
+
  time_pos.zero();
  time_q.zero();
  time_quat.zero();
  time_vel.zero();
+  time_extra.zero();
+
  _time_cast=0.0;

  #ifdef GPU_CAST
@ -308,6 +333,8 @@ void AtomT::clear_resize() {
    quat.clear();
  if (_vel)
    v.clear();
+  if (_extra_fields>0)
+    extra.clear();

  dev_cell_id.clear();
  dev_particle_id.clear();
@ -350,6 +377,7 @@ void AtomT::clear() {
  time_q.clear();
  time_quat.clear();
  time_vel.clear();
+  time_extra.clear();
  clear_resize();

  #ifdef GPU_CAST
@ -370,12 +398,19 @@ double AtomT::host_memory_usage() const {
    atom_bytes+=4;
  if (_vel)
    atom_bytes+=4;
+  if (_extra_fields>0)
+    atom_bytes+=_extra_fields;
  return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }

+#ifdef USE_CUDPP
+#define USE_CUDPP_ARG(arg) arg
+#else
+#define USE_CUDPP_ARG(arg)
+#endif
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
-void AtomT::sort_neighbor(const int num_atoms) {
+void AtomT::sort_neighbor(const int USE_CUDPP_ARG(num_atoms)) {
  #ifdef USE_CUDPP
  CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
                                 (int *)dev_particle_id.begin(),
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -76,7 +76,7 @@ class Atom {
    *        gpu_nbor 2 if binning on host and neighboring on device **/
  bool init(const int nall, const bool charge, const bool rot,
            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
-            const bool vel=false);
+            const bool vel=false, const int extra_fields=0);

  /// Check if we have enough device storage and realloc if not
  /** Returns true if resized with any call during this timestep **/
@ -96,7 +96,7 @@ class Atom {
    *        gpu_nbor 1 if neighboring will be performed on device
    *        gpu_nbor 2 if binning on host and neighboring on device **/
  bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
-                  const bool bonds, const bool vel=false);
+                  const bool bonds, const bool vel=false, const int extra_fields=0);

  /// Returns true if GPU is using charges
  bool charge() { return _charge; }
@ -107,6 +107,9 @@ class Atom {
  /// Returns true if GPU is using velocities
  bool velocity() { return _vel; }

+  /// Returns true if GPU is using extra fields
+  bool using_extra() { return (_extra_fields>0); }
+
  /// Only free matrices of length inum or nall for resizing
  void clear_resize();

@ -128,6 +131,8 @@ class Atom {
      time_quat.add_to_total();
    if (_vel)
      time_vel.add_to_total();
+    if (_extra_fields>0)
+      time_extra.add_to_total();
  }

  /// Add copy times to timers
@ -139,6 +144,8 @@ class Atom {
      time_quat.zero();
    if (_vel)
      time_vel.zero();
+    if (_extra_fields>0)
+      time_extra.zero();
  }

  /// Return the total time for host/device data transfer
@ -158,6 +165,10 @@ class Atom {
      total+=time_vel.total_seconds();
      time_vel.zero_total();
    }
+    if (_extra_fields>0) {
+      total+=time_extra.total_seconds();
+      time_extra.zero_total();
+    }

    return total+_time_transfer/1000.0;
  }
@ -281,7 +292,11 @@ class Atom {

  /// Signal that we need to transfer atom data for next timestep
  inline void data_unavail()
-    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; }
+
+  /// Signal that we need to transfer atom extra data for next kernel call
+  inline void extra_data_unavail()
+    { _extra_avail=false; }

  typedef struct { double x,y,z; } vec3d;
  typedef struct { numtyp x,y,z,w; } vec4d_t;
@ -312,7 +327,7 @@ class Atom {

  /// Copy positions and types to device asynchronously
  /** Copies nall() elements **/
-  inline void add_x_data(double **host_ptr, int *host_type) {
+  inline void add_x_data(double ** /*host_ptr*/, int * /*host_type*/) {
    time_pos.start();
    if (_x_avail==false) {
      #ifdef GPU_CAST
@ -426,7 +441,7 @@ class Atom {

  /// Copy velocities and tags to device asynchronously
  /** Copies nall() elements **/
-  inline void add_v_data(double **host_ptr, tagint *host_tag) {
+  inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) {
    time_vel.start();
    if (_v_avail==false) {
      #ifdef GPU_CAST
@ -450,6 +465,33 @@ class Atom {
    add_v_data(host_ptr,host_tag);
  }

+ // Cast extras to write buffer
+  template<class cpytyp>
+  inline void cast_extra_data(cpytyp *host_ptr) {
+    if (_extra_avail==false) {
+      double t=MPI_Wtime();
+      #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+      #pragma omp parallel for simd schedule(static)
+      #elif (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i=0; i<_nall*_extra_fields; i++)
+        extra[i]=host_ptr[i];
+      _time_cast+=MPI_Wtime()-t;
+    }
+  }
+
+  // Copy extras to device
+  /** Copies nall()*_extra elements **/
+  inline void add_extra_data() {
+    time_extra.start();
+    if (_extra_avail==false) {
+      extra.update_device(_nall*_extra_fields,true);
+      _extra_avail=true;
+    }
+    time_extra.stop();
+  }
+
  /// Add in casting time from additional data (seconds)
  inline void add_cast_time(double t) { _time_cast+=t; }

@ -473,6 +515,8 @@ class Atom {
  UCL_Vector<numtyp,numtyp> quat;
  /// Velocities
  UCL_Vector<numtyp,numtyp> v;
+  /// Extras
+  UCL_Vector<numtyp4,numtyp4> extra;

  #ifdef GPU_CAST
  UCL_Vector<numtyp,numtyp> x_cast;
@ -493,7 +537,7 @@ class Atom {
  UCL_H_Vec<int> host_particle_id;

  /// Device timers
-  UCL_Timer time_pos, time_q, time_quat, time_vel;
+  UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra;

  /// Geryon device
  UCL_Device *dev;
@ -508,11 +552,12 @@ class Atom {
  bool _compiled;

  // True if data has been copied to device already
-  bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
+  bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized;

  bool alloc(const int nall);

  bool _allocated, _rot, _charge, _bonds, _vel, _other;
+  int _extra_fields;
  int _max_atoms, _nall, _gpu_nbor;
  bool _host_view;
  double _time_cast, _time_transfer;
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -0,0 +1,962 @@
+/***************************************************************************
+                               base_amoeba.cpp
+                             -------------------
+                            Trung Dac Nguyen (Northwestern)
+
+  Base class for pair styles needing per-particle data for position,
+  charge, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+#define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> global_device;
+
+template <class numtyp, class acctyp>
+BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_polar_avail(false) {
+  device=&global_device;
+  ans=new Answer<numtyp,acctyp>();
+  nbor=new Neighbor();
+  pair_program=nullptr;
+  ucl_device=nullptr;
+}
+
+template <class numtyp, class acctyp>
+BaseAmoebaT::~BaseAmoeba() {
+  delete ans;
+  delete nbor;
+  k_multipole.clear();
+  k_udirect2b.clear();
+  k_umutual2b.clear();
+  k_fphi_uind.clear();
+  k_fphi_mpole.clear();
+  k_polar.clear();
+  k_special15.clear();
+  k_short_nbor.clear();
+
+  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
+  if (fft_plan_created) cufftDestroy(plan);
+  #endif
+
+  if (pair_program) delete pair_program;
+}
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
+                             const int max_nbors, const int maxspecial,
+                             const int maxspecial15,
+                             const double cell_size, const double gpu_split,
+                             FILE *_screen, const void *pair_program,
+                             const char *k_name_multipole,
+                             const char *k_name_udirect2b,
+                             const char *k_name_umutual2b,
+                             const char *k_name_polar,
+                             const char *k_name_fphi_uind,
+                             const char *k_name_fphi_mpole,
+                             const char *k_name_short_nbor,
+                             const char* k_name_special15) {
+  screen=_screen;
+
+  int gpu_nbor=0;
+  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  _threads_per_atom=device->threads_per_charge();
+
+  bool charge = true;
+  bool rot = false;
+  bool vel = false;
+  _extra_fields = 24; // round up to accomodate quadruples of numtyp values
+                      // rpole 13; uind 3; uinp 3; amtype, amgroup; pval
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4);
+  if (success!=0)
+    return success;
+
+  if (ucl_device!=device->gpu) _compiled=false;
+
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pair_block_size();
+  _block_bio_size=device->block_bio_pair();
+  compile_kernels(*ucl_device,pair_program,k_name_multipole,
+                   k_name_udirect2b, k_name_umutual2b,k_name_polar,
+                   k_name_fphi_uind, k_name_fphi_mpole,
+                   k_name_short_nbor, k_name_special15);
+
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else {
+    _nbor_data=&(nbor->dev_nbor);
+  }
+
+  bool alloc_packed=false;
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
+                              _gpu_host,max_nbors,cell_size,alloc_packed,
+                              _threads_per_atom);
+  if (success!=0)
+    return success;
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_nbor,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->x,4);
+  q_tex.bind_float(atom->q,1);
+
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  _maxspecial=maxspecial;
+  _maxspecial15=maxspecial15;
+
+  // allocate per-atom array tep
+
+  int ef_nall=nlocal; //nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+
+  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
+
+  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
+  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+
+  _max_fieldp_size = _max_tep_size;
+  _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+
+  _max_thetai_size = 0;
+
+  _nmax = nall;
+  dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+  dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+  dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
+
+  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
+  fft_plan_created = false;
+  #endif
+
+  #ifdef ASYNC_DEVICE_COPY
+  _end_command_queue=ucl_device->num_queues();
+  ucl_device->push_command_queue();
+  #endif
+
+  return success;
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::clear_atomic() {
+  // Output any timing information
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
+
+  time_pair.clear();
+  hd_balancer.clear();
+
+  dev_short_nbor.clear();
+  nbor->clear();
+  ans->clear();
+
+  _tep.clear();
+  _fieldp.clear();
+  _thetai1.clear();
+  _thetai2.clear();
+  _thetai3.clear();
+  _igrid.clear();
+  _fdip_phi1.clear();
+  _fdip_phi2.clear();
+  _fdip_sum_phi.clear();
+  _cgrid_brick.clear();
+
+  dev_nspecial15.clear();
+  dev_special15.clear();
+  dev_special15_t.clear();
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
+                                   int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return nullptr;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+
+  return ilist;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
+                                        const int nall, double **host_x,
+                                        int *host_type, double *sublo,
+                                        double *subhi, tagint *tag,
+                                        int **nspecial, tagint **special,
+                                        int *nspecial15, tagint **special15,
+                                        bool &success) {
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return 0;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
+
+  // add one-five neighbors
+
+  if (_maxspecial15>0) {
+    UCL_H_Vec<int> view_nspecial15;
+    UCL_H_Vec<tagint> view_special15;
+    view_nspecial15.view(nspecial15,nall,*ucl_device);
+    view_special15.view(special15[0],nall*_maxspecial15,*ucl_device);
+    ucl_copy(dev_nspecial15,view_nspecial15,nall,false);
+    ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false);
+    nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall);
+
+    add_onefive_neighbors();
+  }
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+  return mn;
+}
+
+// ---------------------------------------------------------------------------
+// Prepare for multiple kernel calls in a time step:
+//   - reallocate per-atom arrays, if needed
+//   - transfer extra data from host to device
+//   - build the full neighbor lists for use by different kernels
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
+                              double **host_x, int *host_type, int *host_amtype,
+                              int *host_amgroup, double **host_rpole,
+                              double **host_uind, double **host_uinp, double *host_pval,
+                              double *sublo, double *subhi, tagint *tag,
+                              int **nspecial, tagint **special,
+                              int *nspecial15, tagint **special15,
+                              const bool eflag_in, const bool vflag_in,
+                              const bool eatom, const bool vatom, int &host_start,
+                              int **&ilist, int **&jnum, const double cpu_time,
+                              bool &success, double *host_q, double * /*boxlo*/, double * /*prd*/) {
+  acc_timers();
+  if (eatom) _eflag=2;
+  else if (eflag_in) _eflag=1;
+  else _eflag=0;
+  if (vatom) _vflag=2;
+  else if (vflag_in) _vflag=1;
+  else _vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (_eflag) _eflag=2;
+  if (_vflag) _vflag=2;
+  #endif
+
+  set_kernel(_eflag,_vflag);
+
+  // ------------------- Resize 1-5 neighbor arrays ------------------------
+
+  if (nall>_nmax) {
+    _nmax = nall;
+    dev_nspecial15.clear();
+    dev_special15.clear();
+    dev_special15_t.clear();
+    dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
+  }
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return nullptr;
+  }
+
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                    success);
+    if (!success)
+      return nullptr;
+    atom->cast_q_data(host_q);
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    atom->cast_q_data(host_q);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  atom->add_q_data();
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  atom->add_extra_data();
+
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  // re-allocate dev_short_nbor if necessary
+  if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    dev_short_nbor.resize((2+_max_nbors)*_nmax);
+  }
+
+  hd_balancer.stop_timer();
+
+  return nbor->host_jlist.begin()-host_start;
+}
+
+// ---------------------------------------------------------------------------
+// Compute multipole real-space part
+//   precompute() should be already invoked before mem (re)allocation
+//   this is the first part in a time step done on the GPU for AMOEBA for now
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
+                                         const int /*nall*/, double ** /*host_x*/,
+                                         int * /*host_type*/, int * /*host_amtype*/,
+                                         int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                         double */*host_pval*/, double * /*sublo*/,
+                                         double * /*subhi*/, tagint * /*tag*/,
+                                         int ** /*nspecial*/, tagint ** /*special*/,
+                                         int * /*nspecial15*/, tagint ** /*special15*/,
+                                         const bool /*eflag_in*/, const bool /*vflag_in*/,
+                                         const bool /*eatom*/, const bool /*vatom*/,
+                                         int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
+                                         const double /*cpu_time*/, bool & /*success*/,
+                                         const double aewald, const double felec,
+                                         const double off2_mpole, double * /*host_q*/,
+                                         double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    _tep.resize(_max_tep_size*4);
+  }
+  *tep_ptr=_tep.host.begin();
+
+  _off2_mpole = off2_mpole;
+  _felec = felec;
+  _aewald = aewald;
+  multipole_real(_eflag,_vflag);
+
+  // leave the answers (forces, energies and virial) on the device,
+  //   only copy them back in the last kernel (polar_real)
+  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //device->add_ans_object(ans);
+
+  // copy tep from device to host
+
+  _tep.update_host(_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute the direct real space part
+//    of the permanent field
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                     double **host_uind, double **host_uinp, double *host_pval,
+                                     const double aewald, const double off2_polar,
+                                     void** fieldp_ptr) {
+  // all the necessary data arrays are already copied from host to device
+
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  atom->add_extra_data();
+
+  *fieldp_ptr=_fieldp.host.begin();
+
+  // specify the correct cutoff and alpha values
+  _off2_polar = off2_polar;
+  _aewald = aewald;
+  udirect2b(_eflag,_vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+
+  _fieldp.update_host(_max_fieldp_size*8,false);
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute the direct real space part
+//    of the induced field
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double ** /*host_rpole*/,
+                                    double **host_uind, double **host_uinp, double * /*host_pval*/,
+                                    const double aewald, const double off2_polar,
+                                    void** /*fieldp_ptr*/) {
+  // only copy the necessary data arrays that are updated over the iterations
+  // use nullptr for the other arrays that are already copied from host to device
+  cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr);
+  atom->add_extra_data();
+
+  // set the correct cutoff and alpha
+  _off2_polar = off2_polar;
+  _aewald = aewald;
+  // launch the kernel
+  umutual2b(_eflag,_vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+  // NOTE: move this step to update_fieldp() to delay device-host transfer
+  //       after umutual1 and self are done on the GPU
+  // *fieldp_ptr=_fieldp.host.begin();
+  // _fieldp.update_host(_max_fieldp_size*8,false);
+}
+
+// ---------------------------------------------------------------------------
+// Prepare for umutual1() after bspline_fill() is done on host
+//   - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed
+//     host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
+//     host_igrid is allocated with nmax by 4
+//   - transfer extra data from host to device
+// NOTE: can be re-used for fphi_mpole() but with a different bsorder value
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
+                                    double ***host_thetai1, double ***host_thetai2,
+                                    double ***host_thetai3, int** host_igrid,
+                                    const int nzlo_out, const int nzhi_out,
+                                    const int nylo_out, const int nyhi_out,
+                                    const int nxlo_out, const int nxhi_out) {
+  // update bsorder with that of the kspace solver
+  _bsorder = bsorder;
+
+  // allocate or resize per-atom arrays
+  // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
+  //   will be consolidated once all terms are ready
+
+  if (_max_thetai_size == 0) {
+    _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
+    _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+    _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+    _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+    _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+
+    _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
+    _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
+    _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
+  } else {
+    if ((int)_thetai1.cols()<_max_thetai_size*bsorder) {
+      _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+      _thetai1.resize(_max_thetai_size*bsorder);
+      _thetai2.resize(_max_thetai_size*bsorder);
+      _thetai3.resize(_max_thetai_size*bsorder);
+      _igrid.resize(_max_thetai_size*4);
+
+      _fdip_phi1.resize(_max_thetai_size*10);
+      _fdip_phi2.resize(_max_thetai_size*10);
+      _fdip_sum_phi.resize(_max_thetai_size*20);
+    }
+  }
+
+  #ifdef ASYNC_DEVICE_COPY
+  _thetai1.cq(ucl_device->cq(_end_command_queue));
+  _thetai2.cq(ucl_device->cq(_end_command_queue));
+  _thetai3.cq(ucl_device->cq(_end_command_queue));
+  #endif
+
+  // pack host data to device
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai1[i][j][0];
+      v.y = host_thetai1[i][j][1];
+      v.z = host_thetai1[i][j][2];
+      v.w = host_thetai1[i][j][3];
+      _thetai1[idx] = v;
+    }
+  _thetai1.update_device(true);
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai2[i][j][0];
+      v.y = host_thetai2[i][j][1];
+      v.z = host_thetai2[i][j][2];
+      v.w = host_thetai2[i][j][3];
+      _thetai2[idx] = v;
+    }
+  _thetai2.update_device(true);
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai3[i][j][0];
+      v.y = host_thetai3[i][j][1];
+      v.z = host_thetai3[i][j][2];
+      v.w = host_thetai3[i][j][3];
+      _thetai3[idx] = v;
+    }
+  _thetai3.update_device(true);
+
+  for (int i = 0; i < inum_full; i++) {
+    int idx = i*4;
+    _igrid[idx+0] = host_igrid[i][0];
+    _igrid[idx+1] = host_igrid[i][1];
+    _igrid[idx+2] = host_igrid[i][2];
+  }
+  _igrid.update_device(true);
+
+  // _cgrid_brick holds the grid-based potential
+
+  _nzlo_out = nzlo_out;
+  _nzhi_out = nzhi_out;
+  _nylo_out = nylo_out;
+  _nyhi_out = nyhi_out;
+  _nxlo_out = nxlo_out;
+  _nxhi_out = nxhi_out;
+  _ngridz = nzhi_out - nzlo_out + 1;
+  _ngridy = nyhi_out - nylo_out + 1;
+  _ngridx = nxhi_out - nxlo_out + 1;
+  _num_grid_points = _ngridx * _ngridy * _ngridz;
+
+  int numel = _num_grid_points;
+  if (_cgrid_brick.cols() == 0) {
+    int nsize=(int)(((double)numel)*1.1);
+    _cgrid_brick.alloc(nsize, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
+  } else if (numel > (int)_cgrid_brick.cols()) {
+    _cgrid_brick.resize(numel);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// fphi_uind = induced potential from grid
+// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+// NOTE: host_grid_brick is from ic_kspace post_convolution()
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
+                                    void **host_fdip_phi1,
+                                    void **host_fdip_phi2,
+                                    void **host_fdip_sum_phi)
+{
+  int n = 0;
+  for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
+    for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
+      for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
+        numtyp2 v;
+        v.x = host_grid_brick[iz][iy][ix][0];
+        v.y = host_grid_brick[iz][iy][ix][1];
+        _cgrid_brick[n] = v;
+        n++;
+      }
+  _cgrid_brick.update_device(_num_grid_points, false);
+
+  #ifdef ASYNC_DEVICE_COPY
+  ucl_device->sync();
+  #endif
+
+  // launch the kernel with its execution configuration (see below)
+  fphi_uind();
+
+  // copy data from device to host
+  _fdip_phi1.update_host(_max_thetai_size*10, false);
+  _fdip_phi2.update_host(_max_thetai_size*10, false);
+  _fdip_sum_phi.update_host(_max_thetai_size*20, false);
+
+  // return the pointers to the host-side arrays
+  *host_fdip_phi1 = _fdip_phi1.host.begin();
+  *host_fdip_phi2 = _fdip_phi2.host.begin();
+  *host_fdip_sum_phi = _fdip_sum_phi.host.begin();
+}
+
+// ---------------------------------------------------------------------------
+// Interpolate the potential from the PME grid
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int BaseAmoebaT::fphi_uind() {
+  int ainum=ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+
+  time_pair.start();
+  int ngridxy = _ngridx * _ngridy;
+  k_fphi_uind.set_size(GX,BX);
+  k_fphi_uind.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
+                  &_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum,
+                  &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
+  time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// fphi_mpole = multipole potential from grid (limited to polar_kspace for now)
+// fphi_mpole extracts the permanent multipole potential from
+//   the particle mesh Ewald grid
+// NOTE: host_grid_brick is from p_kspace post_convolution()
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec)
+{
+  int n = 0;
+  for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
+    for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
+      for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
+        numtyp2 v;
+        v.x = host_grid_brick[iz][iy][ix];
+        v.y = (numtyp)0;
+        _cgrid_brick[n] = v;
+        n++;
+      }
+  _cgrid_brick.update_device(_num_grid_points, false);
+
+  _felec = felec;
+  fphi_mpole();
+
+  _fdip_sum_phi.update_host(_max_thetai_size*20, false);
+
+  *host_fphi = _fdip_sum_phi.host.begin();
+}
+
+// ---------------------------------------------------------------------------
+// Interpolate the potential from the PME grid
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int BaseAmoebaT::fphi_mpole() {
+  int ainum=ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+
+  time_pair.start();
+  int ngridxy = _ngridx * _ngridy;
+  k_fphi_mpole.set_size(GX,BX);
+  k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
+                  &_fdip_sum_phi, &_bsorder, &ainum, &_felec,
+                  &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
+  time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute polar real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
+                                     double **host_rpole, double **host_uind,
+                                     double **host_uinp, double *host_pval,
+                                     const bool eflag_in, const bool vflag_in,
+                                     const bool eatom, const bool vatom,
+                                     const double aewald, const double felec,
+                                     const double off2_polar, void **tep_ptr) {
+
+  // cast necessary data arrays from host to device
+
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  atom->add_extra_data();
+
+  *tep_ptr=_tep.host.begin();
+
+  _off2_polar = off2_polar;
+  _felec = felec;
+  _aewald = aewald;
+  const int red_blocks=polar_real(_eflag,_vflag);
+
+  // only copy answers (forces, energies and virial) back from the device
+  //   in the last kernel (which is polar_real here)
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  device->add_ans_object(ans);
+
+  // copy tep from device to host
+  _tep.update_host(_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Return the memory bytes allocated on the host and device
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+double BaseAmoebaT::host_memory_usage_atomic() const {
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(BaseAmoeba<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Setup the FFT plan: only placeholder for now
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::setup_fft(const int /*numel*/, const int /*element_type*/)
+{
+  // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
+}
+
+// ---------------------------------------------------------------------------
+// Compute FFT on the device: only placeholder for now
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fft1d(void * /*in*/, void * /*out*/,
+                                const int /*numel*/, const int /*mode*/)
+{
+  // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
+  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
+  if (fft_plan_created == false) {
+    int m = numel/2;
+    cufftPlan1d(&plan, m, CUFFT_Z2Z, 1);
+    fft_plan_created = true;
+  }
+
+  // n = number of double complex
+  int n = numel/2;
+
+  // copy the host array to the device (data)
+  UCL_Vector<cufftDoubleComplex,cufftDoubleComplex> data;
+  data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE);
+  int m = 0;
+  double* d_in = (double*)in;
+  for (int i = 0; i < n; i++) {
+    data[i].x = d_in[m];
+    data[i].y = d_in[m+1];
+    m += 2;
+  }
+  data.update_device(false);
+
+  // perform the in-place forward FFT
+
+  cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device,
+    (cufftDoubleComplex*)&data.device, CUFFT_FORWARD);
+  if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result);
+  ucl_device->sync();
+  data.update_host(false);
+
+  // copy back the data to the host array
+
+  m = 0;
+  double* d_out = (double*)out;
+  for (int i = 0; i < n; i++) {
+    d_out[m] = data[i].x;
+    d_out[m+1] = data[i].y;
+    m += 2;
+  }
+
+  data.clear();
+  #endif
+}
+
+// ---------------------------------------------------------------------------
+// Copy the extra data from host to device
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
+                                  double** uind, double** uinp, double* pval) {
+  // signal that we need to transfer extra data from the host
+
+  atom->extra_data_unavail();
+
+  int _nall=atom->nall();
+  numtyp4 *pextra=reinterpret_cast<numtyp4*>(&(atom->extra[0]));
+
+  int n = 0;
+  int nstride = 1; //4;
+  if (rpole) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = rpole[i][0];
+      pextra[idx].y = rpole[i][1];
+      pextra[idx].z = rpole[i][2];
+      pextra[idx].w = rpole[i][3];
+    }
+
+    n += nstride*_nall;
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = rpole[i][4];
+      pextra[idx].y = rpole[i][5];
+      pextra[idx].z = rpole[i][6];
+      pextra[idx].w = rpole[i][8];
+    }
+
+    n += nstride*_nall;
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = rpole[i][9];
+      pextra[idx].y = rpole[i][12];
+      pextra[idx].z = (numtyp)amtype[i];
+      pextra[idx].w = (numtyp)amgroup[i];
+    }
+  } else {
+    n += 2*nstride*_nall;
+  }
+
+  n += nstride*_nall;
+  if (uind) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = uind[i][0];
+      pextra[idx].y = uind[i][1];
+      pextra[idx].z = uind[i][2];
+      pextra[idx].w = 0;
+    }
+  }
+
+  n += nstride*_nall;
+  if (uinp) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = uinp[i][0];
+      pextra[idx].y = uinp[i][1];
+      pextra[idx].z = uinp[i][2];
+      pextra[idx].w = 0;
+    }
+  }
+
+  n += nstride*_nall;
+  if (pval) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = pval[i];
+      pextra[idx].y = 0;
+      pextra[idx].z = 0;
+      pextra[idx].w = 0;
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Compile (load) the kernel strings and set the kernels
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
+                                  const char *kname_multipole,
+                                  const char *kname_udirect2b,
+                                  const char *kname_umutual2b,
+                                  const char *kname_polar,
+                                  const char *kname_fphi_uind,
+                                  const char *kname_fphi_mpole,
+                                  const char *kname_short_nbor,
+                                  const char* kname_special15) {
+  if (_compiled)
+    return;
+
+  if (pair_program) delete pair_program;
+  pair_program=new UCL_Program(dev);
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen);
+
+  k_multipole.set_function(*pair_program, kname_multipole);
+  k_udirect2b.set_function(*pair_program, kname_udirect2b);
+  k_umutual2b.set_function(*pair_program, kname_umutual2b);
+  k_polar.set_function(*pair_program, kname_polar);
+  k_fphi_uind.set_function(*pair_program, kname_fphi_uind);
+  k_fphi_mpole.set_function(*pair_program, kname_fphi_mpole);
+  k_short_nbor.set_function(*pair_program, kname_short_nbor);
+  k_special15.set_function(*pair_program, kname_special15);
+  pos_tex.get_texture(*pair_program, "pos_tex");
+  q_tex.get_texture(*pair_program, "q_tex");
+
+  _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.has_subgroup_support()) {
+    int mx_subgroup_sz = k_polar.max_subgroup_size(_block_size);
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
+}
+
+// ---------------------------------------------------------------------------
+//  Specify 1-5 neighbors from the current neighbor list
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::add_onefive_neighbors() {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
+                               (BX/_threads_per_atom)));
+
+  int _nall=atom->nall();
+  int ainum=ans->inum();
+  int nbor_pitch=nbor->nbor_pitch();
+
+  k_special15.set_size(GX,BX);
+  k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
+                  &atom->dev_tag, &dev_nspecial15, &dev_special15,
+                  &ainum, &_nall, &nbor_pitch,
+                  &_threads_per_atom);
+
+  return GX;
+}
+
+template class BaseAmoeba<PRECISION,ACC_PRECISION>;
+}
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -0,0 +1,325 @@
+/***************************************************************************
+                                base_amoeba.h
+                             -------------------
+                        Trung Dac Nguyen (Northwestern)
+
+  Base class for pair styles needing per-particle data for position,
+  charge, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_BASE_AMOEBA_H
+#define LAL_BASE_AMOEBA_H
+
+#include "lal_device.h"
+#include "lal_balance.h"
+#include "mpi.h"
+
+#if defined(USE_OPENCL)
+#include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+//#define ASYNC_DEVICE_COPY
+
+#if !defined(USE_OPENCL) && !defined(USE_HIP)
+// temporary workaround for int2 also defined in cufft
+#ifdef int2
+#undef int2
+#endif
+#include "cufft.h"
+#endif
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BaseAmoeba {
+ public:
+  BaseAmoeba();
+  virtual ~BaseAmoeba();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * \param k_name name for the kernel for force calculation
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const int maxspecial15, const double cell_size,
+                  const double gpu_split, FILE *screen, const void *pair_program,
+                  const char *kname_multipole, const char *kname_udirect2b,
+                  const char *kname_umutual2b, const char *kname_polar,
+                  const char *kname_fphi_uind, const char *kname_fphi_mpole,
+                  const char *kname_short_nbor, const char* kname_special15);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead(const int add_kernels=0);
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success)) {
+      pos_tex.bind_float(atom->x,4);
+      q_tex.bind_float(atom->q,1);
+    }
+    ans->resize(inum,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note olist_size=total number of local particles **/
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note nlocal+host_inum=total number local particles
+    * \note olist_size=0 **/
+  inline void resize_local(const int inum, const int host_inum,
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear_atomic();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom_atomic(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_atomic() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      nbor->acc_timers(screen);
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
+    }
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    time_pair.zero();
+    atom->zero_timers();
+    ans->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  int build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, int *nspecial15, tagint **special15,
+                       bool &success);
+
+  /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed
+  virtual int** precompute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double **host_uind,
+                double **host_uinp, double *host_pval, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **&ilist, int **&numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd);
+
+  /// Compute multipole real-space with device neighboring
+  virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *host_pval,
+                double *sublo, double *subhi, tagint *tag,
+                int **nspecial, tagint **special, int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+                int &host_start, int **ilist, int **numj, const double cpu_time,
+                bool &success, const double aewald, const double felec,
+                const double off2_mpole, double *charge, double *boxlo,
+                double *prd, void **tep_ptr);
+
+  /// Compute the real space part of the permanent field (udirect2b) with device neighboring
+  virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const double aewald, const double off2_polar, void **fieldp_ptr);
+
+  /// Compute the real space part of the induced field (umutual2b) with device neighboring
+  virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const double aewald, const double off2_polar, void **fieldp_ptr);
+
+  /// Allocate/resize per-atom arrays before the kspace parts in induce() and polar
+  virtual void precompute_kspace(const int inum_full, const int bsorder,
+                                 double ***host_thetai1, double ***host_thetai2,
+                                 double ***host_thetai3, int** igrid,
+                                 const int nzlo_out, const int nzhi_out,
+                                 const int nylo_out, const int nyhi_out,
+                                 const int nxlo_out, const int nxhi_out);
+  /// Interpolate the induced potential from the grid
+  virtual void compute_fphi_uind(double ****host_grid_brick,
+                                 void **host_fdip_phi1, void **host_fdip_phi2,
+                                 void **host_fdip_sum_phi);
+
+  /// Interpolate the multipolar potential from the grid
+  virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
+                                  const double felec);
+
+  /// Compute polar real-space with device neighboring
+  virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom,
+                const double aewald, const double felec, const double off2_polar,
+                void **tep_ptr);
+
+  // copy field and fieldp from device to host after umutual2b
+  virtual void update_fieldp(void **fieldp_ptr) {
+    *fieldp_ptr=_fieldp.host.begin();
+     // _fieldp store both arrays, one after another
+    _fieldp.update_host(_max_fieldp_size*8,false);
+  }
+
+  /// setup a plan for FFT, where size is the number of elements
+
+  void setup_fft(const int size, const int element_type=0);
+
+  /// compute forward/backward FFT on the device
+
+  void compute_fft1d(void* in, void* out, const int numel, const int mode);
+
+  // -------------------------- DEVICE DATA -------------------------
+
+  /// Device Properties and Atom and Neighbor storage
+  Device<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_pair;
+
+  /// Host device load balancer
+  Balance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  Atom<numtyp,acctyp> *atom;
+
+  UCL_Vector<numtyp,numtyp> polar1, polar2, polar3, polar4, polar5;
+
+  /// cast host arrays into a single array for atom->extra
+  void cast_extra_data(int* amtype, int* amgroup, double** rpole,
+    double** uind, double** uinp, double* pval=nullptr);
+
+  /// Per-atom arrays
+  UCL_Vector<acctyp,acctyp> _tep, _fieldp;
+  int _nmax, _max_tep_size, _max_fieldp_size;
+
+  int _bsorder;
+  UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
+  UCL_Vector<int,int> _igrid;
+  UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
+  UCL_Vector<acctyp,acctyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
+  int _max_thetai_size;
+  int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
+  int _ngridx, _ngridy, _ngridz, _num_grid_points;
+
+  int _end_command_queue;
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  Answer<numtyp,acctyp> *ans;
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  Neighbor *nbor;
+  /// Device storage for 1-5 special neighbor counts
+  UCL_D_Vec<int> dev_nspecial15;
+  /// Device storage for special neighbors
+  UCL_D_Vec<tagint> dev_special15, dev_special15_t;
+
+  int add_onefive_neighbors();
+
+  UCL_D_Vec<int> dev_short_nbor;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program;
+  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
+  UCL_Kernel k_fphi_uind, k_fphi_mpole;
+  UCL_Kernel k_special15, k_short_nbor;
+  inline int block_size() { return _block_size; }
+  inline void set_kernel(const int /*eflag*/, const int /*vflag*/) {}
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+
+ protected:
+  bool _compiled;
+  int _block_size, _block_bio_size, _threads_per_atom;
+  int _extra_fields;
+  double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
+  double _gpu_overhead, _driver_overhead;
+  bool short_nbor_polar_avail;
+  UCL_D_Vec<int> *_nbor_data;
+
+  numtyp _aewald,_felec;
+  numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;
+
+  int _eflag, _vflag;
+
+  void compile_kernels(UCL_Device &dev, const void *pair_string,
+     const char *kname_multipole, const char *kname_udirect2b,
+     const char *kname_umutual2b, const char *kname_polar,
+     const char *kname_fphi_uind, const char *kname_fphi_mpole,
+     const char *kname_short_nbor, const char* kname_special15);
+
+  virtual int multipole_real(const int eflag, const int vflag) = 0;
+  virtual int udirect2b(const int eflag, const int vflag) = 0;
+  virtual int umutual2b(const int eflag, const int vflag) = 0;
+  virtual int fphi_uind();
+  virtual int fphi_mpole();
+  virtual int polar_real(const int eflag, const int vflag) = 0;
+
+
+  #if !defined(USE_OPENCL) && !defined(USE_HIP)
+  cufftHandle plan;
+  #endif
+  bool fft_plan_created;
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_atom();

-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  bool charge = false;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_charge();

-  int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
+  bool charge = true;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_charge();

-  int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
+  bool charge = true;
+  bool rot = true;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_atom();

-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
+  bool charge = false;
+  bool rot = false;
+  bool vel = true;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel);
  if (success!=0)
    return success;

@ -193,7 +196,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall,
                       const double cpu_time, bool &success, tagint *tag,
                       double **host_v, const double dtinvsqrt,
                       const int seed, const int timestep,
-                       const int nlocal, double *boxlo, double *prd) {
+                       const int /*nlocal*/, double * /*boxlo*/, double * /*prd*/) {
  acc_timers();
  int eflag, vflag;
  if (eatom) eflag=2;
@ -258,7 +261,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
                        const double cpu_time, bool &success,
                        double **host_v, const double dtinvsqrt,
                        const int seed, const int timestep,
-                        double *boxlo, double *prd) {
+                        double * /*boxlo*/, double * /*prd*/) {
  acc_timers();
  int eflag, vflag;
  if (eatom) eflag=2;
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
  else
    _threads_per_atom=device->threads_per_three();

-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  bool charge = false;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@ -44,19 +44,15 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
 }

 template <class numtyp, class acctyp>
-int CHARMMLongT::init(const int ntypes,
-                           double host_cut_bothsq, double **host_lj1,
-                           double **host_lj2, double **host_lj3,
-                           double **host_lj4, double **host_offset,
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen,
-                           double host_cut_ljsq, const double host_cut_coulsq,
-                           double *host_special_coul, const double qqrd2e,
-                           const double g_ewald, const double cut_lj_innersq,
-                           const double denom_lj, double **epsilon,
-                           double **sigma, const bool mix_arithmetic) {
+int CHARMMLongT::init(const int ntypes, double host_cut_bothsq, double **host_lj1,
+                      double **host_lj2, double **host_lj3, double **host_lj4,
+                      double ** /*host_offset*/, double *host_special_lj, const int nlocal,
+                      const int nall, const int max_nbors, const int maxspecial,
+                      const double cell_size, const double gpu_split, FILE *_screen,
+                      double host_cut_ljsq, const double host_cut_coulsq,
+                      double *host_special_coul, const double qqrd2e, const double g_ewald,
+                      const double cut_lj_innersq, const double denom_lj, double **epsilon,
+                      double **sigma, const bool mix_arithmetic) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,charmm_long,"k_charmm_long");
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -52,7 +52,7 @@ DeviceT::~Device() {
 }

 template <class numtyp, class acctyp>
-int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu,
                         const int first_gpu_id, const int gpu_mode,
                         const double p_split, const int t_per_atom,
                         const double user_cell_size, char *ocl_args,
@ -386,6 +386,9 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
  }

  _ocl_compile_string="-cl-mad-enable ";
+  #ifdef CL_VERSION_2_0
+  _ocl_compile_string+="-cl-std=CL2.0 ";
+  #endif
  if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
  _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
    std::string(OCL_PRECISION_COMPILE);
@ -438,7 +441,7 @@ template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                  const bool rot, const int nlocal,
                  const int nall, const int maxspecial,
-                  const bool vel) {
+                  const bool vel, const int extra_fields) {
  if (!_device_init)
    return -1;
  if (sizeof(acctyp)==sizeof(double) && !gpu->double_precision())
@ -467,7 +470,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,

  if (_init_count==0) {
    // Initialize atom and nbor data
-    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
+    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields))
      return -3;

    _data_in_estimate++;
@ -477,6 +480,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
      _data_in_estimate++;
    if (vel)
      _data_in_estimate++;
+    if (extra_fields>0)
+      _data_in_estimate++;
+
  } else {
    if (!atom.charge() && charge)
      _data_in_estimate++;
@ -484,7 +490,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
      _data_in_estimate++;
    if (!atom.velocity() && vel)
      _data_in_estimate++;
-    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
+    if (atom.using_extra() && extra_fields>0)
+      _data_in_estimate++;
+    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields))
      return -3;
  }

@ -520,7 +528,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,

 template <class numtyp, class acctyp>
 int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
-                       const int host_nlocal, const int nall,
+                       const int host_nlocal, const int /*nall*/,
                       const int maxspecial, const int gpu_host,
                       const int max_nbors, const double cutoff,
                       const bool pre_cut, const int threads_per_atom,
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -61,6 +61,7 @@ class Device {
    * \param nall Total number of local+ghost particles
    * \param maxspecial Maximum mumber of special bonded atoms per atom
    * \param vel True if velocities need to be stored
+    * \param extra_fields Nonzero if extra fields need to be stored
    *
    * Returns:
    * -  0 if successful
@ -70,7 +71,7 @@ class Device {
    * - -5 Double precision is not supported on card **/
  int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
           const int nlocal, const int nall, const int maxspecial,
-           const bool vel=false);
+           const bool vel=false, const int extra_fields=0);

  /// Initialize the device for Atom storage only
  /** \param nlocal Total number of local particles to allocate memory for
--- a/lib/gpu/lal_dpd_tstat_ext.cpp
+++ b/lib/gpu/lal_dpd_tstat_ext.cpp
@ -28,10 +28,10 @@ static DPD<PRECISION,ACC_PRECISION> DPDTMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0,
-                 double **host_gamma, double **host_sigma, double **host_cut,
-                 double *special_lj, const int inum,
-                 const int nall, const int max_nbors,  const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen) {
+                       double **host_gamma, double **host_sigma, double **host_cut,
+                       double *special_lj, const int inum,
+                       const int nall, const int /*max_nbors*/,  const int maxspecial,
+                       const double cell_size, int &gpu_mode, FILE *screen) {
  DPDTMF.clear();
  gpu_mode=DPDTMF.device->gpu_mode();
  double gpu_split=DPDTMF.device->particle_split();
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@ -310,7 +310,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
                   const int nall, double **host_x, int *host_type,
                   int *ilist, int *numj, int **firstneigh,
                   const bool eflag_in, const bool vflag_in,
-                   const bool eatom, const bool vatom,
+                   const bool /*eatom*/, const bool /*vatom*/,
                   int &host_start, const double cpu_time,
                   bool &success, void **fp_ptr) {
  this->acc_timers();
@ -386,8 +386,8 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
                    double **host_x, int *host_type, double *sublo,
                    double *subhi, tagint *tag, int **nspecial,
                    tagint **special, const bool eflag_in,
-                    const bool vflag_in, const bool eatom,
-                    const bool vatom, int &host_start, int **ilist, int **jnum,
+                    const bool vflag_in, const bool /*eatom*/,
+                    const bool /*vatom*/, int &host_start, int **ilist, int **jnum,
                    const double cpu_time, bool &success, int &inum,
                    void **fp_ptr) {
  this->acc_timers();
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@ -0,0 +1,641 @@
+/***************************************************************************
+                                 hippo.cpp
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the hippo pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "hippo_cl.h"
+#elif defined(USE_CUDART)
+const char *hippo=0;
+#else
+#include "hippo_cubin.h"
+#endif
+
+#include "lal_hippo.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define HippoT Hippo<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+HippoT::Hippo() : BaseAmoeba<numtyp,acctyp>(),
+  _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+HippoT::~Hippo() {
+  clear();
+  k_repulsion.clear();
+  k_dispersion.clear();
+
+}
+
+template <class numtyp, class acctyp>
+int HippoT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
+                 const double *host_pdamp, const double *host_thole,
+                 const double *host_dirdamp, const int *host_amtype2class,
+                 const double *host_special_repel, const double *host_special_disp,
+                 const double *host_special_mpole,
+                 const double *host_special_polar_wscale,
+                 const double *host_special_polar_piscale,
+                 const double *host_special_polar_pscale,
+                 const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
+                 const double *host_csix, const double *host_adisp,
+                 const double *host_pcore, const double *host_palpha,
+                 const int nlocal, const int nall, const int max_nbors,
+                 const int maxspecial, const int maxspecial15,
+                 const double cell_size, const double gpu_split, FILE *_screen,
+                 const double polar_dscale, const double polar_uscale) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
+                            cell_size,gpu_split,_screen,hippo,
+                            "k_hippo_multipole", "k_hippo_udirect2b",
+                            "k_hippo_umutual2b", "k_hippo_polar",
+                            "k_hippo_fphi_uind", "k_hippo_fphi_mpole",
+                            "k_hippo_short_nbor", "k_hippo_special15");
+  if (success!=0)
+    return success;
+
+  // specific to HIPPO
+  k_repulsion.set_function(*(this->pair_program),"k_hippo_repulsion");
+  k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+
+  UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_pdamp[i];
+    host_write[i].y = host_thole[i];
+    host_write[i].z = host_dirdamp[i];
+    host_write[i].w = host_amtype2class[i];
+  }
+
+  coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amtype,host_write,false);
+
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_sizpr[i];
+    host_write[i].y = host_dmppr[i];
+    host_write[i].z = host_elepr[i];
+    host_write[i].w = (numtyp)0;
+  }
+
+  coeff_rep.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_rep,host_write,false);
+
+  UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amclass; i++) {
+    host_write2[i].x = host_csix[i];
+    host_write2[i].y = host_adisp[i];
+    host_write2[i].z = host_pcore[i];
+    host_write2[i].w = host_palpha[i];
+  }
+
+  coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amclass,host_write2,false);
+
+  UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
+  sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_polar_wscale[i];
+    dview[i].y=host_special_polar_piscale[i];
+    dview[i].z=host_special_polar_pscale[i];
+    dview[i].w=host_special_mpole[i];
+  }
+  ucl_copy(sp_polar,dview,5,false);
+
+  sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_repel[i];
+    dview[i].y=host_special_disp[i];
+    dview[i].z=(numtyp)0;
+    dview[i].w=(numtyp)0;
+  }
+  ucl_copy(sp_nonpolar,dview,5,false);
+
+  _polar_dscale = polar_dscale;
+  _polar_uscale = polar_uscale;
+
+  _allocated=true;
+  this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes()
+    + coeff_amclass.row_bytes() + sp_polar.row_bytes()
+    + sp_nonpolar.row_bytes() + this->_tep.row_bytes()
+    + this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
+    + this->_thetai2.row_bytes()  + this->_thetai3.row_bytes()
+    + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void HippoT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  coeff_amtype.clear();
+  coeff_rep.clear();
+  coeff_amclass.clear();
+  sp_polar.clear();
+  sp_nonpolar.clear();
+
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double HippoT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Hippo<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Compute the repulsion term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_repulsion(const int /*ago*/, const int inum_full,
+                               const int /*nall*/, double ** /*host_x*/,
+                               int * /*host_type*/, int * /*host_amtype*/,
+                               int * /*host_amgroup*/, double ** /*host_rpole*/,
+                               double * /*sublo*/, double * /*subhi*/, tagint * /*tag*/,
+                               int ** /*nspecial*/, tagint ** /*special*/,
+                               int * /*nspecial15*/, tagint ** /*special15*/,
+                               const bool eflag_in, const bool vflag_in,
+                               const bool eatom, const bool vatom,
+                               int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
+                               const double /*cpu_time*/, bool & /*success*/,
+                               const double /*aewald*/, const double off2_repulse,
+                               double * /*host_q*/, double * /*boxlo*/, double * /*prd*/,
+                               double cut2, double c0, double c1, double c2,
+                               double c3, double c4, double c5, void **tep_ptr) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>this->_max_tep_size) {
+    this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->_tep.resize(this->_max_tep_size*4);
+  }
+  *tep_ptr=this->_tep.host.begin();
+
+  this->_off2_repulse = off2_repulse;
+  _cut2 = cut2;
+  _c0 = c0;
+  _c1 = c1;
+  _c2 = c2;
+  _c3 = c3;
+  _c4 = c4;
+  _c5 = c5;
+  repulsion(this->_eflag,this->_vflag);
+
+  // copy tep from device to host
+  this->_tep.update_host(this->_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the repulsion kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::repulsion(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_disp,
+  //   at this point repuslion is the first kernel in a time step for HIPPO
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_repulse, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  k_repulsion.set_size(GX,BX);
+  k_repulsion.run(&this->atom->x, &this->atom->extra,
+                  &coeff_rep, &sp_nonpolar,
+                  &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                  &this->dev_short_nbor,
+                  &this->ans->force, &this->ans->engv, &this->_tep,
+                  &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                  &this->_threads_per_atom,  &this->_aewald,
+                  &this->_off2_repulse, &_cut2,
+                  &_c0, &_c1, &_c2, &_c3, &_c4, &_c5);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Compute dispersion real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
+                                      double **host_rpole, const double aewald,
+                                      const double off2_disp) {
+
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(host_amtype, host_amgroup, host_rpole,
+                        nullptr, nullptr, nullptr);
+  this->atom->add_extra_data();
+
+  this->_off2_disp = off2_disp;
+  this->_aewald = aewald;
+  dispersion_real(this->_eflag,this->_vflag);
+
+  // only copy them back if this is the last kernel
+  //   otherwise, commenting out these two lines to leave the answers
+  //   (forces, energies and virial) on the device until the last kernel
+  //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //this->device->add_ans_object(this->ans);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the dispersion real-space kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::dispersion_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_disp,
+  //   at this point dispersion is the first kernel in a time step
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_disp, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  k_dispersion.set_size(GX,BX);
+  k_dispersion.run(&this->atom->x, &this->atom->extra,
+                   &coeff_amtype, &coeff_amclass, &sp_nonpolar,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &this->ans->force, &this->ans->engv,
+                   &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                   &this->_threads_per_atom,  &this->_aewald,
+                   &this->_off2_disp);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Compute the multipole real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_multipole_real(const int /*ago*/, const int inum_full,
+                                    const int /*nall*/, double ** /*host_x*/,
+                                    int * /*host_type*/, int * /*host_amtype*/,
+                                    int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                    double* host_pval, double * /*sublo*/,
+                                    double * /*subhi*/, tagint * /*tag*/,
+                                    int ** /*nspecial*/, tagint ** /*special*/,
+                                    int * /*nspecial15*/, tagint ** /*special15*/,
+                                    const bool /*eflag_in*/, const bool /*vflag_in*/,
+                                    const bool /*eatom*/, const bool /*vatom*/,
+                                    int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
+                                    const double /*cpu_time*/, bool & /*success*/,
+                                     const double aewald, const double felec,
+                                    const double off2_mpole, double * /*host_q*/,
+                                    double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
+
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, nullptr, nullptr, host_pval);
+  this->atom->add_extra_data();
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>this->_max_tep_size) {
+    this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->_tep.resize(this->_max_tep_size*4);
+  }
+  *tep_ptr=this->_tep.host.begin();
+
+  this->_off2_mpole = off2_mpole;
+  this->_felec = felec;
+  this->_aewald = aewald;
+  multipole_real(this->_eflag,this->_vflag);
+
+  // copy tep from device to host
+  this->_tep.update_host(this->_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the multipole real-space kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::multipole_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_mpole
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_mpole, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  this->k_multipole.set_size(GX,BX);
+  this->k_multipole.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->ans->force, &this->ans->engv, &this->_tep,
+                        &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                        &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Compute the direct real space part of the permanent field
+//   returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_udirect2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                double **host_uind, double **host_uinp, double* host_pval,
+                                const double aewald, const double off2_polar,
+                                void** fieldp_ptr) {
+
+  // all the necessary data arrays are already copied from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval);
+  this->atom->add_extra_data();
+
+  *fieldp_ptr=this->_fieldp.host.begin();
+
+  this->_off2_polar = off2_polar;
+  this->_aewald = aewald;
+  udirect2b(this->_eflag,this->_vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+
+  this->_fieldp.update_host(this->_max_fieldp_size*8,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the real-space permanent field kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff _off2_polar, if not done yet
+  //   this is the first kernel in a time step where _off2_polar is used
+
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(),
+                           &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                           &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_udirect2b.set_size(GX,BX);
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
+                        &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Compute the direct real space term of the induced field
+//   returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_umutual2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
+                               double **host_uind, double **host_uinp, double * /*host_pval*/,
+                               const double aewald, const double off2_polar, void ** /*fieldp_ptr*/) {
+
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
+  this->atom->add_extra_data();
+
+  this->_off2_polar = off2_polar;
+  this->_aewald = aewald;
+  umutual2b(this->_eflag,this->_vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+  // NOTE: move this step to update_fieldp() to delay device-host transfer
+  // *fieldp_ptr=this->_fieldp.host.begin();
+  // this->_fieldp.update_host(this->_max_fieldp_size*8,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the real-space induced field kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(), &this->dev_short_nbor,
+                           &this->_off2_polar, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_umutual2b.set_size(GX,BX);
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
+                        &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
+                        &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute polar real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_polar_real(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                double **host_uind, double **host_uinp, double * /*host_pval*/,
+                                const bool eflag_in, const bool vflag_in,
+                                const bool eatom, const bool vatom,
+                                const double aewald, const double felec,
+                                const double off2_polar, void **tep_ptr) {
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
+  this->atom->add_extra_data();
+
+  *tep_ptr=this->_tep.host.begin();
+
+  this->_off2_polar = off2_polar;
+  this->_felec = felec;
+  this->_aewald = aewald;
+  const int red_blocks=polar_real(this->_eflag,this->_vflag);
+
+  // only copy answers (forces, energies and virial) back from the device
+  //   in the last kernel in a timestep (which is polar_real here)
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  this->device->add_ans_object(this->ans);
+
+  // copy tep from device to host
+  this->_tep.update_host(this->_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the polar real-space kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::polar_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=this->block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  /*
+  const int cus = this->device->gpu->cus();
+  while (GX < cus && GX > 1) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  }
+  */
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_polar.set_size(GX,BX);
+  this->k_polar.run(&this->atom->x, &this->atom->extra,
+                    &coeff_amtype, &coeff_amclass, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                    &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  // Signal that short nbor list is not avail for the next time step
+  //   do it here because polar_real() is the last kernel in a time step at this point
+
+  this->short_nbor_polar_avail = false;
+
+  return GX;
+}
+
+template class Hippo<PRECISION,ACC_PRECISION>;
+}
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@ -0,0 +1,166 @@
+/***************************************************************************
+                                  hippo.h
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the hippo pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_HIPPO_H
+#define LAL_HIPPO_H
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Hippo : public BaseAmoeba<numtyp, acctyp> {
+ public:
+  Hippo();
+  ~Hippo();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const int max_amtype, const int max_amclass,
+           const double *host_pdamp, const double *host_thole,
+           const double *host_dirdamp, const int *host_amtype2class,
+           const double *host_special_mpole,
+           const double *host_special_repel,
+           const double *host_special_disp,
+           const double *host_special_polar_wscale,
+           const double *host_special_polar_piscale,
+           const double *host_special_polar_pscale,
+           const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
+           const double *host_csix, const double *host_adisp,
+           const double *host_pcore, const double *host_palpha,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const int maxspecial15, const double cell_size,
+           const double gpu_split, FILE *_screen,
+           const double polar_dscale, const double polar_uscale);
+
+  /// Compute repulsion with device neighboring
+  virtual void compute_repulsion(const int ago, const int inum_full,
+                          const int nall, double **host_x,
+                          int *host_type, int *host_amtype,
+                          int *host_amgroup, double **host_rpole,
+                          double *sublo, double *subhi, tagint *tag,
+                          int **nspecial, tagint **special,
+                          int *nspecial15, tagint **special15,
+                          const bool eflag_in, const bool vflag_in,
+                          const bool eatom, const bool vatom,
+                          int &host_start, int **ilist, int **jnum,
+                          const double cpu_time, bool &success,
+                          const double aewald, const double off2_repulse,
+                          double *host_q, double *boxlo, double *prd,
+                          double cut2, double c0, double c1, double c2,
+                          double c3, double c4, double c5,void** tep_ptr);
+
+  /// Compute dispersion real-space with device neighboring
+  virtual void compute_dispersion_real(int *host_amtype,  int *host_amgroup,
+                                double **host_rpole, const double aewald,
+                                const double off2_disp);
+
+  /// Compute multipole real-space with device neighboring
+  virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *host_pval,
+                double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                const double aewald, const double felec, const double off2_mpole, double *charge,
+                double *boxlo, double *prd, void **tep_ptr);
+
+   /// Compute the real space part of the permanent field (udirect2b) with device neighboring
+   virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                  double **host_uind, double **host_uinp, double* host_pval,
+                  const double aewald, const double off2_polar, void** fieldp_ptr);
+
+   /// Compute the real space part of the induced field (umutual2b) with device neighboring
+   virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                   double **host_uind, double **host_uinp, double *host_pval,
+                                   const double aewald, const double off2_polar,
+                                   void** fieldp_ptr);
+
+  /// Compute polar real-space with device neighboring
+  virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const bool eflag_in, const bool vflag_in,
+                const bool eatom, const bool vatom,
+                const double aewald, const double felec, const double off2_polar,
+                void **tep_ptr);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
+  /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
+  UCL_D_Vec<numtyp4> coeff_amtype;
+  /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
+  UCL_D_Vec<numtyp4> coeff_amclass;
+  /// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z;
+  UCL_D_Vec<numtyp4> coeff_rep;
+  /// Special polar values [0-4]:
+  ///   sp_polar.x = special_polar_wscale
+  ///   sp_polar.y special_polar_pscale,
+  ///   sp_polar.z = special_polar_piscale
+  ///   sp_polar.w = special_mpole
+  UCL_D_Vec<numtyp4> sp_polar;
+  /// Special nonpolar values [0-4]:
+  ///   sp_nonpolar.x = special_hal
+  ///   sp_nonpolar.y special_repel
+  ///   sp_nonpolar.z = special_disp
+  UCL_D_Vec<numtyp4> sp_nonpolar;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _cut2,_c0,_c1,_c2,_c3,_c4,_c5;
+  numtyp _polar_dscale, _polar_uscale;
+  numtyp _qqrd2e;
+
+  UCL_Kernel k_repulsion, k_dispersion;
+
+ protected:
+  bool _allocated;
+  int repulsion(const int eflag, const int vflag);
+  int dispersion_real(const int eflag, const int vflag);
+  int multipole_real(const int eflag, const int vflag);
+  int udirect2b(const int eflag, const int vflag);
+  int umutual2b(const int eflag, const int vflag);
+  int polar_real(const int eflag, const int vflag);
+
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@ -0,0 +1,231 @@
+/***************************************************************************
+                                 hippo_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Functions for LAMMPS access to hippo acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_hippo.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Hippo<PRECISION,ACC_PRECISION> HIPPOMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp, const int *host_amtype2class,
+                    const double *host_special_repel,
+                    const double *host_special_disp,
+                    const double *host_special_mpole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
+                    const double *host_csix, const double *host_adisp,
+                    const double *host_pcore, const double *host_palpha,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                   const double polar_dscale, const double polar_uscale) {
+  HIPPOMF.clear();
+  gpu_mode=HIPPOMF.device->gpu_mode();
+  double gpu_split=HIPPOMF.device->particle_split();
+  int first_gpu=HIPPOMF.device->first_device();
+  int last_gpu=HIPPOMF.device->last_device();
+  int world_me=HIPPOMF.device->world_me();
+  int gpu_rank=HIPPOMF.device->gpu_rank();
+  int procs_per_gpu=HIPPOMF.device->procs_per_gpu();
+
+  HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu);
+
+  bool message=false;
+  if (HIPPOMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
+                         host_pdamp, host_thole, host_dirdamp,
+                         host_amtype2class, host_special_repel, host_special_disp,
+                         host_special_mpole, host_special_polar_wscale,
+                         host_special_polar_piscale, host_special_polar_pscale,
+                         host_sizpr, host_dmppr, host_elepr,
+                         host_csix, host_adisp, host_pcore, host_palpha,
+                         nlocal, nall, max_nbors,
+                         maxspecial, maxspecial15, cell_size, gpu_split,
+                         screen, polar_dscale, polar_uscale);
+
+  HIPPOMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
+                           host_pdamp, host_thole, host_dirdamp,
+                           host_amtype2class, host_special_repel, host_special_disp,
+                           host_special_mpole, host_special_polar_wscale,
+                           host_special_polar_piscale, host_special_polar_pscale,
+                           host_sizpr, host_dmppr, host_elepr,
+                           host_csix, host_adisp, host_pcore, host_palpha,
+                           nlocal, nall, max_nbors,
+                           maxspecial, maxspecial15, cell_size, gpu_split,
+                           screen, polar_dscale, polar_uscale);
+
+    HIPPOMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    HIPPOMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void hippo_gpu_clear() {
+  HIPPOMF.clear();
+}
+
+int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double ** /*host_uind*/, double ** /*host_uinp*/, double * /*host_pval*/,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo, double *prd) {
+  return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type,
+                            host_amtype, host_amgroup, host_rpole,
+                            nullptr, nullptr, nullptr, sublo, subhi, tag,
+                            nspecial, special, nspecial15, special15,
+                            eflag_in, vflag_in, eatom, vatom,
+                            host_start, ilist, jnum, cpu_time,
+                            success, host_q, boxlo, prd);
+}
+
+void hippo_gpu_compute_repulsion(const int ago, const int inum_full,
+                                 const int nall, double **host_x, int *host_type,
+                                 int *host_amtype, int *host_amgroup, double **host_rpole,
+                                 double *sublo, double *subhi, tagint *tag, int **nspecial,
+                                 tagint **special, int *nspecial15, tagint** special15,
+                                 const bool eflag, const bool vflag, const bool eatom,
+                                 const bool vatom, int &host_start,
+                                 int **ilist, int **jnum, const double cpu_time,
+                                 bool &success, const double aewald, const double off2,
+                                 double *host_q, double *boxlo, double *prd,
+                                 double cut2, double c0, double c1, double c2,
+                                 double c3, double c4, double c5, void **tep_ptr) {
+  HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd,
+                          cut2, c0, c1, c2, c3, c4, c5, tep_ptr);
+}
+
+void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup,
+                                        double **host_rpole, const double aewald,
+                                        const double off2) {
+  HIPPOMF.compute_dispersion_real(host_amtype, host_amgroup, host_rpole,
+                                         aewald, off2);
+}
+
+void hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double felec, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
+}
+
+void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp, double *host_pval,
+                           const double aewald, const double off2, void **fieldp_ptr) {
+  HIPPOMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole,
+                            host_uind, host_uinp, host_pval,
+                            aewald, off2, fieldp_ptr);
+}
+
+void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                  double **host_uind, double **host_uinp, double *host_pval,
+                                  const double aewald, const double off2, void **fieldp_ptr) {
+  HIPPOMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
+                            aewald, off2, fieldp_ptr);
+}
+
+void hippo_gpu_update_fieldp(void **fieldp_ptr) {
+  HIPPOMF.update_fieldp(fieldp_ptr);
+}
+
+void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                  double **host_uind, double **host_uinp, double *host_pval,
+                                  const bool eflag_in, const bool vflag_in,
+                                  const bool eatom, const bool vatom,
+                                  const double aewald, const double felec, const double off2,
+                                  void **tep_ptr) {
+  HIPPOMF.compute_polar_real(host_amtype, host_amgroup, host_rpole,  host_uind, host_uinp, host_pval,
+                             eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
+}
+
+void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out) {
+   HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2,
+                             host_thetai3, igrid, nzlo_out, nzhi_out,
+                             nylo_out, nyhi_out, nxlo_out, nxhi_out);
+}
+
+void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
+                         void **host_fdip_phi2, void **host_fdip_sum_phi) {
+   HIPPOMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi);
+}
+
+double hippo_gpu_bytes() {
+  return HIPPOMF.host_memory_usage();
+}
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@ -0,0 +1,431 @@
+/// **************************************************************************
+//                              hippo_extra.h
+//                             -------------------
+//                              Trung Dac Nguyen
+//
+//  Device code for hippo math routines
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : ndactrung@gmail.com
+// ***************************************************************************/*
+
+#ifndef LAL_HIPPO_EXTRA_H
+#define LAL_HIPPO_EXTRA_H
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include "lal_aux_fun1.h"
+#else
+#endif
+
+#define MY_PI2 (numtyp)1.57079632679489661923
+#define MY_PI4 (numtyp)0.78539816339744830962
+
+/* ----------------------------------------------------------------------
+   damprep generates coefficients for the Pauli repulsion
+   damping function for powers of the interatomic distance
+
+   literature reference:
+
+   J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An
+   Anisotropic, Atomic Multipole Model", Journal of Chemical Physics,
+   150, 084104 (2019)
+------------------------------------------------------------------------- */
+
+ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
+                        const numtyp rr3, const numtyp rr5, const numtyp rr7,
+                        const numtyp rr9, const numtyp rr11, const int rorder,
+                        const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11])
+{
+  numtyp r3,r4;
+  numtyp r5,r6,r7,r8;
+  numtyp s,ds,d2s;
+  numtyp d3s,d4s,d5s;
+  numtyp dmpi2,dmpk2;
+  numtyp dmpi22,dmpi23;
+  numtyp dmpi24,dmpi25;
+  numtyp dmpi26,dmpi27;
+  numtyp dmpk22,dmpk23;
+  numtyp dmpk24,dmpk25;
+  numtyp dmpk26;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp pre,term,tmp;
+
+  // compute tolerance value for damping exponents
+
+  eps = (numtyp)0.001;
+  diff = dmpi-dmpk; // fabs(dmpi-dmpk)
+  if (diff < (numtyp)0) diff = -diff;
+
+  // treat the case where alpha damping exponents are equal
+
+  if (diff < eps) {
+    r3 = r2 * r;
+    r4 = r3 * r;
+    r5 = r4 * r;
+    r6 = r5 * r;
+    r7 = r6 * r;
+    dmpi2 = (numtyp)0.5 * dmpi;
+    dampi = dmpi2 * r;
+    expi = ucl_exp(-dampi);
+    dmpi22 = dmpi2 * dmpi2;
+    dmpi23 = dmpi22 * dmpi2;
+    dmpi24 = dmpi23 * dmpi2;
+    dmpi25 = dmpi24 * dmpi2;
+    dmpi26 = dmpi25 * dmpi2;
+    pre = (numtyp)128.0;
+    s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi;
+
+    ds = (dmpi22*r3 + dmpi23*r4) * expi / (numtyp)3.0;
+    d2s = dmpi24 * expi * r5 / (numtyp)9.0;
+    d3s = dmpi25 * expi * r6 / (numtyp)45.0;
+    d4s = (dmpi25*r6 + dmpi26*r7) * expi / (numtyp)315.0;
+    if (rorder >= 11) {
+      r8 = r7 * r;
+      dmpi27 = dmpi2 * dmpi26;
+      d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0;
+    }
+
+  // treat the case where alpha damping exponents are unequal
+
+  } else {
+    r3 = r2 * r;
+    r4 = r3 * r;
+    r5 = r4 * r;
+    dmpi2 = (numtyp)0.5 * dmpi;
+    dmpk2 = (numtyp)0.5 * dmpk;
+    dampi = dmpi2 * r;
+    dampk = dmpk2 * r;
+    expi = ucl_exp(-dampi);
+    expk = ucl_exp(-dampk);
+    dmpi22 = dmpi2 * dmpi2;
+    dmpi23 = dmpi22 * dmpi2;
+    dmpi24 = dmpi23 * dmpi2;
+    dmpi25 = dmpi24 * dmpi2;
+    dmpk22 = dmpk2 * dmpk2;
+    dmpk23 = dmpk22 * dmpk2;
+    dmpk24 = dmpk23 * dmpk2;
+    dmpk25 = dmpk24 * dmpk2;
+    term = dmpi22 - dmpk22;
+    pre = (numtyp)8192.0 * dmpi23 * dmpk23 / (term*term*term*term); //ucl_powr(term,(numtyp)4.0);
+    tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term;
+    s = (dampi-tmp)*expk + (dampk+tmp)*expi;
+
+    ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term -
+          (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 -
+           ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 +
+       ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
+       (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    d3s = (dmpi2*dmpk23*r4/(numtyp)15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 -
+           ((numtyp)4.0/(numtyp)15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term -
+           (numtyp)4.0*dmpi2*dmpk22*r/term - (numtyp)4.0/term*dmpi2*dmpk2) * expk +
+      (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 +
+       ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term +
+       (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
+    d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 +
+           dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 -
+           ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term -
+           ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi24*dmpk2*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 +
+       dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 +
+       ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term +
+       ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
+       (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+
+    if (rorder >= 11) {
+      r6 = r5 * r;
+      dmpi26 = dmpi25 * dmpi2;
+      dmpk26 = dmpk25 * dmpk2;
+      d5s = (dmpi2*dmpk25*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi2*dmpk24*r5 +
+             dmpi2*dmpk23*r4/(numtyp)21.0 + dmpi2*dmpk22*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 -
+             ((numtyp)4.0/(numtyp)945.0)*dmpi2*dmpk26*r5/term -
+             ((numtyp)4.0/(numtyp)63.0)*dmpi2*dmpk25*r4/term - ((numtyp)4.0/(numtyp)9.0)*dmpi2*dmpk24*r3/term -
+             ((numtyp)16.0/(numtyp)9.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+             (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+        (dmpi25*dmpk2*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi24*dmpk2*r5 +
+         dmpi23*dmpk2*r4/(numtyp)21.0 + dmpi22*dmpk2*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 +
+         ((numtyp)4.0/(numtyp)945.0)*dmpi26*dmpk2*r5/term + ((numtyp)4.0/(numtyp)63.0)*dmpi25*dmpk2*r4/term +
+         ((numtyp)4.0/(numtyp)9.0)*dmpi24*dmpk2*r3/term + ((numtyp)16.0/(numtyp)9.0)*dmpi23*dmpk2*r2/term +
+         (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    }
+  }
+
+  // convert partial derivatives into full derivatives
+
+  s = s * rr1;
+  ds = ds * rr3;
+  d2s = d2s * rr5;
+  d3s = d3s * rr7;
+  d4s = d4s * rr9;
+  d5s = d5s * rr11;
+  dmpik[0] = (numtyp)0.5 * pre * s * s;
+  dmpik[2] = pre * s * ds;
+  dmpik[4] = pre * (s*d2s + ds*ds);
+  dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s);
+  dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s);
+
+  if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s);
+}
+
+/* ----------------------------------------------------------------------
+   damppole generates coefficients for the charge penetration
+   damping function for powers of the interatomic distance
+
+   literature references:
+
+   L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the
+   Effective Fragment Potential Method: Theory and Application to
+   the Benzene Dimer", Journal of Computational Chemistry, 28,
+   276-291 (2007)  [Gordon f1 and f2 models]
+
+   J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and
+   J. W. Ponder, "An Optimized Charge Penetration Model for Use with
+   the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19,
+   276-291 (2017)
+------------------------------------------------------------------------- */
+
+ucl_inline void damppole(const numtyp r, const int rorder,
+                         const numtyp alphai, const numtyp alphak,
+                         numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11])
+{
+  numtyp termi,termk;
+  numtyp termi2,termk2;
+  numtyp alphai2,alphak2;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampi3;
+  numtyp dampi4,dampi5;
+  numtyp dampi6,dampi7;
+  numtyp dampi8;
+  numtyp dampk2,dampk3;
+  numtyp dampk4,dampk5;
+  numtyp dampk6;
+
+  // compute tolerance and exponential damping factors
+
+  eps = (numtyp)0.001;
+  diff = alphai-alphak;
+  if (diff < (numtyp)0) diff = -diff;
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
+
+  // core-valence charge penetration damping for Gordon f1
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  dampi4 = dampi2 * dampi2;
+  dampi5 = dampi2 * dampi3;
+  dmpi[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampi)*expi;
+  dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
+  dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
+  dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
+  dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                   (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi;
+  if (diff < eps) {
+    dmpk[0] = dmpi[0];
+    dmpk[2] = dmpi[2];
+    dmpk[4] = dmpi[4];
+    dmpk[6] = dmpi[6];
+    dmpk[8] = dmpi[8];
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    dampk4 = dampk2 * dampk2;
+    dampk5 = dampk2 * dampk3;
+    dmpk[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampk)*expk;
+    dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
+    dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
+    dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk;
+    dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
+                     (numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk;
+  }
+
+  // valence-valence charge penetration damping for Gordon f1
+
+  if (diff < eps) {
+    dampi6 = dampi3 * dampi3;
+    dampi7 = dampi3 * dampi4;
+    dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 +
+                      dampi3/(numtyp)48.0)*expi;
+    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
+                      (numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi;
+    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
+    dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi;
+    dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
+                      dampi7/(numtyp)5040.0)*expi;
+    if (rorder >= 11) {
+      dampi8 = dampi4 * dampi4;
+      dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                         dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
+                         dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi;
+    }
+
+  } else {
+    alphai2 = alphai * alphai;
+    alphak2 = alphak * alphak;
+    termi = alphak2 / (alphak2-alphai2);
+    termk = alphai2 / (alphai2-alphak2);
+    termi2 = termi * termi;
+    termk2 = termk * termk;
+    dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi -
+      termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk;
+    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
+    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
+      termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk;
+    dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
+                             dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi -
+      termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk;
+    dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                             (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi -
+      termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
+              (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 +
+                        (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 +
+                        (numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk;
+
+    if (rorder >= 11) {
+      dampi6 = dampi3 * dampi3;
+      dampk6 = dampk3 * dampk3;
+      dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                                (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 +
+                                dampi6/(numtyp)1890.0)*expi -
+        termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 +
+                (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk -
+        (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 +
+                          dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi -
+        (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 +
+                          dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   dampdir = direct field damping coefficents
+   dampdir generates coefficients for the direct field damping
+   function for powers of the interatomic distance
+------------------------------------------------------------------------- */
+
+ucl_inline void dampdir(numtyp r, numtyp alphai, numtyp alphak, numtyp *dmpi, numtyp *dmpk)
+{
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampk2;
+  numtyp dampi3,dampk3;
+  numtyp dampi4,dampk4;
+
+  // compute tolerance and exponential damping factors
+
+  eps = (numtyp)0.001;
+  diff = alphai-alphak; // fabs(alphai-alphak);
+  if (diff < (numtyp)0) diff = -diff;
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
+
+  // core-valence charge penetration damping for Gordon f1 (HIPPO)
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  dampi4 = dampi2 * dampi2;
+  dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
+  dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
+  dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
+  if (diff < eps) {
+    dmpk[2] = dmpi[2];
+    dmpk[4] = dmpi[4];
+    dmpk[6] = dmpi[6];
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    dampk4 = dampk2 * dampk2;
+    dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
+    dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
+    dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/30.0)*expk;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   dampmut = mutual field damping coefficents
+   dampmut generates coefficients for the mutual field damping
+   function for powers of the interatomic distance
+------------------------------------------------------------------------- */
+
+ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
+{
+  numtyp termi,termk;
+  numtyp termi2,termk2;
+  numtyp alphai2,alphak2;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampi3;
+  numtyp dampi4,dampi5;
+  numtyp dampk2,dampk3;
+
+  // compute tolerance and exponential damping factors
+
+  eps = (numtyp)0.001;
+  diff = alphai-alphak; // fabs(alphai-alphak);
+  if (diff < (numtyp)0) diff = -diff;
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
+
+  // valence-valence charge penetration damping for Gordon f1 (HIPPO)
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  if (diff < eps) {
+    dampi4 = dampi2 * dampi2;
+    dampi5 = dampi2 * dampi3;
+    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
+                      7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi;
+    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    alphai2 = alphai * alphai;
+    alphak2 = alphak * alphak;
+    termi = alphak2 / (alphak2-alphai2);
+    termk = alphai2 / (alphai2-alphak2);
+    termi2 = termi * termi;
+    termk2 = termk * termk;
+    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
+    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk -
+      (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi -
+      (numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk;
+  }
+}
+
+#endif
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -576,6 +576,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    time_nbor.stop();
    if (_time_device)
      time_nbor.add_to_total();
+
+    // on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial)
+    // on the device, transpose the matrix (1-d array) for coalesced reads
+    //   dev_special[i][j] = the special i neighbor of atom j
+
    time_transpose.start();
    const int b2x=_block_cell_2d;
    const int b2y=_block_cell_2d;
@ -679,6 +684,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    if (_cutoff < _cell_size) vadjust*=1.46;
    mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
    if (mn<33) mn+=3;
+
    resize_max_neighbors<numtyp,acctyp>(mn,success);
    set_nbor_block_size(mn/2);
    if (!success)
@ -831,6 +837,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
  time_nbor.stop();
 }

+void Neighbor::transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
+    const int columns_in, const int rows_in)
+{
+  const int b2x=_block_cell_2d;
+  const int b2y=_block_cell_2d;
+  const int g2x=static_cast<int>(ceil(static_cast<double>(columns_in)/b2x));
+  const int g2y=static_cast<int>(ceil(static_cast<double>(rows_in)/b2y));
+  _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
+  _shared->k_transpose.run(&out, &in, &columns_in, &rows_in);
+}
+
 template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
     (double **x, const int inum, const int host_inum, const int nall,
      Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -33,7 +33,7 @@
 #endif
 #endif

-#if defined(USE_HIP)
+#if defined(USE_HIP) || defined(__APPLE__)
 #define LAL_USE_OLD_NEIGHBOR
 #endif

@ -259,6 +259,10 @@ class Neighbor {
    return o.str();
  }

+  /// Helper function
+  void transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
+    const int columns_in, const int rows_in);
+
 private:
  NeighborShared *_shared;
  UCL_Device *dev;
@ -289,15 +293,17 @@ class Neighbor {
  #endif

  int _simd_size;
+  #ifdef LAL_USE_OLD_NEIGHBOR
  inline void set_nbor_block_size(const int mn) {
-    #ifdef LAL_USE_OLD_NEIGHBOR
    int desired=mn/(2*_simd_size);
    desired*=_simd_size;
    if (desired<_simd_size) desired=_simd_size;
    else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
    _block_nbor_build=desired;
-    #endif
  }
+  #else
+  inline void set_nbor_block_size(const int) {}
+  #endif
 };

 }
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -48,6 +48,19 @@ _texture_2d( pos_tex,int4);
 #define LAL_USE_OLD_NEIGHBOR
 #endif

+/*
+  compute the id of the cell where the atoms belong to
+x: atom coordinates
+cell_id: cell ids
+particle_id: 
+boxlo[0-2]: the lower left corner of the local box
+ncell[xyz]: the number of cells in xyz dims
+i_cell_size is the inverse cell size
+inum = the number of the local atoms that are ported to the device
+nall = the number of the local+ghost atoms that are ported to the device
+cells_in_cutoff = the number of cells that are within the cutoff
+*/
+
 __kernel void calc_cell_id(const numtyp4 *restrict x_,
                           unsigned *restrict cell_id,
                           int *restrict particle_id,
@ -90,6 +103,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_,
  }
 }

+// compute the number of atoms in each cell
+
 __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
                                      int *restrict cell_counts,
                                      int nall, int ncell) {
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@ -273,19 +273,19 @@ __kernel void interp(const __global numtyp4 *restrict x_,
        int my=mz+fast_mul(ny,npts_x);
        for (int m=0; m<order; m++) {
          grdtyp y0=z0*rho1d_1[m][tid];
-                for (int l=0; l<order; l++) {
-                  grdtyp x0=y0*rho1d_0[l][tid];
-                  grdtyp4 el=brick[my+l];
-                  ek.x-=x0*el.x;
-                  ek.y-=x0*el.y;
-                  ek.z-=x0*el.z;
-                }
+          for (int l=0; l<order; l++) {
+            grdtyp x0=y0*rho1d_0[l][tid];
+            grdtyp4 el=brick[my+l];
+            ek.x-=x0*el.x;
+            ek.y-=x0*el.y;
+            ek.z-=x0*el.z;
+          }
          my+=npts_x;
        }
        mz+=npts_yx;
-            }
+      }
    }
    ans[ii]=ek;
-        }
+  }
 }

--- a/lib/gpu/lal_pre_cuda_hip.h
+++ b/lib/gpu/lal_pre_cuda_hip.h
@ -182,12 +182,15 @@
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
+#define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrt
 #define ucl_sqrt sqrt
-#define ucl_recip(x) ((numtyp)1.0/(x))
+#define ucl_erfc erfc

 #else

+#define ucl_exp expf
+#define ucl_powr powf
 #define ucl_atan atanf
 #define ucl_cbrt cbrtf
 #define ucl_ceil ceilf
@ -195,8 +198,7 @@
 #define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrtf
 #define ucl_sqrt sqrtf
-#define ucl_exp expf
-#define ucl_powr powf
+#define ucl_erfc erfcf

 #endif

--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -166,6 +166,7 @@
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
+#define ucl_erfc erfc

 #if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)

@ -330,6 +331,10 @@
 #define NEIGHMASK 0x3FFFFFFF
 ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };

+#define SBBITS15 29
+#define NEIGHMASK15 0x1FFFFFFF
+ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; };
+
 // default to 32-bit smallint and other ints, 64-bit bigint:
 // same as defined in src/lmptype.h
 #if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@ -150,7 +150,7 @@ double SWT::host_memory_usage() const {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int SWT::loop(const int eflag, const int vflag, const int evatom,
-              bool &success) {
+              bool & /*success*/) {
  const int nbor_pitch=this->nbor->nbor_pitch();

  // build the short neighbor list
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -106,6 +106,7 @@ _texture_2d( pos_tex,int4);
    }                                                                       \
  }

+// (SHUFFLE_AVAIL == 1)
 #else

 #define local_allocate_acc_zeta()
@ -202,6 +203,7 @@ _texture_2d( pos_tex,int4);
    }                                                                       \
  }

+// EVFLAG == 0
 #else

 #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
@ -216,8 +218,8 @@ _texture_2d( pos_tex,int4);
    ans[ii]=old;                                                            \
  }

-#endif
-#endif
+#endif // EVFLAG
+#endif // SHUFFLE_AVAIL

 #ifdef LAL_SIMD_IP_SYNC
 #define t_per_atom t_per_atom_in
--- a/lib/gpu/lal_vashishta.cpp
+++ b/lib/gpu/lal_vashishta.cpp
@ -56,7 +56,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
           const double* costheta, const double* bigb,
           const double* big2b, const double* bigc)
 {
-  int success;
+  int success=0;
  success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                           _screen,vashishta,"k_vashishta","k_vashishta_three_center",
                           "k_vashishta_three_end","k_vashishta_short_nbor");
@ -211,7 +211,7 @@ double VashishtaT::host_memory_usage() const {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int VashishtaT::loop(const int eflag, const int vflag, const int evatom,
-                     bool &success) {
+                     bool & /*success*/) {
  const int nbor_pitch=this->nbor->nbor_pitch();

  // build the short neighbor list
--- a/src/AMOEBA/amoeba_convolution.cpp
+++ b/src/AMOEBA/amoeba_convolution.cpp
@ -22,6 +22,7 @@
 #include "memory.h"
 #include "neighbor.h"
 #include "remap_wrap.h"
+#include "timer.h"

 #include <cmath>
 #include <cstring>
@ -326,15 +327,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
    cfft[n++] = ZEROF;
  }

+  double time0,time1;
+
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  // perform forward FFT

  fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  time1 = platform::walltime();

  if (SCALE) {
-    double scale = 1.0/nfft_global;
+    FFT_SCALAR scale = 1.0/nfft_global;
    for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
  }

+  time_fft += time1 - time0;
+
 #if DEBUG_AMOEBA
  debug_scalar(CFFT1,"PRE Convo / POST FFT");
  debug_file(CFFT1,"pre.convo.post.fft");
@ -382,15 +391,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
  debug_scalar(FFT,"PRE Convo / POST Remap");
  debug_file(FFT,"pre.convo.post.remap");
 #endif
+
+  double time0,time1;
+
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  // perform forward FFT

  fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  time1 = platform::walltime();

  if (SCALE) {
-    double scale = 1.0/nfft_global;
+    FFT_SCALAR scale = 1.0/nfft_global;
    for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
  }

+  time_fft += time1  - time0;
+
 #if DEBUG_AMOEBA
  debug_scalar(CFFT1,"PRE Convo / POST FFT");
  debug_file(CFFT1,"pre.convo.post.fft");
@ -423,7 +441,16 @@ void *AmoebaConvolution::post_convolution_3d()
  debug_scalar(CFFT1,"POST Convo / PRE FFT");
  debug_file(CFFT1,"post.convo.pre.fft");
 #endif
+
+  double time0,time1;
+
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  fft2->compute(cfft,cfft,FFT3d::BACKWARD);
+  time1 = platform::walltime();
+
+  time_fft += time1 - time0;

 #if DEBUG_AMOEBA
  debug_scalar(CFFT2,"POST Convo / POST FFT");
@ -465,8 +492,18 @@ void *AmoebaConvolution::post_convolution_4d()
  debug_scalar(CFFT1,"POST Convo / PRE FFT");
  debug_file(CFFT1,"post.convo.pre.fft");
 #endif
+
+  double time0,time1;
+
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  fft2->compute(cfft,cfft,FFT3d::BACKWARD);

+  time1 = platform::walltime();
+
+  time_fft += time1 - time0;
+
 #if DEBUG_AMOEBA
  debug_scalar(CFFT2,"POST Convo / POST FFT");
  debug_file(CFFT2,"post.convo.post.fft");
--- a/src/AMOEBA/amoeba_convolution.h
+++ b/src/AMOEBA/amoeba_convolution.h
@ -38,7 +38,7 @@ class AmoebaConvolution : protected Pointers {
  int nxlo_out, nxhi_out, nylo_out, nyhi_out, nzlo_out, nzhi_out;
  int nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft;
  bigint nfft_global;          // nx * ny * nz
-  double *grid_brick_start;    // lower left corner of (c)grid_brick data
+  FFT_SCALAR *grid_brick_start;    // lower left corner of (c)grid_brick data

  AmoebaConvolution(class LAMMPS *, class Pair *, int, int, int, int, int);
  ~AmoebaConvolution();
@ -47,35 +47,37 @@ class AmoebaConvolution : protected Pointers {
  FFT_SCALAR *pre_convolution();
  void *post_convolution();

- private:
-  int which;            // caller name for convolution being performed
-  int flag3d;           // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
-  int nbrick_owned;     // owned grid points in brick decomp
-  int nbrick_ghosts;    // owned + ghost brick grid points
-  int ngrid_either;     // max of nbrick_owned or nfft_owned
+  double time_fft;
+
+ protected:
+  int which;                   // caller name for convolution being performed
+  int flag3d;                  // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
+  int nbrick_owned;            // owned grid points in brick decomp
+  int nbrick_ghosts;           // owned + ghost brick grid points
+  int ngrid_either;            // max of nbrick_owned or nfft_owned

  class Pair *amoeba;
  class FFT3d *fft1, *fft2;
  class Grid3d *gc;
  class Remap *remap;

-  double ***grid_brick;      // 3d real brick grid with ghosts
-  double ****cgrid_brick;    // 4d complex brick grid with ghosts
+  FFT_SCALAR ***grid_brick;      // 3d real brick grid with ghosts
+  FFT_SCALAR ****cgrid_brick;    // 4d complex brick grid with ghosts

  FFT_SCALAR *grid_fft;    // 3d FFT grid as 1d vector
  FFT_SCALAR *cfft;        // 3d complex FFT grid as 1d vector

-  double *gc_buf1, *gc_buf2;    // buffers for GridComm
-  double *remap_buf;            // buffer for Remap
+  FFT_SCALAR *gc_buf1, *gc_buf2;    // buffers for GridComm
+  FFT_SCALAR *remap_buf;            // buffer for Remap

  void allocate_grid();
  void deallocate_grid();
  void *zero_3d();
  void *zero_4d();
  FFT_SCALAR *pre_convolution_3d();
-  FFT_SCALAR *pre_convolution_4d();
+  virtual FFT_SCALAR *pre_convolution_4d();
  void *post_convolution_3d();
-  void *post_convolution_4d();
+  virtual void *post_convolution_4d();
  void procs2grid2d(int, int, int, int &, int &);

  // DEBUG
--- a/src/AMOEBA/amoeba_dispersion.cpp
+++ b/src/AMOEBA/amoeba_dispersion.cpp
@ -285,7 +285,7 @@ void PairAmoeba::dispersion_kspace()
  // gridpre = my portion of 3d grid in brick decomp w/ ghost values
  // zeroed by zero()

-  double ***gridpre = (double ***) d_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) d_kspace->zero();

  // map atoms to grid

@ -294,7 +294,7 @@ void PairAmoeba::dispersion_kspace()
  // pre-convolution operations including forward FFT
  // gridfft = my portion of complex 3d grid in FFT decomposition

-  double *gridfft = d_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = d_kspace->pre_convolution();

  // ---------------------
  // convolution operation
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@ -24,6 +24,7 @@
 #include "math_special.h"
 #include "my_page.h"
 #include "neigh_list.h"
+#include "timer.h"

 #include <cmath>

@ -381,8 +382,6 @@ void PairAmoeba::induce()
      }
    }

-    // if (comm->me == 0) printf("CG iteration count = %d\n",iter);
-
    // terminate the calculation if dipoles failed to converge
    // NOTE: could make this an error

@ -546,13 +545,19 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
    }
  }

-  // get the reciprocal space part of the mutual field
-
-  if (polar_kspace_flag) umutual1(field,fieldp);
+  double time0, time1, time2;
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();

  // get the real space portion of the mutual field

  if (polar_rspace_flag) umutual2b(field,fieldp);
+  time1 = platform::walltime();
+
+  // get the reciprocal space part of the mutual field
+
+  if (polar_kspace_flag) umutual1(field,fieldp);
+  time2 = platform::walltime();

  // add the self-energy portion of the mutual field

@ -563,6 +568,11 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
      fieldp[i][j] += term*uinp[i][j];
    }
  }
+
+  // accumulate timing information
+
+  time_mutual_rspace += time1 - time0;
+  time_mutual_kspace += time2 - time1;
 }

 /* ----------------------------------------------------------------------
@ -785,7 +795,12 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)

  // get the reciprocal space part of the permanent field

+  double time0, time1, time2;
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  if (polar_kspace_flag) udirect1(field);
+  time1 = platform::walltime();

  for (i = 0; i < nlocal; i++) {
    for (j = 0; j < 3; j++) {
@ -796,6 +811,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
  // get the real space portion of the permanent field

  if (polar_rspace_flag) udirect2b(field,fieldp);
+  time2 = platform::walltime();

  // get the self-energy portion of the permanent field

@ -806,6 +822,11 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
      fieldp[i][j] += term*rpole[i][j+1];
    }
  }
+
+  // accumulate timing information
+
+  time_direct_kspace += time1 - time0;
+  time_direct_rspace += time2 - time1;
 }

 /* ----------------------------------------------------------------------
@ -842,18 +863,26 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
    }
  }

+  double time0, time1;
+
  // gridpre = my portion of 4d grid in brick decomp w/ ghost values

-  double ****gridpre = (double ****) ic_kspace->zero();
+  FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero();

  // map 2 values to grid

+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  grid_uind(fuind,fuinp,gridpre);

+  time1 = platform::walltime();
+  time_grid_uind += (time1 - time0);
+
  // pre-convolution operations including forward FFT
  // gridfft = my portion of complex 3d grid in FFT decomposition

-  double *gridfft = ic_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = ic_kspace->pre_convolution();

  // ---------------------
  // convolution operation
@ -883,12 +912,18 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
  // post-convolution operations including backward FFT
  // gridppost = my portion of 4d grid in brick decomp w/ ghost values

-  double ****gridpost = (double ****) ic_kspace->post_convolution();
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution();

  // get potential

+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);

+  time1 = platform::walltime();
+  time_fphi_uind += (time1 - time0);
+
  // store fractional reciprocal potentials for OPT method

  if (poltyp == OPT) {
@ -1055,7 +1090,7 @@ void PairAmoeba::udirect1(double **field)
  // gridpre = my portion of 3d grid in brick decomp w/ ghost values
  // zeroed by setup()

-  double ***gridpre = (double ***) i_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) i_kspace->zero();

  // map multipole moments to grid

@ -1064,7 +1099,7 @@ void PairAmoeba::udirect1(double **field)
  // pre-convolution operations including forward FFT
  // gridfft = my 1d portion of complex 3d grid in FFT decomp

-  double *gridfft = i_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = i_kspace->pre_convolution();

  // ---------------------
  // convolution operation
@ -1109,7 +1144,7 @@ void PairAmoeba::udirect1(double **field)
  // post-convolution operations including backward FFT
  // gridppost = my portion of 3d grid in brick decomp w/ ghost values

-  double ***gridpost = (double ***) i_kspace->post_convolution();
+  FFT_SCALAR ***gridpost = (FFT_SCALAR ***) i_kspace->post_convolution();

  // get potential

--- a/src/AMOEBA/amoeba_kspace.cpp
+++ b/src/AMOEBA/amoeba_kspace.cpp
@ -68,25 +68,23 @@ void PairAmoeba::moduli()
  int maxfft = MAX(nfft1,nfft2);
  maxfft = MAX(maxfft,nfft3);

-  double *array = new double[bsorder];
-  double *bsarray = new double[maxfft];
+  if (maxfft > _nfft_max) {
+    memory->destroy(_moduli_bsarray);
+    _nfft_max = maxfft;
+    memory->create(_moduli_bsarray,_nfft_max,"amoeba:_moduli_bsarray");
+  }

  // compute and load the moduli values

  double x = 0.0;
-  bspline(x,bsorder,array);
+  bspline(x,bsorder,_moduli_array);

-  for (i = 0; i < maxfft; i++) bsarray[i] = 0.0;
-  for (i = 0; i < bsorder; i++) bsarray[i+1] = array[i];
+  for (i = 0; i < maxfft; i++) _moduli_bsarray[i] = 0.0;
+  for (i = 0; i < bsorder; i++) _moduli_bsarray[i+1] = _moduli_array[i];

-  dftmod(bsmod1,bsarray,nfft1,bsorder);
-  dftmod(bsmod2,bsarray,nfft2,bsorder);
-  dftmod(bsmod3,bsarray,nfft3,bsorder);
-
-  // perform deallocation of local arrays
-
-  delete[] array;
-  delete[] bsarray;
+  dftmod(bsmod1,_moduli_bsarray,nfft1,bsorder);
+  dftmod(bsmod2,_moduli_bsarray,nfft2,bsorder);
+  dftmod(bsmod3,_moduli_bsarray,nfft3,bsorder);
 }

 /* ----------------------------------------------------------------------
@ -525,7 +523,7 @@ void PairAmoeba::frac_to_cart()
   grid_mpole maps fractional atomic multipoles to PME grid
 ------------------------------------------------------------------------- */

-void PairAmoeba::grid_mpole(double **fmp, double ***grid)
+void PairAmoeba::grid_mpole(double **fmp, FFT_SCALAR ***grid)
 {
  int i,j,k,m,ib,jb,kb;
  double v0,u0,t0;
@ -598,7 +596,7 @@ void PairAmoeba::grid_mpole(double **fmp, double ***grid)
   the particle mesh Ewald grid
 ------------------------------------------------------------------------- */

-void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
+void PairAmoeba::fphi_mpole(FFT_SCALAR ***grid, double **fphi)
 {
  int i,j,k,m,ib,jb,kb;
  double v0,v1,v2,v3;
@ -742,7 +740,7 @@ void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
   grid_uind maps fractional induced dipoles to the PME grid
 ------------------------------------------------------------------------- */

-void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
+void PairAmoeba::grid_uind(double **fuind, double **fuinp, FFT_SCALAR ****grid)
 {
  int i,j,k,m,ib,jb,kb;
  double v0,u0,t0;
@ -793,7 +791,7 @@ void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
 ------------------------------------------------------------------------- */

-void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
+void PairAmoeba::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
                           double **fdip_phi2, double **fdip_sum_phi)
 {
  int i,j,k,m,ib,jb,kb;
@ -1042,7 +1040,7 @@ void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
   grid_disp maps dispersion coefficients to PME grid
 ------------------------------------------------------------------------- */

-void PairAmoeba::grid_disp(double ***grid)
+void PairAmoeba::grid_disp(FFT_SCALAR ***grid)
 {
  int i,j,k,m,ib,jb,kb,itype,iclass;
  double v0,u0,t0;
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@ -21,6 +21,7 @@
 #include "math_const.h"
 #include "math_special.h"
 #include "neigh_list.h"
+#include "timer.h"

 #include <cmath>

@ -55,6 +56,8 @@ void PairAmoeba::multipole()
  double qixx,qixy,qixz,qiyy,qiyz,qizz;
  double cii,dii,qii;

+  double time0,time1,time2;
+
  // set cutoffs, taper coeffs, and PME params

  if (use_ewald) choose(MPOLE_LONG);
@ -78,13 +81,18 @@ void PairAmoeba::multipole()

  felec = electric / am_dielectric;

+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  // compute the real space part of the Ewald summation

  if (mpole_rspace_flag) multipole_real();
+  time1 = platform::walltime();

  // compute the reciprocal space part of the Ewald summation

  if (mpole_kspace_flag) multipole_kspace();
+  time2 = platform::walltime();

  // compute the Ewald self-energy term over all the atoms

@ -109,6 +117,11 @@ void PairAmoeba::multipole()
    e = fterm * (cii + term*(dii/3.0+2.0*term*qii/5.0));
    empole += e;
  }
+
+  // accumulate timing information
+
+  time_mpole_rspace += time1 - time0;
+  time_mpole_kspace += time2 - time1;
 }

 /* ----------------------------------------------------------------------
@ -361,6 +374,9 @@ void PairAmoeba::multipole_real()
        bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2;
      }
      for (k = 0; k < 6; k++) bn[k] *= felec;
+      //if (i == 0 && j < 10) {
+      //  printf("j = %d: aewald = %f; rr1 = %f; bn: %f %f %f %f %f %f\n", j, aewald, rr1, bn[0], bn[1], bn[2], bn[3], bn[4], bn[5]);
+      //}

      // find damped multipole intermediates and energy value

@ -404,6 +420,8 @@ void PairAmoeba::multipole_real()
          term2i*rr3i + term2k*rr3k + term2ik*rr3ik +
          term3i*rr5i + term3k*rr5k + term3ik*rr5ik;

+
+
        // find damped multipole intermediates for force and torque

        de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik +
@ -444,6 +462,7 @@ void PairAmoeba::multipole_real()
        term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
        term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9);
        term6 = 4.0 * rr7;
+
      }

      empole += e;
@ -482,6 +501,7 @@ void PairAmoeba::multipole_real()
      tq[i][2] += ttmi[2];

      // increment force-based gradient and torque on second site
+      // commenting out j parts for DEBUGGING

      f[j][0] += frcx;
      f[j][1] += frcy;
@ -638,7 +658,7 @@ void PairAmoeba::multipole_kspace()

  // gridpre = my portion of 3d grid in brick decomp w/ ghost values

-  double ***gridpre = (double ***) m_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) m_kspace->zero();

  // map atoms to grid

@ -647,7 +667,7 @@ void PairAmoeba::multipole_kspace()
  // pre-convolution operations including forward FFT
  // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector

-  double *gridfft = m_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = m_kspace->pre_convolution();

  // ---------------------
  // convolution operation
@ -718,7 +738,7 @@ void PairAmoeba::multipole_kspace()
  // post-convolution operations including backward FFT
  // gridppost = my portion of 3d grid in brick decomp w/ ghost values

-  double ***gridpost = (double ***) m_kspace->post_convolution();
+  FFT_SCALAR ***gridpost = (FFT_SCALAR ***) m_kspace->post_convolution();

  // get potential

--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@ -21,6 +21,7 @@
 #include "math_const.h"
 #include "math_special.h"
 #include "neigh_list.h"
+#include "timer.h"

 #include <cmath>
 #include <cstring>
@ -55,6 +56,8 @@ void PairAmoeba::polar()
  double fix[3],fiy[3],fiz[3];
  double tep[3];

+  double time0,time1,time2;
+
  // set cutoffs, taper coeffs, and PME params

  if (use_ewald) choose(POLAR_LONG);
@ -76,11 +79,16 @@ void PairAmoeba::polar()

  // compute the real space part of the dipole interactions

+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  if (polar_rspace_flag) polar_real();
+  time1 = platform::walltime();

  // compute the reciprocal space part of dipole interactions

  if (polar_kspace_flag) polar_kspace();
+  time2 = platform::walltime();

  // compute the Ewald self-energy torque and virial terms

@ -133,6 +141,11 @@ void PairAmoeba::polar()
    virpolar[4] -= vxz;
    virpolar[5] -= vyz;
  }
+
+  // accumulate timing information
+
+  time_polar_rspace += time1 - time0;
+  time_polar_kspace += time2 - time1;
 }

 /* ----------------------------------------------------------------------
@ -382,7 +395,7 @@ void PairAmoeba::polar_real()
          factor_uscale = 1.0;
        }
      }
-
+      //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale);
      r = sqrt(r2);
      ck = rpole[j][0];
      dkx = rpole[j][1];
@ -597,7 +610,6 @@ void PairAmoeba::polar_real()
      dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir;
      dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir;
      dufld[i][5] += zr*tiz5 + zr*zr*tuir;
-
      dufld[j][0] -= xr*tkx5 + xr*xr*tukr;
      dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr;
      dufld[j][2] -= yr*tky5 + yr*yr*tukr;
@ -855,6 +867,7 @@ void PairAmoeba::polar_real()
        frcx = -2.0 * depx;
        frcy = -2.0 * depy;
        frcz = -2.0 * depz;
+
      }

      // get the dtau/dr terms used for mutual polarization force
@ -1327,7 +1340,7 @@ void PairAmoeba::polar_kspace()

    // gridpre = my portion of 3d grid in brick decomp w/ ghost values

-    double ***gridpre = (double ***) p_kspace->zero();
+    FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();

    // map atoms to grid

@ -1336,7 +1349,7 @@ void PairAmoeba::polar_kspace()
    // pre-convolution operations including forward FFT
    // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector

-    double *gridfft = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft = p_kspace->pre_convolution();

    // ---------------------
    // convolution operation
@ -1386,7 +1399,7 @@ void PairAmoeba::polar_kspace()
    // post-convolution operations including backward FFT
    // gridppost = my portion of 3d grid in brick decomp w/ ghost values

-    double ***gridpost = (double ***) p_kspace->post_convolution();
+    FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution();

    // get potential

@ -1419,7 +1432,7 @@ void PairAmoeba::polar_kspace()

  // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values

-  double ****gridpre2 = (double ****) pc_kspace->zero();
+  FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero();

  // map 2 values to grid

@ -1428,7 +1441,7 @@ void PairAmoeba::polar_kspace()
  // pre-convolution operations including forward FFT
  // gridfft = my portion of complex 3d grid in FFT decomposition

-  double *gridfft = pc_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = pc_kspace->pre_convolution();

  // ---------------------
  // convolution operation
@ -1451,7 +1464,7 @@ void PairAmoeba::polar_kspace()
  // post-convolution operations including backward FFT
  // gridppost = my portion of 4d grid in brick decomp w/ ghost values

-  double ****gridpost = (double ****) pc_kspace->post_convolution();
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution();

  // get potential

@ -1857,7 +1870,7 @@ void PairAmoeba::polar_kspace()
  // gridpre = my portion of 3d grid in brick decomp w/ ghost values
  // zeroed by zero()

-  double ***gridpre = (double ***) p_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();

  // map atoms to grid

@ -1887,7 +1900,7 @@ void PairAmoeba::polar_kspace()
  // gridpre = my portion of 3d grid in brick decomp w/ ghost values
  // zeroed by zero()

-  gridpre = (double ***) p_kspace->zero();
+  gridpre = (FFT_SCALAR ***) p_kspace->zero();

  // map atoms to grid

@ -1896,7 +1909,7 @@ void PairAmoeba::polar_kspace()
  // pre-convolution operations including forward FFT
  // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors

-  double *gridfft2 = p_kspace->pre_convolution();
+  FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();

  // ---------------------
  // convolution operation
@ -1953,7 +1966,7 @@ void PairAmoeba::polar_kspace()
    // gridpre = my portion of 3d grid in brick decomp w/ ghost values
    // zeroed by zero()

-    double ***gridpre = (double ***) p_kspace->zero();
+    FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();

    // map atoms to grid

@ -1962,12 +1975,12 @@ void PairAmoeba::polar_kspace()
    // pre-convolution operations including forward FFT
    // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector

-    double *gridfft = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft = p_kspace->pre_convolution();

    // gridfft1 = copy of first FFT

    int nfft_owned = p_kspace->nfft_owned;
-    memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double));
+    memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR));

    // assign ??? to the PME grid

@ -1982,7 +1995,7 @@ void PairAmoeba::polar_kspace()

    // gridpre = my portion of 3d grid in brick decomp w/ ghost values

-    gridpre = (double ***) p_kspace->zero();
+    gridpre = (FFT_SCALAR ***) p_kspace->zero();

    // map atoms to grid

@ -1991,7 +2004,7 @@ void PairAmoeba::polar_kspace()
    // pre-convolution operations including forward FFT
    // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector

-    double *gridfft2 = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();

    // ---------------------
    // convolution operation
--- a/src/AMOEBA/fix_amoeba_bitorsion.cpp
+++ b/src/AMOEBA/fix_amoeba_bitorsion.cpp
@ -194,8 +194,8 @@ void FixAmoebaBiTorsion::init()
  // error check that PairAmoeba or PairHiippo exist

  pair = nullptr;
-  pair = force->pair_match("amoeba",1,0);
-  if (!pair) pair = force->pair_match("hippo",1,0);
+  pair = force->pair_match("^amoeba",0,0);
+  if (!pair) pair = force->pair_match("^hippo",0,0);

  if (!pair)
    error->all(FLERR,"Cannot use fix amoeba/bitorsion w/out pair amoeba/hippo");
--- a/src/AMOEBA/improper_amoeba.cpp
+++ b/src/AMOEBA/improper_amoeba.cpp
@ -285,8 +285,9 @@ void ImproperAmoeba::init_style()
  // check if PairAmoeba disabled improper terms

  Pair *pair = nullptr;
-  pair = force->pair_match("amoeba",1,0);
-  if (!pair) pair = force->pair_match("hippo",1,0);
+  pair = force->pair_match("^amoeba",0,0);
+  if (!pair) pair = force->pair_match("^hippo",0,0);
+
  if (!pair) error->all(FLERR,"Improper amoeba could not find pair amoeba/hippo");

  int tmp;
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@ -29,6 +29,7 @@
 #include "my_page.h"
 #include "neigh_list.h"
 #include "neighbor.h"
+#include "timer.h"
 #include "update.h"

 #include <cmath>
@ -47,6 +48,7 @@ enum{MUTUAL,OPT,TCG,DIRECT};
 enum{GEAR,ASPC,LSQR};

 #define DELTASTACK 16
+#define DEBUG_AMOEBA 0

 /* ---------------------------------------------------------------------- */

@ -85,6 +87,10 @@ PairAmoeba::PairAmoeba(LAMMPS *lmp) : Pair(lmp)
  cmp = fmp = nullptr;
  cphi = fphi = nullptr;

+  _moduli_array = nullptr;
+  _moduli_bsarray = nullptr;
+  _nfft_max = 0;
+
  poli = nullptr;
  conj = conjp = nullptr;
  vec = vecp = nullptr;
@ -227,6 +233,9 @@ PairAmoeba::~PairAmoeba()
  memory->destroy(fphidp);
  memory->destroy(cphidp);

+  memory->destroy(_moduli_array);
+  memory->destroy(_moduli_bsarray);
+
  memory->destroy(thetai1);
  memory->destroy(thetai2);
  memory->destroy(thetai3);
@ -349,12 +358,22 @@ void PairAmoeba::compute(int eflag, int vflag)
  if (update->ntimestep <= update->beginstep+1) {
    time_init = time_hal = time_repulse = time_disp = time_mpole = 0.0;
    time_induce = time_polar = time_qxfer = 0.0;
+
+    time_mpole_rspace = time_mpole_kspace = 0.0;
+    time_direct_rspace = time_direct_kspace = 0.0;
+    time_mutual_rspace = time_mutual_kspace = 0.0;
+    time_polar_rspace = time_polar_kspace = 0.0;
+
+    time_grid_uind = time_fphi_uind = 0.0;
+    if (ic_kspace) {
+      ic_kspace->time_fft = 0.0;
+    }
  }

  double time0,time1,time2,time3,time4,time5,time6,time7,time8;

-  MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();

  // if reneighboring step:
  // augment neighbor list to include 1-5 neighbor flags
@ -410,8 +429,7 @@ void PairAmoeba::compute(int eflag, int vflag)
  comm->forward_comm(this);

  if (amoeba) pbc_xred();
-
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();

  // ----------------------------------------
  // compute components of force field
@ -420,22 +438,22 @@ void PairAmoeba::compute(int eflag, int vflag)
  // buffered 14-7 Vdwl, pairwise

  if (amoeba && hal_flag) hal();
-  time2 = MPI_Wtime();
+  time2 = platform::walltime();

  // Pauli repulsion, pairwise

  if (!amoeba && repulse_flag) repulsion();
-  time3 = MPI_Wtime();
+  time3 = platform::walltime();

  // Ewald dispersion, pairwise and long range

  if (!amoeba && (disp_rspace_flag || disp_kspace_flag)) dispersion();
-  time4 = MPI_Wtime();
+  time4 = platform::walltime();

  // multipole, pairwise and long range

  if (mpole_rspace_flag || mpole_kspace_flag) multipole();
-  time5 = MPI_Wtime();
+  time5 = platform::walltime();

  // induced dipoles, interative CG relaxation
  // communicate induce() output values needed by ghost atoms
@ -445,17 +463,17 @@ void PairAmoeba::compute(int eflag, int vflag)
    cfstyle = INDUCE;
    comm->forward_comm(this);
  }
-  time6 = MPI_Wtime();
+  time6 = platform::walltime();

  // dipoles, pairwise and long range

  if (polar_rspace_flag || polar_kspace_flag) polar();
-  time7 = MPI_Wtime();
+  time7 = platform::walltime();

  // charge transfer, pairwise

  if (!amoeba && qxfer_flag) charge_transfer();
-  time8 = MPI_Wtime();
+  time8 = platform::walltime();

  // store energy components for output by compute pair command

@ -518,6 +536,44 @@ void PairAmoeba::finish()
  MPI_Allreduce(&time_qxfer,&ave,1,MPI_DOUBLE,MPI_SUM,world);
  time_qxfer = ave/comm->nprocs;

+  #if DEBUG_AMOEBA
+  // real-space/kspace breakdown
+  MPI_Allreduce(&time_mpole_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mpole_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_mpole_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mpole_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_direct_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_direct_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_direct_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_direct_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_mutual_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_mutual_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_polar_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_polar_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_polar_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_grid_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_grid_uind = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_fphi_uind = ave/comm->nprocs;
+
+  double time_mutual_fft = 0;
+  if (ic_kspace) time_mutual_fft = ic_kspace->time_fft;
+  MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_fft = ave/comm->nprocs;
+  #endif // DEBUG_AMOEBA
+
  double time_total = (time_init + time_hal + time_repulse + time_disp +
                       time_mpole + time_induce + time_polar + time_qxfer) / 100.0;

@ -534,8 +590,27 @@ void PairAmoeba::finish()
    utils::logmesg(lmp,"  Induce  time: {:<12.6g} {:6.2f}%\n", time_induce, time_induce/time_total);
    utils::logmesg(lmp,"  Polar   time: {:<12.6g} {:6.2f}%\n", time_polar, time_polar/time_total);
    if (!amoeba)
-      utils::logmesg(lmp,"  Qxfer   time: {:<12.6g} {:6.2f}%\n", time_qxfer, time_qxfer/time_total);
-    utils::logmesg(lmp,"  Total   time: {:<12.6g}\n",time_total * 100.0);
+      utils::logmesg(lmp,"  Qxfer   time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total);
+    utils::logmesg(lmp,"  Total   time: {:.6g}\n",time_total * 100.0);
+
+    #if DEBUG_AMOEBA
+    double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace;
+    double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace;
+
+    utils::logmesg(lmp,"    Real-space timing breakdown: {:.3g}%\n", rspace_time/time_total);
+    utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total);
+    utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total);
+    utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total);
+    utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total);
+    utils::logmesg(lmp,"    K-space timing breakdown   : {:.3g}%\n", kspace_time/time_total);
+    utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
+    utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
+    utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
+    utils::logmesg(lmp,"       - Grid    : {:.6g} {:.3g}%\n", time_grid_uind, time_grid_uind/time_total);
+    utils::logmesg(lmp,"       - FFT     : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
+    utils::logmesg(lmp,"       - Interp  : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total);
+    utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
+    #endif
  }
 }

@ -2320,6 +2395,8 @@ void PairAmoeba::grow_local()
    firstneigh_pcpc = (double **)
      memory->smalloc(nmax*sizeof(double *),"induce:firstneigh_pcpc");
  }
+
+  memory->create(_moduli_array,bsordermax,"amoeba:_moduli_array");
 }

 /* ----------------------------------------------------------------------
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@ -82,6 +82,12 @@ class PairAmoeba : public Pair {
  double time_init, time_hal, time_repulse, time_disp;
  double time_mpole, time_induce, time_polar, time_qxfer;

+  double time_mpole_rspace, time_mpole_kspace;
+  double time_direct_rspace, time_direct_kspace;
+  double time_mutual_rspace, time_mutual_kspace;
+  double time_polar_rspace, time_polar_kspace;
+  double time_grid_uind, time_fphi_uind;
+
  // energy/virial components

  double ehal, erepulse, edisp, epolar, empole, eqxfer;
@ -324,8 +330,12 @@ class PairAmoeba : public Pair {
  double *qfac;        // convoulution pre-factors
  double *gridfft1;    // copy of p_kspace FFT grid

-  double **cmp, **fmp;    // Cartesian and fractional multipoles
-  double **cphi, **fphi;
+  double **cmp,**fmp;              // Cartesian and fractional multipoles
+  double **cphi,**fphi;
+
+  double *_moduli_array;           // buffers for moduli
+  double *_moduli_bsarray;
+  int _nfft_max;

  // params for current KSpace solve and FFT being worked on

@ -335,8 +345,12 @@ class PairAmoeba : public Pair {
  double ctf[10][10];         // indices NOT flipped vs Fortran
  double ftc[10][10];         // indices NOT flipped vs Fortran

-  class AmoebaConvolution *m_kspace, *p_kspace, *pc_kspace, *d_kspace;
-  class AmoebaConvolution *i_kspace, *ic_kspace;
+  class AmoebaConvolution *m_kspace;   // multipole KSpace
+  class AmoebaConvolution *p_kspace;   // polar KSpace
+  class AmoebaConvolution *pc_kspace;
+  class AmoebaConvolution *d_kspace;   // dispersion KSpace
+  class AmoebaConvolution *i_kspace;   // induce KSpace
+  class AmoebaConvolution *ic_kspace;

  // FFT grid size factors

@ -347,33 +361,33 @@ class PairAmoeba : public Pair {

  void hal();

-  void repulsion();
-  void damprep(double, double, double, double, double, double, double, double, int, double, double,
-               double *);
+  virtual void repulsion();
+  void damprep(double, double, double, double, double, double, double, double,
+               int, double, double, double *);

  void dispersion();
-  void dispersion_real();
+  virtual void dispersion_real();
  void dispersion_kspace();

  void multipole();
-  void multipole_real();
+  virtual void multipole_real();
  void multipole_kspace();

  void polar();
  void polar_energy();
-  void polar_real();
-  void polar_kspace();
+  virtual void polar_real();
+  virtual void polar_kspace();
  void damppole(double, int, double, double, double *, double *, double *);

-  void induce();
+  virtual void induce();
  void ulspred();
-  void ufield0c(double **, double **);
+  virtual void ufield0c(double **, double **);
  void uscale0b(int, double **, double **, double **, double **);
  void dfield0c(double **, double **);
-  void umutual1(double **, double **);
-  void umutual2b(double **, double **);
+  virtual void umutual1(double **, double **);
+  virtual void umutual2b(double **, double **);
  void udirect1(double **);
-  void udirect2b(double **, double **);
+  virtual void udirect2b(double **, double **);
  void dampmut(double, double, double, double *);
  void dampdir(double, double, double, double *, double *);
  void cholesky(int, double *, double *);
@ -393,11 +407,11 @@ class PairAmoeba : public Pair {
  void fphi_to_cphi(double **, double **);
  void frac_to_cart();

-  void grid_mpole(double **, double ***);
-  void fphi_mpole(double ***, double **);
-  void grid_uind(double **, double **, double ****);
-  void fphi_uind(double ****, double **, double **, double **);
-  void grid_disp(double ***);
+  void grid_mpole(double **, FFT_SCALAR ***);
+  void fphi_mpole(FFT_SCALAR ***, double **);
+  void grid_uind(double **, double **, FFT_SCALAR ****);
+  virtual void fphi_uind(FFT_SCALAR ****, double **, double **, double **);
+  void grid_disp(FFT_SCALAR ***);

  void kewald();
  void kewald_parallel(int, int, int, int, int &, int &, int &, int &, int &, int &, int &, int &,
--- a/src/Depend.sh
+++ b/src/Depend.sh
@ -45,6 +45,10 @@ depend () {
 # add one if statement per parent package
 # add one depend() call per child package that depends on that parent

+if (test $1 = "AMOEBA") then
+  depend GPU
+fi
+
 if (test $1 = "ASPHERE") then
  depend GPU
  depend OPENMP
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@ -28,6 +28,8 @@ action () {

 # list of files with optional dependcies

+action amoeba_convolution_gpu.cpp amoeba_convolution.cpp
+action amoeba_convolution_gpu.h amoeba_convolution.cpp
 action fix_gpu.cpp
 action fix_gpu.h
 action fix_nve_gpu.h
@ -41,6 +43,8 @@ action fix_npt_gpu.cpp
 action fix_nve_asphere_gpu.h fix_nve_asphere.h
 action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp
 action gpu_extra.h
+action pair_amoeba_gpu.cpp pair_amoeba.cpp
+action pair_amoeba_gpu.h pair_amoeba.h
 action pair_beck_gpu.cpp pair_beck.cpp
 action pair_beck_gpu.h pair_beck.h
 action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp
@ -89,6 +93,8 @@ action pair_gauss_gpu.cpp pair_gauss.cpp
 action pair_gauss_gpu.h pair_gauss.h
 action pair_gayberne_gpu.cpp pair_gayberne.cpp
 action pair_gayberne_gpu.h pair_gayberne.cpp
+action pair_hippo_gpu.cpp pair_hippo.cpp
+action pair_hippo_gpu.h pair_hippo.cpp
 action pair_lj96_cut_gpu.cpp pair_lj96_cut.cpp
 action pair_lj96_cut_gpu.h pair_lj96_cut.h
 action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp
@ -113,6 +119,10 @@ action pair_lj_cut_coul_msm_gpu.cpp pair_lj_cut_coul_msm.cpp
 action pair_lj_cut_coul_msm_gpu.h pair_lj_cut_coul_msm.h
 action pair_lj_cut_gpu.cpp
 action pair_lj_cut_gpu.h
+action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
+action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
+action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
+action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp
 action pair_lj_smooth_gpu.cpp pair_lj_smooth.cpp
 action pair_lj_smooth_gpu.h pair_lj_smooth.cpp
 action pair_lj_expand_gpu.cpp
@ -155,10 +165,6 @@ action pppm_gpu.cpp pppm.cpp
 action pppm_gpu.h pppm.cpp
 action pair_ufm_gpu.cpp pair_ufm.cpp
 action pair_ufm_gpu.h pair_ufm.h
-action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
-action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
-action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
-action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp

 # edit 2 Makefile.package files to include/exclude package info

--- a/src/GPU/amoeba_convolution_gpu.cpp
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "amoeba_convolution_gpu.h"
+#include "comm.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "grid3d.h"
+
+using namespace LAMMPS_NS;
+
+// DEBUG
+
+#define DEBUG_AMOEBA 0
+#if DEBUG_AMOEBA
+char *labels[7] =
+  {(char *) "MPOLE_GRID", (char *) "POLAR_GRID",
+   (char *) "POLAR_GRIDC", (char *) "DISP_GRID",
+   (char *) "INDUCE_GRID", (char *) "INDUCE_GRIDC"};
+
+enum{GRIDBRICK_OUT,GRIDBRICK_IN,FFT,CFFT1,CFFT2};
+#endif
+// END DEBUG
+
+#define SCALE 0
+
+//#define USE_AMOEBA_FFT
+#ifdef USE_AMOEBA_FFT
+// External functions from GPU library
+int amoeba_setup_fft(const int size, const int numel, const int element_type);
+int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode);
+#endif
+
+/* ----------------------------------------------------------------------
+   partition an FFT grid across processors
+   both for a brick and FFT x pencil decomposition
+   nx,nz,nz = global FFT grid size
+   order = size of stencil in each dimension that maps atoms to grid
+   adapted from PPPM::set_grid_local()
+------------------------------------------------------------------------- */
+
+AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
+                                     int nx_caller, int ny_caller, int nz_caller,
+                                     int order_caller, int which_caller) :
+  AmoebaConvolution(lmp, pair, nx_caller, ny_caller,  nz_caller, order_caller,
+                    which_caller)
+{
+
+}
+
+/* ----------------------------------------------------------------------
+   perform pre-convolution grid operations for 4d cgrid_brick array
+------------------------------------------------------------------------- */
+
+FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
+{
+  int ix,iy,iz,n;
+
+  // reverse comm for 4d brick grid + ghosts
+
+#if DEBUG_AMOEBA
+  debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE Grid3d");
+#endif
+
+  gc->reverse_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
+                   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+
+#if DEBUG_AMOEBA
+  debug_scalar(GRIDBRICK_IN,"PRE Convo / POST Grid3d");
+  debug_file(GRIDBRICK_IN,"pre.convo.post.grid3d");
+#endif
+  // copy owned 4d brick grid values to FFT grid
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+        cfft[n++] = cgrid_brick[iz][iy][ix][0];
+        cfft[n++] = cgrid_brick[iz][iy][ix][1];
+      }
+
+  // remap FFT grid from brick to x pencil partitioning
+  // NOTE: could just setup FFT to start from brick decomp and skip remap
+
+  remap->perform(cfft,cfft,remap_buf);
+
+#if DEBUG_AMOEBA
+  debug_scalar(FFT,"PRE Convo / POST Remap");
+  debug_file(FFT,"pre.convo.post.remap");
+#endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = platform::walltime();
+
+  // perform forward FFT
+
+  #ifdef USE_AMOEBA_FFT
+  amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD);
+  #else
+  fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  #endif
+
+  time1 = platform::walltime();
+
+  time_fft += time1 - time0;
+
+  if (SCALE) {
+    double scale = 1.0/nfft_global;
+    for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
+  }
+
+#if DEBUG_AMOEBA
+  debug_scalar(CFFT1,"PRE Convo / POST FFT");
+  debug_file(CFFT1,"pre.convo.post.fft");
+#endif
+  return cfft;
+}
+
+/* ----------------------------------------------------------------------
+   perform post-convolution grid operations for 4d cgrid_brick array
+------------------------------------------------------------------------- */
+
+void *AmoebaConvolutionGPU::post_convolution_4d()
+{
+  int ix,iy,iz,n;
+
+  // perform backward FFT
+
+#if DEBUG_AMOEBA
+  debug_scalar(CFFT1,"POST Convo / PRE FFT");
+  debug_file(CFFT1,"post.convo.pre.fft");
+#endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = platform::walltime();
+
+  fft2->compute(cfft,cfft,FFT3d::BACKWARD);
+
+  time1 = platform::walltime();
+
+  time_fft += time1 - time0;
+
+#if DEBUG_AMOEBA
+  debug_scalar(CFFT2,"POST Convo / POST FFT");
+  debug_file(CFFT2,"post.convo.post.fft");
+#endif
+  // copy 1d complex values into 4d complex grid
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+        cgrid_brick[iz][iy][ix][0] = cfft[n++];
+        cgrid_brick[iz][iy][ix][1] = cfft[n++];
+      }
+
+  // forward comm to populate ghost grid values
+
+#if DEBUG_AMOEBA
+  debug_scalar(GRIDBRICK_IN,"POST Convo / PRE grid3d");
+  debug_file(GRIDBRICK_IN,"post.convo.pre.grid3d");
+#endif
+  gc->forward_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
+                   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+
+  return (void *) cgrid_brick;
+}
--- a/src/GPU/amoeba_convolution_gpu.h
+++ b/src/GPU/amoeba_convolution_gpu.h
@ -0,0 +1,32 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_AMOEBA_CONVOLUTION_GPU_H
+#define LMP_AMOEBA_CONVOLUTION_GPU_H
+
+#include "amoeba_convolution.h"
+
+
+namespace LAMMPS_NS {
+
+class AmoebaConvolutionGPU : public AmoebaConvolution {
+ public:
+  AmoebaConvolutionGPU(class LAMMPS *, class Pair *, int, int, int, int, int);
+
+  FFT_SCALAR *pre_convolution_4d() override;
+  void *post_convolution_4d() override;
+
+};
+
+} // namespace LAMMPS_NS
+#endif
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@ -131,7 +131,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
  _gpu_mode = GPU_NEIGH;
  _particle_split = 1.0;
  int nthreads = 0;
-  int newtonflag = 0;
+  int newtonflag = force->newton_pair;
  int threads_per_atom = -1;
  double binsize = 0.0;
  char *opencl_args = nullptr;
@ -360,6 +360,8 @@ double FixGPU::memory_usage()
  return bytes;
 }

+/* ---------------------------------------------------------------------- */
+
 double FixGPU::binsize(const double subx, const double suby,
                       const double subz, const int nlocal,
                       const double cut) {
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@ -0,0 +1,72 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(amoeba/gpu,PairAmoebaGPU);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_AMOEBA_GPU_H
+#define LMP_PAIR_AMOEBA_GPU_H
+
+#include "pair_amoeba.h"
+
+namespace LAMMPS_NS {
+
+class PairAmoebaGPU : public PairAmoeba {
+ public:
+  PairAmoebaGPU(LAMMPS *lmp);
+  ~PairAmoebaGPU() override;
+  void init_style() override;
+  double memory_usage() override;
+
+  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+  void induce() override;
+
+  void multipole_real() override;
+  void udirect2b(double **, double **) override;
+  void umutual1(double **, double **) override;
+  void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
+  void umutual2b(double **, double **) override;
+  void ufield0c(double **, double **) override;
+  void polar_real() override;
+  void polar_kspace() override;
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  void *tq_pinned;
+  void *fieldp_pinned;
+  bool acc_float;
+
+  bool gpu_hal_ready;
+  bool gpu_repulsion_ready;
+  bool gpu_dispersion_real_ready;
+  bool gpu_multipole_real_ready;
+  bool gpu_udirect2b_ready;
+  bool gpu_umutual1_ready;
+  bool gpu_fphi_uind_ready;
+  bool gpu_umutual2b_ready;
+  bool gpu_polar_real_ready;
+
+  void udirect2b_cpu();
+
+  template<class numtyp>
+  void compute_force_from_torque(const numtyp*, double**, double*);
+};
+
+}    // namespace LAMMPS_NS
+#endif
+#endif
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
--- a/src/GPU/pair_hippo_gpu.h
+++ b/src/GPU/pair_hippo_gpu.h
@ -0,0 +1,73 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(hippo/gpu,PairHippoGPU);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_HIPPO_GPU_H
+#define LMP_PAIR_HIPPO_GPU_H
+
+#include "pair_amoeba.h"
+
+namespace LAMMPS_NS {
+
+class PairHippoGPU : public PairAmoeba {
+ public:
+  PairHippoGPU(LAMMPS *lmp);
+  ~PairHippoGPU() override;
+  void init_style() override;
+  double memory_usage() override;
+
+  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+  void induce() override;
+
+  void repulsion() override;
+  void dispersion_real() override;
+  void multipole_real() override;
+  void udirect2b(double **, double **) override;
+  void umutual1(double **, double **) override;
+  void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
+  void umutual2b(double **, double **) override;
+  void ufield0c(double **, double **) override;
+  void polar_real() override;
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  void *tq_pinned;
+  void *fieldp_pinned;
+  bool acc_float;
+
+  bool gpu_hal_ready;
+  bool gpu_repulsion_ready;
+  bool gpu_dispersion_real_ready;
+  bool gpu_multipole_real_ready;
+  bool gpu_udirect2b_ready;
+  bool gpu_umutual1_ready;
+  bool gpu_fphi_uind_ready;
+  bool gpu_umutual2b_ready;
+  bool gpu_polar_real_ready;
+
+  void udirect2b_cpu();
+
+  template<class numtyp>
+  void compute_force_from_torque(const numtyp*, double**, double*);
+};
+
+}    // namespace LAMMPS_NS
+#endif
+#endif
--- a/src/REAXFF/fix_reaxff_species.cpp
+++ b/src/REAXFF/fix_reaxff_species.cpp
@ -21,6 +21,7 @@

 #include "atom.h"
 #include "atom_vec.h"
+#include "citeme.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
@ -36,12 +37,25 @@
 #include "pair_reaxff.h"
 #include "reaxff_defs.h"

+#include <algorithm>
 #include <cstring>
 #include <exception>
+#include <random>

 using namespace LAMMPS_NS;
 using namespace FixConst;

+static const char cite_reaxff_species_delete[] =
+  "fix reaxff/species, 'delete' keyword: https://doi.org/10.1016/j.carbon.2022.11.002\n\n"
+  "@Article{Gissinger23,\n"
+  " author = {J. R. Gissinger, S. R. Zavada, J. G. Smith, J. Kemppainen, I. Gallegos, G. M. Odegard, E. J. Siochi, K. E. Wise},\n"
+  " title = {Predicting char yield of high-temperature resins},\n"
+  " journal = {Carbon},\n"
+  " year =    2023,\n"
+  " volume =  202,\n"
+  " pages =   {336-347}\n"
+  "}\n\n";
+
 /* ---------------------------------------------------------------------- */

 FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
@ -145,6 +159,7 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
  ele = filepos = filedel = nullptr;
  eleflag = posflag = padflag = 0;
  delflag = specieslistflag = masslimitflag = 0;
+  delete_Nlimit = delete_Nsteps = 0;

  singlepos_opened = multipos_opened = del_opened = 0;
  multipos = 0;
@ -221,7 +236,12 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :

      } else
        error->all(FLERR, "Unknown fix reaxff/species delete option: {}", arg[iarg]);
-
+      // rate limit when deleting molecules
+    } else if (strcmp(arg[iarg], "delete_rate_limit") == 0) {
+      if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species delete_rate_limit", error);
+      delete_Nlimit = utils::numeric(FLERR, arg[iarg+1], false, lmp);
+      delete_Nsteps = utils::numeric(FLERR, arg[iarg+2], false, lmp);
+      iarg += 3;
      // position of molecules
    } else if (strcmp(arg[iarg], "position") == 0) {
      if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species position", error);
@ -260,6 +280,15 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
  if (delflag && specieslistflag && masslimitflag)
    error->all(FLERR, "Incompatible combination fix reaxff/species command options");

+  if (delete_Nlimit > 0) {
+    if (lmp->citeme) lmp->citeme->add(cite_reaxff_species_delete);
+    memory->create(delete_Tcount,delete_Nsteps,"reaxff/species:delete_Tcount");
+
+    for (int i = 0; i < delete_Nsteps; i++)
+      delete_Tcount[i] = -1;
+    delete_Tcount[0] = 0;
+  }
+
  vector_nmole = 0;
  vector_nspec = 0;
 }
@ -279,6 +308,7 @@ FixReaxFFSpecies::~FixReaxFFSpecies()
  memory->destroy(Mol2Spec);
  memory->destroy(MolType);
  memory->destroy(MolName);
+  memory->destroy(delete_Tcount);

  delete[] filepos;
  delete[] filedel;
@ -375,7 +405,13 @@ void FixReaxFFSpecies::Output_ReaxFF_Bonds(bigint ntimestep, FILE * /*fp*/)
  // point to fix_ave_atom
  f_SPECBOND->end_of_step();

-  if (ntimestep != nvalid) return;
+  if (ntimestep != nvalid) {
+    // push back delete_Tcount on every step
+    if (delete_Nlimit > 0)
+      for (int i = delete_Nsteps-1; i > 0; i--)
+        delete_Tcount[i] = delete_Tcount[i-1];
+    return;
+  }

  nlocal = atom->nlocal;

@ -826,6 +862,15 @@ void FixReaxFFSpecies::WritePos(int Nmole, int Nspec)

 void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
 {
+  int ndeletions;
+  int headroom = -1;
+  if (delete_Nlimit > 0) {
+    if (delete_Tcount[delete_Nsteps-1] == -1) return;
+    ndeletions = delete_Tcount[0] - delete_Tcount[delete_Nsteps-1];
+    headroom = MAX(0, delete_Nlimit - ndeletions);
+    if (headroom == 0) return;
+  }
+
  int i, j, m, n, itype, cid;
  int ndel, ndelone, count, count_tmp;
  int *Nameall;
@ -856,7 +901,23 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
  int *marklist;
  memory->create(marklist, nlocal, "reaxff/species:marklist");

-  for (m = 1; m <= Nmole; m++) {
+  std::random_device rnd;
+  std::minstd_rand park_rng(rnd());
+  int *molrange;
+  memory->create(molrange,Nmole,"reaxff/species:molrange");
+  for (m = 0; m < Nmole; m++)
+    molrange[m] = m + 1;
+  if (delete_Nlimit > 0) {
+    // shuffle index when using rate_limit, in case order is biased
+    if (comm->me == 0)
+      std::shuffle(&molrange[0],&molrange[Nmole], park_rng);
+    MPI_Bcast(&molrange[0], Nmole, MPI_INT, 0, world);
+  }
+
+  int this_delete_Tcount = 0;
+  for (int mm = 0; mm < Nmole; mm++) {
+    if (this_delete_Tcount == headroom) break;
+    m = molrange[mm];
    localmass = totalmass = count = nmarklist = 0;
    for (n = 0; n < ntypes; n++) Name[n] = 0;

@ -896,6 +957,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
      // find corresponding moltype

      if (totalmass > massmin && totalmass < massmax) {
+        this_delete_Tcount++;
        for (j = 0; j < nmarklist; j++) {
          mark[marklist[j]] = 1;
          deletecount[Mol2Spec[m - 1]] += 1.0 / (double) count;
@ -905,6 +967,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
      if (count > 0) {
        for (i = 0; i < ndelspec; i++) {
          if (del_species[i] == species_str) {
+            this_delete_Tcount++;
            for (j = 0; j < nmarklist; j++) {
              mark[marklist[j]] = 1;
              deletecount[i] += 1.0 / (double) count;
@ -976,6 +1039,14 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
    }
  }

+
+  // push back delete_Tcount on every step
+  if (delete_Nlimit > 0) {
+    for (i = delete_Nsteps-1; i > 0; i--)
+      delete_Tcount[i] = delete_Tcount[i-1];
+    delete_Tcount[0] += this_delete_Tcount;
+  }
+
  if (ndel && (atom->map_style != Atom::MAP_NONE)) {
    atom->nghost = 0;
    atom->map_init();
@ -988,6 +1059,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
  memory->destroy(marklist);
  memory->destroy(mark);
  memory->destroy(deletecount);
+  memory->destroy(molrange);
 }

 /* ---------------------------------------------------------------------- */
--- a/src/REAXFF/fix_reaxff_species.h
+++ b/src/REAXFF/fix_reaxff_species.h
@ -60,6 +60,7 @@ class FixReaxFFSpecies : public Fix {
  FILE *fp, *pos, *fdel;
  int eleflag, posflag, multipos, padflag, setupflag;
  int delflag, specieslistflag, masslimitflag;
+  int delete_Nlimit, delete_Nsteps, *delete_Tcount;
  double massmin, massmax;
  int singlepos_opened, multipos_opened, del_opened;
  char *ele, **eletype, *filepos, *filedel;
--- a/src/atom.cpp
+++ b/src/atom.cpp
@ -2345,6 +2345,18 @@ void Atom::setup_sort_bins()
    return;
  }

+#ifdef LMP_GPU
+  if (userbinsize == 0.0) {
+    auto ifix = dynamic_cast<FixGPU *>(modify->get_fix_by_id("package_gpu"));
+    if (ifix) {
+      const double subx = domain->subhi[0] - domain->sublo[0];
+      const double suby = domain->subhi[1] - domain->sublo[1];
+      const double subz = domain->subhi[2] - domain->sublo[2];
+      binsize = ifix->binsize(subx, suby, subz, atom->nlocal, 0.5 * neighbor->cutneighmax);
+    }
+  }
+#endif
+
  double bininv = 1.0/binsize;

  // nbin xyz = local bins
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@ -535,6 +535,7 @@ void Neighbor::init()
      int flag=0;
      for (int isub=0; isub < ph->nstyles; ++isub) {
        if (force->pair_match("amoeba",0,isub)
+            || force->pair_match("hippo",0,isub)
            || force->pair_match("coul/wolf",0,isub)
            || force->pair_match("coul/dsf",0,isub)
            || force->pair_match("coul/exclude",0)
@ -545,6 +546,7 @@ void Neighbor::init()
        special_flag[1] = special_flag[2] = special_flag[3] = 2;
    } else {
      if (force->pair_match("amoeba",0)
+          || force->pair_match("hippo",0)
          || force->pair_match("coul/wolf",0)
          || force->pair_match("coul/dsf",0)
          || force->pair_match("coul/exclude",0)