Merge branch 'develop'

2023-01-29 13:27:54 -05:00
parent d31595a36c 4cb29ce413
commit f0578bbf63
138 changed files with 16070 additions and 1717 deletions
--- a/cmake/Modules/LAMMPSUtils.cmake
+++ b/cmake/Modules/LAMMPSUtils.cmake
@ -99,8 +99,15 @@ function(check_for_autogen_files source_dir)
 endfunction()

 macro(pkg_depends PKG1 PKG2)
-  if(PKG_${PKG1} AND NOT (PKG_${PKG2} OR BUILD_${PKG2}))
-    message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with the ${PKG2} package")
+  if(DEFINED BUILD_${PKG2})
+    if(PKG_${PKG1} AND NOT BUILD_${PKG2})
+      message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with -D BUILD_${PKG2}=ON")
+    endif()
+  elseif(DEFINED PKG_${PKG2})
+    if(PKG_${PKG1} AND NOT PKG_${PKG2})
+      message(WARNING "The ${PKG1} package depends on the ${PKG2} package. Enabling it.")
+      set(PKG_${PKG2} ON CACHE BOOL "" FORCE)
+    endif()
  endif()
 endmacro()

--- a/cmake/Modules/Packages/COMPRESS.cmake
+++ b/cmake/Modules/Packages/COMPRESS.cmake
@ -1,4 +1,9 @@
-find_package(ZLIB REQUIRED)
+find_package(ZLIB)
+if(NOT ZLIB_FOUND)
+  message(WARNING "No Zlib development support found. Disabling COMPRESS package...")
+  set(PKG_COMPRESS OFF CACHE BOOL "" FORCE)
+  return()
+endif()
 target_link_libraries(lammps PRIVATE ZLIB::ZLIB)

 find_package(PkgConfig QUIET)
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@ -26,6 +26,19 @@ elseif(GPU_PREC STREQUAL "SINGLE")
  set(GPU_PREC_SETTING "SINGLE_SINGLE")
 endif()

+option(GPU_DEBUG "Enable debugging code of the GPU package" OFF)
+mark_as_advanced(GPU_DEBUG)
+
+if(PKG_AMOEBA AND FFT_SINGLE)
+  message(FATAL_ERROR "GPU acceleration of AMOEBA is not (yet) compatible with single precision FFT")
+endif()
+
+if (PKG_AMOEBA)
+  list(APPEND GPU_SOURCES
+              ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h
+              ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp)
+endif()
+
 file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
 file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)

@ -151,7 +164,12 @@ if(GPU_API STREQUAL "CUDA")
  add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
  target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
  target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
-  target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS})
+  target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} ${GPU_CUDA_MPS_FLAGS})
+  if(GPU_DEBUG)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+  else()
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
+  endif()
  if(CUDPP_OPT)
    target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
    target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
@ -192,6 +210,7 @@ elseif(GPU_API STREQUAL "OPENCL")
    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu
    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu
    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu
+    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu
  )

  foreach(GPU_KERNEL ${GPU_LIB_CU})
@ -208,6 +227,7 @@ elseif(GPU_API STREQUAL "OPENCL")
  GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu)
  GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu)
  GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu)
+  GenerateOpenCLHeader(hippo ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu)

  list(APPEND GPU_LIB_SOURCES
    ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h
@ -217,14 +237,18 @@ elseif(GPU_API STREQUAL "OPENCL")
    ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h
    ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h
    ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h
+    ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h
  )

  add_library(gpu STATIC ${GPU_LIB_SOURCES})
  target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
  target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
-  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
-  target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
-
+  target_compile_definitions(gpu PRIVATE -DUSE_OPENCL -D_${GPU_PREC_SETTING})
+  if(GPU_DEBUG)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+  else()
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
+  endif()
  target_link_libraries(lammps PRIVATE gpu)

  add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
@ -374,8 +398,12 @@ elseif(GPU_API STREQUAL "HIP")

  add_library(gpu STATIC ${GPU_LIB_SOURCES})
  target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
-  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
-  target_compile_definitions(gpu PRIVATE -DUSE_HIP)
+  target_compile_definitions(gpu PRIVATE -DUSE_HIP -D_${GPU_PREC_SETTING})
+  if(GPU_DEBUG)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+  else()
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
+  endif()
  target_link_libraries(gpu PRIVATE hip::host)

  if(HIP_USE_DEVICE_SORT)
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -144,6 +144,7 @@ if(PKG_ML-IAP)
                                 ${KOKKOS_PKG_SOURCES_DIR}/mliap_descriptor_so3_kokkos.cpp
                                 ${KOKKOS_PKG_SOURCES_DIR}/mliap_model_linear_kokkos.cpp
                                 ${KOKKOS_PKG_SOURCES_DIR}/mliap_model_python_kokkos.cpp
+                                 ${KOKKOS_PKG_SOURCES_DIR}/mliap_unified_kokkos.cpp
                                 ${KOKKOS_PKG_SOURCES_DIR}/mliap_so3_kokkos.cpp)

  # Add KOKKOS version of ML-IAP Python coupling if non-KOKKOS version is included
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@ -126,10 +126,11 @@ CMake build
   -D GPU_API=value             # value = opencl (default) or cuda or hip
   -D GPU_PREC=value            # precision setting
                                # value = double or mixed (default) or single
-   -D HIP_PATH                  # path to HIP installation. Must be set if GPU_API=HIP
   -D GPU_ARCH=value            # primary GPU hardware choice for GPU_API=cuda
-                                # value = sm_XX, see below
-                                # default is sm_50
+                                # value = sm_XX (see below, default is sm_50)
+   -D GPU_DEBUG=value           # enable debug code in the GPU package library, mostly useful for developers
+                                # value = yes or no (default)
+   -D HIP_PATH=value            # value = path to HIP installation. Must be set if GPU_API=HIP
   -D HIP_ARCH=value            # primary GPU hardware choice for GPU_API=hip
                                # value depends on selected HIP_PLATFORM
                                # default is 'gfx906' for HIP_PLATFORM=amd and 'sm_50' for HIP_PLATFORM=nvcc
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@ -39,7 +39,7 @@ OPT.
   * :doc:`agni (o) <pair_agni>`
   * :doc:`airebo (io) <pair_airebo>`
   * :doc:`airebo/morse (io) <pair_airebo>`
-   * :doc:`amoeba <pair_amoeba>`
+   * :doc:`amoeba (g) <pair_amoeba>`
   * :doc:`atm <pair_atm>`
   * :doc:`awpmd/cut <pair_awpmd>`
   * :doc:`beck (go) <pair_beck>`
@ -126,7 +126,7 @@ OPT.
   * :doc:`hbond/dreiding/lj (o) <pair_hbond_dreiding>`
   * :doc:`hbond/dreiding/morse (o) <pair_hbond_dreiding>`
   * :doc:`hdnnp <pair_hdnnp>`
-   * :doc:`hippo <pair_amoeba>`
+   * :doc:`hippo (g) <pair_amoeba>`
   * :doc:`ilp/graphene/hbn (t) <pair_ilp_graphene_hbn>`
   * :doc:`ilp/tmd (t) <pair_ilp_tmd>`
   * :doc:`kolmogorov/crespi/full <pair_kolmogorov_crespi_full>`
@ -200,6 +200,7 @@ OPT.
   * :doc:`mdpd <pair_mesodpd>`
   * :doc:`mdpd/rhosum <pair_mesodpd>`
   * :doc:`meam (k) <pair_meam>`
+   * :doc:`meam/ms (k) <pair_meam>`
   * :doc:`meam/spline (o) <pair_meam_spline>`
   * :doc:`meam/sw/spline <pair_meam_sw_spline>`
   * :doc:`mesocnt <pair_mesocnt>`
--- a/doc/src/fix_pimd.rst
+++ b/doc/src/fix_pimd.rst
@ -149,6 +149,34 @@ related tasks for each of the partitions, e.g.
   restart 1000 system_${ibead}.restart1 system_${ibead}.restart2
   read_restart system_${ibead}.restart2

+Restart, fix_modify, output, run start/stop, minimize info
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+This fix writes the state of the Nose/Hoover thermostat over all
+quasi-beads to :doc:`binary restart files <restart>`.  See the
+:doc:`read_restart <read_restart>` command for info on how to re-specify
+a fix in an input script that reads a restart file, so that the
+operation of the fix continues in an uninterrupted fashion.
+
+None of the :doc:`fix_modify <fix_modify>` options
+are relevant to this fix.
+
+This fix computes a global 3-vector, which can be accessed by various
+:doc:`output commands <Howto_output>`.  The three quantities in the
+global vector are
+
+#. the total spring energy of the quasi-beads,
+#. the current temperature of the classical system of ring polymers,
+#. the current value of the scalar virial estimator for the kinetic
+   energy of the quantum system :ref:`(Herman) <Herman>`.
+
+The vector values calculated by this fix are "extensive", except for the
+temperature, which is "intensive".
+
+No parameter of this fix can be used with the *start/stop* keywords of
+the :doc:`run <run>` command.  This fix is not invoked during
+:doc:`energy minimization <minimize>`.
+
 Restrictions
 """"""""""""

@ -204,3 +232,8 @@ Path Integrals, McGraw-Hill, New York (1965).

 **(Calhoun)** A. Calhoun, M. Pavese, G. Voth, Chem Phys Letters, 262,
 415 (1996).
+
+.. _Herman:
+
+**(Herman)** M. F. Herman, E. J. Bruskin, B. J. Berne, J Chem Phys, 76, 5150 (1982).
+
--- a/doc/src/fix_reaxff_species.rst
+++ b/doc/src/fix_reaxff_species.rst
@ -39,6 +39,9 @@ Syntax
           *masslimit* value = massmin massmax
             massmin = minimum molecular weight of species to delete
             massmax = maximum molecular weight of species to delete
+       *delete_rate_limit* value = Nlimit Nsteps
+             Nlimit = maximum number of deletions allowed to occur within interval
+             Nsteps = the interval (number of timesteps) over which to count deletions

 Examples
 """"""""
@ -142,7 +145,13 @@ When using the *masslimit* keyword, each line of the *filedel* file
 contains the timestep on which deletions occurs, followed by how many
 of each species are deleted (with quantities preceding chemical
 formulae).  The *specieslist* and *masslimit* keywords cannot both be
-used in the same *reaxff/species* fix.
+used in the same *reaxff/species* fix.  The *delete_rate_limit*
+keyword can enforce an upper limit on the overall rate of molecule
+deletion.  The number of deletion occurrences is limited to Nlimit
+within an interval of Nsteps timesteps.  When using the
+*delete_rate_limit* keyword, no deletions are permitted to occur
+within the first Nsteps timesteps of the first run (after reading a
+either a data or restart file).

 ----------

--- a/doc/src/fix_rigid.rst
+++ b/doc/src/fix_rigid.rst
@ -732,8 +732,8 @@ choices:

 * Use one of the 4 NPT or NPH styles for the rigid bodies.  Use the
  *dilate* all option so that it will dilate the positions of the
-  *non-rigid particles as well.  Use :doc:`fix nvt <fix_nh>` (or any
-  *other thermostat) for the non-rigid particles.
+  non-rigid particles as well.  Use :doc:`fix nvt <fix_nh>` (or any
+  other thermostat) for the non-rigid particles.
 * Use :doc:`fix npt <fix_nh>` for the group of non-rigid particles.  Use
  the *dilate* all option so that it will dilate the center-of-mass
  positions of the rigid bodies as well.  Use one of the 4 NVE or 2 NVT
--- a/doc/src/pair_amoeba.rst
+++ b/doc/src/pair_amoeba.rst
@ -1,11 +1,18 @@
 .. index:: pair_style amoeba
+.. index:: pair_style amoeba/gpu
 .. index:: pair_style hippo
+.. index:: pair_style hippo/gpu

 pair_style amoeba command
 =========================

+Accelerator Variants: *amoeba/gpu*
+
 pair_style hippo command
 ========================
+
+Accelerator Variants: *hippo/gpu*
+
 Syntax
 """"""

@ -127,6 +134,10 @@ version discussed in :ref:`(Ponder) <amoeba-Ponder>`, :ref:`(Ren)
 implementation of HIPPO in LAMMPS matches the version discussed in
 :ref:`(Rackers) <amoeba-Rackers>`.

+.. versionadded:: TBD
+
+Accelerator support via the GPU package is available.
+
 ----------

 Only a single pair_coeff command is used with either the *amoeba* and
@ -187,6 +198,19 @@ These pair styles can only be used via the *pair* keyword of the

 ----------

+.. include:: accel_styles.rst
+
+.. note::
+
+  Using the GPU accelerated pair styles 'amoeba/gpu' or 'hippo/gpu'
+  when compiling the GPU package for OpenCL has a few known issues
+  when running on integrated GPUs and the calculation may crash.
+
+  The GPU accelerated pair styles are also not (yet) compatible
+  with single precision FFTs.
+
+----------
+
 Restrictions
 """"""""""""

--- a/doc/src/pair_meam.rst
+++ b/doc/src/pair_meam.rst
@ -1,17 +1,26 @@
 .. index:: pair_style meam
 .. index:: pair_style meam/kk
+.. index:: pair_style meam/ms
+.. index:: pair_style meam/ms/kk

 pair_style meam command
 =========================

 Accelerator Variants: *meam/kk*

+pair_style meam/ms command
+==========================
+
+Accelerator Variants: *meam/ms/kk*
+
 Syntax
 """"""

 .. code-block:: LAMMPS

-   pair_style meam
+   pair_style style
+
+* style = *meam* or *meam/ms*

 Examples
 """"""""
@ -22,6 +31,9 @@ Examples
   pair_coeff * * ../potentials/library.meam Si ../potentials/si.meam Si
   pair_coeff * * ../potentials/library.meam Ni Al NULL Ni Al Ni Ni

+   pair_style meam/ms
+   pair_coeff * * ../potentials/library.msmeam H Ga ../potentials/HGa.meam H Ga
+
 Description
 """""""""""

@ -31,16 +43,23 @@ Description
   as of November 2010; see description below of the mixture_ref_t
   parameter

-Pair style *meam* computes non-bonded interactions for a variety of materials
-using the modified embedded-atom method (MEAM)
-:ref:`(Baskes) <Baskes>`.  Conceptually, it is an extension to the original
-:doc:`EAM method <pair_eam>` which adds angular forces.  It is
-thus suitable for modeling metals and alloys with fcc, bcc, hcp and
-diamond cubic structures, as well as materials with covalent interactions
-like silicon and carbon. This *meam* pair style is a translation of the
-original Fortran version to C++. It is functionally equivalent but more
-efficient and has additional features. The Fortran version of the *meam*
-pair style has been removed from LAMMPS after the 12 December 2018 release.
+Pair style *meam* computes non-bonded interactions for a variety of
+materials using the modified embedded-atom method (MEAM) :ref:`(Baskes)
+<Baskes>`.  Conceptually, it is an extension to the original :doc:`EAM
+method <pair_eam>` which adds angular forces.  It is thus suitable for
+modeling metals and alloys with fcc, bcc, hcp and diamond cubic
+structures, as well as materials with covalent interactions like silicon
+and carbon.
+
+The *meam* pair style is a translation of the original Fortran version
+to C++. It is functionally equivalent but more efficient and has
+additional features. The Fortran version of the *meam* pair style has
+been removed from LAMMPS after the 12 December 2018 release.
+
+Pair style *meam/ms* uses the multi-state MEAM (MS-MEAM) method
+according to :ref:`(Baskes2) <Baskes2>`, which is an extension to MEAM.
+This pair style is mostly equivalent to *meam* and differs only
+where noted in the documentation below.

 In the MEAM formulation, the total energy E of a system of atoms is
 given by:
@ -351,6 +370,16 @@ Most published MEAM parameter sets use the default values *attrac* = *repulse* =
 Setting *repuls* = *attrac* = *delta* corresponds to the form used in several
 recent published MEAM parameter sets, such as :ref:`(Valone) <Valone>`

+Then using *meam/ms* pair style the multi-state MEAM (MS-MEAM) method is
+activated.  This requires 6 extra parameters in the MEAM library file,
+resulting in 25 parameters ordered that are ordered like this:
+
+elt, lat, z, ielement, atwt, alpha, b0, b1, b2, b3, b1m, b2m, b3m, alat, esub, asub,
+t0, t1, t2, t3, t1m, t2m, t3m, rozero, ibar
+
+The 6 extra MS-MEAM parameters are *b1m, b2m, b3m, t1m, t2m, t3m*.
+In the LAMMPS ``potentials`` folder, compatible files have an ".msmeam" extension.
+
 ----------

 .. include:: accel_styles.rst
@ -393,16 +422,15 @@ This pair style can only be used via the *pair* keyword of the
 Restrictions
 """"""""""""

-The *meam* style is provided in the MEAM package. It is
-only enabled if LAMMPS was built with that package.
+The *meam* and *meam/ms* pair styles are provided in the MEAM
+package. They are only enabled if LAMMPS was built with that package.
 See the :doc:`Build package <Build_package>` page for more info.

-The maximum number of elements, that can be read from the MEAM
-library file, is determined at compile time. The default is 5.
-If you need support for more elements, you have to change the
-define for the constant 'maxelt' at the beginning of the file
-src/MEAM/meam.h and update/recompile LAMMPS. There is no
-limit on the number of atoms types.
+The maximum number of elements, that can be read from the MEAM library
+file, is determined at compile time. The default is 5.  If you need
+support for more elements, you have to change the the constant 'maxelt'
+at the beginning of the file ``src/MEAM/meam.h`` and update/recompile
+LAMMPS.  There is no limit on the number of atoms types.

 Related commands
 """"""""""""""""
@ -421,6 +449,10 @@ none

 **(Baskes)** Baskes, Phys Rev B, 46, 2727-2742 (1992).

+.. _Baskes2:
+
+**(Baskes2)** Baskes, Phys Rev B, 75, 094113 (2007).
+
 .. _Gullet:

 **(Gullet)** Gullet, Wagner, Slepoy, SANDIA Report 2003-8782 (2003). DOI:10.2172/918395
--- a/doc/src/pair_style.rst
+++ b/doc/src/pair_style.rst
@ -277,7 +277,8 @@ accelerated styles exist.
 * :doc:`lubricateU/poly <pair_lubricateU>` - hydrodynamic lubrication forces for Fast Lubrication with polydispersity
 * :doc:`mdpd <pair_mesodpd>` - mDPD particle interactions
 * :doc:`mdpd/rhosum <pair_mesodpd>` - mDPD particle interactions for mass density
-* :doc:`meam <pair_meam>` - modified embedded atom method (MEAM) in C
+* :doc:`meam <pair_meam>` - modified embedded atom method (MEAM)
+* :doc:`meam/ms <pair_meam>` - multi-state modified embedded atom method (MS-MEAM)
 * :doc:`meam/spline <pair_meam_spline>` - splined version of MEAM
 * :doc:`meam/sw/spline <pair_meam_sw_spline>` - splined version of MEAM with a Stillinger-Weber term
 * :doc:`mesocnt <pair_mesocnt>` - mesoscopic vdW potential for (carbon) nanotubes
--- a/examples/meam/msmeam/HGa.meam
+++ b/examples/meam/msmeam/HGa.meam
@ -0,0 +1,30 @@
+bkgd_dyn        =       1
+emb_lin_neg = 1
+augt1=0 
+ialloy=1 
+rc	=	 5.9 
+#H
+attrac(1,1)=0.460 
+repuls(1,1)=0.460 
+Cmin(1,1,1)=1.3 # PuMS
+Cmax(1,1,1)= 2.80 
+nn2(1,1)=1
+#Ga
+rho0(2)         =       0.6
+attrac(2,2)=0.097 
+repuls(2,2)=0.097 
+nn2(2,2)=1
+#HGa
+attrac(1,2)=0.300 
+repuls(1,2)=0.300 
+lattce(1,2)=l12 
+re(1,2)=3.19 
+delta(1,2)=-0.48  
+alpha(1,2)=6.6 
+Cmin(1,1,2)=2.0 
+Cmin(2,1,2)= 2.0 
+Cmin(1,2,1)=2.0 
+Cmin(2,2,1)     =       1.4
+Cmin(1,2,2)     =       1.4
+Cmin(1,1,2)     =       1.4
+nn2(1,2)=1
--- a/examples/meam/msmeam/README.md
+++ b/examples/meam/msmeam/README.md
@ -0,0 +1,9 @@
+To run Baske's test, do
+
+    lmp -in in.msmeam
+
+Then 
+
+    diff dump.msmeam dump.msmeam.bu
+
+
--- a/examples/meam/msmeam/data.msmeam.bu
+++ b/examples/meam/msmeam/data.msmeam.bu
@ -0,0 +1,25 @@
+LAMMPS data file via write_data, version 16 Feb 2016, timestep = 1
+
+3 atoms
+2 atom types
+
+-4.0000000000000000e+00 4.0000000000000000e+00 xlo xhi
+-4.0000000000000000e+00 4.0000000000000000e+00 ylo yhi
+-4.0000000000000000e+00 4.0000000000000000e+00 zlo zhi
+
+Masses
+
+1 1.0079
+2 69.723
+
+Atoms # atomic
+
+1 1 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00 0 0 0
+2 2 2.2000000000000002e+00 0.0000000000000000e+00 0.0000000000000000e+00 0 0 0
+3 2 2.9999999999999999e-01 2.2999999999999998e+00 0.0000000000000000e+00 0 0 0
+
+Velocities
+
+1 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00
+2 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00
+3 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00
--- a/examples/meam/msmeam/dump.msmeam.bu
+++ b/examples/meam/msmeam/dump.msmeam.bu
@ -0,0 +1,24 @@
+ITEM: TIMESTEP
+0
+ITEM: NUMBER OF ATOMS
+3
+ITEM: BOX BOUNDS pp pp pp
+-4 4
+-4 4
+-4 4
+ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] 
+1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0 
+2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0 
+3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0 
+ITEM: TIMESTEP
+1
+ITEM: NUMBER OF ATOMS
+3
+ITEM: BOX BOUNDS pp pp pp
+-4 4
+-4 4
+-4 4
+ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] 
+1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0 
+2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0 
+3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0 
--- a/examples/meam/msmeam/in.msmeam
+++ b/examples/meam/msmeam/in.msmeam
@ -0,0 +1,31 @@
+echo	both
+log	log.msmeam
+# Test of MEAM potential for HGa
+
+# ------------------------ INITIALIZATION ----------------------------
+units           metal
+dimension       3
+boundary        p       p       p
+atom_style      atomic
+variable latparam equal 4.646
+variable ncell equal 3
+
+# ----------------------- ATOM DEFINITION ----------------------------
+region          box block -4 4 -4 4 -4 4
+create_box      2 box
+
+#
+
+include potential.mod
+create_atoms    1 single 0 0 0  units box
+create_atoms    2 single 2.2 0 0  units box
+create_atoms    2 single 0.3 2.3 0  units box
+# ---------- Define Settings ---------------------
+variable	teng equal "c_eatoms"
+compute pot_energy all pe/atom
+compute stress all stress/atom NULL
+dump 1 all custom 1 dump.msmeam id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
+run	1
+write_data	data.msmeam
+
+print "All done!"
--- a/examples/meam/msmeam/library.msmeam
+++ b/examples/meam/msmeam/library.msmeam
@ -0,0 +1,14 @@
+# DATE: 2018-09-22 UNITS: metal CONTRIBUTOR: Steve Valone, smv@lanl.gov CITATION: Baskes, PRB 1992; smv, sr, mib, JNM 2010
+# ms-meam data format May 2010
+#  elt   lat    z     ielement      atwt
+#  alpha b0     b1    b2     b3     b1m     b2m   b3m      alat   esub   asub
+#    -   t0     t1    t2     t3     t1m     t2m   t3m      rozero ibar
+#  NOTE:  leading character cannot be a space
+
+'H'    'dim'  1.0   1      1.0079
+2.960  2.960  3.0   1.0    1.0    1.0    3.0  1.0       0.741  2.235  2.50
+1.0    0.44721 0.0  0.00   0.0    0.31623 0  6.70 0
+
+'Ga4'  'fcc'  12.0  31     69.723
+4.42   4.80   3.10  6.00   0.00   0.0    0.0  0.5       4.247  2.897  0.97
+1.0    1.649 1.435  0.00   0.0    0.0  2.0       0.70   0
--- a/examples/meam/msmeam/log.msmeam.bu
+++ b/examples/meam/msmeam/log.msmeam.bu
@ -0,0 +1,107 @@
+# Test of MEAM potential for HGa
+
+# ------------------------ INITIALIZATION ----------------------------
+units           metal
+dimension       3
+boundary        p       p       p
+atom_style      atomic
+variable latparam equal 4.646
+variable ncell equal 3
+
+# ----------------------- ATOM DEFINITION ----------------------------
+region          box block -4 4 -4 4 -4 4
+create_box      2 box
+Created orthogonal box = (-4 -4 -4) to (4 4 4)
+  1 by 1 by 1 MPI processor grid
+
+#
+
+include potential.mod
+# NOTE: This script can be modified for different pair styles
+# See in.elastic for more info.
+
+variable Pu string H
+print "potential chosen ${Pu}"
+potential chosen H
+# Choose potential
+pair_style      MSmeam
+print		"we just executed"
+we just executed
+
+pair_coeff      * * library.MSmeam ${Pu} Ga4  HGaMS.meam ${Pu} Ga4
+pair_coeff      * * library.MSmeam H Ga4  HGaMS.meam ${Pu} Ga4
+pair_coeff      * * library.MSmeam H Ga4  HGaMS.meam H Ga4
+Reading potential file library.MSmeam with DATE: 2018-09-22
+# Setup neighbor style
+neighbor 1.0 nsq
+neigh_modify once no every 1 delay 0 check yes
+
+# Setup minimization style
+variable dmax equal 1.0e-2
+min_style	     cg
+min_modify	     dmax ${dmax} line quadratic
+min_modify	     dmax 0.01 line quadratic
+compute eng all pe/atom
+compute eatoms all reduce sum c_eng
+
+# Setup output
+thermo		100
+thermo_style custom step temp etotal  press pxx pyy pzz pxy pxz pyz lx ly lz vol c_eatoms
+thermo_modify norm yes
+create_atoms    1 single 0 0 0  units box
+Created 1 atoms
+create_atoms    2 single 2.2 0 0  units box
+Created 1 atoms
+create_atoms    2 single 0.3 2.3 0  units box
+Created 1 atoms
+# ---------- Define Settings ---------------------
+variable	teng equal "c_eatoms"
+compute pot_energy all pe/atom
+compute stress all stress/atom NULL
+dump 1 all custom 1 dump.msmeam id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
+run	1
+WARNING: No fixes defined, atoms won't move (../verlet.cpp:55)
+Neighbor list info ...
+  2 neighbor list requests
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 6.9
+  ghost atom cutoff = 6.9
+Memory usage per processor = 12.9295 Mbytes
+Step Temp TotEng Press Pxx Pyy Pzz Pxy Pxz Pyz Lx Ly Lz Volume eatoms 
+       0            0    15.433079    491354.68    838670.91    635393.13            0    80195.793            0            0            8            8            8          512    15.433079 
+       1            0    15.433079    491354.68    838670.91    635393.13            0    80195.793            0            0            8            8            8          512    15.433079 
+Loop time of 0.000172138 on 1 procs for 1 steps with 3 atoms
+
+Performance: 501.922 ns/day, 0.048 hours/ns, 5809.285 timesteps/s
+81.3% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 6.6996e-05 | 6.6996e-05 | 6.6996e-05 |   0.0 | 38.92
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 1.9073e-06 | 1.9073e-06 | 1.9073e-06 |   0.0 |  1.11
+Output  | 9.7036e-05 | 9.7036e-05 | 9.7036e-05 |   0.0 | 56.37
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 6.199e-06  |            |       |  3.60
+
+Nlocal:    3 ave 3 max 3 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    78 ave 78 max 78 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    7 ave 7 max 7 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:  14 ave 14 max 14 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 14
+Ave neighs/atom = 4.66667
+Neighbor list builds = 0
+Dangerous builds = 0
+write_data	data.msmeam
+
+print "All done!"
+All done!
+Total wall time: 0:00:00
+
--- a/examples/meam/msmeam/msmeam.dump.bu
+++ b/examples/meam/msmeam/msmeam.dump.bu
@ -0,0 +1,24 @@
+ITEM: TIMESTEP
+0
+ITEM: NUMBER OF ATOMS
+3
+ITEM: BOX BOUNDS pp pp pp
+-4 4
+-4 4
+-4 4
+ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] 
+1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0 
+2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0 
+3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0 
+ITEM: TIMESTEP
+1
+ITEM: NUMBER OF ATOMS
+3
+ITEM: BOX BOUNDS pp pp pp
+-4 4
+-4 4
+-4 4
+ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] 
+1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0 
+2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0 
+3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0 
--- a/examples/meam/msmeam/potential.mod
+++ b/examples/meam/msmeam/potential.mod
@ -0,0 +1,25 @@
+# NOTE: This script can be modified for different pair styles 
+# See in.elastic for more info.
+
+variable Pu string H
+print "potential chosen ${Pu}"
+# Choose potential
+pair_style meam/ms
+print		"we just executed"
+
+pair_coeff      * * library.msmeam ${Pu} Ga4  HGa.meam ${Pu} Ga4
+# Setup neighbor style
+neighbor 1.0 bin
+neigh_modify once no every 1 delay 0 check yes
+
+# Setup minimization style
+variable dmax equal 1.0e-2
+min_style	     cg
+min_modify	     dmax ${dmax} line quadratic
+compute eng all pe/atom
+compute eatoms all reduce sum c_eng
+
+# Setup output
+thermo		100
+thermo_style custom step temp etotal  press pxx pyy pzz pxy pxz pyz lx ly lz vol c_eatoms
+thermo_modify norm yes
--- a/lib/gpu/Makefile.lammps.standard
+++ b/lib/gpu/Makefile.lammps.standard
@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda
 endif

 gpu_SYSINC =
-gpu_SYSLIB =  -lcudart -lcuda
+gpu_SYSLIB =  -lcudart -lcuda -lcufft
 gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs

--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -1,8 +1,16 @@
+# Common headers for kernels
+PRE1_H = lal_preprocessor.h lal_aux_fun1.h
+
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \
         lal_pre_cuda_hip.h
-ALL_H  =  $(NVD_H) $(wildcard ./lal_*.h)
+
+# Headers for Host files
+HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
+         lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
+         lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
+         lal_neighbor_shared.h lal_pre_ocl_config.h $(NVD_H)
         
 # Source files
 SRCS := $(wildcard ./lal_*.cpp)
@ -54,13 +62,40 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
 $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
 	$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h

-$(OBJ_DIR)/%_cubin.h: lal_%.cu  $(ALL_H)
+$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
 	$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
 	$(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@

 # host code compilation

-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
+$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)

 #ifdef CUDPP_OPT
@ -77,10 +112,10 @@ $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
 	$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini

 $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
-	$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
+	$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu -Icudpp_mini

 $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
-	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
+	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu -Icudpp_mini
 #endif

 # build libgpu.a
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -6,7 +6,7 @@ UCL_H  = $(wildcard ./geryon/ucl*.h)
 OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h

 # Headers for Host files
-HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \
+HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
         lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
         lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
         lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H)
@ -74,6 +74,9 @@ $(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.
 $(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h
 	$(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h;

+$(OBJ_DIR)/hippo_cl.h: lal_hippo.cu $(PRE1_H) lal_hippo_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh hippo $(PRE1_H) lal_hippo_extra.h lal_hippo.cu $(OBJ_DIR)/hippo_cl.h;
+
 $(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;

--- a/lib/gpu/geryon/hip_macros.h
+++ b/lib/gpu/geryon/hip_macros.h
@ -26,6 +26,9 @@
 #ifdef UCL_DEBUG
 #define UCL_SYNC_DEBUG
 #define UCL_DESTRUCT_CHECK
+#define UCL_DEBUG_ARG(arg) arg
+#else
+#define UCL_DEBUG_ARG(arg)
 #endif

 #ifndef UCL_NO_API_CHECK
--- a/lib/gpu/geryon/nvd_macros.h
+++ b/lib/gpu/geryon/nvd_macros.h
@ -33,6 +33,9 @@
 #ifdef UCL_DEBUG
 #define UCL_SYNC_DEBUG
 #define UCL_DESTRUCT_CHECK
+#define UCL_DEBUG_ARG(arg) arg
+#else
+#define UCL_DEBUG_ARG(arg)
 #endif

 #ifndef UCL_NO_API_CHECK
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -309,15 +309,14 @@ class UCL_Device {
  /// Return the maximum memory pitch in bytes for current device
  inline size_t max_pitch() { return max_pitch(_device); }
  /// Return the maximum memory pitch in bytes
-  inline size_t max_pitch(const int i) { return 0; }
+  inline size_t max_pitch(const int) { return 0; }

  /// Returns false if accelerator cannot be shared by multiple processes
  /** If it cannot be determined, true is returned **/
  inline bool sharing_supported() { return sharing_supported(_device); }
  /// Returns false if accelerator cannot be shared by multiple processes
  /** If it cannot be determined, true is returned **/
-  inline bool sharing_supported(const int i)
-    { return true; }
+  inline bool sharing_supported(const int) { return true; }

  /// True if the device is a sub-device
  inline bool is_subdevice()
--- a/lib/gpu/geryon/ocl_macros.h
+++ b/lib/gpu/geryon/ocl_macros.h
@ -33,6 +33,9 @@
 #ifdef UCL_DEBUG
 #define UCL_SYNC_DEBUG
 #define UCL_DESTRUCT_CHECK
+#define UCL_DEBUG_ARG(arg) arg
+#else
+#define UCL_DEBUG_ARG(arg)
 #endif

 #ifndef UCL_NO_API_CHECK
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@ -137,7 +137,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,

 template <class mat_type>
 inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
-                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT /*kind2*/){
  cl_mem_flags buffer_perm;
  cl_map_flags map_perm;
  if (kind==UCL_READ_ONLY) {
@ -583,7 +583,7 @@ template <> struct _ucl_memcpy<1,0> {
  template <class p1, class p2>
  static inline void mc(p1 &dst, const p2 &src, const size_t n,
                        cl_command_queue &cq, const cl_bool block,
-                        const size_t dst_offset, const size_t src_offset) {
+                        const size_t /*dst_offset*/, const size_t src_offset) {
    if (src.cbegin()==dst.cbegin()) {
      #ifdef UCL_DBG_MEM_TRACE
      std::cerr << "UCL_COPY 1S\n";
@ -641,7 +641,7 @@ template <> struct _ucl_memcpy<0,1> {
  template <class p1, class p2>
  static inline void mc(p1 &dst, const p2 &src, const size_t n,
                        cl_command_queue &cq, const cl_bool block,
-                        const size_t dst_offset, const size_t src_offset) {
+                        const size_t dst_offset, const size_t /*src_offset*/) {
    if (src.cbegin()==dst.cbegin()) {
      if (block) ucl_sync(cq);
      #ifdef UCL_DBG_MEM_TRACE
--- a/lib/gpu/geryon/ocl_texture.h
+++ b/lib/gpu/geryon/ocl_texture.h
@ -35,19 +35,19 @@ class UCL_Texture {
  UCL_Texture() {}
  ~UCL_Texture() {}
  /// Construct with a specified texture reference
-  inline UCL_Texture(UCL_Program &prog, const char *texture_name) { }
+  inline UCL_Texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }
  /// Set the texture reference for this object
-  inline void get_texture(UCL_Program &prog, const char *texture_name) { }
+  inline void get_texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }

  /// Bind a float array where each fetch grabs a vector of length numel
  template<class mat_typ>
-  inline void bind_float(mat_typ &vec, const unsigned numel) { }
+    inline void bind_float(mat_typ & /*vec*/, const unsigned /*numel*/) { }

  /// Unbind the texture reference from the memory allocation
  inline void unbind() { }

  /// Make a texture reference available to kernel
-  inline void allow(UCL_Kernel &kernel) { }
+  inline void allow(UCL_Kernel & /*kernel*/) { }

 private:
  friend class UCL_Kernel;
@ -62,7 +62,7 @@ class UCL_Const {
  inline UCL_Const(UCL_Program &prog, const char *global_name)
    { get_global(prog,global_name); }
  /// Set the global reference for this object
-  inline void get_global(UCL_Program &prog, const char *global_name) {
+  inline void get_global(UCL_Program &prog, const char * /*global_name*/) {
    if (_active) {
      CL_DESTRUCT_CALL(clReleaseContext(_context));
      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -71,7 +71,7 @@ class UCL_Timer {
  inline void init(UCL_Device &dev) { init(dev,dev.cq()); }

  /// Initialize command queue for timing
-  inline void init(UCL_Device &dev, command_queue &cq) {
+  inline void init(UCL_Device & /*dev*/, command_queue &cq) {
    clear();
    _cq=cq;
    clRetainCommandQueue(_cq);
--- a/lib/gpu/geryon/ucl_copy.h
+++ b/lib/gpu/geryon/ucl_copy.h
@ -205,12 +205,11 @@ template <> struct _host_host_copy<1,1> {
 // Should never be here
 template <int host_t1, int host_t2> struct _host_host_copy {
  template <class mat1, class mat2>
-  static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
+  static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/) {
    assert(0==1);
  }
  template <class mat1, class mat2>
-  static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
-                         const size_t cols) {
+  static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, const size_t /*cols*/) {
    assert(0==1);
  }
 };
@ -470,24 +469,22 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
 // Neither on host or both on host
 template <> struct _ucl_cast_copy<1,1> {
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer, command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
+                          mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t /*cols*/, mat3 & /*cast_buffer*/) {
    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer,
-                        command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t /*cols*/, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
    assert(0==1);
  }
 };
@ -495,24 +492,22 @@ template <> struct _ucl_cast_copy<1,1> {
 // Neither on host or both on host
 template <> struct _ucl_cast_copy<0,0> {
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer, command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
+                          mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t /*cols*/, mat3 & /*cast_buffer*/) {
    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer,
-                        command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t cols, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
    assert(0==1);
  }
 };
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@ -125,7 +125,7 @@ class UCL_D_Vec : public UCL_BaseMat {
    * - The view does not prevent the memory from being freed by the
    *   allocating container when using CUDA APIs **/
  template <class ucl_type>
-  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
+  inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
    #ifdef UCL_DEBUG
    assert(rows==1);
    #endif
@ -230,8 +230,8 @@ class UCL_D_Vec : public UCL_BaseMat {
    * - The view does not prevent the memory from being freed by the
    *   allocating container when using CUDA APIs **/
  template <class ucl_type>
-  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols) {
+  inline void view_offset(const size_t offset,ucl_type &input,
+                          const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
    #ifdef UCL_DEBUG
    assert(rows==1);
    #endif
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@ -126,7 +126,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    *   allocating container when using CUDA APIs
    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
-  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
+  inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
    #ifdef UCL_DEBUG
    assert(rows==1);
    #endif
@ -188,7 +188,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    *   allocating container when using CUDA APIs
    * - Viewing a device pointer on the host is not supported **/
  template <class ptr_type>
-  inline void view(ptr_type *input, const size_t rows, const size_t cols,
+  inline void view(ptr_type *input, const size_t UCL_DEBUG_ARG(rows), const size_t cols,
                   UCL_Device &dev) {
    #ifdef UCL_DEBUG
    assert(rows==1);
@ -233,7 +233,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    *   allocating container when using CUDA APIs
    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
-  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t UCL_DEBUG_ARG(rows),
                          const size_t cols) {
    #ifdef UCL_DEBUG
    assert(rows==1);
--- a/lib/gpu/geryon/ucl_s_obj_help.h
+++ b/lib/gpu/geryon/ucl_s_obj_help.h
@ -27,7 +27,7 @@ template <int st> struct _ucl_s_obj_help;
 // -- Can potentially use same memory if shared by accelerator
 template <> struct _ucl_s_obj_help<1> {
  template <class t1, class t2, class t3>
-  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
+    static inline int alloc(t1 &host, t2 &device, t3 & /*_buffer*/,
                          const int cols, UCL_Device &acc,
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
@ -131,41 +131,37 @@ template <> struct _ucl_s_obj_help<1> {
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
+    static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, const bool async) {
    ucl_copy(dst,src,async);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
+    static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, command_queue &cq) {
    ucl_copy(dst,src,cq);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
-                          const bool async) {
+    static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, const bool async) {
    ucl_copy(dst,src,cols,async);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
-                          command_queue &cq) {
+    static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, command_queue &cq) {
    ucl_copy(dst,src,cols,cq);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
-                          t3 &buffer, const bool async) {
+    static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, const bool async) {
    ucl_copy(dst,src,rows,cols,async);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
-                          t3 &buffer, command_queue &cq) {
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, command_queue &cq) {
    ucl_copy(dst,src,rows,cols,cq);
  }

  template <class t1, class t2, class t3>
-  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
+    static inline int dev_resize(t1 &device, t2 &host, t3 & /*buff*/,const int cols) {
    if (device.kind()==UCL_VIEW) {
      device.view(host);
      return UCL_SUCCESS;
@ -353,7 +349,7 @@ template <int st> struct _ucl_s_obj_help {
  }

  template <class t1, class t2, class t3>
-  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
+  static inline int dev_resize(t1 &device, t2 & /*host*/, t3 &buff,const int cols) {
    int err=buff.resize(cols);
    if (err!=UCL_SUCCESS)
      return err;
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -0,0 +1,322 @@
+/***************************************************************************
+                                 amoeba.cpp
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the amoeba pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "amoeba_cl.h"
+#elif defined(USE_CUDART)
+const char *amoeba=0;
+#else
+#include "amoeba_cubin.h"
+#endif
+
+#include "lal_amoeba.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define AmoebaT Amoeba<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+AmoebaT::Amoeba() : BaseAmoeba<numtyp,acctyp>(),
+  _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+AmoebaT::~Amoeba() {
+  clear();
+}
+
+template <class numtyp, class acctyp>
+int AmoebaT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
+                  const double *host_pdamp, const double *host_thole,
+                  const double *host_dirdamp, const int *host_amtype2class,
+                  const double *host_special_hal,
+                  const double * /*host_special_repel*/,
+                  const double * /*host_special_disp*/,
+                  const double *host_special_mpole,
+                  const double * /*host_special_polar_wscale*/,
+                  const double *host_special_polar_piscale,
+                  const double *host_special_polar_pscale,
+                  const double *host_csix, const double *host_adisp,
+                  const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const int maxspecial15,
+                  const double cell_size, const double gpu_split, FILE *_screen,
+                  const double polar_dscale, const double polar_uscale) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
+                            cell_size,gpu_split,_screen,amoeba,
+                            "k_amoeba_multipole", "k_amoeba_udirect2b",
+                            "k_amoeba_umutual2b", "k_amoeba_polar",
+                            "k_amoeba_fphi_uind", "k_amoeba_fphi_mpole",
+                            "k_amoeba_short_nbor", "k_amoeba_special15");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+
+  UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_pdamp[i];
+    host_write[i].y = host_thole[i];
+    host_write[i].z = host_dirdamp[i];
+    host_write[i].w = host_amtype2class[i];
+  }
+
+  coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amtype,host_write,false);
+
+  UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amclass; i++) {
+    host_write2[i].x = host_csix[i];
+    host_write2[i].y = host_adisp[i];
+    host_write2[i].z = (numtyp)0;
+    host_write2[i].w = (numtyp)0;
+  }
+
+  coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amclass,host_write2,false);
+
+  UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
+  sp_amoeba.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_hal[i];
+    dview[i].y=host_special_polar_piscale[i];
+    dview[i].z=host_special_polar_pscale[i];
+    dview[i].w=host_special_mpole[i];
+  }
+  ucl_copy(sp_amoeba,dview,5,false);
+
+  _polar_dscale = polar_dscale;
+  _polar_uscale = polar_uscale;
+
+  _allocated=true;
+  this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
+    + sp_amoeba.row_bytes() + this->_tep.row_bytes()
+    + this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
+    + this->_thetai2.row_bytes()  + this->_thetai3.row_bytes()
+    + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void AmoebaT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  coeff_amtype.clear();
+  coeff_amclass.clear();
+  sp_amoeba.clear();
+
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double AmoebaT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the multipole real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::multipole_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_mpole,
+  //   at this point mpole is the first kernel in a time step for AMOEBA
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_mpole, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  this->k_multipole.set_size(GX,BX);
+  this->k_multipole.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &sp_amoeba,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->ans->force, &this->ans->engv, &this->_tep,
+                        &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                        &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Launch the real-space permanent field kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff _off2_polar, if not done yet
+  //   this is the first kernel in a time step where _off2_polar is used
+
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(),
+                           &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                           &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_udirect2b.set_size(GX,BX);
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
+                        &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Launch the real-space induced field kernel, returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(), &this->dev_short_nbor,
+                           &this->_off2_polar, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_umutual2b.set_size(GX,BX);
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
+                        &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
+                        &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Launch the polar real-space kernel, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::polar_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=this->block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  /*
+  const int cus = this->device->gpu->cus();
+  while (GX < cus && GX > 1) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  }
+  */
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_polar.set_size(GX,BX);
+  this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                    &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  // Signal that short nbor list is not avail for the next time step
+  //   do it here because polar_real() is the last kernel in a time step at this point
+
+  this->short_nbor_polar_avail = false;
+
+  return GX;
+}
+
+template class Amoeba<PRECISION,ACC_PRECISION>;
+}
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@ -0,0 +1,100 @@
+/***************************************************************************
+                                  amoeba.h
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the amoeba pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_AMOEBA_H
+#define LAL_AMOEBA_H
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Amoeba : public BaseAmoeba<numtyp, acctyp> {
+ public:
+  Amoeba();
+  ~Amoeba();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const int max_amtype, const int max_amclass,
+           const double *host_pdamp, const double *host_thole,
+           const double *host_dirdamp, const int *host_amtype2class,
+           const double *host_special_mpole,
+           const double *host_special_hal,
+           const double *host_special_repel,
+           const double *host_special_disp,
+           const double *host_special_polar_wscale,
+           const double *host_special_polar_piscale,
+           const double *host_special_polar_pscale,
+           const double *host_csix, const double *host_adisp,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const int maxspecial15, const double cell_size,
+           const double gpu_split, FILE *_screen,
+           const double polar_dscale, const double polar_uscale);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
+  /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
+  UCL_D_Vec<numtyp4> coeff_amtype;
+  /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
+  UCL_D_Vec<numtyp4> coeff_amclass;
+  /// Special amoeba values [0-4]:
+  ///   sp_amoeba.x = special_hal
+  ///   sp_amoeba.y = special_polar_pscale,
+  ///   sp_amoeba.z = special_polar_piscale
+  ///   sp_amoeba.w = special_mpole
+  UCL_D_Vec<numtyp4> sp_amoeba;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _polar_dscale, _polar_uscale;
+  numtyp _qqrd2e;
+
+ protected:
+  bool _allocated;
+  int multipole_real(const int eflag, const int vflag);
+  int udirect2b(const int eflag, const int vflag);
+  int umutual2b(const int eflag, const int vflag);
+  int polar_real(const int eflag, const int vflag);
+
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@ -0,0 +1,213 @@
+/***************************************************************************
+                                 amoeba_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Functions for LAMMPS access to amoeba acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_amoeba.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp, const int *host_amtype2class,
+                    const double *host_special_hal,
+                    const double *host_special_repel,
+                    const double *host_special_disp,
+                    const double *host_special_mpole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const double *host_csix, const double *host_adisp,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double polar_dscale, const double polar_uscale) {
+  AMOEBAMF.clear();
+  gpu_mode=AMOEBAMF.device->gpu_mode();
+  double gpu_split=AMOEBAMF.device->particle_split();
+  int first_gpu=AMOEBAMF.device->first_device();
+  int last_gpu=AMOEBAMF.device->last_device();
+  int world_me=AMOEBAMF.device->world_me();
+  int gpu_rank=AMOEBAMF.device->gpu_rank();
+  int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();
+
+  AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);
+
+  bool message=false;
+  if (AMOEBAMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
+                          host_pdamp, host_thole, host_dirdamp,
+                          host_amtype2class, host_special_hal,
+                          host_special_repel, host_special_disp,
+                          host_special_mpole, host_special_polar_wscale,
+                          host_special_polar_piscale, host_special_polar_pscale,
+                          host_csix, host_adisp, nlocal, nall, max_nbors,
+                          maxspecial, maxspecial15, cell_size, gpu_split,
+                          screen, polar_dscale, polar_uscale);
+
+  AMOEBAMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
+                            host_pdamp, host_thole, host_dirdamp,
+                            host_amtype2class, host_special_hal,
+                            host_special_repel, host_special_disp,
+                            host_special_mpole, host_special_polar_wscale,
+                            host_special_polar_piscale, host_special_polar_pscale,
+                            host_csix, host_adisp, nlocal, nall, max_nbors,
+                            maxspecial, maxspecial15, cell_size, gpu_split,
+                            screen, polar_dscale, polar_uscale);
+
+    AMOEBAMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    AMOEBAMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void amoeba_gpu_clear() {
+  AMOEBAMF.clear();
+}
+
+int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
+                            double **host_x, int *host_type, int *host_amtype,
+                            int *host_amgroup, double **host_rpole, double ** /*host_uind*/,
+                            double ** /*host_uinp*/, double * /*host_pval*/,
+                            double *sublo, double *subhi, tagint *tag,
+                            int **nspecial, tagint **special,
+                            int *nspecial15, tagint **special15,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom, int &host_start,
+                            int **ilist, int **jnum, const double cpu_time,
+                            bool &success, double *host_q, double *boxlo, double *prd) {
+  return AMOEBAMF.precompute(ago, inum_full, nall, host_x, host_type,
+                             host_amtype, host_amgroup, host_rpole,
+                             nullptr, nullptr, nullptr, sublo, subhi, tag,
+                             nspecial, special, nspecial15, special15,
+                             eflag_in, vflag_in, eatom, vatom,
+                             host_start, ilist, jnum, cpu_time,
+                             success, host_q, boxlo, prd);
+}
+
+
+void amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double felec, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
+}
+
+void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           const double aewald, const double off2, void **fieldp_ptr) {
+  AMOEBAMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
+                             aewald, off2, fieldp_ptr);
+}
+
+void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                   double **host_uind, double **host_uinp,
+                                   const double aewald, const double off2, void **fieldp_ptr) {
+  AMOEBAMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
+                             aewald, off2, fieldp_ptr);
+}
+
+void amoeba_gpu_update_fieldp(void **fieldp_ptr) {
+  AMOEBAMF.update_fieldp(fieldp_ptr);
+}
+
+void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                    double **host_uind, double **host_uinp,
+                                    const bool eflag_in, const bool vflag_in,
+                                    const bool eatom, const bool vatom,
+                                    const double aewald, const double felec, const double off2,
+                                    void **tep_ptr) {
+  AMOEBAMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
+                              eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
+}
+
+void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out) {
+   AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid,
+                              nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out);
+}
+
+void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi) {
+   AMOEBAMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1,
+                              host_fdip_phi2, host_fdip_sum_phi);
+}
+
+void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) {
+   AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec);
+}
+
+void amoeba_setup_fft(const int numel, const int element_type) {
+  AMOEBAMF.setup_fft(numel, element_type);
+}
+
+void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) {
+  AMOEBAMF.compute_fft1d(in, out, numel, mode);
+}
+
+double amoeba_gpu_bytes() {
+  return AMOEBAMF.host_memory_usage();
+}
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const {
    bytes+=sizeof(numtyp);
  if (_vel)
    bytes+=4*sizeof(numtyp);
+  if (_extra_fields>0)
+    bytes+=_extra_fields*sizeof(numtyp4);
  return bytes;
 }

@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) {
                                   UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=v.device.row_bytes();
  }
+  if (_extra_fields>0) {
+    success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=extra.device.row_bytes();
+  }

  if (_gpu_nbor>0) {
    if (_bonds) {
@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) {

 template <class numtyp, class acctyp>
 bool AtomT::add_fields(const bool charge, const bool rot,
-                       const int gpu_nbor, const bool bonds, const bool vel) {
+                       const int gpu_nbor, const bool bonds, const bool vel,
+                       const int extra_fields) {
  bool success=true;
  // Ignore host/device transfers?
  int gpu_bytes=0;
@ -191,7 +199,17 @@ bool AtomT::add_fields(const bool charge, const bool rot,
    }
  }

-  if (bonds && !_bonds) {
+  if (extra_fields > 0 && _extra_fields==0) {
+    _extra_fields=extra_fields;
+    _other=true;
+    if (_host_view==false) {
+      success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
+                                     UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=extra.device.row_bytes();
+    }
+  }
+
+  if (bonds && _bonds==false) {
    _bonds=true;
    if (_bonds && _gpu_nbor>0) {
      success=success && (dev_tag.alloc(_max_atoms,*dev,
@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,

 template <class numtyp, class acctyp>
 bool AtomT::init(const int nall, const bool charge, const bool rot,
-                 UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) {
+                 UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel,
+                 const int extra_fields) {
  clear();

  bool success=true;
@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
  _q_avail=false;
  _quat_avail=false;
  _v_avail=false;
+  _extra_avail=false;
  _resized=false;
  _gpu_nbor=gpu_nbor;
  _bonds=bonds;
  _charge=charge;
  _rot=rot;
  _vel=vel;
-  _other=_charge || _rot || _vel;
+  _extra_fields=extra_fields;
+  _other=_charge || _rot || _vel || (extra_fields>0);
  dev=&devi;
  _time_transfer=0;

@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
  time_q.init(*dev);
  time_quat.init(*dev);
  time_vel.init(*dev);
+  time_extra.init(*dev);
+
  time_pos.zero();
  time_q.zero();
  time_quat.zero();
  time_vel.zero();
+  time_extra.zero();
+
  _time_cast=0.0;

  #ifdef GPU_CAST
@ -308,6 +333,8 @@ void AtomT::clear_resize() {
    quat.clear();
  if (_vel)
    v.clear();
+  if (_extra_fields>0)
+    extra.clear();

  dev_cell_id.clear();
  dev_particle_id.clear();
@ -350,6 +377,7 @@ void AtomT::clear() {
  time_q.clear();
  time_quat.clear();
  time_vel.clear();
+  time_extra.clear();
  clear_resize();

  #ifdef GPU_CAST
@ -370,12 +398,19 @@ double AtomT::host_memory_usage() const {
    atom_bytes+=4;
  if (_vel)
    atom_bytes+=4;
+  if (_extra_fields>0)
+    atom_bytes+=_extra_fields;
  return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }

+#ifdef USE_CUDPP
+#define USE_CUDPP_ARG(arg) arg
+#else
+#define USE_CUDPP_ARG(arg)
+#endif
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
-void AtomT::sort_neighbor(const int num_atoms) {
+void AtomT::sort_neighbor(const int USE_CUDPP_ARG(num_atoms)) {
  #ifdef USE_CUDPP
  CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
                                 (int *)dev_particle_id.begin(),
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -76,7 +76,7 @@ class Atom {
    *        gpu_nbor 2 if binning on host and neighboring on device **/
  bool init(const int nall, const bool charge, const bool rot,
            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
-            const bool vel=false);
+            const bool vel=false, const int extra_fields=0);

  /// Check if we have enough device storage and realloc if not
  /** Returns true if resized with any call during this timestep **/
@ -96,7 +96,7 @@ class Atom {
    *        gpu_nbor 1 if neighboring will be performed on device
    *        gpu_nbor 2 if binning on host and neighboring on device **/
  bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
-                  const bool bonds, const bool vel=false);
+                  const bool bonds, const bool vel=false, const int extra_fields=0);

  /// Returns true if GPU is using charges
  bool charge() { return _charge; }
@ -107,6 +107,9 @@ class Atom {
  /// Returns true if GPU is using velocities
  bool velocity() { return _vel; }

+  /// Returns true if GPU is using extra fields
+  bool using_extra() { return (_extra_fields>0); }
+
  /// Only free matrices of length inum or nall for resizing
  void clear_resize();

@ -128,6 +131,8 @@ class Atom {
      time_quat.add_to_total();
    if (_vel)
      time_vel.add_to_total();
+    if (_extra_fields>0)
+      time_extra.add_to_total();
  }

  /// Add copy times to timers
@ -139,6 +144,8 @@ class Atom {
      time_quat.zero();
    if (_vel)
      time_vel.zero();
+    if (_extra_fields>0)
+      time_extra.zero();
  }

  /// Return the total time for host/device data transfer
@ -158,6 +165,10 @@ class Atom {
      total+=time_vel.total_seconds();
      time_vel.zero_total();
    }
+    if (_extra_fields>0) {
+      total+=time_extra.total_seconds();
+      time_extra.zero_total();
+    }

    return total+_time_transfer/1000.0;
  }
@ -281,7 +292,11 @@ class Atom {

  /// Signal that we need to transfer atom data for next timestep
  inline void data_unavail()
-    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; }
+
+  /// Signal that we need to transfer atom extra data for next kernel call
+  inline void extra_data_unavail()
+    { _extra_avail=false; }

  typedef struct { double x,y,z; } vec3d;
  typedef struct { numtyp x,y,z,w; } vec4d_t;
@ -312,7 +327,7 @@ class Atom {

  /// Copy positions and types to device asynchronously
  /** Copies nall() elements **/
-  inline void add_x_data(double **host_ptr, int *host_type) {
+  inline void add_x_data(double ** /*host_ptr*/, int * /*host_type*/) {
    time_pos.start();
    if (_x_avail==false) {
      #ifdef GPU_CAST
@ -426,7 +441,7 @@ class Atom {

  /// Copy velocities and tags to device asynchronously
  /** Copies nall() elements **/
-  inline void add_v_data(double **host_ptr, tagint *host_tag) {
+  inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) {
    time_vel.start();
    if (_v_avail==false) {
      #ifdef GPU_CAST
@ -450,6 +465,33 @@ class Atom {
    add_v_data(host_ptr,host_tag);
  }

+ // Cast extras to write buffer
+  template<class cpytyp>
+  inline void cast_extra_data(cpytyp *host_ptr) {
+    if (_extra_avail==false) {
+      double t=MPI_Wtime();
+      #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+      #pragma omp parallel for simd schedule(static)
+      #elif (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i=0; i<_nall*_extra_fields; i++)
+        extra[i]=host_ptr[i];
+      _time_cast+=MPI_Wtime()-t;
+    }
+  }
+
+  // Copy extras to device
+  /** Copies nall()*_extra elements **/
+  inline void add_extra_data() {
+    time_extra.start();
+    if (_extra_avail==false) {
+      extra.update_device(_nall*_extra_fields,true);
+      _extra_avail=true;
+    }
+    time_extra.stop();
+  }
+
  /// Add in casting time from additional data (seconds)
  inline void add_cast_time(double t) { _time_cast+=t; }

@ -473,6 +515,8 @@ class Atom {
  UCL_Vector<numtyp,numtyp> quat;
  /// Velocities
  UCL_Vector<numtyp,numtyp> v;
+  /// Extras
+  UCL_Vector<numtyp4,numtyp4> extra;

  #ifdef GPU_CAST
  UCL_Vector<numtyp,numtyp> x_cast;
@ -493,7 +537,7 @@ class Atom {
  UCL_H_Vec<int> host_particle_id;

  /// Device timers
-  UCL_Timer time_pos, time_q, time_quat, time_vel;
+  UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra;

  /// Geryon device
  UCL_Device *dev;
@ -508,11 +552,12 @@ class Atom {
  bool _compiled;

  // True if data has been copied to device already
-  bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
+  bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized;

  bool alloc(const int nall);

  bool _allocated, _rot, _charge, _bonds, _vel, _other;
+  int _extra_fields;
  int _max_atoms, _nall, _gpu_nbor;
  bool _host_view;
  double _time_cast, _time_transfer;
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -0,0 +1,962 @@
+/***************************************************************************
+                               base_amoeba.cpp
+                             -------------------
+                            Trung Dac Nguyen (Northwestern)
+
+  Base class for pair styles needing per-particle data for position,
+  charge, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+#define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> global_device;
+
+template <class numtyp, class acctyp>
+BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_polar_avail(false) {
+  device=&global_device;
+  ans=new Answer<numtyp,acctyp>();
+  nbor=new Neighbor();
+  pair_program=nullptr;
+  ucl_device=nullptr;
+}
+
+template <class numtyp, class acctyp>
+BaseAmoebaT::~BaseAmoeba() {
+  delete ans;
+  delete nbor;
+  k_multipole.clear();
+  k_udirect2b.clear();
+  k_umutual2b.clear();
+  k_fphi_uind.clear();
+  k_fphi_mpole.clear();
+  k_polar.clear();
+  k_special15.clear();
+  k_short_nbor.clear();
+
+  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
+  if (fft_plan_created) cufftDestroy(plan);
+  #endif
+
+  if (pair_program) delete pair_program;
+}
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
+                             const int max_nbors, const int maxspecial,
+                             const int maxspecial15,
+                             const double cell_size, const double gpu_split,
+                             FILE *_screen, const void *pair_program,
+                             const char *k_name_multipole,
+                             const char *k_name_udirect2b,
+                             const char *k_name_umutual2b,
+                             const char *k_name_polar,
+                             const char *k_name_fphi_uind,
+                             const char *k_name_fphi_mpole,
+                             const char *k_name_short_nbor,
+                             const char* k_name_special15) {
+  screen=_screen;
+
+  int gpu_nbor=0;
+  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  _threads_per_atom=device->threads_per_charge();
+
+  bool charge = true;
+  bool rot = false;
+  bool vel = false;
+  _extra_fields = 24; // round up to accomodate quadruples of numtyp values
+                      // rpole 13; uind 3; uinp 3; amtype, amgroup; pval
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4);
+  if (success!=0)
+    return success;
+
+  if (ucl_device!=device->gpu) _compiled=false;
+
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pair_block_size();
+  _block_bio_size=device->block_bio_pair();
+  compile_kernels(*ucl_device,pair_program,k_name_multipole,
+                   k_name_udirect2b, k_name_umutual2b,k_name_polar,
+                   k_name_fphi_uind, k_name_fphi_mpole,
+                   k_name_short_nbor, k_name_special15);
+
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else {
+    _nbor_data=&(nbor->dev_nbor);
+  }
+
+  bool alloc_packed=false;
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
+                              _gpu_host,max_nbors,cell_size,alloc_packed,
+                              _threads_per_atom);
+  if (success!=0)
+    return success;
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_nbor,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->x,4);
+  q_tex.bind_float(atom->q,1);
+
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  _maxspecial=maxspecial;
+  _maxspecial15=maxspecial15;
+
+  // allocate per-atom array tep
+
+  int ef_nall=nlocal; //nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+
+  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
+
+  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
+  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+
+  _max_fieldp_size = _max_tep_size;
+  _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+
+  _max_thetai_size = 0;
+
+  _nmax = nall;
+  dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+  dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+  dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
+
+  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
+  fft_plan_created = false;
+  #endif
+
+  #ifdef ASYNC_DEVICE_COPY
+  _end_command_queue=ucl_device->num_queues();
+  ucl_device->push_command_queue();
+  #endif
+
+  return success;
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::clear_atomic() {
+  // Output any timing information
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
+
+  time_pair.clear();
+  hd_balancer.clear();
+
+  dev_short_nbor.clear();
+  nbor->clear();
+  ans->clear();
+
+  _tep.clear();
+  _fieldp.clear();
+  _thetai1.clear();
+  _thetai2.clear();
+  _thetai3.clear();
+  _igrid.clear();
+  _fdip_phi1.clear();
+  _fdip_phi2.clear();
+  _fdip_sum_phi.clear();
+  _cgrid_brick.clear();
+
+  dev_nspecial15.clear();
+  dev_special15.clear();
+  dev_special15_t.clear();
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
+                                   int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return nullptr;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+
+  return ilist;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
+                                        const int nall, double **host_x,
+                                        int *host_type, double *sublo,
+                                        double *subhi, tagint *tag,
+                                        int **nspecial, tagint **special,
+                                        int *nspecial15, tagint **special15,
+                                        bool &success) {
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return 0;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
+
+  // add one-five neighbors
+
+  if (_maxspecial15>0) {
+    UCL_H_Vec<int> view_nspecial15;
+    UCL_H_Vec<tagint> view_special15;
+    view_nspecial15.view(nspecial15,nall,*ucl_device);
+    view_special15.view(special15[0],nall*_maxspecial15,*ucl_device);
+    ucl_copy(dev_nspecial15,view_nspecial15,nall,false);
+    ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false);
+    nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall);
+
+    add_onefive_neighbors();
+  }
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+  return mn;
+}
+
+// ---------------------------------------------------------------------------
+// Prepare for multiple kernel calls in a time step:
+//   - reallocate per-atom arrays, if needed
+//   - transfer extra data from host to device
+//   - build the full neighbor lists for use by different kernels
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
+                              double **host_x, int *host_type, int *host_amtype,
+                              int *host_amgroup, double **host_rpole,
+                              double **host_uind, double **host_uinp, double *host_pval,
+                              double *sublo, double *subhi, tagint *tag,
+                              int **nspecial, tagint **special,
+                              int *nspecial15, tagint **special15,
+                              const bool eflag_in, const bool vflag_in,
+                              const bool eatom, const bool vatom, int &host_start,
+                              int **&ilist, int **&jnum, const double cpu_time,
+                              bool &success, double *host_q, double * /*boxlo*/, double * /*prd*/) {
+  acc_timers();
+  if (eatom) _eflag=2;
+  else if (eflag_in) _eflag=1;
+  else _eflag=0;
+  if (vatom) _vflag=2;
+  else if (vflag_in) _vflag=1;
+  else _vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (_eflag) _eflag=2;
+  if (_vflag) _vflag=2;
+  #endif
+
+  set_kernel(_eflag,_vflag);
+
+  // ------------------- Resize 1-5 neighbor arrays ------------------------
+
+  if (nall>_nmax) {
+    _nmax = nall;
+    dev_nspecial15.clear();
+    dev_special15.clear();
+    dev_special15_t.clear();
+    dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
+  }
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return nullptr;
+  }
+
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                    success);
+    if (!success)
+      return nullptr;
+    atom->cast_q_data(host_q);
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    atom->cast_q_data(host_q);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  atom->add_q_data();
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  atom->add_extra_data();
+
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  // re-allocate dev_short_nbor if necessary
+  if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    dev_short_nbor.resize((2+_max_nbors)*_nmax);
+  }
+
+  hd_balancer.stop_timer();
+
+  return nbor->host_jlist.begin()-host_start;
+}
+
+// ---------------------------------------------------------------------------
+// Compute multipole real-space part
+//   precompute() should be already invoked before mem (re)allocation
+//   this is the first part in a time step done on the GPU for AMOEBA for now
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
+                                         const int /*nall*/, double ** /*host_x*/,
+                                         int * /*host_type*/, int * /*host_amtype*/,
+                                         int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                         double */*host_pval*/, double * /*sublo*/,
+                                         double * /*subhi*/, tagint * /*tag*/,
+                                         int ** /*nspecial*/, tagint ** /*special*/,
+                                         int * /*nspecial15*/, tagint ** /*special15*/,
+                                         const bool /*eflag_in*/, const bool /*vflag_in*/,
+                                         const bool /*eatom*/, const bool /*vatom*/,
+                                         int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
+                                         const double /*cpu_time*/, bool & /*success*/,
+                                         const double aewald, const double felec,
+                                         const double off2_mpole, double * /*host_q*/,
+                                         double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    _tep.resize(_max_tep_size*4);
+  }
+  *tep_ptr=_tep.host.begin();
+
+  _off2_mpole = off2_mpole;
+  _felec = felec;
+  _aewald = aewald;
+  multipole_real(_eflag,_vflag);
+
+  // leave the answers (forces, energies and virial) on the device,
+  //   only copy them back in the last kernel (polar_real)
+  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //device->add_ans_object(ans);
+
+  // copy tep from device to host
+
+  _tep.update_host(_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute the direct real space part
+//    of the permanent field
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                     double **host_uind, double **host_uinp, double *host_pval,
+                                     const double aewald, const double off2_polar,
+                                     void** fieldp_ptr) {
+  // all the necessary data arrays are already copied from host to device
+
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  atom->add_extra_data();
+
+  *fieldp_ptr=_fieldp.host.begin();
+
+  // specify the correct cutoff and alpha values
+  _off2_polar = off2_polar;
+  _aewald = aewald;
+  udirect2b(_eflag,_vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+
+  _fieldp.update_host(_max_fieldp_size*8,false);
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute the direct real space part
+//    of the induced field
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double ** /*host_rpole*/,
+                                    double **host_uind, double **host_uinp, double * /*host_pval*/,
+                                    const double aewald, const double off2_polar,
+                                    void** /*fieldp_ptr*/) {
+  // only copy the necessary data arrays that are updated over the iterations
+  // use nullptr for the other arrays that are already copied from host to device
+  cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr);
+  atom->add_extra_data();
+
+  // set the correct cutoff and alpha
+  _off2_polar = off2_polar;
+  _aewald = aewald;
+  // launch the kernel
+  umutual2b(_eflag,_vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+  // NOTE: move this step to update_fieldp() to delay device-host transfer
+  //       after umutual1 and self are done on the GPU
+  // *fieldp_ptr=_fieldp.host.begin();
+  // _fieldp.update_host(_max_fieldp_size*8,false);
+}
+
+// ---------------------------------------------------------------------------
+// Prepare for umutual1() after bspline_fill() is done on host
+//   - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed
+//     host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
+//     host_igrid is allocated with nmax by 4
+//   - transfer extra data from host to device
+// NOTE: can be re-used for fphi_mpole() but with a different bsorder value
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
+                                    double ***host_thetai1, double ***host_thetai2,
+                                    double ***host_thetai3, int** host_igrid,
+                                    const int nzlo_out, const int nzhi_out,
+                                    const int nylo_out, const int nyhi_out,
+                                    const int nxlo_out, const int nxhi_out) {
+  // update bsorder with that of the kspace solver
+  _bsorder = bsorder;
+
+  // allocate or resize per-atom arrays
+  // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
+  //   will be consolidated once all terms are ready
+
+  if (_max_thetai_size == 0) {
+    _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
+    _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+    _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+    _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+    _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+
+    _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
+    _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
+    _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
+  } else {
+    if ((int)_thetai1.cols()<_max_thetai_size*bsorder) {
+      _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+      _thetai1.resize(_max_thetai_size*bsorder);
+      _thetai2.resize(_max_thetai_size*bsorder);
+      _thetai3.resize(_max_thetai_size*bsorder);
+      _igrid.resize(_max_thetai_size*4);
+
+      _fdip_phi1.resize(_max_thetai_size*10);
+      _fdip_phi2.resize(_max_thetai_size*10);
+      _fdip_sum_phi.resize(_max_thetai_size*20);
+    }
+  }
+
+  #ifdef ASYNC_DEVICE_COPY
+  _thetai1.cq(ucl_device->cq(_end_command_queue));
+  _thetai2.cq(ucl_device->cq(_end_command_queue));
+  _thetai3.cq(ucl_device->cq(_end_command_queue));
+  #endif
+
+  // pack host data to device
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai1[i][j][0];
+      v.y = host_thetai1[i][j][1];
+      v.z = host_thetai1[i][j][2];
+      v.w = host_thetai1[i][j][3];
+      _thetai1[idx] = v;
+    }
+  _thetai1.update_device(true);
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai2[i][j][0];
+      v.y = host_thetai2[i][j][1];
+      v.z = host_thetai2[i][j][2];
+      v.w = host_thetai2[i][j][3];
+      _thetai2[idx] = v;
+    }
+  _thetai2.update_device(true);
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai3[i][j][0];
+      v.y = host_thetai3[i][j][1];
+      v.z = host_thetai3[i][j][2];
+      v.w = host_thetai3[i][j][3];
+      _thetai3[idx] = v;
+    }
+  _thetai3.update_device(true);
+
+  for (int i = 0; i < inum_full; i++) {
+    int idx = i*4;
+    _igrid[idx+0] = host_igrid[i][0];
+    _igrid[idx+1] = host_igrid[i][1];
+    _igrid[idx+2] = host_igrid[i][2];
+  }
+  _igrid.update_device(true);
+
+  // _cgrid_brick holds the grid-based potential
+
+  _nzlo_out = nzlo_out;
+  _nzhi_out = nzhi_out;
+  _nylo_out = nylo_out;
+  _nyhi_out = nyhi_out;
+  _nxlo_out = nxlo_out;
+  _nxhi_out = nxhi_out;
+  _ngridz = nzhi_out - nzlo_out + 1;
+  _ngridy = nyhi_out - nylo_out + 1;
+  _ngridx = nxhi_out - nxlo_out + 1;
+  _num_grid_points = _ngridx * _ngridy * _ngridz;
+
+  int numel = _num_grid_points;
+  if (_cgrid_brick.cols() == 0) {
+    int nsize=(int)(((double)numel)*1.1);
+    _cgrid_brick.alloc(nsize, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
+  } else if (numel > (int)_cgrid_brick.cols()) {
+    _cgrid_brick.resize(numel);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// fphi_uind = induced potential from grid
+// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+// NOTE: host_grid_brick is from ic_kspace post_convolution()
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
+                                    void **host_fdip_phi1,
+                                    void **host_fdip_phi2,
+                                    void **host_fdip_sum_phi)
+{
+  int n = 0;
+  for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
+    for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
+      for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
+        numtyp2 v;
+        v.x = host_grid_brick[iz][iy][ix][0];
+        v.y = host_grid_brick[iz][iy][ix][1];
+        _cgrid_brick[n] = v;
+        n++;
+      }
+  _cgrid_brick.update_device(_num_grid_points, false);
+
+  #ifdef ASYNC_DEVICE_COPY
+  ucl_device->sync();
+  #endif
+
+  // launch the kernel with its execution configuration (see below)
+  fphi_uind();
+
+  // copy data from device to host
+  _fdip_phi1.update_host(_max_thetai_size*10, false);
+  _fdip_phi2.update_host(_max_thetai_size*10, false);
+  _fdip_sum_phi.update_host(_max_thetai_size*20, false);
+
+  // return the pointers to the host-side arrays
+  *host_fdip_phi1 = _fdip_phi1.host.begin();
+  *host_fdip_phi2 = _fdip_phi2.host.begin();
+  *host_fdip_sum_phi = _fdip_sum_phi.host.begin();
+}
+
+// ---------------------------------------------------------------------------
+// Interpolate the potential from the PME grid
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int BaseAmoebaT::fphi_uind() {
+  int ainum=ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+
+  time_pair.start();
+  int ngridxy = _ngridx * _ngridy;
+  k_fphi_uind.set_size(GX,BX);
+  k_fphi_uind.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
+                  &_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum,
+                  &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
+  time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// fphi_mpole = multipole potential from grid (limited to polar_kspace for now)
+// fphi_mpole extracts the permanent multipole potential from
+//   the particle mesh Ewald grid
+// NOTE: host_grid_brick is from p_kspace post_convolution()
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec)
+{
+  int n = 0;
+  for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
+    for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
+      for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
+        numtyp2 v;
+        v.x = host_grid_brick[iz][iy][ix];
+        v.y = (numtyp)0;
+        _cgrid_brick[n] = v;
+        n++;
+      }
+  _cgrid_brick.update_device(_num_grid_points, false);
+
+  _felec = felec;
+  fphi_mpole();
+
+  _fdip_sum_phi.update_host(_max_thetai_size*20, false);
+
+  *host_fphi = _fdip_sum_phi.host.begin();
+}
+
+// ---------------------------------------------------------------------------
+// Interpolate the potential from the PME grid
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int BaseAmoebaT::fphi_mpole() {
+  int ainum=ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+
+  time_pair.start();
+  int ngridxy = _ngridx * _ngridy;
+  k_fphi_mpole.set_size(GX,BX);
+  k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
+                  &_fdip_sum_phi, &_bsorder, &ainum, &_felec,
+                  &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
+  time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute polar real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
+                                     double **host_rpole, double **host_uind,
+                                     double **host_uinp, double *host_pval,
+                                     const bool eflag_in, const bool vflag_in,
+                                     const bool eatom, const bool vatom,
+                                     const double aewald, const double felec,
+                                     const double off2_polar, void **tep_ptr) {
+
+  // cast necessary data arrays from host to device
+
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  atom->add_extra_data();
+
+  *tep_ptr=_tep.host.begin();
+
+  _off2_polar = off2_polar;
+  _felec = felec;
+  _aewald = aewald;
+  const int red_blocks=polar_real(_eflag,_vflag);
+
+  // only copy answers (forces, energies and virial) back from the device
+  //   in the last kernel (which is polar_real here)
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  device->add_ans_object(ans);
+
+  // copy tep from device to host
+  _tep.update_host(_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Return the memory bytes allocated on the host and device
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+double BaseAmoebaT::host_memory_usage_atomic() const {
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(BaseAmoeba<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Setup the FFT plan: only placeholder for now
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::setup_fft(const int /*numel*/, const int /*element_type*/)
+{
+  // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
+}
+
+// ---------------------------------------------------------------------------
+// Compute FFT on the device: only placeholder for now
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fft1d(void * /*in*/, void * /*out*/,
+                                const int /*numel*/, const int /*mode*/)
+{
+  // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
+  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
+  if (fft_plan_created == false) {
+    int m = numel/2;
+    cufftPlan1d(&plan, m, CUFFT_Z2Z, 1);
+    fft_plan_created = true;
+  }
+
+  // n = number of double complex
+  int n = numel/2;
+
+  // copy the host array to the device (data)
+  UCL_Vector<cufftDoubleComplex,cufftDoubleComplex> data;
+  data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE);
+  int m = 0;
+  double* d_in = (double*)in;
+  for (int i = 0; i < n; i++) {
+    data[i].x = d_in[m];
+    data[i].y = d_in[m+1];
+    m += 2;
+  }
+  data.update_device(false);
+
+  // perform the in-place forward FFT
+
+  cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device,
+    (cufftDoubleComplex*)&data.device, CUFFT_FORWARD);
+  if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result);
+  ucl_device->sync();
+  data.update_host(false);
+
+  // copy back the data to the host array
+
+  m = 0;
+  double* d_out = (double*)out;
+  for (int i = 0; i < n; i++) {
+    d_out[m] = data[i].x;
+    d_out[m+1] = data[i].y;
+    m += 2;
+  }
+
+  data.clear();
+  #endif
+}
+
+// ---------------------------------------------------------------------------
+// Copy the extra data from host to device
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
+                                  double** uind, double** uinp, double* pval) {
+  // signal that we need to transfer extra data from the host
+
+  atom->extra_data_unavail();
+
+  int _nall=atom->nall();
+  numtyp4 *pextra=reinterpret_cast<numtyp4*>(&(atom->extra[0]));
+
+  int n = 0;
+  int nstride = 1; //4;
+  if (rpole) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = rpole[i][0];
+      pextra[idx].y = rpole[i][1];
+      pextra[idx].z = rpole[i][2];
+      pextra[idx].w = rpole[i][3];
+    }
+
+    n += nstride*_nall;
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = rpole[i][4];
+      pextra[idx].y = rpole[i][5];
+      pextra[idx].z = rpole[i][6];
+      pextra[idx].w = rpole[i][8];
+    }
+
+    n += nstride*_nall;
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = rpole[i][9];
+      pextra[idx].y = rpole[i][12];
+      pextra[idx].z = (numtyp)amtype[i];
+      pextra[idx].w = (numtyp)amgroup[i];
+    }
+  } else {
+    n += 2*nstride*_nall;
+  }
+
+  n += nstride*_nall;
+  if (uind) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = uind[i][0];
+      pextra[idx].y = uind[i][1];
+      pextra[idx].z = uind[i][2];
+      pextra[idx].w = 0;
+    }
+  }
+
+  n += nstride*_nall;
+  if (uinp) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = uinp[i][0];
+      pextra[idx].y = uinp[i][1];
+      pextra[idx].z = uinp[i][2];
+      pextra[idx].w = 0;
+    }
+  }
+
+  n += nstride*_nall;
+  if (pval) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = pval[i];
+      pextra[idx].y = 0;
+      pextra[idx].z = 0;
+      pextra[idx].w = 0;
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Compile (load) the kernel strings and set the kernels
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
+                                  const char *kname_multipole,
+                                  const char *kname_udirect2b,
+                                  const char *kname_umutual2b,
+                                  const char *kname_polar,
+                                  const char *kname_fphi_uind,
+                                  const char *kname_fphi_mpole,
+                                  const char *kname_short_nbor,
+                                  const char* kname_special15) {
+  if (_compiled)
+    return;
+
+  if (pair_program) delete pair_program;
+  pair_program=new UCL_Program(dev);
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen);
+
+  k_multipole.set_function(*pair_program, kname_multipole);
+  k_udirect2b.set_function(*pair_program, kname_udirect2b);
+  k_umutual2b.set_function(*pair_program, kname_umutual2b);
+  k_polar.set_function(*pair_program, kname_polar);
+  k_fphi_uind.set_function(*pair_program, kname_fphi_uind);
+  k_fphi_mpole.set_function(*pair_program, kname_fphi_mpole);
+  k_short_nbor.set_function(*pair_program, kname_short_nbor);
+  k_special15.set_function(*pair_program, kname_special15);
+  pos_tex.get_texture(*pair_program, "pos_tex");
+  q_tex.get_texture(*pair_program, "q_tex");
+
+  _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.has_subgroup_support()) {
+    int mx_subgroup_sz = k_polar.max_subgroup_size(_block_size);
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
+}
+
+// ---------------------------------------------------------------------------
+//  Specify 1-5 neighbors from the current neighbor list
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::add_onefive_neighbors() {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
+                               (BX/_threads_per_atom)));
+
+  int _nall=atom->nall();
+  int ainum=ans->inum();
+  int nbor_pitch=nbor->nbor_pitch();
+
+  k_special15.set_size(GX,BX);
+  k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
+                  &atom->dev_tag, &dev_nspecial15, &dev_special15,
+                  &ainum, &_nall, &nbor_pitch,
+                  &_threads_per_atom);
+
+  return GX;
+}
+
+template class BaseAmoeba<PRECISION,ACC_PRECISION>;
+}
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -0,0 +1,325 @@
+/***************************************************************************
+                                base_amoeba.h
+                             -------------------
+                        Trung Dac Nguyen (Northwestern)
+
+  Base class for pair styles needing per-particle data for position,
+  charge, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_BASE_AMOEBA_H
+#define LAL_BASE_AMOEBA_H
+
+#include "lal_device.h"
+#include "lal_balance.h"
+#include "mpi.h"
+
+#if defined(USE_OPENCL)
+#include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+//#define ASYNC_DEVICE_COPY
+
+#if !defined(USE_OPENCL) && !defined(USE_HIP)
+// temporary workaround for int2 also defined in cufft
+#ifdef int2
+#undef int2
+#endif
+#include "cufft.h"
+#endif
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BaseAmoeba {
+ public:
+  BaseAmoeba();
+  virtual ~BaseAmoeba();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * \param k_name name for the kernel for force calculation
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const int maxspecial15, const double cell_size,
+                  const double gpu_split, FILE *screen, const void *pair_program,
+                  const char *kname_multipole, const char *kname_udirect2b,
+                  const char *kname_umutual2b, const char *kname_polar,
+                  const char *kname_fphi_uind, const char *kname_fphi_mpole,
+                  const char *kname_short_nbor, const char* kname_special15);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead(const int add_kernels=0);
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success)) {
+      pos_tex.bind_float(atom->x,4);
+      q_tex.bind_float(atom->q,1);
+    }
+    ans->resize(inum,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note olist_size=total number of local particles **/
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note nlocal+host_inum=total number local particles
+    * \note olist_size=0 **/
+  inline void resize_local(const int inum, const int host_inum,
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear_atomic();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom_atomic(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_atomic() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      nbor->acc_timers(screen);
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
+    }
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    time_pair.zero();
+    atom->zero_timers();
+    ans->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  int build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, int *nspecial15, tagint **special15,
+                       bool &success);
+
+  /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed
+  virtual int** precompute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double **host_uind,
+                double **host_uinp, double *host_pval, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **&ilist, int **&numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd);
+
+  /// Compute multipole real-space with device neighboring
+  virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *host_pval,
+                double *sublo, double *subhi, tagint *tag,
+                int **nspecial, tagint **special, int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+                int &host_start, int **ilist, int **numj, const double cpu_time,
+                bool &success, const double aewald, const double felec,
+                const double off2_mpole, double *charge, double *boxlo,
+                double *prd, void **tep_ptr);
+
+  /// Compute the real space part of the permanent field (udirect2b) with device neighboring
+  virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const double aewald, const double off2_polar, void **fieldp_ptr);
+
+  /// Compute the real space part of the induced field (umutual2b) with device neighboring
+  virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const double aewald, const double off2_polar, void **fieldp_ptr);
+
+  /// Allocate/resize per-atom arrays before the kspace parts in induce() and polar
+  virtual void precompute_kspace(const int inum_full, const int bsorder,
+                                 double ***host_thetai1, double ***host_thetai2,
+                                 double ***host_thetai3, int** igrid,
+                                 const int nzlo_out, const int nzhi_out,
+                                 const int nylo_out, const int nyhi_out,
+                                 const int nxlo_out, const int nxhi_out);
+  /// Interpolate the induced potential from the grid
+  virtual void compute_fphi_uind(double ****host_grid_brick,
+                                 void **host_fdip_phi1, void **host_fdip_phi2,
+                                 void **host_fdip_sum_phi);
+
+  /// Interpolate the multipolar potential from the grid
+  virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
+                                  const double felec);
+
+  /// Compute polar real-space with device neighboring
+  virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom,
+                const double aewald, const double felec, const double off2_polar,
+                void **tep_ptr);
+
+  // copy field and fieldp from device to host after umutual2b
+  virtual void update_fieldp(void **fieldp_ptr) {
+    *fieldp_ptr=_fieldp.host.begin();
+     // _fieldp store both arrays, one after another
+    _fieldp.update_host(_max_fieldp_size*8,false);
+  }
+
+  /// setup a plan for FFT, where size is the number of elements
+
+  void setup_fft(const int size, const int element_type=0);
+
+  /// compute forward/backward FFT on the device
+
+  void compute_fft1d(void* in, void* out, const int numel, const int mode);
+
+  // -------------------------- DEVICE DATA -------------------------
+
+  /// Device Properties and Atom and Neighbor storage
+  Device<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_pair;
+
+  /// Host device load balancer
+  Balance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  Atom<numtyp,acctyp> *atom;
+
+  UCL_Vector<numtyp,numtyp> polar1, polar2, polar3, polar4, polar5;
+
+  /// cast host arrays into a single array for atom->extra
+  void cast_extra_data(int* amtype, int* amgroup, double** rpole,
+    double** uind, double** uinp, double* pval=nullptr);
+
+  /// Per-atom arrays
+  UCL_Vector<acctyp,acctyp> _tep, _fieldp;
+  int _nmax, _max_tep_size, _max_fieldp_size;
+
+  int _bsorder;
+  UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
+  UCL_Vector<int,int> _igrid;
+  UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
+  UCL_Vector<acctyp,acctyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
+  int _max_thetai_size;
+  int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
+  int _ngridx, _ngridy, _ngridz, _num_grid_points;
+
+  int _end_command_queue;
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  Answer<numtyp,acctyp> *ans;
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  Neighbor *nbor;
+  /// Device storage for 1-5 special neighbor counts
+  UCL_D_Vec<int> dev_nspecial15;
+  /// Device storage for special neighbors
+  UCL_D_Vec<tagint> dev_special15, dev_special15_t;
+
+  int add_onefive_neighbors();
+
+  UCL_D_Vec<int> dev_short_nbor;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program;
+  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
+  UCL_Kernel k_fphi_uind, k_fphi_mpole;
+  UCL_Kernel k_special15, k_short_nbor;
+  inline int block_size() { return _block_size; }
+  inline void set_kernel(const int /*eflag*/, const int /*vflag*/) {}
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+
+ protected:
+  bool _compiled;
+  int _block_size, _block_bio_size, _threads_per_atom;
+  int _extra_fields;
+  double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
+  double _gpu_overhead, _driver_overhead;
+  bool short_nbor_polar_avail;
+  UCL_D_Vec<int> *_nbor_data;
+
+  numtyp _aewald,_felec;
+  numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;
+
+  int _eflag, _vflag;
+
+  void compile_kernels(UCL_Device &dev, const void *pair_string,
+     const char *kname_multipole, const char *kname_udirect2b,
+     const char *kname_umutual2b, const char *kname_polar,
+     const char *kname_fphi_uind, const char *kname_fphi_mpole,
+     const char *kname_short_nbor, const char* kname_special15);
+
+  virtual int multipole_real(const int eflag, const int vflag) = 0;
+  virtual int udirect2b(const int eflag, const int vflag) = 0;
+  virtual int umutual2b(const int eflag, const int vflag) = 0;
+  virtual int fphi_uind();
+  virtual int fphi_mpole();
+  virtual int polar_real(const int eflag, const int vflag) = 0;
+
+
+  #if !defined(USE_OPENCL) && !defined(USE_HIP)
+  cufftHandle plan;
+  #endif
+  bool fft_plan_created;
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_atom();

-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  bool charge = false;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_charge();

-  int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
+  bool charge = true;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_charge();

-  int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
+  bool charge = true;
+  bool rot = true;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_atom();

-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
+  bool charge = false;
+  bool rot = false;
+  bool vel = true;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel);
  if (success!=0)
    return success;

@ -193,7 +196,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall,
                       const double cpu_time, bool &success, tagint *tag,
                       double **host_v, const double dtinvsqrt,
                       const int seed, const int timestep,
-                       const int nlocal, double *boxlo, double *prd) {
+                       const int /*nlocal*/, double * /*boxlo*/, double * /*prd*/) {
  acc_timers();
  int eflag, vflag;
  if (eatom) eflag=2;
@ -258,7 +261,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
                        const double cpu_time, bool &success,
                        double **host_v, const double dtinvsqrt,
                        const int seed, const int timestep,
-                        double *boxlo, double *prd) {
+                        double * /*boxlo*/, double * /*prd*/) {
  acc_timers();
  int eflag, vflag;
  if (eatom) eflag=2;
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
  else
    _threads_per_atom=device->threads_per_three();

-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  bool charge = false;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@ -44,18 +44,14 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
 }

 template <class numtyp, class acctyp>
-int CHARMMLongT::init(const int ntypes,
-                           double host_cut_bothsq, double **host_lj1,
-                           double **host_lj2, double **host_lj3,
-                           double **host_lj4, double **host_offset,
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen,
+int CHARMMLongT::init(const int ntypes, double host_cut_bothsq, double **host_lj1,
+                      double **host_lj2, double **host_lj3, double **host_lj4,
+                      double ** /*host_offset*/, double *host_special_lj, const int nlocal,
+                      const int nall, const int max_nbors, const int maxspecial,
+                      const double cell_size, const double gpu_split, FILE *_screen,
                      double host_cut_ljsq, const double host_cut_coulsq,
-                           double *host_special_coul, const double qqrd2e,
-                           const double g_ewald, const double cut_lj_innersq,
-                           const double denom_lj, double **epsilon,
+                      double *host_special_coul, const double qqrd2e, const double g_ewald,
+                      const double cut_lj_innersq, const double denom_lj, double **epsilon,
                      double **sigma, const bool mix_arithmetic) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -52,7 +52,7 @@ DeviceT::~Device() {
 }

 template <class numtyp, class acctyp>
-int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu,
                         const int first_gpu_id, const int gpu_mode,
                         const double p_split, const int t_per_atom,
                         const double user_cell_size, char *ocl_args,
@ -386,6 +386,9 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
  }

  _ocl_compile_string="-cl-mad-enable ";
+  #ifdef CL_VERSION_2_0
+  _ocl_compile_string+="-cl-std=CL2.0 ";
+  #endif
  if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
  _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
    std::string(OCL_PRECISION_COMPILE);
@ -438,7 +441,7 @@ template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                  const bool rot, const int nlocal,
                  const int nall, const int maxspecial,
-                  const bool vel) {
+                  const bool vel, const int extra_fields) {
  if (!_device_init)
    return -1;
  if (sizeof(acctyp)==sizeof(double) && !gpu->double_precision())
@ -467,7 +470,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,

  if (_init_count==0) {
    // Initialize atom and nbor data
-    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
+    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields))
      return -3;

    _data_in_estimate++;
@ -477,6 +480,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
      _data_in_estimate++;
    if (vel)
      _data_in_estimate++;
+    if (extra_fields>0)
+      _data_in_estimate++;
+
  } else {
    if (!atom.charge() && charge)
      _data_in_estimate++;
@ -484,7 +490,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
      _data_in_estimate++;
    if (!atom.velocity() && vel)
      _data_in_estimate++;
-    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
+    if (atom.using_extra() && extra_fields>0)
+      _data_in_estimate++;
+    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields))
      return -3;
  }

@ -520,7 +528,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,

 template <class numtyp, class acctyp>
 int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
-                       const int host_nlocal, const int nall,
+                       const int host_nlocal, const int /*nall*/,
                       const int maxspecial, const int gpu_host,
                       const int max_nbors, const double cutoff,
                       const bool pre_cut, const int threads_per_atom,
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -61,6 +61,7 @@ class Device {
    * \param nall Total number of local+ghost particles
    * \param maxspecial Maximum mumber of special bonded atoms per atom
    * \param vel True if velocities need to be stored
+    * \param extra_fields Nonzero if extra fields need to be stored
    *
    * Returns:
    * -  0 if successful
@ -70,7 +71,7 @@ class Device {
    * - -5 Double precision is not supported on card **/
  int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
           const int nlocal, const int nall, const int maxspecial,
-           const bool vel=false);
+           const bool vel=false, const int extra_fields=0);

  /// Initialize the device for Atom storage only
  /** \param nlocal Total number of local particles to allocate memory for
--- a/lib/gpu/lal_dpd_tstat_ext.cpp
+++ b/lib/gpu/lal_dpd_tstat_ext.cpp
@ -30,7 +30,7 @@ static DPD<PRECISION,ACC_PRECISION> DPDTMF;
 int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0,
                       double **host_gamma, double **host_sigma, double **host_cut,
                       double *special_lj, const int inum,
-                 const int nall, const int max_nbors,  const int maxspecial,
+                       const int nall, const int /*max_nbors*/,  const int maxspecial,
                       const double cell_size, int &gpu_mode, FILE *screen) {
  DPDTMF.clear();
  gpu_mode=DPDTMF.device->gpu_mode();
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@ -310,7 +310,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
                   const int nall, double **host_x, int *host_type,
                   int *ilist, int *numj, int **firstneigh,
                   const bool eflag_in, const bool vflag_in,
-                   const bool eatom, const bool vatom,
+                   const bool /*eatom*/, const bool /*vatom*/,
                   int &host_start, const double cpu_time,
                   bool &success, void **fp_ptr) {
  this->acc_timers();
@ -386,8 +386,8 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
                    double **host_x, int *host_type, double *sublo,
                    double *subhi, tagint *tag, int **nspecial,
                    tagint **special, const bool eflag_in,
-                    const bool vflag_in, const bool eatom,
-                    const bool vatom, int &host_start, int **ilist, int **jnum,
+                    const bool vflag_in, const bool /*eatom*/,
+                    const bool /*vatom*/, int &host_start, int **ilist, int **jnum,
                    const double cpu_time, bool &success, int &inum,
                    void **fp_ptr) {
  this->acc_timers();
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@ -0,0 +1,641 @@
+/***************************************************************************
+                                 hippo.cpp
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the hippo pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "hippo_cl.h"
+#elif defined(USE_CUDART)
+const char *hippo=0;
+#else
+#include "hippo_cubin.h"
+#endif
+
+#include "lal_hippo.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define HippoT Hippo<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+HippoT::Hippo() : BaseAmoeba<numtyp,acctyp>(),
+  _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+HippoT::~Hippo() {
+  clear();
+  k_repulsion.clear();
+  k_dispersion.clear();
+
+}
+
+template <class numtyp, class acctyp>
+int HippoT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
+                 const double *host_pdamp, const double *host_thole,
+                 const double *host_dirdamp, const int *host_amtype2class,
+                 const double *host_special_repel, const double *host_special_disp,
+                 const double *host_special_mpole,
+                 const double *host_special_polar_wscale,
+                 const double *host_special_polar_piscale,
+                 const double *host_special_polar_pscale,
+                 const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
+                 const double *host_csix, const double *host_adisp,
+                 const double *host_pcore, const double *host_palpha,
+                 const int nlocal, const int nall, const int max_nbors,
+                 const int maxspecial, const int maxspecial15,
+                 const double cell_size, const double gpu_split, FILE *_screen,
+                 const double polar_dscale, const double polar_uscale) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
+                            cell_size,gpu_split,_screen,hippo,
+                            "k_hippo_multipole", "k_hippo_udirect2b",
+                            "k_hippo_umutual2b", "k_hippo_polar",
+                            "k_hippo_fphi_uind", "k_hippo_fphi_mpole",
+                            "k_hippo_short_nbor", "k_hippo_special15");
+  if (success!=0)
+    return success;
+
+  // specific to HIPPO
+  k_repulsion.set_function(*(this->pair_program),"k_hippo_repulsion");
+  k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+
+  UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_pdamp[i];
+    host_write[i].y = host_thole[i];
+    host_write[i].z = host_dirdamp[i];
+    host_write[i].w = host_amtype2class[i];
+  }
+
+  coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amtype,host_write,false);
+
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_sizpr[i];
+    host_write[i].y = host_dmppr[i];
+    host_write[i].z = host_elepr[i];
+    host_write[i].w = (numtyp)0;
+  }
+
+  coeff_rep.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_rep,host_write,false);
+
+  UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amclass; i++) {
+    host_write2[i].x = host_csix[i];
+    host_write2[i].y = host_adisp[i];
+    host_write2[i].z = host_pcore[i];
+    host_write2[i].w = host_palpha[i];
+  }
+
+  coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amclass,host_write2,false);
+
+  UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
+  sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_polar_wscale[i];
+    dview[i].y=host_special_polar_piscale[i];
+    dview[i].z=host_special_polar_pscale[i];
+    dview[i].w=host_special_mpole[i];
+  }
+  ucl_copy(sp_polar,dview,5,false);
+
+  sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_repel[i];
+    dview[i].y=host_special_disp[i];
+    dview[i].z=(numtyp)0;
+    dview[i].w=(numtyp)0;
+  }
+  ucl_copy(sp_nonpolar,dview,5,false);
+
+  _polar_dscale = polar_dscale;
+  _polar_uscale = polar_uscale;
+
+  _allocated=true;
+  this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes()
+    + coeff_amclass.row_bytes() + sp_polar.row_bytes()
+    + sp_nonpolar.row_bytes() + this->_tep.row_bytes()
+    + this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
+    + this->_thetai2.row_bytes()  + this->_thetai3.row_bytes()
+    + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void HippoT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  coeff_amtype.clear();
+  coeff_rep.clear();
+  coeff_amclass.clear();
+  sp_polar.clear();
+  sp_nonpolar.clear();
+
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double HippoT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Hippo<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Compute the repulsion term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_repulsion(const int /*ago*/, const int inum_full,
+                               const int /*nall*/, double ** /*host_x*/,
+                               int * /*host_type*/, int * /*host_amtype*/,
+                               int * /*host_amgroup*/, double ** /*host_rpole*/,
+                               double * /*sublo*/, double * /*subhi*/, tagint * /*tag*/,
+                               int ** /*nspecial*/, tagint ** /*special*/,
+                               int * /*nspecial15*/, tagint ** /*special15*/,
+                               const bool eflag_in, const bool vflag_in,
+                               const bool eatom, const bool vatom,
+                               int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
+                               const double /*cpu_time*/, bool & /*success*/,
+                               const double /*aewald*/, const double off2_repulse,
+                               double * /*host_q*/, double * /*boxlo*/, double * /*prd*/,
+                               double cut2, double c0, double c1, double c2,
+                               double c3, double c4, double c5, void **tep_ptr) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>this->_max_tep_size) {
+    this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->_tep.resize(this->_max_tep_size*4);
+  }
+  *tep_ptr=this->_tep.host.begin();
+
+  this->_off2_repulse = off2_repulse;
+  _cut2 = cut2;
+  _c0 = c0;
+  _c1 = c1;
+  _c2 = c2;
+  _c3 = c3;
+  _c4 = c4;
+  _c5 = c5;
+  repulsion(this->_eflag,this->_vflag);
+
+  // copy tep from device to host
+  this->_tep.update_host(this->_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the repulsion kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::repulsion(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_disp,
+  //   at this point repuslion is the first kernel in a time step for HIPPO
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_repulse, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  k_repulsion.set_size(GX,BX);
+  k_repulsion.run(&this->atom->x, &this->atom->extra,
+                  &coeff_rep, &sp_nonpolar,
+                  &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                  &this->dev_short_nbor,
+                  &this->ans->force, &this->ans->engv, &this->_tep,
+                  &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                  &this->_threads_per_atom,  &this->_aewald,
+                  &this->_off2_repulse, &_cut2,
+                  &_c0, &_c1, &_c2, &_c3, &_c4, &_c5);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Compute dispersion real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
+                                      double **host_rpole, const double aewald,
+                                      const double off2_disp) {
+
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(host_amtype, host_amgroup, host_rpole,
+                        nullptr, nullptr, nullptr);
+  this->atom->add_extra_data();
+
+  this->_off2_disp = off2_disp;
+  this->_aewald = aewald;
+  dispersion_real(this->_eflag,this->_vflag);
+
+  // only copy them back if this is the last kernel
+  //   otherwise, commenting out these two lines to leave the answers
+  //   (forces, energies and virial) on the device until the last kernel
+  //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //this->device->add_ans_object(this->ans);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the dispersion real-space kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::dispersion_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_disp,
+  //   at this point dispersion is the first kernel in a time step
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_disp, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  k_dispersion.set_size(GX,BX);
+  k_dispersion.run(&this->atom->x, &this->atom->extra,
+                   &coeff_amtype, &coeff_amclass, &sp_nonpolar,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &this->ans->force, &this->ans->engv,
+                   &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                   &this->_threads_per_atom,  &this->_aewald,
+                   &this->_off2_disp);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Compute the multipole real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_multipole_real(const int /*ago*/, const int inum_full,
+                                    const int /*nall*/, double ** /*host_x*/,
+                                    int * /*host_type*/, int * /*host_amtype*/,
+                                    int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                    double* host_pval, double * /*sublo*/,
+                                    double * /*subhi*/, tagint * /*tag*/,
+                                    int ** /*nspecial*/, tagint ** /*special*/,
+                                    int * /*nspecial15*/, tagint ** /*special15*/,
+                                    const bool /*eflag_in*/, const bool /*vflag_in*/,
+                                    const bool /*eatom*/, const bool /*vatom*/,
+                                    int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
+                                    const double /*cpu_time*/, bool & /*success*/,
+                                     const double aewald, const double felec,
+                                    const double off2_mpole, double * /*host_q*/,
+                                    double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
+
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, nullptr, nullptr, host_pval);
+  this->atom->add_extra_data();
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>this->_max_tep_size) {
+    this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->_tep.resize(this->_max_tep_size*4);
+  }
+  *tep_ptr=this->_tep.host.begin();
+
+  this->_off2_mpole = off2_mpole;
+  this->_felec = felec;
+  this->_aewald = aewald;
+  multipole_real(this->_eflag,this->_vflag);
+
+  // copy tep from device to host
+  this->_tep.update_host(this->_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the multipole real-space kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::multipole_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_mpole
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_mpole, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  this->k_multipole.set_size(GX,BX);
+  this->k_multipole.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->ans->force, &this->ans->engv, &this->_tep,
+                        &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                        &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Compute the direct real space part of the permanent field
+//   returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_udirect2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                double **host_uind, double **host_uinp, double* host_pval,
+                                const double aewald, const double off2_polar,
+                                void** fieldp_ptr) {
+
+  // all the necessary data arrays are already copied from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval);
+  this->atom->add_extra_data();
+
+  *fieldp_ptr=this->_fieldp.host.begin();
+
+  this->_off2_polar = off2_polar;
+  this->_aewald = aewald;
+  udirect2b(this->_eflag,this->_vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+
+  this->_fieldp.update_host(this->_max_fieldp_size*8,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the real-space permanent field kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff _off2_polar, if not done yet
+  //   this is the first kernel in a time step where _off2_polar is used
+
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(),
+                           &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                           &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_udirect2b.set_size(GX,BX);
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
+                        &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Compute the direct real space term of the induced field
+//   returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_umutual2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
+                               double **host_uind, double **host_uinp, double * /*host_pval*/,
+                               const double aewald, const double off2_polar, void ** /*fieldp_ptr*/) {
+
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
+  this->atom->add_extra_data();
+
+  this->_off2_polar = off2_polar;
+  this->_aewald = aewald;
+  umutual2b(this->_eflag,this->_vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+  // NOTE: move this step to update_fieldp() to delay device-host transfer
+  // *fieldp_ptr=this->_fieldp.host.begin();
+  // this->_fieldp.update_host(this->_max_fieldp_size*8,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the real-space induced field kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(), &this->dev_short_nbor,
+                           &this->_off2_polar, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_umutual2b.set_size(GX,BX);
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
+                        &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
+                        &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute polar real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_polar_real(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                double **host_uind, double **host_uinp, double * /*host_pval*/,
+                                const bool eflag_in, const bool vflag_in,
+                                const bool eatom, const bool vatom,
+                                const double aewald, const double felec,
+                                const double off2_polar, void **tep_ptr) {
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
+  this->atom->add_extra_data();
+
+  *tep_ptr=this->_tep.host.begin();
+
+  this->_off2_polar = off2_polar;
+  this->_felec = felec;
+  this->_aewald = aewald;
+  const int red_blocks=polar_real(this->_eflag,this->_vflag);
+
+  // only copy answers (forces, energies and virial) back from the device
+  //   in the last kernel in a timestep (which is polar_real here)
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  this->device->add_ans_object(this->ans);
+
+  // copy tep from device to host
+  this->_tep.update_host(this->_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the polar real-space kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::polar_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=this->block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  /*
+  const int cus = this->device->gpu->cus();
+  while (GX < cus && GX > 1) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  }
+  */
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_polar.set_size(GX,BX);
+  this->k_polar.run(&this->atom->x, &this->atom->extra,
+                    &coeff_amtype, &coeff_amclass, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                    &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  // Signal that short nbor list is not avail for the next time step
+  //   do it here because polar_real() is the last kernel in a time step at this point
+
+  this->short_nbor_polar_avail = false;
+
+  return GX;
+}
+
+template class Hippo<PRECISION,ACC_PRECISION>;
+}
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@ -0,0 +1,166 @@
+/***************************************************************************
+                                  hippo.h
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the hippo pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_HIPPO_H
+#define LAL_HIPPO_H
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Hippo : public BaseAmoeba<numtyp, acctyp> {
+ public:
+  Hippo();
+  ~Hippo();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const int max_amtype, const int max_amclass,
+           const double *host_pdamp, const double *host_thole,
+           const double *host_dirdamp, const int *host_amtype2class,
+           const double *host_special_mpole,
+           const double *host_special_repel,
+           const double *host_special_disp,
+           const double *host_special_polar_wscale,
+           const double *host_special_polar_piscale,
+           const double *host_special_polar_pscale,
+           const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
+           const double *host_csix, const double *host_adisp,
+           const double *host_pcore, const double *host_palpha,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const int maxspecial15, const double cell_size,
+           const double gpu_split, FILE *_screen,
+           const double polar_dscale, const double polar_uscale);
+
+  /// Compute repulsion with device neighboring
+  virtual void compute_repulsion(const int ago, const int inum_full,
+                          const int nall, double **host_x,
+                          int *host_type, int *host_amtype,
+                          int *host_amgroup, double **host_rpole,
+                          double *sublo, double *subhi, tagint *tag,
+                          int **nspecial, tagint **special,
+                          int *nspecial15, tagint **special15,
+                          const bool eflag_in, const bool vflag_in,
+                          const bool eatom, const bool vatom,
+                          int &host_start, int **ilist, int **jnum,
+                          const double cpu_time, bool &success,
+                          const double aewald, const double off2_repulse,
+                          double *host_q, double *boxlo, double *prd,
+                          double cut2, double c0, double c1, double c2,
+                          double c3, double c4, double c5,void** tep_ptr);
+
+  /// Compute dispersion real-space with device neighboring
+  virtual void compute_dispersion_real(int *host_amtype,  int *host_amgroup,
+                                double **host_rpole, const double aewald,
+                                const double off2_disp);
+
+  /// Compute multipole real-space with device neighboring
+  virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *host_pval,
+                double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                const double aewald, const double felec, const double off2_mpole, double *charge,
+                double *boxlo, double *prd, void **tep_ptr);
+
+   /// Compute the real space part of the permanent field (udirect2b) with device neighboring
+   virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                  double **host_uind, double **host_uinp, double* host_pval,
+                  const double aewald, const double off2_polar, void** fieldp_ptr);
+
+   /// Compute the real space part of the induced field (umutual2b) with device neighboring
+   virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                   double **host_uind, double **host_uinp, double *host_pval,
+                                   const double aewald, const double off2_polar,
+                                   void** fieldp_ptr);
+
+  /// Compute polar real-space with device neighboring
+  virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const bool eflag_in, const bool vflag_in,
+                const bool eatom, const bool vatom,
+                const double aewald, const double felec, const double off2_polar,
+                void **tep_ptr);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
+  /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
+  UCL_D_Vec<numtyp4> coeff_amtype;
+  /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
+  UCL_D_Vec<numtyp4> coeff_amclass;
+  /// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z;
+  UCL_D_Vec<numtyp4> coeff_rep;
+  /// Special polar values [0-4]:
+  ///   sp_polar.x = special_polar_wscale
+  ///   sp_polar.y special_polar_pscale,
+  ///   sp_polar.z = special_polar_piscale
+  ///   sp_polar.w = special_mpole
+  UCL_D_Vec<numtyp4> sp_polar;
+  /// Special nonpolar values [0-4]:
+  ///   sp_nonpolar.x = special_hal
+  ///   sp_nonpolar.y special_repel
+  ///   sp_nonpolar.z = special_disp
+  UCL_D_Vec<numtyp4> sp_nonpolar;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _cut2,_c0,_c1,_c2,_c3,_c4,_c5;
+  numtyp _polar_dscale, _polar_uscale;
+  numtyp _qqrd2e;
+
+  UCL_Kernel k_repulsion, k_dispersion;
+
+ protected:
+  bool _allocated;
+  int repulsion(const int eflag, const int vflag);
+  int dispersion_real(const int eflag, const int vflag);
+  int multipole_real(const int eflag, const int vflag);
+  int udirect2b(const int eflag, const int vflag);
+  int umutual2b(const int eflag, const int vflag);
+  int polar_real(const int eflag, const int vflag);
+
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@ -0,0 +1,231 @@
+/***************************************************************************
+                                 hippo_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Functions for LAMMPS access to hippo acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_hippo.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Hippo<PRECISION,ACC_PRECISION> HIPPOMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp, const int *host_amtype2class,
+                    const double *host_special_repel,
+                    const double *host_special_disp,
+                    const double *host_special_mpole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
+                    const double *host_csix, const double *host_adisp,
+                    const double *host_pcore, const double *host_palpha,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                   const double polar_dscale, const double polar_uscale) {
+  HIPPOMF.clear();
+  gpu_mode=HIPPOMF.device->gpu_mode();
+  double gpu_split=HIPPOMF.device->particle_split();
+  int first_gpu=HIPPOMF.device->first_device();
+  int last_gpu=HIPPOMF.device->last_device();
+  int world_me=HIPPOMF.device->world_me();
+  int gpu_rank=HIPPOMF.device->gpu_rank();
+  int procs_per_gpu=HIPPOMF.device->procs_per_gpu();
+
+  HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu);
+
+  bool message=false;
+  if (HIPPOMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
+                         host_pdamp, host_thole, host_dirdamp,
+                         host_amtype2class, host_special_repel, host_special_disp,
+                         host_special_mpole, host_special_polar_wscale,
+                         host_special_polar_piscale, host_special_polar_pscale,
+                         host_sizpr, host_dmppr, host_elepr,
+                         host_csix, host_adisp, host_pcore, host_palpha,
+                         nlocal, nall, max_nbors,
+                         maxspecial, maxspecial15, cell_size, gpu_split,
+                         screen, polar_dscale, polar_uscale);
+
+  HIPPOMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
+                           host_pdamp, host_thole, host_dirdamp,
+                           host_amtype2class, host_special_repel, host_special_disp,
+                           host_special_mpole, host_special_polar_wscale,
+                           host_special_polar_piscale, host_special_polar_pscale,
+                           host_sizpr, host_dmppr, host_elepr,
+                           host_csix, host_adisp, host_pcore, host_palpha,
+                           nlocal, nall, max_nbors,
+                           maxspecial, maxspecial15, cell_size, gpu_split,
+                           screen, polar_dscale, polar_uscale);
+
+    HIPPOMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    HIPPOMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void hippo_gpu_clear() {
+  HIPPOMF.clear();
+}
+
+int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double ** /*host_uind*/, double ** /*host_uinp*/, double * /*host_pval*/,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo, double *prd) {
+  return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type,
+                            host_amtype, host_amgroup, host_rpole,
+                            nullptr, nullptr, nullptr, sublo, subhi, tag,
+                            nspecial, special, nspecial15, special15,
+                            eflag_in, vflag_in, eatom, vatom,
+                            host_start, ilist, jnum, cpu_time,
+                            success, host_q, boxlo, prd);
+}
+
+void hippo_gpu_compute_repulsion(const int ago, const int inum_full,
+                                 const int nall, double **host_x, int *host_type,
+                                 int *host_amtype, int *host_amgroup, double **host_rpole,
+                                 double *sublo, double *subhi, tagint *tag, int **nspecial,
+                                 tagint **special, int *nspecial15, tagint** special15,
+                                 const bool eflag, const bool vflag, const bool eatom,
+                                 const bool vatom, int &host_start,
+                                 int **ilist, int **jnum, const double cpu_time,
+                                 bool &success, const double aewald, const double off2,
+                                 double *host_q, double *boxlo, double *prd,
+                                 double cut2, double c0, double c1, double c2,
+                                 double c3, double c4, double c5, void **tep_ptr) {
+  HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd,
+                          cut2, c0, c1, c2, c3, c4, c5, tep_ptr);
+}
+
+void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup,
+                                        double **host_rpole, const double aewald,
+                                        const double off2) {
+  HIPPOMF.compute_dispersion_real(host_amtype, host_amgroup, host_rpole,
+                                         aewald, off2);
+}
+
+void hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double felec, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
+}
+
+void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp, double *host_pval,
+                           const double aewald, const double off2, void **fieldp_ptr) {
+  HIPPOMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole,
+                            host_uind, host_uinp, host_pval,
+                            aewald, off2, fieldp_ptr);
+}
+
+void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                  double **host_uind, double **host_uinp, double *host_pval,
+                                  const double aewald, const double off2, void **fieldp_ptr) {
+  HIPPOMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
+                            aewald, off2, fieldp_ptr);
+}
+
+void hippo_gpu_update_fieldp(void **fieldp_ptr) {
+  HIPPOMF.update_fieldp(fieldp_ptr);
+}
+
+void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                  double **host_uind, double **host_uinp, double *host_pval,
+                                  const bool eflag_in, const bool vflag_in,
+                                  const bool eatom, const bool vatom,
+                                  const double aewald, const double felec, const double off2,
+                                  void **tep_ptr) {
+  HIPPOMF.compute_polar_real(host_amtype, host_amgroup, host_rpole,  host_uind, host_uinp, host_pval,
+                             eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
+}
+
+void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out) {
+   HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2,
+                             host_thetai3, igrid, nzlo_out, nzhi_out,
+                             nylo_out, nyhi_out, nxlo_out, nxhi_out);
+}
+
+void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
+                         void **host_fdip_phi2, void **host_fdip_sum_phi) {
+   HIPPOMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi);
+}
+
+double hippo_gpu_bytes() {
+  return HIPPOMF.host_memory_usage();
+}
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@ -0,0 +1,431 @@
+/// **************************************************************************
+//                              hippo_extra.h
+//                             -------------------
+//                              Trung Dac Nguyen
+//
+//  Device code for hippo math routines
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : ndactrung@gmail.com
+// ***************************************************************************/*
+
+#ifndef LAL_HIPPO_EXTRA_H
+#define LAL_HIPPO_EXTRA_H
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include "lal_aux_fun1.h"
+#else
+#endif
+
+#define MY_PI2 (numtyp)1.57079632679489661923
+#define MY_PI4 (numtyp)0.78539816339744830962
+
+/* ----------------------------------------------------------------------
+   damprep generates coefficients for the Pauli repulsion
+   damping function for powers of the interatomic distance
+
+   literature reference:
+
+   J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An
+   Anisotropic, Atomic Multipole Model", Journal of Chemical Physics,
+   150, 084104 (2019)
+------------------------------------------------------------------------- */
+
+ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
+                        const numtyp rr3, const numtyp rr5, const numtyp rr7,
+                        const numtyp rr9, const numtyp rr11, const int rorder,
+                        const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11])
+{
+  numtyp r3,r4;
+  numtyp r5,r6,r7,r8;
+  numtyp s,ds,d2s;
+  numtyp d3s,d4s,d5s;
+  numtyp dmpi2,dmpk2;
+  numtyp dmpi22,dmpi23;
+  numtyp dmpi24,dmpi25;
+  numtyp dmpi26,dmpi27;
+  numtyp dmpk22,dmpk23;
+  numtyp dmpk24,dmpk25;
+  numtyp dmpk26;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp pre,term,tmp;
+
+  // compute tolerance value for damping exponents
+
+  eps = (numtyp)0.001;
+  diff = dmpi-dmpk; // fabs(dmpi-dmpk)
+  if (diff < (numtyp)0) diff = -diff;
+
+  // treat the case where alpha damping exponents are equal
+
+  if (diff < eps) {
+    r3 = r2 * r;
+    r4 = r3 * r;
+    r5 = r4 * r;
+    r6 = r5 * r;
+    r7 = r6 * r;
+    dmpi2 = (numtyp)0.5 * dmpi;
+    dampi = dmpi2 * r;
+    expi = ucl_exp(-dampi);
+    dmpi22 = dmpi2 * dmpi2;
+    dmpi23 = dmpi22 * dmpi2;
+    dmpi24 = dmpi23 * dmpi2;
+    dmpi25 = dmpi24 * dmpi2;
+    dmpi26 = dmpi25 * dmpi2;
+    pre = (numtyp)128.0;
+    s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi;
+
+    ds = (dmpi22*r3 + dmpi23*r4) * expi / (numtyp)3.0;
+    d2s = dmpi24 * expi * r5 / (numtyp)9.0;
+    d3s = dmpi25 * expi * r6 / (numtyp)45.0;
+    d4s = (dmpi25*r6 + dmpi26*r7) * expi / (numtyp)315.0;
+    if (rorder >= 11) {
+      r8 = r7 * r;
+      dmpi27 = dmpi2 * dmpi26;
+      d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0;
+    }
+
+  // treat the case where alpha damping exponents are unequal
+
+  } else {
+    r3 = r2 * r;
+    r4 = r3 * r;
+    r5 = r4 * r;
+    dmpi2 = (numtyp)0.5 * dmpi;
+    dmpk2 = (numtyp)0.5 * dmpk;
+    dampi = dmpi2 * r;
+    dampk = dmpk2 * r;
+    expi = ucl_exp(-dampi);
+    expk = ucl_exp(-dampk);
+    dmpi22 = dmpi2 * dmpi2;
+    dmpi23 = dmpi22 * dmpi2;
+    dmpi24 = dmpi23 * dmpi2;
+    dmpi25 = dmpi24 * dmpi2;
+    dmpk22 = dmpk2 * dmpk2;
+    dmpk23 = dmpk22 * dmpk2;
+    dmpk24 = dmpk23 * dmpk2;
+    dmpk25 = dmpk24 * dmpk2;
+    term = dmpi22 - dmpk22;
+    pre = (numtyp)8192.0 * dmpi23 * dmpk23 / (term*term*term*term); //ucl_powr(term,(numtyp)4.0);
+    tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term;
+    s = (dampi-tmp)*expk + (dampk+tmp)*expi;
+
+    ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term -
+          (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 -
+           ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 +
+       ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
+       (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    d3s = (dmpi2*dmpk23*r4/(numtyp)15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 -
+           ((numtyp)4.0/(numtyp)15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term -
+           (numtyp)4.0*dmpi2*dmpk22*r/term - (numtyp)4.0/term*dmpi2*dmpk2) * expk +
+      (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 +
+       ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term +
+       (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
+    d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 +
+           dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 -
+           ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term -
+           ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi24*dmpk2*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 +
+       dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 +
+       ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term +
+       ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
+       (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+
+    if (rorder >= 11) {
+      r6 = r5 * r;
+      dmpi26 = dmpi25 * dmpi2;
+      dmpk26 = dmpk25 * dmpk2;
+      d5s = (dmpi2*dmpk25*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi2*dmpk24*r5 +
+             dmpi2*dmpk23*r4/(numtyp)21.0 + dmpi2*dmpk22*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 -
+             ((numtyp)4.0/(numtyp)945.0)*dmpi2*dmpk26*r5/term -
+             ((numtyp)4.0/(numtyp)63.0)*dmpi2*dmpk25*r4/term - ((numtyp)4.0/(numtyp)9.0)*dmpi2*dmpk24*r3/term -
+             ((numtyp)16.0/(numtyp)9.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+             (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+        (dmpi25*dmpk2*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi24*dmpk2*r5 +
+         dmpi23*dmpk2*r4/(numtyp)21.0 + dmpi22*dmpk2*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 +
+         ((numtyp)4.0/(numtyp)945.0)*dmpi26*dmpk2*r5/term + ((numtyp)4.0/(numtyp)63.0)*dmpi25*dmpk2*r4/term +
+         ((numtyp)4.0/(numtyp)9.0)*dmpi24*dmpk2*r3/term + ((numtyp)16.0/(numtyp)9.0)*dmpi23*dmpk2*r2/term +
+         (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    }
+  }
+
+  // convert partial derivatives into full derivatives
+
+  s = s * rr1;
+  ds = ds * rr3;
+  d2s = d2s * rr5;
+  d3s = d3s * rr7;
+  d4s = d4s * rr9;
+  d5s = d5s * rr11;
+  dmpik[0] = (numtyp)0.5 * pre * s * s;
+  dmpik[2] = pre * s * ds;
+  dmpik[4] = pre * (s*d2s + ds*ds);
+  dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s);
+  dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s);
+
+  if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s);
+}
+
+/* ----------------------------------------------------------------------
+   damppole generates coefficients for the charge penetration
+   damping function for powers of the interatomic distance
+
+   literature references:
+
+   L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the
+   Effective Fragment Potential Method: Theory and Application to
+   the Benzene Dimer", Journal of Computational Chemistry, 28,
+   276-291 (2007)  [Gordon f1 and f2 models]
+
+   J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and
+   J. W. Ponder, "An Optimized Charge Penetration Model for Use with
+   the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19,
+   276-291 (2017)
+------------------------------------------------------------------------- */
+
+ucl_inline void damppole(const numtyp r, const int rorder,
+                         const numtyp alphai, const numtyp alphak,
+                         numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11])
+{
+  numtyp termi,termk;
+  numtyp termi2,termk2;
+  numtyp alphai2,alphak2;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampi3;
+  numtyp dampi4,dampi5;
+  numtyp dampi6,dampi7;
+  numtyp dampi8;
+  numtyp dampk2,dampk3;
+  numtyp dampk4,dampk5;
+  numtyp dampk6;
+
+  // compute tolerance and exponential damping factors
+
+  eps = (numtyp)0.001;
+  diff = alphai-alphak;
+  if (diff < (numtyp)0) diff = -diff;
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
+
+  // core-valence charge penetration damping for Gordon f1
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  dampi4 = dampi2 * dampi2;
+  dampi5 = dampi2 * dampi3;
+  dmpi[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampi)*expi;
+  dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
+  dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
+  dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
+  dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                   (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi;
+  if (diff < eps) {
+    dmpk[0] = dmpi[0];
+    dmpk[2] = dmpi[2];
+    dmpk[4] = dmpi[4];
+    dmpk[6] = dmpi[6];
+    dmpk[8] = dmpi[8];
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    dampk4 = dampk2 * dampk2;
+    dampk5 = dampk2 * dampk3;
+    dmpk[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampk)*expk;
+    dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
+    dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
+    dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk;
+    dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
+                     (numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk;
+  }
+
+  // valence-valence charge penetration damping for Gordon f1
+
+  if (diff < eps) {
+    dampi6 = dampi3 * dampi3;
+    dampi7 = dampi3 * dampi4;
+    dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 +
+                      dampi3/(numtyp)48.0)*expi;
+    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
+                      (numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi;
+    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
+    dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi;
+    dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
+                      dampi7/(numtyp)5040.0)*expi;
+    if (rorder >= 11) {
+      dampi8 = dampi4 * dampi4;
+      dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                         dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
+                         dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi;
+    }
+
+  } else {
+    alphai2 = alphai * alphai;
+    alphak2 = alphak * alphak;
+    termi = alphak2 / (alphak2-alphai2);
+    termk = alphai2 / (alphai2-alphak2);
+    termi2 = termi * termi;
+    termk2 = termk * termk;
+    dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi -
+      termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk;
+    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
+    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
+      termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk;
+    dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
+                             dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi -
+      termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk;
+    dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                             (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi -
+      termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
+              (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 +
+                        (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 +
+                        (numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk;
+
+    if (rorder >= 11) {
+      dampi6 = dampi3 * dampi3;
+      dampk6 = dampk3 * dampk3;
+      dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                                (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 +
+                                dampi6/(numtyp)1890.0)*expi -
+        termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 +
+                (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk -
+        (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 +
+                          dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi -
+        (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 +
+                          dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   dampdir = direct field damping coefficents
+   dampdir generates coefficients for the direct field damping
+   function for powers of the interatomic distance
+------------------------------------------------------------------------- */
+
+ucl_inline void dampdir(numtyp r, numtyp alphai, numtyp alphak, numtyp *dmpi, numtyp *dmpk)
+{
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampk2;
+  numtyp dampi3,dampk3;
+  numtyp dampi4,dampk4;
+
+  // compute tolerance and exponential damping factors
+
+  eps = (numtyp)0.001;
+  diff = alphai-alphak; // fabs(alphai-alphak);
+  if (diff < (numtyp)0) diff = -diff;
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
+
+  // core-valence charge penetration damping for Gordon f1 (HIPPO)
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  dampi4 = dampi2 * dampi2;
+  dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
+  dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
+  dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
+  if (diff < eps) {
+    dmpk[2] = dmpi[2];
+    dmpk[4] = dmpi[4];
+    dmpk[6] = dmpi[6];
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    dampk4 = dampk2 * dampk2;
+    dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
+    dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
+    dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/30.0)*expk;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   dampmut = mutual field damping coefficents
+   dampmut generates coefficients for the mutual field damping
+   function for powers of the interatomic distance
+------------------------------------------------------------------------- */
+
+ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
+{
+  numtyp termi,termk;
+  numtyp termi2,termk2;
+  numtyp alphai2,alphak2;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampi3;
+  numtyp dampi4,dampi5;
+  numtyp dampk2,dampk3;
+
+  // compute tolerance and exponential damping factors
+
+  eps = (numtyp)0.001;
+  diff = alphai-alphak; // fabs(alphai-alphak);
+  if (diff < (numtyp)0) diff = -diff;
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
+
+  // valence-valence charge penetration damping for Gordon f1 (HIPPO)
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  if (diff < eps) {
+    dampi4 = dampi2 * dampi2;
+    dampi5 = dampi2 * dampi3;
+    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
+                      7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi;
+    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    alphai2 = alphai * alphai;
+    alphak2 = alphak * alphak;
+    termi = alphak2 / (alphak2-alphai2);
+    termk = alphai2 / (alphai2-alphak2);
+    termi2 = termi * termi;
+    termk2 = termk * termk;
+    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
+    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk -
+      (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi -
+      (numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk;
+  }
+}
+
+#endif
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -576,6 +576,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    time_nbor.stop();
    if (_time_device)
      time_nbor.add_to_total();
+
+    // on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial)
+    // on the device, transpose the matrix (1-d array) for coalesced reads
+    //   dev_special[i][j] = the special i neighbor of atom j
+
    time_transpose.start();
    const int b2x=_block_cell_2d;
    const int b2y=_block_cell_2d;
@ -679,6 +684,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    if (_cutoff < _cell_size) vadjust*=1.46;
    mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
    if (mn<33) mn+=3;
+
    resize_max_neighbors<numtyp,acctyp>(mn,success);
    set_nbor_block_size(mn/2);
    if (!success)
@ -831,6 +837,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
  time_nbor.stop();
 }

+void Neighbor::transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
+    const int columns_in, const int rows_in)
+{
+  const int b2x=_block_cell_2d;
+  const int b2y=_block_cell_2d;
+  const int g2x=static_cast<int>(ceil(static_cast<double>(columns_in)/b2x));
+  const int g2y=static_cast<int>(ceil(static_cast<double>(rows_in)/b2y));
+  _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
+  _shared->k_transpose.run(&out, &in, &columns_in, &rows_in);
+}
+
 template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
     (double **x, const int inum, const int host_inum, const int nall,
      Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -33,7 +33,7 @@
 #endif
 #endif

-#if defined(USE_HIP)
+#if defined(USE_HIP) || defined(__APPLE__)
 #define LAL_USE_OLD_NEIGHBOR
 #endif

@ -259,6 +259,10 @@ class Neighbor {
    return o.str();
  }

+  /// Helper function
+  void transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
+    const int columns_in, const int rows_in);
+
 private:
  NeighborShared *_shared;
  UCL_Device *dev;
@ -289,15 +293,17 @@ class Neighbor {
  #endif

  int _simd_size;
-  inline void set_nbor_block_size(const int mn) {
  #ifdef LAL_USE_OLD_NEIGHBOR
+  inline void set_nbor_block_size(const int mn) {
    int desired=mn/(2*_simd_size);
    desired*=_simd_size;
    if (desired<_simd_size) desired=_simd_size;
    else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
    _block_nbor_build=desired;
-    #endif
  }
+  #else
+  inline void set_nbor_block_size(const int) {}
+  #endif
 };

 }
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -48,6 +48,19 @@ _texture_2d( pos_tex,int4);
 #define LAL_USE_OLD_NEIGHBOR
 #endif

+/*
+  compute the id of the cell where the atoms belong to
+x: atom coordinates
+cell_id: cell ids
+particle_id: 
+boxlo[0-2]: the lower left corner of the local box
+ncell[xyz]: the number of cells in xyz dims
+i_cell_size is the inverse cell size
+inum = the number of the local atoms that are ported to the device
+nall = the number of the local+ghost atoms that are ported to the device
+cells_in_cutoff = the number of cells that are within the cutoff
+*/
+
 __kernel void calc_cell_id(const numtyp4 *restrict x_,
                           unsigned *restrict cell_id,
                           int *restrict particle_id,
@ -90,6 +103,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_,
  }
 }

+// compute the number of atoms in each cell
+
 __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
                                      int *restrict cell_counts,
                                      int nall, int ncell) {
--- a/lib/gpu/lal_pre_cuda_hip.h
+++ b/lib/gpu/lal_pre_cuda_hip.h
@ -182,12 +182,15 @@
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
+#define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrt
 #define ucl_sqrt sqrt
-#define ucl_recip(x) ((numtyp)1.0/(x))
+#define ucl_erfc erfc

 #else

+#define ucl_exp expf
+#define ucl_powr powf
 #define ucl_atan atanf
 #define ucl_cbrt cbrtf
 #define ucl_ceil ceilf
@ -195,8 +198,7 @@
 #define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrtf
 #define ucl_sqrt sqrtf
-#define ucl_exp expf
-#define ucl_powr powf
+#define ucl_erfc erfcf

 #endif

--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -166,6 +166,7 @@
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
+#define ucl_erfc erfc

 #if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)

@ -330,6 +331,10 @@
 #define NEIGHMASK 0x3FFFFFFF
 ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };

+#define SBBITS15 29
+#define NEIGHMASK15 0x1FFFFFFF
+ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; };
+
 // default to 32-bit smallint and other ints, 64-bit bigint:
 // same as defined in src/lmptype.h
 #if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@ -150,7 +150,7 @@ double SWT::host_memory_usage() const {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int SWT::loop(const int eflag, const int vflag, const int evatom,
-              bool &success) {
+              bool & /*success*/) {
  const int nbor_pitch=this->nbor->nbor_pitch();

  // build the short neighbor list
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -106,6 +106,7 @@ _texture_2d( pos_tex,int4);
    }                                                                       \
  }

+// (SHUFFLE_AVAIL == 1)
 #else

 #define local_allocate_acc_zeta()
@ -202,6 +203,7 @@ _texture_2d( pos_tex,int4);
    }                                                                       \
  }

+// EVFLAG == 0
 #else

 #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
@ -216,8 +218,8 @@ _texture_2d( pos_tex,int4);
    ans[ii]=old;                                                            \
  }

-#endif
-#endif
+#endif // EVFLAG
+#endif // SHUFFLE_AVAIL

 #ifdef LAL_SIMD_IP_SYNC
 #define t_per_atom t_per_atom_in
--- a/lib/gpu/lal_vashishta.cpp
+++ b/lib/gpu/lal_vashishta.cpp
@ -56,7 +56,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
           const double* costheta, const double* bigb,
           const double* big2b, const double* bigc)
 {
-  int success;
+  int success=0;
  success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                           _screen,vashishta,"k_vashishta","k_vashishta_three_center",
                           "k_vashishta_three_end","k_vashishta_short_nbor");
@ -211,7 +211,7 @@ double VashishtaT::host_memory_usage() const {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int VashishtaT::loop(const int eflag, const int vflag, const int evatom,
-                     bool &success) {
+                     bool & /*success*/) {
  const int nbor_pitch=this->nbor->nbor_pitch();

  // build the short neighbor list
--- a/lib/kokkos/kokkos_5538.diff
+++ b/lib/kokkos/kokkos_5538.diff
@ -1,199 +0,0 @@
-diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
-index 22af411f32..530510a0d1 100644
--- a/lib/kokkos/Makefile.kokkos
-+++ b/lib/kokkos/Makefile.kokkos
-@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
- #KOKKOS_DEVICES ?= "Threads"
- # Options:
- # Intel:    KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
-# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
-+# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
- # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
- # IBM:      BGQ,Power7,Power8,Power9
- # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
-@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
- KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
- KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
- KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
-+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
- KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
-@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72)   \
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_TURING75)  \
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80)  \
-                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
-+                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)  \
-+                                              + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
- 
- #SEK: This seems like a bug to me
- ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
-@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
-     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
-     KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
-   endif
-+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
-+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
-+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
-+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
-+  endif
- 
-   ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
-     KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
-diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
-index 88ddc48378..b83ced9243 100644
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
-+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
-@@ -102,6 +102,7 @@
- #cmakedefine KOKKOS_ARCH_AMPERE
- #cmakedefine KOKKOS_ARCH_AMPERE80
- #cmakedefine KOKKOS_ARCH_AMPERE86
-+#cmakedefine KOKKOS_ARCH_HOPPER90
- #cmakedefine KOKKOS_ARCH_AMD_ZEN
- #cmakedefine KOKKOS_ARCH_AMD_ZEN2
- #cmakedefine KOKKOS_ARCH_AMD_ZEN3
-diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
-index f56cef1651..2585a6a64c 100644
--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
-+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
-@@ -74,6 +74,7 @@ int main() {
-     case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
-     case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
-     case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
-+    case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
-     default:
-       std::cout << "Compute capability " << compute_capability
-                 << " is not supported" << std::endl;
-diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
-index ef16aad047..c1d76cceeb 100644
--- a/lib/kokkos/cmake/kokkos_arch.cmake
-+++ b/lib/kokkos/cmake/kokkos_arch.cmake
-@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72         GPU  "NVIDIA Volta generation CC 7.2"   "KOKK
- KOKKOS_ARCH_OPTION(TURING75        GPU  "NVIDIA Turing generation CC 7.5"  "KOKKOS_SHOW_CUDA_ARCHS")
- KOKKOS_ARCH_OPTION(AMPERE80        GPU  "NVIDIA Ampere generation CC 8.0"  "KOKKOS_SHOW_CUDA_ARCHS")
- KOKKOS_ARCH_OPTION(AMPERE86        GPU  "NVIDIA Ampere generation CC 8.6"  "KOKKOS_SHOW_CUDA_ARCHS")
-+KOKKOS_ARCH_OPTION(HOPPER90        GPU  "NVIDIA Hopper generation CC 9.0"  "KOKKOS_SHOW_CUDA_ARCHS")
- 
- IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
-   SET(KOKKOS_SHOW_HIP_ARCHS ON)
-@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72   sm_72)
- CHECK_CUDA_ARCH(TURING75  sm_75)
- CHECK_CUDA_ARCH(AMPERE80  sm_80)
- CHECK_CUDA_ARCH(AMPERE86  sm_86)
-+CHECK_CUDA_ARCH(HOPPER90  sm_90)
- 
- SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
- FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
-@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
-   SET(KOKKOS_ARCH_AMPERE ON)
- ENDIF()
- 
-+IF (KOKKOS_ARCH_HOPPER90)
-+  SET(KOKKOS_ARCH_HOPPER ON)
-+ENDIF()
-+
- #Regardless of version, make sure we define the general architecture name
- IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
-   SET(KOKKOS_ARCH_VEGA ON)
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-index 56f9117844..fcd4773dbc 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
-       case 61: return 96;
-       case 70:
-       case 80:
-      case 86: return 8;
-+      case 86:
-+      case 90: return 8;
-       case 75: return 32;
-       default:
-         Kokkos::Impl::throw_runtime_exception(
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
-index 40a263561f..8c40ebd60d 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
-@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
- #endif  // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
- 
- #if CUDA_VERSION >= 11010 && \
-    ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
-+    ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
- KOKKOS_INLINE_FUNCTION
- bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
- KOKKOS_INLINE_FUNCTION
-diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
-index f9451ecfe6..2ce1efb98c 100644
--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
-+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
-@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
- 
- struct OpenACC_Traits {
- #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
-    defined(KOKKOS_ARCH_AMPERE)
-+    defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
-   static constexpr acc_device_t dev_type     = acc_device_nvidia;
-   static constexpr bool may_fallback_to_host = false;
- #else
-diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
-index a9bc085912..27ee1d4232 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
-+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
-@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
- 
-   // FIXME_OPENMPTARGET:  Only fix the number of teams for NVIDIA architectures
-   // from Pascal and upwards.
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
-+    defined(KOKKOS_ARCH_HOPPER)
- #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
-   omp_set_num_teams(512);
- #endif
-diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
-index 840db4327c..7e5addbc5b 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
-+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
-@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
- #if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
-     !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) &&   \
-     !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) &&   \
-    !defined(KOKKOS_ARCH_AMPERE)
-+    !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
-   if (!settings.has_device_id() && gpu_devices.empty()) {
-     Impl::SYCLInternal::singleton().initialize(sycl::device());
-     return;
-diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
-index 5ac7d8af30..ba101f699e 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
-+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
-@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
-     return std::min({
-              int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
-       // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
-    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||   \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
-+    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
-+    defined(KOKKOS_ARCH_HOPPER)
-                  256,
- #endif
-                  max_threads_for_memory
-@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
-     return std::min<int>({
-              int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
-       // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
-    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||   \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
-+    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
-+    defined(KOKKOS_ARCH_HOPPER)
-                  256,
- #endif
-                  max_threads_for_memory
--- a/lib/kokkos/kokkos_5706.diff
+++ b/lib/kokkos/kokkos_5706.diff
@ -1,523 +0,0 @@
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-index fcd4773dbc..30b6958a67 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
-                                 LaunchBounds{});
- }
- 
-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
- // NOTE these number can be obtained several ways:
- // * One option is to download the CUDA Occupancy Calculator spreadsheet, select
- // "Compute Capability" first and check what is the smallest "Shared Memory
-@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
-     return 0;
-   }() * 1024;
- }
-+
- }  // namespace Impl
- }  // namespace Kokkos
- 
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
-index 5811498e01..e22eb3b842 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
-@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
-   }
- #endif
- 
-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
-  cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
-#else
-  cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
-#endif
-
-   // Init the array for used for arbitrarily sized atomics
-   if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
- 
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-index b7a80ad84f..5c4c3a7d39 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-@@ -93,10 +93,6 @@ namespace Impl {
- //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
- // function qualifier which could be used to improve performance.
- //----------------------------------------------------------------------------
-// Maximize L1 cache and minimize shared memory:
-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
-// For 2.0 capability: 48 KB L1 and 16 KB shared
-//----------------------------------------------------------------------------
- 
- template <class DriverType>
- __global__ static void cuda_parallel_launch_constant_memory() {
-@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
-   }
- }
- 
-// This function needs to be template on DriverType and LaunchBounds
-+// These functions needs to be template on DriverType and LaunchBounds
- // so that the static bool is unique for each type combo
- // KernelFuncPtr does not necessarily contain that type information.
-+
- template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-inline void configure_shmem_preference(KernelFuncPtr const& func,
-                                       bool prefer_shmem) {
-+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
-+    const KernelFuncPtr& func) {
-+  // Only call cudaFuncGetAttributes once for each unique kernel
-+  // by leveraging static variable initialization rules
-+  auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
-+    cudaFuncAttributes attr;
-+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
-+    return attr;
-+  };
-+  static cudaFuncAttributes func_attr = wrap_get_attributes();
-+  return func_attr;
-+}
-+
-+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-+inline void configure_shmem_preference(const KernelFuncPtr& func,
-+                                       const cudaDeviceProp& device_props,
-+                                       const size_t block_size, int& shmem,
-+                                       const size_t occupancy) {
- #ifndef KOKKOS_ARCH_KEPLER
-  // On Kepler the L1 has no benefit since it doesn't cache reads
-+
-+  const auto& func_attr =
-+      get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
-+
-+  // Compute limits for number of blocks due to registers/SM
-+  const size_t regs_per_sm     = device_props.regsPerMultiprocessor;
-+  const size_t regs_per_thread = func_attr.numRegs;
-+  // The granularity of register allocation is chunks of 256 registers per warp
-+  // -> 8 registers per thread
-+  const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
-+  const size_t max_blocks_regs =
-+      regs_per_sm / (allocated_regs_per_thread * block_size);
-+
-+  // Compute how many threads per sm we actually want
-+  const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
-+  // only allocate multiples of warp size
-+  const size_t num_threads_desired =
-+      ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
-+  // Get close to the desired occupancy,
-+  // don't undershoot by much but also don't allocate a whole new block just
-+  // because one is a few threads over otherwise.
-+  size_t num_blocks_desired =
-+      (num_threads_desired + block_size * 0.8) / block_size;
-+  num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
-+  if (num_blocks_desired == 0) num_blocks_desired = 1;
-+
-+  // Calculate how much shared memory we need per block
-+  size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
-+
-+  // The minimum shared memory allocation we can have in total per SM is 8kB.
-+  // If we want to lower occupancy we have to make sure we request at least that
-+  // much in aggregate over all blocks, so that shared memory actually becomes a
-+  // limiting factor for occupancy
-+  constexpr size_t min_shmem_size_per_sm = 8192;
-+  if ((occupancy < 100) &&
-+      (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
-+    shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
-+    // Need to set the caller's shmem variable so that the
-+    // kernel launch uses the correct dynamic shared memory request
-+    shmem = shmem_per_block - func_attr.sharedSizeBytes;
-+  }
-+
-+  // Compute the carveout fraction we need based on occupancy
-+  // Use multiples of 8kB
-+  const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
-+  size_t carveout               = shmem_per_block == 0
-+                        ? 0
-+                        : 100 *
-+                              (((num_blocks_desired * shmem_per_block +
-+                                 min_shmem_size_per_sm - 1) /
-+                                min_shmem_size_per_sm) *
-+                               min_shmem_size_per_sm) /
-+                              max_shmem_per_sm;
-+  if (carveout > 100) carveout = 100;
-+
-+  // Set the carveout, but only call it once per kernel or when it changes
-   auto set_cache_config = [&] {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
-        func,
-        (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
-    return prefer_shmem;
-+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
-+        func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
-+    return carveout;
-   };
-  static bool cache_config_preference_cached = set_cache_config();
-  if (cache_config_preference_cached != prefer_shmem) {
-+  // Store the value in a static variable so we only reset if needed
-+  static size_t cache_config_preference_cached = set_cache_config();
-+  if (cache_config_preference_cached != carveout) {
-     cache_config_preference_cached = set_cache_config();
-   }
- #else
-   // Use the parameters so we don't get a warning
-   (void)func;
-  (void)prefer_shmem;
-+  (void)device_props;
-+  (void)block_size;
-+  (void)occupancy;
- #endif
- }
- 
-template <class Policy>
-std::enable_if_t<Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
-    Policy const& policy, cudaDeviceProp const& properties,
-    cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
-    bool& prefer_shmem) {
-  int const block_size        = block.x * block.y * block.z;
-  int const desired_occupancy = policy.impl_get_desired_occupancy().value();
-
-  size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
-  size_t const static_shmem           = attributes.sharedSizeBytes;
-
-  // round to nearest integer and avoid division by zero
-  int active_blocks = std::max(
-      1, static_cast<int>(std::round(
-             static_cast<double>(properties.maxThreadsPerMultiProcessor) /
-             block_size * desired_occupancy / 100)));
-  int const dynamic_shmem =
-      shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
-
-  if (dynamic_shmem > shmem) {
-    shmem        = dynamic_shmem;
-    prefer_shmem = false;
-  }
-}
-
-template <class Policy>
-std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
-    Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
-    dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
-
- // </editor-fold> end Some helper functions for launch code readability }}}1
- //==============================================================================
- 
-@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
- #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
-   inline static void create_parallel_launch_graph_node(
-       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
-      CudaInternal const* cuda_instance, bool prefer_shmem) {
-+      CudaInternal const* cuda_instance) {
-     //----------------------------------------
-     auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
-     KOKKOS_EXPECTS(bool(graph));
-@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
- 
-     if (!Impl::is_empty_launch(grid, block)) {
-       Impl::check_shmem_request(cuda_instance, shmem);
-      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-          base_t::get_kernel_func(), prefer_shmem);
-+      if (DriverType::Policy::
-+                        experimental_contains_desired_occupancy) {
-+        int desired_occupancy =
-+            driver.get_policy().impl_get_desired_occupancy().value();
-+        size_t block_size = block.x * block.y * block.z;
-+        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-+            shmem, desired_occupancy);
-+      }
- 
-       void const* args[] = {&driver};
- 
-@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
- #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
-   inline static void create_parallel_launch_graph_node(
-       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
-      CudaInternal const* cuda_instance, bool prefer_shmem) {
-+      CudaInternal const* cuda_instance) {
-     //----------------------------------------
-     auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
-     KOKKOS_EXPECTS(bool(graph));
-@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
- 
-     if (!Impl::is_empty_launch(grid, block)) {
-       Impl::check_shmem_request(cuda_instance, shmem);
-      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-          base_t::get_kernel_func(), prefer_shmem);
-+      if constexpr (DriverType::Policy::
-+                        experimental_contains_desired_occupancy) {
-+        int desired_occupancy =
-+            driver.get_policy().impl_get_desired_occupancy().value();
-+        size_t block_size = block.x * block.y * block.z;
-+        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-+            shmem, desired_occupancy);
-+      }
- 
-       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
- 
-@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
- #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
-   inline static void create_parallel_launch_graph_node(
-       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
-      CudaInternal const* cuda_instance, bool prefer_shmem) {
-+      CudaInternal const* cuda_instance) {
-     // Just use global memory; coordinating through events to share constant
-     // memory with the non-graph interface is not really reasonable since
-     // events don't work with Graphs directly, and this would anyway require
-@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
-         DriverType, LaunchBounds,
-         Experimental::CudaLaunchMechanism::GlobalMemory>;
-     global_launch_impl_t::create_parallel_launch_graph_node(
-        driver, grid, block, shmem, cuda_instance, prefer_shmem);
-+        driver, grid, block, shmem, cuda_instance);
-   }
- #endif
- };
-@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
- 
-   inline static void launch_kernel(const DriverType& driver, const dim3& grid,
-                                    const dim3& block, int shmem,
-                                   const CudaInternal* cuda_instance,
-                                   bool prefer_shmem) {
-+                                   const CudaInternal* cuda_instance) {
-     if (!Impl::is_empty_launch(grid, block)) {
-       // Prevent multiple threads to simultaneously set the cache configuration
-       // preference and launch the same kernel
-@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
- 
-       Impl::check_shmem_request(cuda_instance, shmem);
- 
-      // If a desired occupancy is specified, we compute how much shared memory
-      // to ask for to achieve that occupancy, assuming that the cache
-      // configuration is `cudaFuncCachePreferL1`.  If the amount of dynamic
-      // shared memory computed is actually smaller than `shmem` we overwrite
-      // `shmem` and set `prefer_shmem` to `false`.
-      modify_launch_configuration_if_desired_occupancy_is_specified(
-          driver.get_policy(), cuda_instance->m_deviceProp,
-          get_cuda_func_attributes(), block, shmem, prefer_shmem);
-
-      Impl::configure_shmem_preference<
-          DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-          base_t::get_kernel_func(), prefer_shmem);
-+      if (DriverType::Policy::
-+                        experimental_contains_desired_occupancy) {
-+        int desired_occupancy =
-+            driver.get_policy().impl_get_desired_occupancy().value();
-+        size_t block_size = block.x * block.y * block.z;
-+        Impl::configure_shmem_preference<
-+            DriverType,
-+            Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-+            shmem, desired_occupancy);
-+      }
- 
-       KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
- 
-@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
-   }
- 
-   static cudaFuncAttributes get_cuda_func_attributes() {
-    // Race condition inside of cudaFuncGetAttributes if the same address is
-    // given requires using a local variable as input instead of a static Rely
-    // on static variable initialization to make sure only one thread executes
-    // the code and the result is visible.
-    auto wrap_get_attributes = []() -> cudaFuncAttributes {
-      cudaFuncAttributes attr_tmp;
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
-      return attr_tmp;
-    };
-    static cudaFuncAttributes attr = wrap_get_attributes();
-    return attr;
-+    return get_cuda_kernel_func_attributes<
-+        DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-+        base_t::get_kernel_func());
-   }
- };
- 
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-index e586bb4cc6..0e348c092a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-               maxblocks[1]),
-           1);
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else if (RP::rank == 3) {
-       const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
-       KOKKOS_ASSERT(block.x > 0);
-@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-               (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
-               maxblocks[2]));
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else if (RP::rank == 4) {
-       // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
-       // threadIdx.z
-@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-               (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
-               maxblocks[2]));
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else if (RP::rank == 5) {
-       // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
-       // threadIdx.z
-@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-               (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
-               maxblocks[2]));
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else if (RP::rank == 6) {
-       // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
-       // threadIdx.z
-@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-           std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
-                                      maxblocks[2]));
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else {
-       Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
-     }
-@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
- 
-       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
-           *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-+          m_policy.space()
-+	      .impl_internal_space_instance()); // copy to device and execute
- 
-       if (!m_result_ptr_device_accessible) {
-         if (m_result_ptr) {
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
-index ac160f8fe2..d1031751c2 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
-@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
- #endif
- 
-     CudaParallelLaunch<ParallelFor, LaunchBounds>(
-        *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
-        false);
-+        *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
-   }
- 
-   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
- 
-       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
-           *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-+          m_policy.space()
-+	      .impl_internal_space_instance());  // copy to device and execute
- 
-       if (!m_result_ptr_device_accessible) {
-         if (m_result_ptr) {
-@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
-         m_final = false;
-         CudaParallelLaunch<ParallelScan, LaunchBounds>(
-             *this, grid, block, shmem,
-            m_policy.space().impl_internal_space_instance(),
-            false);  // copy to device and execute
-+            m_policy.space()
-+	        .impl_internal_space_instance());  // copy to device and execute
- #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-       }
- #endif
-       m_final = true;
-       CudaParallelLaunch<ParallelScan, LaunchBounds>(
-           *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-+          m_policy.space()
-+	      .impl_internal_space_instance());  // copy to device and execute
-     }
-   }
- 
-@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
-         m_final = false;
-         CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
-             *this, grid, block, shmem,
-            m_policy.space().impl_internal_space_instance(),
-            false);  // copy to device and execute
-+            m_policy.space()
-+	        .impl_internal_space_instance());  // copy to device and execute
- #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-       }
- #endif
-       m_final = true;
-       CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
-           *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-+          m_policy.space()
-+              .impl_internal_space_instance());  // copy to device and execute
- 
-       const int size = Analysis::value_size(m_functor);
- #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
-index cdd16085b3..ea9430b812 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
-@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
- 
-     CudaParallelLaunch<ParallelFor, LaunchBounds>(
-         *this, grid, block, shmem_size_total,
-        m_policy.space().impl_internal_space_instance(),
-        true);  // copy to device and execute
-+        m_policy.space()
-+	    .impl_internal_space_instance());  // copy to device and execute
-   }
- 
-   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
- 
-       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
-           *this, grid, block, shmem_size_total,
-          m_policy.space().impl_internal_space_instance(),
-          true);  // copy to device and execute
-+          m_policy.space()
-+	      .impl_internal_space_instance());  // copy to device and execute
- 
-       if (!m_result_ptr_device_accessible) {
-         m_policy.space().fence(
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
-index 34d4bef9fd..178012431c 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
-@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
- //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
- // function qualifier which could be used to improve performance.
- //----------------------------------------------------------------------------
-// Maximize shared memory and minimize L1 cache:
-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
-// For 2.0 capability: 48 KB shared and 16 KB L1
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
- /*
-  *  Algorithmic constraints:
-  *   (a) blockDim.y <= 1024
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
-index fb3a6b138f..a12378a891 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
-@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
-     const int shared = 0;
- 
-     Kokkos::Impl::CudaParallelLaunch<Self>(
-        *this, grid, block, shared, Cuda().impl_internal_space_instance(),
-        false);
-+        *this, grid, block, shared, Cuda().impl_internal_space_instance());
-   }
- 
-   inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
--- a/lib/kokkos/kokkos_5731.diff
+++ b/lib/kokkos/kokkos_5731.diff
@ -1,46 +0,0 @@
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-index 30b6958a67..b94f053272 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
-                                 LaunchBounds{});
- }
- 
-// NOTE these number can be obtained several ways:
-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
-// "Compute Capability" first and check what is the smallest "Shared Memory
-// Size Config" that is available.  The "Shared Memory Per Multiprocessor" in
-// bytes is then to be found below in the summary.
-// * Another option would be to look for the information in the "Tuning
-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
-// the "Shared Memory" section (more tedious)
-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
-  int const compute_capability = properties.major * 10 + properties.minor;
-  return [compute_capability]() {
-    switch (compute_capability) {
-      case 30:
-      case 32:
-      case 35: return 16;
-      case 37: return 80;
-      case 50:
-      case 53:
-      case 60:
-      case 62: return 64;
-      case 52:
-      case 61: return 96;
-      case 70:
-      case 80:
-      case 86:
-      case 90: return 8;
-      case 75: return 32;
-      default:
-        Kokkos::Impl::throw_runtime_exception(
-            "Unknown device in cuda block size deduction");
-    }
-    return 0;
-  }() * 1024;
-}
-
- }  // namespace Impl
- }  // namespace Kokkos
- 
--- a/lib/kokkos/kokkos_5739.diff
+++ b/lib/kokkos/kokkos_5739.diff
@ -1,204 +0,0 @@
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-index b94f053272..252c13c524 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-@@ -53,17 +53,69 @@
- namespace Kokkos {
- namespace Impl {
- 
-+inline int cuda_warp_per_sm_allocation_granularity(
-+    cudaDeviceProp const& properties) {
-+  // Allocation granularity of warps in each sm
-+  switch (properties.major) {
-+    case 3:
-+    case 5:
-+    case 7:
-+    case 8:
-+    case 9: return 4;
-+    case 6: return (properties.minor == 0 ? 2 : 4);
-+    default:
-+      throw_runtime_exception(
-+          "Unknown device in cuda warp per sm allocation granularity");
-+      return 0;
-+  }
-+}
-+
-+inline int cuda_max_warps_per_sm_registers(
-+    cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
-+  // Maximum number of warps per sm as a function of register counts,
-+  // subject to the constraint that warps are allocated with a fixed granularity
-+  int const max_regs_per_block = properties.regsPerBlock;
-+  int const regs_per_warp      = attributes.numRegs * properties.warpSize;
-+  int const warp_granularity =
-+      cuda_warp_per_sm_allocation_granularity(properties);
-+  // The granularity of register allocation is chunks of 256 registers per warp,
-+  // which implies a need to over-allocate, so we round up
-+  int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
-+
-+  // The maximum number of warps per SM is constrained from above by register
-+  // allocation. To satisfy the constraint that warps per SM is allocated at a
-+  // finite granularity, we need to round down.
-+  int const max_warps_per_sm =
-+      warp_granularity *
-+      (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
-+
-+  return max_warps_per_sm;
-+}
-+
- inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
-                                          cudaFuncAttributes const& attributes,
-                                          int block_size, size_t dynamic_shmem) {
-  // Limits due do registers/SM
-+  // Limits due to registers/SM
-   int const regs_per_sm     = properties.regsPerMultiprocessor;
-   int const regs_per_thread = attributes.numRegs;
-   // The granularity of register allocation is chunks of 256 registers per warp
-   // -> 8 registers per thread
-   int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
-  int const max_blocks_regs =
-      regs_per_sm / (allocated_regs_per_thread * block_size);
-+  int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
-+
-+  // Compute the maximum number of warps as a function of the number of
-+  // registers
-+  int const max_warps_per_sm_registers =
-+      cuda_max_warps_per_sm_registers(properties, attributes);
-+
-+  // Constrain the number of blocks to respect the maximum number of warps per
-+  // SM On face value this should be an equality, but due to the warp
-+  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
-+  // left-hand-side of this comparison can overshoot what the hardware allows
-+  // based on register counts alone
-+  while ((max_blocks_regs * block_size / properties.warpSize) >
-+         max_warps_per_sm_registers)
-+    max_blocks_regs--;
- 
-   // Limits due to shared memory/SM
-   size_t const shmem_per_sm            = properties.sharedMemPerMultiprocessor;
-@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
-                                 LaunchBounds{});
- }
- 
-+template <class LaunchBounds>
-+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
-+                                     LaunchBounds) {
-+  auto const& prop = Kokkos::Cuda().cuda_device_prop();
-+
-+  // Thin version of cuda_get_opt_block_size for cases where there is no shared
-+  // memory
-+  auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
-+
-+  return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
-+                                LaunchBounds{});
-+}
-+
- }  // namespace Impl
- }  // namespace Kokkos
- 
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-index 5c4c3a7d39..170183ca0a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
-   // The granularity of register allocation is chunks of 256 registers per warp
-   // -> 8 registers per thread
-   const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
-  const size_t max_blocks_regs =
-+  size_t max_blocks_regs =
-       regs_per_sm / (allocated_regs_per_thread * block_size);
- 
-+  // Compute the maximum number of warps as a function of the number of
-+  // registers
-+  const size_t max_warps_per_sm_registers =
-+      cuda_max_warps_per_sm_registers(device_props, func_attr);
-+
-+  // Constrain the number of blocks to respect the maximum number of warps per
-+  // SM On face value this should be an equality, but due to the warp
-+  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
-+  // left-hand-side of this comparison can overshoot what the hardware allows
-+  // based on register counts alone
-+  while ((max_blocks_regs * block_size / device_props.warpSize) >
-+         max_warps_per_sm_registers)
-+    max_blocks_regs--;
-+
-   // Compute how many threads per sm we actually want
-   const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
-   // only allocate multiples of warp size
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-index 0e348c092a..7e4f62f12e 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-@@ -67,6 +67,34 @@
- namespace Kokkos {
- namespace Impl {
- 
-+template <typename ParallelType, typename Policy, typename LaunchBounds>
-+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
-+  cudaFuncAttributes attr =
-+      CudaParallelLaunch<ParallelType,
-+                         LaunchBounds>::get_cuda_func_attributes();
-+  auto const& prop = pol.space().cuda_device_prop();
-+
-+  // Limits due to registers/SM, MDRange doesn't have
-+  // shared memory constraints
-+  int const optimal_block_size =
-+      Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
-+
-+  // Compute how many blocks of this size we can launch, based on warp
-+  // constraints
-+  int const max_warps_per_sm_registers =
-+      Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
-+  int const max_num_threads_from_warps =
-+      max_warps_per_sm_registers * prop.warpSize;
-+  int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
-+
-+  // Compute the total number of threads
-+  int const max_threads_per_sm = optimal_block_size * max_num_blocks;
-+
-+  return std::min(
-+      max_threads_per_sm,
-+      static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
-+}
-+
- template <class FunctorType, class... Traits>
- class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-  public:
-@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-  public:
-   template <typename Policy, typename Functor>
-   static int max_tile_size_product(const Policy& pol, const Functor&) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelFor,
-                           LaunchBounds>::get_cuda_func_attributes();
-    auto const& prop = pol.space().cuda_device_prop();
-    // Limits due to registers/SM, MDRange doesn't have
-    // shared memory constraints
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
-+    return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
-   }
-   Policy const& get_policy() const { return m_rp; }
-   inline __device__ void operator()() const {
-@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-  public:
-   template <typename Policy, typename Functor>
-   static int max_tile_size_product(const Policy& pol, const Functor&) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelReduce,
-                           LaunchBounds>::get_cuda_func_attributes();
-    auto const& prop = pol.space().cuda_device_prop();
-    // Limits due do registers/SM
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
-+    return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
-   }
-   Policy const& get_policy() const { return m_policy; }
-   inline __device__ void exec_range(reference_type update) const {
--- a/lib/kokkos/kokkos_fix_5706_apply_last.diff
+++ b/lib/kokkos/kokkos_fix_5706_apply_last.diff
@ -1,63 +0,0 @@
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-index 170183ca0a..ba43e362bb 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
-       Impl::check_shmem_request(cuda_instance, shmem);
-       if (DriverType::Policy::
-                         experimental_contains_desired_occupancy) {
-+      /*
-         int desired_occupancy =
-             driver.get_policy().impl_get_desired_occupancy().value();
-         size_t block_size = block.x * block.y * block.z;
-         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-            shmem, desired_occupancy);
-+            shmem, desired_occupancy);*/
-+        Kokkos::Impl::throw_runtime_exception(
-+        std::string("Cuda graph node creation FAILED:"
-+                    " occupancy requests are currently broken."));
-       }
- 
-       void const* args[] = {&driver};
-@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
- 
-     if (!Impl::is_empty_launch(grid, block)) {
-       Impl::check_shmem_request(cuda_instance, shmem);
-      if constexpr (DriverType::Policy::
-+      if (DriverType::Policy::
-                         experimental_contains_desired_occupancy) {
-        int desired_occupancy =
-+        /*int desired_occupancy =
-             driver.get_policy().impl_get_desired_occupancy().value();
-         size_t block_size = block.x * block.y * block.z;
-         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-            shmem, desired_occupancy);
-+            shmem, desired_occupancy);*/
-+        Kokkos::Impl::throw_runtime_exception(
-+        std::string("Cuda graph node creation FAILED:"
-+                    " occupancy requests are currently broken."));
-       }
- 
-       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
-@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
- 
-       if (DriverType::Policy::
-                         experimental_contains_desired_occupancy) {
-        int desired_occupancy =
-+        /*int desired_occupancy =
-             driver.get_policy().impl_get_desired_occupancy().value();
-         size_t block_size = block.x * block.y * block.z;
-         Impl::configure_shmem_preference<
-             DriverType,
-             Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-            shmem, desired_occupancy);
-+            shmem, desired_occupancy);*/
-+        Kokkos::Impl::throw_runtime_exception(
-+        std::string("Cuda graph node creation FAILED:"
-+                    " occupancy requests are currently broken."));
-       }
- 
-       KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
--- a/potentials/HGa.msmeam
+++ b/potentials/HGa.msmeam
@ -0,0 +1,30 @@
+bkgd_dyn        =       1
+emb_lin_neg = 1
+augt1=0 
+ialloy=1 
+rc	=	 5.9 
+#H
+attrac(1,1)=0.460 
+repuls(1,1)=0.460 
+Cmin(1,1,1)=1.3 # PuMS
+Cmax(1,1,1)= 2.80 
+nn2(1,1)=1
+#Ga
+rho0(2)         =       0.6
+attrac(2,2)=0.097 
+repuls(2,2)=0.097 
+nn2(2,2)=1
+#HGa
+attrac(1,2)=0.300 
+repuls(1,2)=0.300 
+lattce(1,2)=l12 
+re(1,2)=3.19 
+delta(1,2)=-0.48  
+alpha(1,2)=6.6 
+Cmin(1,1,2)=2.0 
+Cmin(2,1,2)= 2.0 
+Cmin(1,2,1)=2.0 
+Cmin(2,2,1)     =       1.4
+Cmin(1,2,2)     =       1.4
+Cmin(1,1,2)     =       1.4
+nn2(1,2)=1
--- a/potentials/library.msmeam
+++ b/potentials/library.msmeam
@ -0,0 +1,14 @@
+# DATE: 2018-09-22 UNITS: metal CONTRIBUTOR: Steve Valone, smv@lanl.gov CITATION: Baskes, PRB 1992; smv, sr, mib, JNM 2010
+# ms-meam data format May 2010
+#  elt   lat    z     ielement      atwt
+#  alpha b0     b1    b2     b3     b1m     b2m   b3m      alat   esub   asub
+#    -   t0     t1    t2     t3     t1m     t2m   t3m      rozero ibar
+#  NOTE:  leading character cannot be a space
+
+'H'    'dim'  1.0   1      1.0079
+2.960  2.960  3.0   1.0    1.0    1.0    3.0  1.0       0.741  2.235  2.50
+1.0    0.44721 0.0  0.00   0.0    0.31623 0  6.70 0
+
+'Ga4'  'fcc'  12.0  31     69.723
+4.42   4.80   3.10  6.00   0.00   0.0    0.0  0.5       4.247  2.897  0.97
+1.0    1.649 1.435  0.00   0.0    0.0  2.0       0.70   0
--- a/python/lammps/mliap/init.py
+++ b/python/lammps/mliap/init.py
@ -32,7 +32,7 @@ if not pylib.Py_IsInitialized():
 else:
    from .loader import load_model, load_unified, activate_mliappy
    try:
-         from .loader import  load_model_kokkos,  activate_mliappy_kokkos
+         from .loader import  load_model_kokkos, load_unified_kokkos,  activate_mliappy_kokkos
    except Exception as ee:
        # ignore import error, it means that the KOKKOS package was not included in LAMMPS
        pass
--- a/python/lammps/mliap/loader.py
+++ b/python/lammps/mliap/loader.py
@ -75,7 +75,7 @@ def activate_mliappy(lmp):
 def activate_mliappy_kokkos(lmp):
    try:
        library = lmp.lib
-        module_names = ["mliap_model_python_couple_kokkos"]
+        module_names = ["mliap_model_python_couple_kokkos", "mliap_unified_couple_kokkos"]
        api_version = library.lammps_python_api_version()

        for module_name in module_names:
@ -118,3 +118,12 @@ def load_unified(model):
                          ) from ie
    mliap_unified_couple.load_from_python(model)

+def load_unified_kokkos(model):
+    try:
+        import mliap_unified_couple_kokkos
+    except ImportError as ie:
+        raise ImportError("ML-IAP python module must be activated before loading\n"
+                          "the pair style. Call lammps.mliap.activate_mliappy(lmp)."
+                          ) from ie
+    mliap_unified_couple_kokkos.load_from_python(model)
+
--- a/src/AMOEBA/amoeba_convolution.cpp
+++ b/src/AMOEBA/amoeba_convolution.cpp
@ -22,6 +22,7 @@
 #include "memory.h"
 #include "neighbor.h"
 #include "remap_wrap.h"
+#include "timer.h"

 #include <cmath>
 #include <cstring>
@ -326,15 +327,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
    cfft[n++] = ZEROF;
  }

+  double time0,time1;
+
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  // perform forward FFT

  fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  time1 = platform::walltime();

  if (SCALE) {
-    double scale = 1.0/nfft_global;
+    FFT_SCALAR scale = 1.0/nfft_global;
    for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
  }

+  time_fft += time1 - time0;
+
 #if DEBUG_AMOEBA
  debug_scalar(CFFT1,"PRE Convo / POST FFT");
  debug_file(CFFT1,"pre.convo.post.fft");
@ -382,15 +391,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
  debug_scalar(FFT,"PRE Convo / POST Remap");
  debug_file(FFT,"pre.convo.post.remap");
 #endif
+
+  double time0,time1;
+
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  // perform forward FFT

  fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  time1 = platform::walltime();

  if (SCALE) {
-    double scale = 1.0/nfft_global;
+    FFT_SCALAR scale = 1.0/nfft_global;
    for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
  }

+  time_fft += time1  - time0;
+
 #if DEBUG_AMOEBA
  debug_scalar(CFFT1,"PRE Convo / POST FFT");
  debug_file(CFFT1,"pre.convo.post.fft");
@ -423,7 +441,16 @@ void *AmoebaConvolution::post_convolution_3d()
  debug_scalar(CFFT1,"POST Convo / PRE FFT");
  debug_file(CFFT1,"post.convo.pre.fft");
 #endif
+
+  double time0,time1;
+
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  fft2->compute(cfft,cfft,FFT3d::BACKWARD);
+  time1 = platform::walltime();
+
+  time_fft += time1 - time0;

 #if DEBUG_AMOEBA
  debug_scalar(CFFT2,"POST Convo / POST FFT");
@ -465,8 +492,18 @@ void *AmoebaConvolution::post_convolution_4d()
  debug_scalar(CFFT1,"POST Convo / PRE FFT");
  debug_file(CFFT1,"post.convo.pre.fft");
 #endif
+
+  double time0,time1;
+
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  fft2->compute(cfft,cfft,FFT3d::BACKWARD);

+  time1 = platform::walltime();
+
+  time_fft += time1 - time0;
+
 #if DEBUG_AMOEBA
  debug_scalar(CFFT2,"POST Convo / POST FFT");
  debug_file(CFFT2,"post.convo.post.fft");
--- a/src/AMOEBA/amoeba_convolution.h
+++ b/src/AMOEBA/amoeba_convolution.h
@ -38,7 +38,7 @@ class AmoebaConvolution : protected Pointers {
  int nxlo_out, nxhi_out, nylo_out, nyhi_out, nzlo_out, nzhi_out;
  int nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft;
  bigint nfft_global;          // nx * ny * nz
-  double *grid_brick_start;    // lower left corner of (c)grid_brick data
+  FFT_SCALAR *grid_brick_start;    // lower left corner of (c)grid_brick data

  AmoebaConvolution(class LAMMPS *, class Pair *, int, int, int, int, int);
  ~AmoebaConvolution();
@ -47,7 +47,9 @@ class AmoebaConvolution : protected Pointers {
  FFT_SCALAR *pre_convolution();
  void *post_convolution();

- private:
+  double time_fft;
+
+ protected:
  int which;                   // caller name for convolution being performed
  int flag3d;                  // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
  int nbrick_owned;            // owned grid points in brick decomp
@ -59,23 +61,23 @@ class AmoebaConvolution : protected Pointers {
  class Grid3d *gc;
  class Remap *remap;

-  double ***grid_brick;      // 3d real brick grid with ghosts
-  double ****cgrid_brick;    // 4d complex brick grid with ghosts
+  FFT_SCALAR ***grid_brick;      // 3d real brick grid with ghosts
+  FFT_SCALAR ****cgrid_brick;    // 4d complex brick grid with ghosts

  FFT_SCALAR *grid_fft;    // 3d FFT grid as 1d vector
  FFT_SCALAR *cfft;        // 3d complex FFT grid as 1d vector

-  double *gc_buf1, *gc_buf2;    // buffers for GridComm
-  double *remap_buf;            // buffer for Remap
+  FFT_SCALAR *gc_buf1, *gc_buf2;    // buffers for GridComm
+  FFT_SCALAR *remap_buf;            // buffer for Remap

  void allocate_grid();
  void deallocate_grid();
  void *zero_3d();
  void *zero_4d();
  FFT_SCALAR *pre_convolution_3d();
-  FFT_SCALAR *pre_convolution_4d();
+  virtual FFT_SCALAR *pre_convolution_4d();
  void *post_convolution_3d();
-  void *post_convolution_4d();
+  virtual void *post_convolution_4d();
  void procs2grid2d(int, int, int, int &, int &);

  // DEBUG
--- a/src/AMOEBA/amoeba_dispersion.cpp
+++ b/src/AMOEBA/amoeba_dispersion.cpp
@ -285,7 +285,7 @@ void PairAmoeba::dispersion_kspace()
  // gridpre = my portion of 3d grid in brick decomp w/ ghost values
  // zeroed by zero()

-  double ***gridpre = (double ***) d_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) d_kspace->zero();

  // map atoms to grid

@ -294,7 +294,7 @@ void PairAmoeba::dispersion_kspace()
  // pre-convolution operations including forward FFT
  // gridfft = my portion of complex 3d grid in FFT decomposition

-  double *gridfft = d_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = d_kspace->pre_convolution();

  // ---------------------
  // convolution operation
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@ -24,6 +24,7 @@
 #include "math_special.h"
 #include "my_page.h"
 #include "neigh_list.h"
+#include "timer.h"

 #include <cmath>

@ -381,8 +382,6 @@ void PairAmoeba::induce()
      }
    }

-    // if (comm->me == 0) printf("CG iteration count = %d\n",iter);
-
    // terminate the calculation if dipoles failed to converge
    // NOTE: could make this an error

@ -546,13 +545,19 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
    }
  }

-  // get the reciprocal space part of the mutual field
-
-  if (polar_kspace_flag) umutual1(field,fieldp);
+  double time0, time1, time2;
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();

  // get the real space portion of the mutual field

  if (polar_rspace_flag) umutual2b(field,fieldp);
+  time1 = platform::walltime();
+
+  // get the reciprocal space part of the mutual field
+
+  if (polar_kspace_flag) umutual1(field,fieldp);
+  time2 = platform::walltime();

  // add the self-energy portion of the mutual field

@ -563,6 +568,11 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
      fieldp[i][j] += term*uinp[i][j];
    }
  }
+
+  // accumulate timing information
+
+  time_mutual_rspace += time1 - time0;
+  time_mutual_kspace += time2 - time1;
 }

 /* ----------------------------------------------------------------------
@ -785,7 +795,12 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)

  // get the reciprocal space part of the permanent field

+  double time0, time1, time2;
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  if (polar_kspace_flag) udirect1(field);
+  time1 = platform::walltime();

  for (i = 0; i < nlocal; i++) {
    for (j = 0; j < 3; j++) {
@ -796,6 +811,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
  // get the real space portion of the permanent field

  if (polar_rspace_flag) udirect2b(field,fieldp);
+  time2 = platform::walltime();

  // get the self-energy portion of the permanent field

@ -806,6 +822,11 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
      fieldp[i][j] += term*rpole[i][j+1];
    }
  }
+
+  // accumulate timing information
+
+  time_direct_kspace += time1 - time0;
+  time_direct_rspace += time2 - time1;
 }

 /* ----------------------------------------------------------------------
@ -842,18 +863,26 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
    }
  }

+  double time0, time1;
+
  // gridpre = my portion of 4d grid in brick decomp w/ ghost values

-  double ****gridpre = (double ****) ic_kspace->zero();
+  FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero();

  // map 2 values to grid

+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  grid_uind(fuind,fuinp,gridpre);

+  time1 = platform::walltime();
+  time_grid_uind += (time1 - time0);
+
  // pre-convolution operations including forward FFT
  // gridfft = my portion of complex 3d grid in FFT decomposition

-  double *gridfft = ic_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = ic_kspace->pre_convolution();

  // ---------------------
  // convolution operation
@ -883,12 +912,18 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
  // post-convolution operations including backward FFT
  // gridppost = my portion of 4d grid in brick decomp w/ ghost values

-  double ****gridpost = (double ****) ic_kspace->post_convolution();
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution();

  // get potential

+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);

+  time1 = platform::walltime();
+  time_fphi_uind += (time1 - time0);
+
  // store fractional reciprocal potentials for OPT method

  if (poltyp == OPT) {
@ -1055,7 +1090,7 @@ void PairAmoeba::udirect1(double **field)
  // gridpre = my portion of 3d grid in brick decomp w/ ghost values
  // zeroed by setup()

-  double ***gridpre = (double ***) i_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) i_kspace->zero();

  // map multipole moments to grid

@ -1064,7 +1099,7 @@ void PairAmoeba::udirect1(double **field)
  // pre-convolution operations including forward FFT
  // gridfft = my 1d portion of complex 3d grid in FFT decomp

-  double *gridfft = i_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = i_kspace->pre_convolution();

  // ---------------------
  // convolution operation
@ -1109,7 +1144,7 @@ void PairAmoeba::udirect1(double **field)
  // post-convolution operations including backward FFT
  // gridppost = my portion of 3d grid in brick decomp w/ ghost values

-  double ***gridpost = (double ***) i_kspace->post_convolution();
+  FFT_SCALAR ***gridpost = (FFT_SCALAR ***) i_kspace->post_convolution();

  // get potential

--- a/src/AMOEBA/amoeba_kspace.cpp
+++ b/src/AMOEBA/amoeba_kspace.cpp
@ -68,25 +68,23 @@ void PairAmoeba::moduli()
  int maxfft = MAX(nfft1,nfft2);
  maxfft = MAX(maxfft,nfft3);

-  double *array = new double[bsorder];
-  double *bsarray = new double[maxfft];
+  if (maxfft > _nfft_max) {
+    memory->destroy(_moduli_bsarray);
+    _nfft_max = maxfft;
+    memory->create(_moduli_bsarray,_nfft_max,"amoeba:_moduli_bsarray");
+  }

  // compute and load the moduli values

  double x = 0.0;
-  bspline(x,bsorder,array);
+  bspline(x,bsorder,_moduli_array);

-  for (i = 0; i < maxfft; i++) bsarray[i] = 0.0;
-  for (i = 0; i < bsorder; i++) bsarray[i+1] = array[i];
+  for (i = 0; i < maxfft; i++) _moduli_bsarray[i] = 0.0;
+  for (i = 0; i < bsorder; i++) _moduli_bsarray[i+1] = _moduli_array[i];

-  dftmod(bsmod1,bsarray,nfft1,bsorder);
-  dftmod(bsmod2,bsarray,nfft2,bsorder);
-  dftmod(bsmod3,bsarray,nfft3,bsorder);
-
-  // perform deallocation of local arrays
-
-  delete[] array;
-  delete[] bsarray;
+  dftmod(bsmod1,_moduli_bsarray,nfft1,bsorder);
+  dftmod(bsmod2,_moduli_bsarray,nfft2,bsorder);
+  dftmod(bsmod3,_moduli_bsarray,nfft3,bsorder);
 }

 /* ----------------------------------------------------------------------
@ -525,7 +523,7 @@ void PairAmoeba::frac_to_cart()
   grid_mpole maps fractional atomic multipoles to PME grid
 ------------------------------------------------------------------------- */

-void PairAmoeba::grid_mpole(double **fmp, double ***grid)
+void PairAmoeba::grid_mpole(double **fmp, FFT_SCALAR ***grid)
 {
  int i,j,k,m,ib,jb,kb;
  double v0,u0,t0;
@ -598,7 +596,7 @@ void PairAmoeba::grid_mpole(double **fmp, double ***grid)
   the particle mesh Ewald grid
 ------------------------------------------------------------------------- */

-void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
+void PairAmoeba::fphi_mpole(FFT_SCALAR ***grid, double **fphi)
 {
  int i,j,k,m,ib,jb,kb;
  double v0,v1,v2,v3;
@ -742,7 +740,7 @@ void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
   grid_uind maps fractional induced dipoles to the PME grid
 ------------------------------------------------------------------------- */

-void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
+void PairAmoeba::grid_uind(double **fuind, double **fuinp, FFT_SCALAR ****grid)
 {
  int i,j,k,m,ib,jb,kb;
  double v0,u0,t0;
@ -793,7 +791,7 @@ void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
 ------------------------------------------------------------------------- */

-void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
+void PairAmoeba::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
                           double **fdip_phi2, double **fdip_sum_phi)
 {
  int i,j,k,m,ib,jb,kb;
@ -1042,7 +1040,7 @@ void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
   grid_disp maps dispersion coefficients to PME grid
 ------------------------------------------------------------------------- */

-void PairAmoeba::grid_disp(double ***grid)
+void PairAmoeba::grid_disp(FFT_SCALAR ***grid)
 {
  int i,j,k,m,ib,jb,kb,itype,iclass;
  double v0,u0,t0;
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@ -21,6 +21,7 @@
 #include "math_const.h"
 #include "math_special.h"
 #include "neigh_list.h"
+#include "timer.h"

 #include <cmath>

@ -55,6 +56,8 @@ void PairAmoeba::multipole()
  double qixx,qixy,qixz,qiyy,qiyz,qizz;
  double cii,dii,qii;

+  double time0,time1,time2;
+
  // set cutoffs, taper coeffs, and PME params

  if (use_ewald) choose(MPOLE_LONG);
@ -78,13 +81,18 @@ void PairAmoeba::multipole()

  felec = electric / am_dielectric;

+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  // compute the real space part of the Ewald summation

  if (mpole_rspace_flag) multipole_real();
+  time1 = platform::walltime();

  // compute the reciprocal space part of the Ewald summation

  if (mpole_kspace_flag) multipole_kspace();
+  time2 = platform::walltime();

  // compute the Ewald self-energy term over all the atoms

@ -109,6 +117,11 @@ void PairAmoeba::multipole()
    e = fterm * (cii + term*(dii/3.0+2.0*term*qii/5.0));
    empole += e;
  }
+
+  // accumulate timing information
+
+  time_mpole_rspace += time1 - time0;
+  time_mpole_kspace += time2 - time1;
 }

 /* ----------------------------------------------------------------------
@ -361,6 +374,9 @@ void PairAmoeba::multipole_real()
        bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2;
      }
      for (k = 0; k < 6; k++) bn[k] *= felec;
+      //if (i == 0 && j < 10) {
+      //  printf("j = %d: aewald = %f; rr1 = %f; bn: %f %f %f %f %f %f\n", j, aewald, rr1, bn[0], bn[1], bn[2], bn[3], bn[4], bn[5]);
+      //}

      // find damped multipole intermediates and energy value

@ -404,6 +420,8 @@ void PairAmoeba::multipole_real()
          term2i*rr3i + term2k*rr3k + term2ik*rr3ik +
          term3i*rr5i + term3k*rr5k + term3ik*rr5ik;

+
+
        // find damped multipole intermediates for force and torque

        de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik +
@ -444,6 +462,7 @@ void PairAmoeba::multipole_real()
        term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
        term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9);
        term6 = 4.0 * rr7;
+
      }

      empole += e;
@ -482,6 +501,7 @@ void PairAmoeba::multipole_real()
      tq[i][2] += ttmi[2];

      // increment force-based gradient and torque on second site
+      // commenting out j parts for DEBUGGING

      f[j][0] += frcx;
      f[j][1] += frcy;
@ -638,7 +658,7 @@ void PairAmoeba::multipole_kspace()

  // gridpre = my portion of 3d grid in brick decomp w/ ghost values

-  double ***gridpre = (double ***) m_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) m_kspace->zero();

  // map atoms to grid

@ -647,7 +667,7 @@ void PairAmoeba::multipole_kspace()
  // pre-convolution operations including forward FFT
  // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector

-  double *gridfft = m_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = m_kspace->pre_convolution();

  // ---------------------
  // convolution operation
@ -718,7 +738,7 @@ void PairAmoeba::multipole_kspace()
  // post-convolution operations including backward FFT
  // gridppost = my portion of 3d grid in brick decomp w/ ghost values

-  double ***gridpost = (double ***) m_kspace->post_convolution();
+  FFT_SCALAR ***gridpost = (FFT_SCALAR ***) m_kspace->post_convolution();

  // get potential

--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@ -21,6 +21,7 @@
 #include "math_const.h"
 #include "math_special.h"
 #include "neigh_list.h"
+#include "timer.h"

 #include <cmath>
 #include <cstring>
@ -55,6 +56,8 @@ void PairAmoeba::polar()
  double fix[3],fiy[3],fiz[3];
  double tep[3];

+  double time0,time1,time2;
+
  // set cutoffs, taper coeffs, and PME params

  if (use_ewald) choose(POLAR_LONG);
@ -76,11 +79,16 @@ void PairAmoeba::polar()

  // compute the real space part of the dipole interactions

+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
  if (polar_rspace_flag) polar_real();
+  time1 = platform::walltime();

  // compute the reciprocal space part of dipole interactions

  if (polar_kspace_flag) polar_kspace();
+  time2 = platform::walltime();

  // compute the Ewald self-energy torque and virial terms

@ -133,6 +141,11 @@ void PairAmoeba::polar()
    virpolar[4] -= vxz;
    virpolar[5] -= vyz;
  }
+
+  // accumulate timing information
+
+  time_polar_rspace += time1 - time0;
+  time_polar_kspace += time2 - time1;
 }

 /* ----------------------------------------------------------------------
@ -382,7 +395,7 @@ void PairAmoeba::polar_real()
          factor_uscale = 1.0;
        }
      }
-
+      //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale);
      r = sqrt(r2);
      ck = rpole[j][0];
      dkx = rpole[j][1];
@ -597,7 +610,6 @@ void PairAmoeba::polar_real()
      dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir;
      dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir;
      dufld[i][5] += zr*tiz5 + zr*zr*tuir;
-
      dufld[j][0] -= xr*tkx5 + xr*xr*tukr;
      dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr;
      dufld[j][2] -= yr*tky5 + yr*yr*tukr;
@ -855,6 +867,7 @@ void PairAmoeba::polar_real()
        frcx = -2.0 * depx;
        frcy = -2.0 * depy;
        frcz = -2.0 * depz;
+
      }

      // get the dtau/dr terms used for mutual polarization force
@ -1327,7 +1340,7 @@ void PairAmoeba::polar_kspace()

    // gridpre = my portion of 3d grid in brick decomp w/ ghost values

-    double ***gridpre = (double ***) p_kspace->zero();
+    FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();

    // map atoms to grid

@ -1336,7 +1349,7 @@ void PairAmoeba::polar_kspace()
    // pre-convolution operations including forward FFT
    // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector

-    double *gridfft = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft = p_kspace->pre_convolution();

    // ---------------------
    // convolution operation
@ -1386,7 +1399,7 @@ void PairAmoeba::polar_kspace()
    // post-convolution operations including backward FFT
    // gridppost = my portion of 3d grid in brick decomp w/ ghost values

-    double ***gridpost = (double ***) p_kspace->post_convolution();
+    FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution();

    // get potential

@ -1419,7 +1432,7 @@ void PairAmoeba::polar_kspace()

  // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values

-  double ****gridpre2 = (double ****) pc_kspace->zero();
+  FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero();

  // map 2 values to grid

@ -1428,7 +1441,7 @@ void PairAmoeba::polar_kspace()
  // pre-convolution operations including forward FFT
  // gridfft = my portion of complex 3d grid in FFT decomposition

-  double *gridfft = pc_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = pc_kspace->pre_convolution();

  // ---------------------
  // convolution operation
@ -1451,7 +1464,7 @@ void PairAmoeba::polar_kspace()
  // post-convolution operations including backward FFT
  // gridppost = my portion of 4d grid in brick decomp w/ ghost values

-  double ****gridpost = (double ****) pc_kspace->post_convolution();
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution();

  // get potential

@ -1857,7 +1870,7 @@ void PairAmoeba::polar_kspace()
  // gridpre = my portion of 3d grid in brick decomp w/ ghost values
  // zeroed by zero()

-  double ***gridpre = (double ***) p_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();

  // map atoms to grid

@ -1887,7 +1900,7 @@ void PairAmoeba::polar_kspace()
  // gridpre = my portion of 3d grid in brick decomp w/ ghost values
  // zeroed by zero()

-  gridpre = (double ***) p_kspace->zero();
+  gridpre = (FFT_SCALAR ***) p_kspace->zero();

  // map atoms to grid

@ -1896,7 +1909,7 @@ void PairAmoeba::polar_kspace()
  // pre-convolution operations including forward FFT
  // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors

-  double *gridfft2 = p_kspace->pre_convolution();
+  FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();

  // ---------------------
  // convolution operation
@ -1953,7 +1966,7 @@ void PairAmoeba::polar_kspace()
    // gridpre = my portion of 3d grid in brick decomp w/ ghost values
    // zeroed by zero()

-    double ***gridpre = (double ***) p_kspace->zero();
+    FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();

    // map atoms to grid

@ -1962,12 +1975,12 @@ void PairAmoeba::polar_kspace()
    // pre-convolution operations including forward FFT
    // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector

-    double *gridfft = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft = p_kspace->pre_convolution();

    // gridfft1 = copy of first FFT

    int nfft_owned = p_kspace->nfft_owned;
-    memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double));
+    memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR));

    // assign ??? to the PME grid

@ -1982,7 +1995,7 @@ void PairAmoeba::polar_kspace()

    // gridpre = my portion of 3d grid in brick decomp w/ ghost values

-    gridpre = (double ***) p_kspace->zero();
+    gridpre = (FFT_SCALAR ***) p_kspace->zero();

    // map atoms to grid

@ -1991,7 +2004,7 @@ void PairAmoeba::polar_kspace()
    // pre-convolution operations including forward FFT
    // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector

-    double *gridfft2 = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();

    // ---------------------
    // convolution operation
--- a/src/AMOEBA/fix_amoeba_bitorsion.cpp
+++ b/src/AMOEBA/fix_amoeba_bitorsion.cpp
@ -194,8 +194,8 @@ void FixAmoebaBiTorsion::init()
  // error check that PairAmoeba or PairHiippo exist

  pair = nullptr;
-  pair = force->pair_match("amoeba",1,0);
-  if (!pair) pair = force->pair_match("hippo",1,0);
+  pair = force->pair_match("^amoeba",0,0);
+  if (!pair) pair = force->pair_match("^hippo",0,0);

  if (!pair)
    error->all(FLERR,"Cannot use fix amoeba/bitorsion w/out pair amoeba/hippo");
--- a/src/AMOEBA/improper_amoeba.cpp
+++ b/src/AMOEBA/improper_amoeba.cpp
@ -285,8 +285,9 @@ void ImproperAmoeba::init_style()
  // check if PairAmoeba disabled improper terms

  Pair *pair = nullptr;
-  pair = force->pair_match("amoeba",1,0);
-  if (!pair) pair = force->pair_match("hippo",1,0);
+  pair = force->pair_match("^amoeba",0,0);
+  if (!pair) pair = force->pair_match("^hippo",0,0);
+
  if (!pair) error->all(FLERR,"Improper amoeba could not find pair amoeba/hippo");

  int tmp;
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@ -29,6 +29,7 @@
 #include "my_page.h"
 #include "neigh_list.h"
 #include "neighbor.h"
+#include "timer.h"
 #include "update.h"

 #include <cmath>
@ -47,6 +48,7 @@ enum{MUTUAL,OPT,TCG,DIRECT};
 enum{GEAR,ASPC,LSQR};

 #define DELTASTACK 16
+#define DEBUG_AMOEBA 0

 /* ---------------------------------------------------------------------- */

@ -85,6 +87,10 @@ PairAmoeba::PairAmoeba(LAMMPS *lmp) : Pair(lmp)
  cmp = fmp = nullptr;
  cphi = fphi = nullptr;

+  _moduli_array = nullptr;
+  _moduli_bsarray = nullptr;
+  _nfft_max = 0;
+
  poli = nullptr;
  conj = conjp = nullptr;
  vec = vecp = nullptr;
@ -227,6 +233,9 @@ PairAmoeba::~PairAmoeba()
  memory->destroy(fphidp);
  memory->destroy(cphidp);

+  memory->destroy(_moduli_array);
+  memory->destroy(_moduli_bsarray);
+
  memory->destroy(thetai1);
  memory->destroy(thetai2);
  memory->destroy(thetai3);
@ -349,12 +358,22 @@ void PairAmoeba::compute(int eflag, int vflag)
  if (update->ntimestep <= update->beginstep+1) {
    time_init = time_hal = time_repulse = time_disp = time_mpole = 0.0;
    time_induce = time_polar = time_qxfer = 0.0;
+
+    time_mpole_rspace = time_mpole_kspace = 0.0;
+    time_direct_rspace = time_direct_kspace = 0.0;
+    time_mutual_rspace = time_mutual_kspace = 0.0;
+    time_polar_rspace = time_polar_kspace = 0.0;
+
+    time_grid_uind = time_fphi_uind = 0.0;
+    if (ic_kspace) {
+      ic_kspace->time_fft = 0.0;
+    }
  }

  double time0,time1,time2,time3,time4,time5,time6,time7,time8;

-  MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();

  // if reneighboring step:
  // augment neighbor list to include 1-5 neighbor flags
@ -410,8 +429,7 @@ void PairAmoeba::compute(int eflag, int vflag)
  comm->forward_comm(this);

  if (amoeba) pbc_xred();
-
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();

  // ----------------------------------------
  // compute components of force field
@ -420,22 +438,22 @@ void PairAmoeba::compute(int eflag, int vflag)
  // buffered 14-7 Vdwl, pairwise

  if (amoeba && hal_flag) hal();
-  time2 = MPI_Wtime();
+  time2 = platform::walltime();

  // Pauli repulsion, pairwise

  if (!amoeba && repulse_flag) repulsion();
-  time3 = MPI_Wtime();
+  time3 = platform::walltime();

  // Ewald dispersion, pairwise and long range

  if (!amoeba && (disp_rspace_flag || disp_kspace_flag)) dispersion();
-  time4 = MPI_Wtime();
+  time4 = platform::walltime();

  // multipole, pairwise and long range

  if (mpole_rspace_flag || mpole_kspace_flag) multipole();
-  time5 = MPI_Wtime();
+  time5 = platform::walltime();

  // induced dipoles, interative CG relaxation
  // communicate induce() output values needed by ghost atoms
@ -445,17 +463,17 @@ void PairAmoeba::compute(int eflag, int vflag)
    cfstyle = INDUCE;
    comm->forward_comm(this);
  }
-  time6 = MPI_Wtime();
+  time6 = platform::walltime();

  // dipoles, pairwise and long range

  if (polar_rspace_flag || polar_kspace_flag) polar();
-  time7 = MPI_Wtime();
+  time7 = platform::walltime();

  // charge transfer, pairwise

  if (!amoeba && qxfer_flag) charge_transfer();
-  time8 = MPI_Wtime();
+  time8 = platform::walltime();

  // store energy components for output by compute pair command

@ -518,6 +536,44 @@ void PairAmoeba::finish()
  MPI_Allreduce(&time_qxfer,&ave,1,MPI_DOUBLE,MPI_SUM,world);
  time_qxfer = ave/comm->nprocs;

+  #if DEBUG_AMOEBA
+  // real-space/kspace breakdown
+  MPI_Allreduce(&time_mpole_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mpole_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_mpole_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mpole_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_direct_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_direct_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_direct_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_direct_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_mutual_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_mutual_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_polar_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_polar_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_polar_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_grid_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_grid_uind = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_fphi_uind = ave/comm->nprocs;
+
+  double time_mutual_fft = 0;
+  if (ic_kspace) time_mutual_fft = ic_kspace->time_fft;
+  MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_fft = ave/comm->nprocs;
+  #endif // DEBUG_AMOEBA
+
  double time_total = (time_init + time_hal + time_repulse + time_disp +
                       time_mpole + time_induce + time_polar + time_qxfer) / 100.0;

@ -534,8 +590,27 @@ void PairAmoeba::finish()
    utils::logmesg(lmp,"  Induce  time: {:<12.6g} {:6.2f}%\n", time_induce, time_induce/time_total);
    utils::logmesg(lmp,"  Polar   time: {:<12.6g} {:6.2f}%\n", time_polar, time_polar/time_total);
    if (!amoeba)
-      utils::logmesg(lmp,"  Qxfer   time: {:<12.6g} {:6.2f}%\n", time_qxfer, time_qxfer/time_total);
-    utils::logmesg(lmp,"  Total   time: {:<12.6g}\n",time_total * 100.0);
+      utils::logmesg(lmp,"  Qxfer   time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total);
+    utils::logmesg(lmp,"  Total   time: {:.6g}\n",time_total * 100.0);
+
+    #if DEBUG_AMOEBA
+    double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace;
+    double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace;
+
+    utils::logmesg(lmp,"    Real-space timing breakdown: {:.3g}%\n", rspace_time/time_total);
+    utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total);
+    utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total);
+    utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total);
+    utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total);
+    utils::logmesg(lmp,"    K-space timing breakdown   : {:.3g}%\n", kspace_time/time_total);
+    utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
+    utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
+    utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
+    utils::logmesg(lmp,"       - Grid    : {:.6g} {:.3g}%\n", time_grid_uind, time_grid_uind/time_total);
+    utils::logmesg(lmp,"       - FFT     : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
+    utils::logmesg(lmp,"       - Interp  : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total);
+    utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
+    #endif
  }
 }

@ -2320,6 +2395,8 @@ void PairAmoeba::grow_local()
    firstneigh_pcpc = (double **)
      memory->smalloc(nmax*sizeof(double *),"induce:firstneigh_pcpc");
  }
+
+  memory->create(_moduli_array,bsordermax,"amoeba:_moduli_array");
 }

 /* ----------------------------------------------------------------------
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@ -82,6 +82,12 @@ class PairAmoeba : public Pair {
  double time_init, time_hal, time_repulse, time_disp;
  double time_mpole, time_induce, time_polar, time_qxfer;

+  double time_mpole_rspace, time_mpole_kspace;
+  double time_direct_rspace, time_direct_kspace;
+  double time_mutual_rspace, time_mutual_kspace;
+  double time_polar_rspace, time_polar_kspace;
+  double time_grid_uind, time_fphi_uind;
+
  // energy/virial components

  double ehal, erepulse, edisp, epolar, empole, eqxfer;
@ -327,6 +333,10 @@ class PairAmoeba : public Pair {
  double **cmp,**fmp;              // Cartesian and fractional multipoles
  double **cphi,**fphi;

+  double *_moduli_array;           // buffers for moduli
+  double *_moduli_bsarray;
+  int _nfft_max;
+
  // params for current KSpace solve and FFT being worked on

  int nfft1, nfft2, nfft3;    // size of FFT
@ -335,8 +345,12 @@ class PairAmoeba : public Pair {
  double ctf[10][10];         // indices NOT flipped vs Fortran
  double ftc[10][10];         // indices NOT flipped vs Fortran

-  class AmoebaConvolution *m_kspace, *p_kspace, *pc_kspace, *d_kspace;
-  class AmoebaConvolution *i_kspace, *ic_kspace;
+  class AmoebaConvolution *m_kspace;   // multipole KSpace
+  class AmoebaConvolution *p_kspace;   // polar KSpace
+  class AmoebaConvolution *pc_kspace;
+  class AmoebaConvolution *d_kspace;   // dispersion KSpace
+  class AmoebaConvolution *i_kspace;   // induce KSpace
+  class AmoebaConvolution *ic_kspace;

  // FFT grid size factors

@ -347,33 +361,33 @@ class PairAmoeba : public Pair {

  void hal();

-  void repulsion();
-  void damprep(double, double, double, double, double, double, double, double, int, double, double,
-               double *);
+  virtual void repulsion();
+  void damprep(double, double, double, double, double, double, double, double,
+               int, double, double, double *);

  void dispersion();
-  void dispersion_real();
+  virtual void dispersion_real();
  void dispersion_kspace();

  void multipole();
-  void multipole_real();
+  virtual void multipole_real();
  void multipole_kspace();

  void polar();
  void polar_energy();
-  void polar_real();
-  void polar_kspace();
+  virtual void polar_real();
+  virtual void polar_kspace();
  void damppole(double, int, double, double, double *, double *, double *);

-  void induce();
+  virtual void induce();
  void ulspred();
-  void ufield0c(double **, double **);
+  virtual void ufield0c(double **, double **);
  void uscale0b(int, double **, double **, double **, double **);
  void dfield0c(double **, double **);
-  void umutual1(double **, double **);
-  void umutual2b(double **, double **);
+  virtual void umutual1(double **, double **);
+  virtual void umutual2b(double **, double **);
  void udirect1(double **);
-  void udirect2b(double **, double **);
+  virtual void udirect2b(double **, double **);
  void dampmut(double, double, double, double *);
  void dampdir(double, double, double, double *, double *);
  void cholesky(int, double *, double *);
@ -393,11 +407,11 @@ class PairAmoeba : public Pair {
  void fphi_to_cphi(double **, double **);
  void frac_to_cart();

-  void grid_mpole(double **, double ***);
-  void fphi_mpole(double ***, double **);
-  void grid_uind(double **, double **, double ****);
-  void fphi_uind(double ****, double **, double **, double **);
-  void grid_disp(double ***);
+  void grid_mpole(double **, FFT_SCALAR ***);
+  void fphi_mpole(FFT_SCALAR ***, double **);
+  void grid_uind(double **, double **, FFT_SCALAR ****);
+  virtual void fphi_uind(FFT_SCALAR ****, double **, double **, double **);
+  void grid_disp(FFT_SCALAR ***);

  void kewald();
  void kewald_parallel(int, int, int, int, int &, int &, int &, int &, int &, int &, int &, int &,
--- a/src/Depend.sh
+++ b/src/Depend.sh
@ -45,6 +45,10 @@ depend () {
 # add one if statement per parent package
 # add one depend() call per child package that depends on that parent

+if (test $1 = "AMOEBA") then
+  depend GPU
+fi
+
 if (test $1 = "ASPHERE") then
  depend GPU
  depend OPENMP
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@ -28,6 +28,8 @@ action () {

 # list of files with optional dependcies

+action amoeba_convolution_gpu.cpp amoeba_convolution.cpp
+action amoeba_convolution_gpu.h amoeba_convolution.cpp
 action fix_gpu.cpp
 action fix_gpu.h
 action fix_nve_gpu.h
@ -41,6 +43,8 @@ action fix_npt_gpu.cpp
 action fix_nve_asphere_gpu.h fix_nve_asphere.h
 action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp
 action gpu_extra.h
+action pair_amoeba_gpu.cpp pair_amoeba.cpp
+action pair_amoeba_gpu.h pair_amoeba.h
 action pair_beck_gpu.cpp pair_beck.cpp
 action pair_beck_gpu.h pair_beck.h
 action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp
@ -89,6 +93,8 @@ action pair_gauss_gpu.cpp pair_gauss.cpp
 action pair_gauss_gpu.h pair_gauss.h
 action pair_gayberne_gpu.cpp pair_gayberne.cpp
 action pair_gayberne_gpu.h pair_gayberne.cpp
+action pair_hippo_gpu.cpp pair_hippo.cpp
+action pair_hippo_gpu.h pair_hippo.cpp
 action pair_lj96_cut_gpu.cpp pair_lj96_cut.cpp
 action pair_lj96_cut_gpu.h pair_lj96_cut.h
 action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp
@ -113,6 +119,10 @@ action pair_lj_cut_coul_msm_gpu.cpp pair_lj_cut_coul_msm.cpp
 action pair_lj_cut_coul_msm_gpu.h pair_lj_cut_coul_msm.h
 action pair_lj_cut_gpu.cpp
 action pair_lj_cut_gpu.h
+action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
+action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
+action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
+action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp
 action pair_lj_smooth_gpu.cpp pair_lj_smooth.cpp
 action pair_lj_smooth_gpu.h pair_lj_smooth.cpp
 action pair_lj_expand_gpu.cpp
@ -155,10 +165,6 @@ action pppm_gpu.cpp pppm.cpp
 action pppm_gpu.h pppm.cpp
 action pair_ufm_gpu.cpp pair_ufm.cpp
 action pair_ufm_gpu.h pair_ufm.h
-action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
-action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
-action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
-action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp

 # edit 2 Makefile.package files to include/exclude package info

--- a/src/GPU/amoeba_convolution_gpu.cpp
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "amoeba_convolution_gpu.h"
+#include "comm.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "grid3d.h"
+
+using namespace LAMMPS_NS;
+
+// DEBUG
+
+#define DEBUG_AMOEBA 0
+#if DEBUG_AMOEBA
+char *labels[7] =
+  {(char *) "MPOLE_GRID", (char *) "POLAR_GRID",
+   (char *) "POLAR_GRIDC", (char *) "DISP_GRID",
+   (char *) "INDUCE_GRID", (char *) "INDUCE_GRIDC"};
+
+enum{GRIDBRICK_OUT,GRIDBRICK_IN,FFT,CFFT1,CFFT2};
+#endif
+// END DEBUG
+
+#define SCALE 0
+
+//#define USE_AMOEBA_FFT
+#ifdef USE_AMOEBA_FFT
+// External functions from GPU library
+int amoeba_setup_fft(const int size, const int numel, const int element_type);
+int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode);
+#endif
+
+/* ----------------------------------------------------------------------
+   partition an FFT grid across processors
+   both for a brick and FFT x pencil decomposition
+   nx,nz,nz = global FFT grid size
+   order = size of stencil in each dimension that maps atoms to grid
+   adapted from PPPM::set_grid_local()
+------------------------------------------------------------------------- */
+
+AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
+                                     int nx_caller, int ny_caller, int nz_caller,
+                                     int order_caller, int which_caller) :
+  AmoebaConvolution(lmp, pair, nx_caller, ny_caller,  nz_caller, order_caller,
+                    which_caller)
+{
+
+}
+
+/* ----------------------------------------------------------------------
+   perform pre-convolution grid operations for 4d cgrid_brick array
+------------------------------------------------------------------------- */
+
+FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
+{
+  int ix,iy,iz,n;
+
+  // reverse comm for 4d brick grid + ghosts
+
+#if DEBUG_AMOEBA
+  debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE Grid3d");
+#endif
+
+  gc->reverse_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
+                   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+
+#if DEBUG_AMOEBA
+  debug_scalar(GRIDBRICK_IN,"PRE Convo / POST Grid3d");
+  debug_file(GRIDBRICK_IN,"pre.convo.post.grid3d");
+#endif
+  // copy owned 4d brick grid values to FFT grid
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+        cfft[n++] = cgrid_brick[iz][iy][ix][0];
+        cfft[n++] = cgrid_brick[iz][iy][ix][1];
+      }
+
+  // remap FFT grid from brick to x pencil partitioning
+  // NOTE: could just setup FFT to start from brick decomp and skip remap
+
+  remap->perform(cfft,cfft,remap_buf);
+
+#if DEBUG_AMOEBA
+  debug_scalar(FFT,"PRE Convo / POST Remap");
+  debug_file(FFT,"pre.convo.post.remap");
+#endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = platform::walltime();
+
+  // perform forward FFT
+
+  #ifdef USE_AMOEBA_FFT
+  amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD);
+  #else
+  fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  #endif
+
+  time1 = platform::walltime();
+
+  time_fft += time1 - time0;
+
+  if (SCALE) {
+    double scale = 1.0/nfft_global;
+    for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
+  }
+
+#if DEBUG_AMOEBA
+  debug_scalar(CFFT1,"PRE Convo / POST FFT");
+  debug_file(CFFT1,"pre.convo.post.fft");
+#endif
+  return cfft;
+}
+
+/* ----------------------------------------------------------------------
+   perform post-convolution grid operations for 4d cgrid_brick array
+------------------------------------------------------------------------- */
+
+void *AmoebaConvolutionGPU::post_convolution_4d()
+{
+  int ix,iy,iz,n;
+
+  // perform backward FFT
+
+#if DEBUG_AMOEBA
+  debug_scalar(CFFT1,"POST Convo / PRE FFT");
+  debug_file(CFFT1,"post.convo.pre.fft");
+#endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = platform::walltime();
+
+  fft2->compute(cfft,cfft,FFT3d::BACKWARD);
+
+  time1 = platform::walltime();
+
+  time_fft += time1 - time0;
+
+#if DEBUG_AMOEBA
+  debug_scalar(CFFT2,"POST Convo / POST FFT");
+  debug_file(CFFT2,"post.convo.post.fft");
+#endif
+  // copy 1d complex values into 4d complex grid
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+        cgrid_brick[iz][iy][ix][0] = cfft[n++];
+        cgrid_brick[iz][iy][ix][1] = cfft[n++];
+      }
+
+  // forward comm to populate ghost grid values
+
+#if DEBUG_AMOEBA
+  debug_scalar(GRIDBRICK_IN,"POST Convo / PRE grid3d");
+  debug_file(GRIDBRICK_IN,"post.convo.pre.grid3d");
+#endif
+  gc->forward_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
+                   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+
+  return (void *) cgrid_brick;
+}
--- a/src/GPU/amoeba_convolution_gpu.h
+++ b/src/GPU/amoeba_convolution_gpu.h
@ -0,0 +1,32 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_AMOEBA_CONVOLUTION_GPU_H
+#define LMP_AMOEBA_CONVOLUTION_GPU_H
+
+#include "amoeba_convolution.h"
+
+
+namespace LAMMPS_NS {
+
+class AmoebaConvolutionGPU : public AmoebaConvolution {
+ public:
+  AmoebaConvolutionGPU(class LAMMPS *, class Pair *, int, int, int, int, int);
+
+  FFT_SCALAR *pre_convolution_4d() override;
+  void *post_convolution_4d() override;
+
+};
+
+} // namespace LAMMPS_NS
+#endif
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@ -131,7 +131,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
  _gpu_mode = GPU_NEIGH;
  _particle_split = 1.0;
  int nthreads = 0;
-  int newtonflag = 0;
+  int newtonflag = force->newton_pair;
  int threads_per_atom = -1;
  double binsize = 0.0;
  char *opencl_args = nullptr;
@ -360,6 +360,8 @@ double FixGPU::memory_usage()
  return bytes;
 }

+/* ---------------------------------------------------------------------- */
+
 double FixGPU::binsize(const double subx, const double suby,
                       const double subz, const int nlocal,
                       const double cut) {
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@ -0,0 +1,72 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(amoeba/gpu,PairAmoebaGPU);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_AMOEBA_GPU_H
+#define LMP_PAIR_AMOEBA_GPU_H
+
+#include "pair_amoeba.h"
+
+namespace LAMMPS_NS {
+
+class PairAmoebaGPU : public PairAmoeba {
+ public:
+  PairAmoebaGPU(LAMMPS *lmp);
+  ~PairAmoebaGPU() override;
+  void init_style() override;
+  double memory_usage() override;
+
+  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+  void induce() override;
+
+  void multipole_real() override;
+  void udirect2b(double **, double **) override;
+  void umutual1(double **, double **) override;
+  void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
+  void umutual2b(double **, double **) override;
+  void ufield0c(double **, double **) override;
+  void polar_real() override;
+  void polar_kspace() override;
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  void *tq_pinned;
+  void *fieldp_pinned;
+  bool acc_float;
+
+  bool gpu_hal_ready;
+  bool gpu_repulsion_ready;
+  bool gpu_dispersion_real_ready;
+  bool gpu_multipole_real_ready;
+  bool gpu_udirect2b_ready;
+  bool gpu_umutual1_ready;
+  bool gpu_fphi_uind_ready;
+  bool gpu_umutual2b_ready;
+  bool gpu_polar_real_ready;
+
+  void udirect2b_cpu();
+
+  template<class numtyp>
+  void compute_force_from_torque(const numtyp*, double**, double*);
+};
+
+}    // namespace LAMMPS_NS
+#endif
+#endif
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
--- a/src/GPU/pair_hippo_gpu.h
+++ b/src/GPU/pair_hippo_gpu.h
@ -0,0 +1,73 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(hippo/gpu,PairHippoGPU);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_HIPPO_GPU_H
+#define LMP_PAIR_HIPPO_GPU_H
+
+#include "pair_amoeba.h"
+
+namespace LAMMPS_NS {
+
+class PairHippoGPU : public PairAmoeba {
+ public:
+  PairHippoGPU(LAMMPS *lmp);
+  ~PairHippoGPU() override;
+  void init_style() override;
+  double memory_usage() override;
+
+  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+  void induce() override;
+
+  void repulsion() override;
+  void dispersion_real() override;
+  void multipole_real() override;
+  void udirect2b(double **, double **) override;
+  void umutual1(double **, double **) override;
+  void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
+  void umutual2b(double **, double **) override;
+  void ufield0c(double **, double **) override;
+  void polar_real() override;
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  void *tq_pinned;
+  void *fieldp_pinned;
+  bool acc_float;
+
+  bool gpu_hal_ready;
+  bool gpu_repulsion_ready;
+  bool gpu_dispersion_real_ready;
+  bool gpu_multipole_real_ready;
+  bool gpu_udirect2b_ready;
+  bool gpu_umutual1_ready;
+  bool gpu_fphi_uind_ready;
+  bool gpu_umutual2b_ready;
+  bool gpu_polar_real_ready;
+
+  void udirect2b_cpu();
+
+  template<class numtyp>
+  void compute_force_from_torque(const numtyp*, double**, double*);
+};
+
+}    // namespace LAMMPS_NS
+#endif
+#endif
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@ -204,6 +204,8 @@ action mliap_model_linear_kokkos.h mliap_model_linear.h
 action mliap_model_python_kokkos.cpp mliap_model_linear.cpp
 action mliap_model_python_kokkos.h mliap_model_linear.h
 action mliap_model_kokkos.h mliap_model.h
+action mliap_unified_kokkos.cpp mliap_unified.cpp
+action mliap_unified_kokkos.h mliap_unified.h
 action mliap_so3_kokkos.cpp mliap_so3.cpp
 action mliap_so3_kokkos.h mliap_so3.h
 action modify_kokkos.cpp
@ -314,6 +316,8 @@ action pair_lj_spica_kokkos.cpp pair_lj_spica.cpp
 action pair_lj_spica_kokkos.h pair_lj_spica.h
 action pair_meam_kokkos.cpp pair_meam.cpp
 action pair_meam_kokkos.h pair_meam.h
+action pair_meam_ms_kokkos.cpp pair_meam_ms.cpp
+action pair_meam_ms_kokkos.h pair_meam_ms.h
 action pair_mliap_kokkos.cpp pair_mliap.cpp
 action pair_mliap_kokkos.h pair_mliap.h
 action pair_morse_kokkos.cpp
@ -365,6 +369,7 @@ action verlet_kokkos.h

 # Install cython pyx file only if non-KOKKOS version is present
 action mliap_model_python_couple_kokkos.pyx mliap_model_python_couple.pyx
+action mliap_unified_couple_kokkos.pyx mliap_unified_couple.pyx

 # edit 2 Makefile.package files to include/exclude package info

@ -423,15 +428,19 @@ fi
 if (test $1 = 1) then
  if (type cythonize > /dev/null 2>&1 && test -e ../python_impl.cpp) then
    cythonize -3 ../mliap_model_python_couple_kokkos.pyx
+    cythonize -3 ../mliap_unified_couple_kokkos.pyx
  fi

 elif (test $1 = 0) then
  rm -f ../mliap_model_python_couple_kokkos.cpp ../mliap_model_python_couple_kokkos.h
+  rm -f ../mliap_unified_couple_kokkos.cpp ../mliap_unified_couple_kokkos.h

 elif (test $1 = 2) then
  if (type cythonize > /dev/null 2>&1 && test -e ../python_impl.cpp) then
    cythonize -3 ../mliap_model_python_couple_kokkos.pyx
+    cythonize -3 ../mliap_unified_couple_kokkos.pyx
  else
    rm -f ../mliap_model_python_couple_kokkos.cpp ../mliap_model_python_couple_kokkos.h
+    rm -f ../mliap_unified_couple_kokkos.cpp ../mliap_unified_couple_kokkos.h
  fi
 fi
--- a/src/KOKKOS/fix_nvt_kokkos.cpp
+++ b/src/KOKKOS/fix_nvt_kokkos.cpp
@ -39,7 +39,7 @@ FixNVTKokkos<DeviceType>::FixNVTKokkos(LAMMPS *lmp, int narg, char **arg) :
  // id = fix-ID + temp

  this->id_temp = utils::strdup(std::string(this->id)+"_temp");
-  this->modify->add_compute(fmt::format("{} all temp/kk",this->id_temp));
+  this->modify->add_compute(fmt::format("{} {} temp/kk",this->id_temp,this->group->names[this->igroup]));
  this->tcomputeflag = 1;
 }

--- a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
@ -67,7 +67,7 @@ FixNVTSllodKokkos<DeviceType>::FixNVTSllodKokkos(LAMMPS *lmp, int narg, char **a
  }

  this->id_temp = utils::strdup(std::string(this->id)+"_temp");
-  this->modify->add_compute(fmt::format("{} all temp/deform/kk",this->id_temp));
+  this->modify->add_compute(fmt::format("{} {} temp/deform/kk",this->id_temp,this->group->names[this->igroup]));
  this->tcomputeflag = 1;
  this->nondeformbias = 0;
 }
--- a/src/KOKKOS/fix_setforce_kokkos.cpp
+++ b/src/KOKKOS/fix_setforce_kokkos.cpp
@ -77,9 +77,8 @@ void FixSetForceKokkos<DeviceType>::init()
 template<class DeviceType>
 void FixSetForceKokkos<DeviceType>::post_force(int /*vflag*/)
 {
-  atomKK->sync(execution_space, X_MASK | F_MASK | MASK_MASK);
+  atomKK->sync(execution_space, F_MASK | MASK_MASK);

-  x = atomKK->k_x.view<DeviceType>();
  f = atomKK->k_f.view<DeviceType>();
  mask = atomKK->k_mask.view<DeviceType>();

@ -88,6 +87,8 @@ void FixSetForceKokkos<DeviceType>::post_force(int /*vflag*/)
  // update region if necessary

  if (region) {
+    if (!utils::strmatch(region->style, "^block"))
+      error->all(FLERR,"Cannot (yet) use {}-style region with fix setforce/kk",region->style);
    region->prematch();
    DAT::tdual_int_1d k_match = DAT::tdual_int_1d("setforce:k_match",nlocal);
    KokkosBase* regionKKBase = dynamic_cast<KokkosBase*>(region);
--- a/src/KOKKOS/meam_dens_final_kokkos.h
+++ b/src/KOKKOS/meam_dens_final_kokkos.h
@ -61,17 +61,44 @@ void MEAMKokkos<DeviceType>::operator()(TagMEAMDensFinal, const int &i, EV_FLOAT
  if (elti >= 0) {
    scaleii = d_scale(type[i],type[i]);
    d_rho1[i] = 0.0;
+    if (msmeamflag) {
+      d_rho2[i] = -1.0 / 3.0 * (d_arho2b[i] * d_arho2b[i]
+                              - d_arho2mb[i] * d_arho2mb[i]);
+    } else{
      d_rho2[i] = -1.0 / 3.0 * d_arho2b[i] * d_arho2b[i];
+    }
    d_rho3[i] = 0.0;
    for (int m = 0; m < 3; m++) {
+      if (msmeamflag) {
+        d_rho1[i] = d_rho1[i] + d_arho1(i, m) * d_arho1(i, m)
+                             - d_arho1m(i, m) * d_arho1m(i, m);
+        d_rho3[i] = d_rho3[i] - 3.0 / 5.0 * (d_arho3b(i, m) * d_arho3b(i, m)
+                                          - d_arho3mb(i, m) * d_arho3mb(i, m));
+      } else{
        d_rho1[i] += d_arho1(i,m) * d_arho1(i,m);
        d_rho3[i] -= 3.0 / 5.0 * d_arho3b(i,m) * d_arho3b(i,m);
      }
-    for (int m = 0; m < 6; m++)
+    }
+    for (int m = 0; m < 6; m++){
+      if (msmeamflag) {
+        d_rho2[i] = d_rho2[i] + v2D[m] * (d_arho2(i, m) * d_arho2(i, m)
+                                         - d_arho2m(i, m) * d_arho2m(i, m));
+      } else{
        d_rho2[i] += v2D[m] * d_arho2(i,m) * d_arho2(i,m);
+      }
+    }
    for (int m = 0; m < 10; m++)
+      if (msmeamflag) {
+        d_rho3[i] = d_rho3[i] + v3D[m] * (d_arho3(i, m) * d_arho3(i, m)
+                                        - d_arho3m(i, m) * d_arho3m(i, m));
+      } else{
        d_rho3[i] += v3D[m] * d_arho3(i,m) * d_arho3(i,m);
+      }

+    if (msmeamflag) {
+      // with msmeam all t weights are already accounted for in rho
+      d_gamma[i] = d_rho1[i] + d_rho2[i] + d_rho3[i];
+    } else{
      if (d_rho0[i] > 0.0) {
        if (ialloy == 1) {
          d_t_ave(i,0) = fdiv_zero_kk(d_t_ave(i,0), d_tsq_ave(i,0));
@ -87,8 +114,8 @@ void MEAMKokkos<DeviceType>::operator()(TagMEAMDensFinal, const int &i, EV_FLOAT
          d_t_ave(i,2) /= d_rho0[i];
        }
      }
-
      d_gamma[i] = d_t_ave(i,0) * d_rho1[i] + d_t_ave(i,1) * d_rho2[i] + d_t_ave(i,2) * d_rho3[i];
+    }

    if (d_rho0[i] > 0.0)
      d_gamma[i] /= (d_rho0[i] * d_rho0[i]);
--- a/Show More
+++ b/Show More