diff --git a/cmake/Modules/LAMMPSUtils.cmake b/cmake/Modules/LAMMPSUtils.cmake
index d42f91f10e..9b42dafc44 100644
--- a/cmake/Modules/LAMMPSUtils.cmake
+++ b/cmake/Modules/LAMMPSUtils.cmake
@@ -99,8 +99,15 @@ function(check_for_autogen_files source_dir)
 endfunction()
 
 macro(pkg_depends PKG1 PKG2)
-  if(PKG_${PKG1} AND NOT (PKG_${PKG2} OR BUILD_${PKG2}))
-    message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with the ${PKG2} package")
+  if(DEFINED BUILD_${PKG2})
+    if(PKG_${PKG1} AND NOT BUILD_${PKG2})
+      message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with -D BUILD_${PKG2}=ON")
+    endif()
+  elseif(DEFINED PKG_${PKG2})
+    if(PKG_${PKG1} AND NOT PKG_${PKG2})
+      message(WARNING "The ${PKG1} package depends on the ${PKG2} package. Enabling it.")
+      set(PKG_${PKG2} ON CACHE BOOL "" FORCE)
+    endif()
   endif()
 endmacro()
 
diff --git a/cmake/Modules/Packages/COMPRESS.cmake b/cmake/Modules/Packages/COMPRESS.cmake
index bdcf1aa3f8..4e1ab846a7 100644
--- a/cmake/Modules/Packages/COMPRESS.cmake
+++ b/cmake/Modules/Packages/COMPRESS.cmake
@@ -1,4 +1,9 @@
-find_package(ZLIB REQUIRED)
+find_package(ZLIB)
+if(NOT ZLIB_FOUND)
+  message(WARNING "No Zlib development support found. Disabling COMPRESS package...")
+  set(PKG_COMPRESS OFF CACHE BOOL "" FORCE)
+  return()
+endif()
 target_link_libraries(lammps PRIVATE ZLIB::ZLIB)
 
 find_package(PkgConfig QUIET)
diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index dd66276ae4..24d9538206 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -26,6 +26,19 @@ elseif(GPU_PREC STREQUAL "SINGLE")
   set(GPU_PREC_SETTING "SINGLE_SINGLE")
 endif()
 
+option(GPU_DEBUG "Enable debugging code of the GPU package" OFF)
+mark_as_advanced(GPU_DEBUG)
+
+if(PKG_AMOEBA AND FFT_SINGLE)
+  message(FATAL_ERROR "GPU acceleration of AMOEBA is not (yet) compatible with single precision FFT")
+endif()
+
+if (PKG_AMOEBA)
+  list(APPEND GPU_SOURCES
+              ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h
+              ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp)
+endif()
+
 file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
 file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
 
@@ -151,7 +164,12 @@ if(GPU_API STREQUAL "CUDA")
   add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
   target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
   target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
-  target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS})
+  target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} ${GPU_CUDA_MPS_FLAGS})
+  if(GPU_DEBUG)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+  else()
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
+  endif()
   if(CUDPP_OPT)
     target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
     target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
@@ -192,6 +210,7 @@ elseif(GPU_API STREQUAL "OPENCL")
     ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu
     ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu
     ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu
+    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu
   )
 
   foreach(GPU_KERNEL ${GPU_LIB_CU})
@@ -208,6 +227,7 @@ elseif(GPU_API STREQUAL "OPENCL")
   GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu)
   GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu)
   GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu)
+  GenerateOpenCLHeader(hippo ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu)
 
   list(APPEND GPU_LIB_SOURCES
     ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h
@@ -217,14 +237,18 @@ elseif(GPU_API STREQUAL "OPENCL")
     ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h
     ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h
     ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h
+    ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h
   )
 
   add_library(gpu STATIC ${GPU_LIB_SOURCES})
   target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
   target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
-  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
-  target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
-
+  target_compile_definitions(gpu PRIVATE -DUSE_OPENCL -D_${GPU_PREC_SETTING})
+  if(GPU_DEBUG)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+  else()
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
+  endif()
   target_link_libraries(lammps PRIVATE gpu)
 
   add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
@@ -374,8 +398,12 @@ elseif(GPU_API STREQUAL "HIP")
 
   add_library(gpu STATIC ${GPU_LIB_SOURCES})
   target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
-  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
-  target_compile_definitions(gpu PRIVATE -DUSE_HIP)
+  target_compile_definitions(gpu PRIVATE -DUSE_HIP -D_${GPU_PREC_SETTING})
+  if(GPU_DEBUG)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+  else()
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
+  endif()
   target_link_libraries(gpu PRIVATE hip::host)
 
   if(HIP_USE_DEVICE_SORT)
diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake
index f2cfa078c2..de64df7268 100644
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@@ -144,6 +144,7 @@ if(PKG_ML-IAP)
                                  ${KOKKOS_PKG_SOURCES_DIR}/mliap_descriptor_so3_kokkos.cpp
                                  ${KOKKOS_PKG_SOURCES_DIR}/mliap_model_linear_kokkos.cpp
                                  ${KOKKOS_PKG_SOURCES_DIR}/mliap_model_python_kokkos.cpp
+                                 ${KOKKOS_PKG_SOURCES_DIR}/mliap_unified_kokkos.cpp
                                  ${KOKKOS_PKG_SOURCES_DIR}/mliap_so3_kokkos.cpp)
 
   # Add KOKKOS version of ML-IAP Python coupling if non-KOKKOS version is included
diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst
index be2ba0fc60..659d185e18 100644
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@@ -126,10 +126,11 @@ CMake build
    -D GPU_API=value             # value = opencl (default) or cuda or hip
    -D GPU_PREC=value            # precision setting
                                 # value = double or mixed (default) or single
-   -D HIP_PATH                  # path to HIP installation. Must be set if GPU_API=HIP
    -D GPU_ARCH=value            # primary GPU hardware choice for GPU_API=cuda
-                                # value = sm_XX, see below
-                                # default is sm_50
+                                # value = sm_XX (see below, default is sm_50)
+   -D GPU_DEBUG=value           # enable debug code in the GPU package library, mostly useful for developers
+                                # value = yes or no (default)
+   -D HIP_PATH=value            # value = path to HIP installation. Must be set if GPU_API=HIP
    -D HIP_ARCH=value            # primary GPU hardware choice for GPU_API=hip
                                 # value depends on selected HIP_PLATFORM
                                 # default is 'gfx906' for HIP_PLATFORM=amd and 'sm_50' for HIP_PLATFORM=nvcc
diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst
index f5924f12c7..7f7b2d4b7d 100644
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@@ -39,7 +39,7 @@ OPT.
    * :doc:`agni (o) <pair_agni>`
    * :doc:`airebo (io) <pair_airebo>`
    * :doc:`airebo/morse (io) <pair_airebo>`
-   * :doc:`amoeba <pair_amoeba>`
+   * :doc:`amoeba (g) <pair_amoeba>`
    * :doc:`atm <pair_atm>`
    * :doc:`awpmd/cut <pair_awpmd>`
    * :doc:`beck (go) <pair_beck>`
@@ -126,7 +126,7 @@ OPT.
    * :doc:`hbond/dreiding/lj (o) <pair_hbond_dreiding>`
    * :doc:`hbond/dreiding/morse (o) <pair_hbond_dreiding>`
    * :doc:`hdnnp <pair_hdnnp>`
-   * :doc:`hippo <pair_amoeba>`
+   * :doc:`hippo (g) <pair_amoeba>`
    * :doc:`ilp/graphene/hbn (t) <pair_ilp_graphene_hbn>`
    * :doc:`ilp/tmd (t) <pair_ilp_tmd>`
    * :doc:`kolmogorov/crespi/full <pair_kolmogorov_crespi_full>`
@@ -200,6 +200,7 @@ OPT.
    * :doc:`mdpd <pair_mesodpd>`
    * :doc:`mdpd/rhosum <pair_mesodpd>`
    * :doc:`meam (k) <pair_meam>`
+   * :doc:`meam/ms (k) <pair_meam>`
    * :doc:`meam/spline (o) <pair_meam_spline>`
    * :doc:`meam/sw/spline <pair_meam_sw_spline>`
    * :doc:`mesocnt <pair_mesocnt>`
diff --git a/doc/src/fix_pimd.rst b/doc/src/fix_pimd.rst
index 838b9812ad..e5d42eb15f 100644
--- a/doc/src/fix_pimd.rst
+++ b/doc/src/fix_pimd.rst
@@ -149,6 +149,34 @@ related tasks for each of the partitions, e.g.
    restart 1000 system_${ibead}.restart1 system_${ibead}.restart2
    read_restart system_${ibead}.restart2
 
+Restart, fix_modify, output, run start/stop, minimize info
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+This fix writes the state of the Nose/Hoover thermostat over all
+quasi-beads to :doc:`binary restart files <restart>`.  See the
+:doc:`read_restart <read_restart>` command for info on how to re-specify
+a fix in an input script that reads a restart file, so that the
+operation of the fix continues in an uninterrupted fashion.
+
+None of the :doc:`fix_modify <fix_modify>` options
+are relevant to this fix.
+
+This fix computes a global 3-vector, which can be accessed by various
+:doc:`output commands <Howto_output>`.  The three quantities in the
+global vector are
+
+#. the total spring energy of the quasi-beads,
+#. the current temperature of the classical system of ring polymers,
+#. the current value of the scalar virial estimator for the kinetic
+   energy of the quantum system :ref:`(Herman) <Herman>`.
+
+The vector values calculated by this fix are "extensive", except for the
+temperature, which is "intensive".
+
+No parameter of this fix can be used with the *start/stop* keywords of
+the :doc:`run <run>` command.  This fix is not invoked during
+:doc:`energy minimization <minimize>`.
+
 Restrictions
 """"""""""""
 
@@ -204,3 +232,8 @@ Path Integrals, McGraw-Hill, New York (1965).
 
 **(Calhoun)** A. Calhoun, M. Pavese, G. Voth, Chem Phys Letters, 262,
 415 (1996).
+
+.. _Herman:
+
+**(Herman)** M. F. Herman, E. J. Bruskin, B. J. Berne, J Chem Phys, 76, 5150 (1982).
+
diff --git a/doc/src/fix_reaxff_species.rst b/doc/src/fix_reaxff_species.rst
index c78c05a35e..383b8212f9 100644
--- a/doc/src/fix_reaxff_species.rst
+++ b/doc/src/fix_reaxff_species.rst
@@ -39,6 +39,9 @@ Syntax
            *masslimit* value = massmin massmax
              massmin = minimum molecular weight of species to delete
              massmax = maximum molecular weight of species to delete
+       *delete_rate_limit* value = Nlimit Nsteps
+             Nlimit = maximum number of deletions allowed to occur within interval
+             Nsteps = the interval (number of timesteps) over which to count deletions
 
 Examples
 """"""""
@@ -142,7 +145,13 @@ When using the *masslimit* keyword, each line of the *filedel* file
 contains the timestep on which deletions occurs, followed by how many
 of each species are deleted (with quantities preceding chemical
 formulae).  The *specieslist* and *masslimit* keywords cannot both be
-used in the same *reaxff/species* fix.
+used in the same *reaxff/species* fix.  The *delete_rate_limit*
+keyword can enforce an upper limit on the overall rate of molecule
+deletion.  The number of deletion occurrences is limited to Nlimit
+within an interval of Nsteps timesteps.  When using the
+*delete_rate_limit* keyword, no deletions are permitted to occur
+within the first Nsteps timesteps of the first run (after reading a
+either a data or restart file).
 
 ----------
 
diff --git a/doc/src/fix_rigid.rst b/doc/src/fix_rigid.rst
index 9a958e50d1..3a2477f90a 100644
--- a/doc/src/fix_rigid.rst
+++ b/doc/src/fix_rigid.rst
@@ -732,8 +732,8 @@ choices:
 
 * Use one of the 4 NPT or NPH styles for the rigid bodies.  Use the
   *dilate* all option so that it will dilate the positions of the
-  *non-rigid particles as well.  Use :doc:`fix nvt <fix_nh>` (or any
-  *other thermostat) for the non-rigid particles.
+  non-rigid particles as well.  Use :doc:`fix nvt <fix_nh>` (or any
+  other thermostat) for the non-rigid particles.
 * Use :doc:`fix npt <fix_nh>` for the group of non-rigid particles.  Use
   the *dilate* all option so that it will dilate the center-of-mass
   positions of the rigid bodies as well.  Use one of the 4 NVE or 2 NVT
diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst
index f5c0ea14df..6ef92a6938 100644
--- a/doc/src/pair_amoeba.rst
+++ b/doc/src/pair_amoeba.rst
@@ -1,11 +1,18 @@
 .. index:: pair_style amoeba
+.. index:: pair_style amoeba/gpu
 .. index:: pair_style hippo
+.. index:: pair_style hippo/gpu
 
 pair_style amoeba command
 =========================
 
+Accelerator Variants: *amoeba/gpu*
+
 pair_style hippo command
 ========================
+
+Accelerator Variants: *hippo/gpu*
+
 Syntax
 """"""
 
@@ -127,6 +134,10 @@ version discussed in :ref:`(Ponder) <amoeba-Ponder>`, :ref:`(Ren)
 implementation of HIPPO in LAMMPS matches the version discussed in
 :ref:`(Rackers) <amoeba-Rackers>`.
 
+.. versionadded:: TBD
+
+Accelerator support via the GPU package is available.
+
 ----------
 
 Only a single pair_coeff command is used with either the *amoeba* and
@@ -187,6 +198,19 @@ These pair styles can only be used via the *pair* keyword of the
 
 ----------
 
+.. include:: accel_styles.rst
+
+.. note::
+
+  Using the GPU accelerated pair styles 'amoeba/gpu' or 'hippo/gpu'
+  when compiling the GPU package for OpenCL has a few known issues
+  when running on integrated GPUs and the calculation may crash.
+
+  The GPU accelerated pair styles are also not (yet) compatible
+  with single precision FFTs.
+
+----------
+
 Restrictions
 """"""""""""
 
diff --git a/doc/src/pair_meam.rst b/doc/src/pair_meam.rst
index 6a3d52c4d5..57c40aa6ee 100644
--- a/doc/src/pair_meam.rst
+++ b/doc/src/pair_meam.rst
@@ -1,17 +1,26 @@
 .. index:: pair_style meam
 .. index:: pair_style meam/kk
+.. index:: pair_style meam/ms
+.. index:: pair_style meam/ms/kk
 
 pair_style meam command
 =========================
 
 Accelerator Variants: *meam/kk*
 
+pair_style meam/ms command
+==========================
+
+Accelerator Variants: *meam/ms/kk*
+
 Syntax
 """"""
 
 .. code-block:: LAMMPS
 
-   pair_style meam
+   pair_style style
+
+* style = *meam* or *meam/ms*
 
 Examples
 """"""""
@@ -22,6 +31,9 @@ Examples
    pair_coeff * * ../potentials/library.meam Si ../potentials/si.meam Si
    pair_coeff * * ../potentials/library.meam Ni Al NULL Ni Al Ni Ni
 
+   pair_style meam/ms
+   pair_coeff * * ../potentials/library.msmeam H Ga ../potentials/HGa.meam H Ga
+
 Description
 """""""""""
 
@@ -31,16 +43,23 @@ Description
    as of November 2010; see description below of the mixture_ref_t
    parameter
 
-Pair style *meam* computes non-bonded interactions for a variety of materials
-using the modified embedded-atom method (MEAM)
-:ref:`(Baskes) <Baskes>`.  Conceptually, it is an extension to the original
-:doc:`EAM method <pair_eam>` which adds angular forces.  It is
-thus suitable for modeling metals and alloys with fcc, bcc, hcp and
-diamond cubic structures, as well as materials with covalent interactions
-like silicon and carbon. This *meam* pair style is a translation of the
-original Fortran version to C++. It is functionally equivalent but more
-efficient and has additional features. The Fortran version of the *meam*
-pair style has been removed from LAMMPS after the 12 December 2018 release.
+Pair style *meam* computes non-bonded interactions for a variety of
+materials using the modified embedded-atom method (MEAM) :ref:`(Baskes)
+<Baskes>`.  Conceptually, it is an extension to the original :doc:`EAM
+method <pair_eam>` which adds angular forces.  It is thus suitable for
+modeling metals and alloys with fcc, bcc, hcp and diamond cubic
+structures, as well as materials with covalent interactions like silicon
+and carbon.
+
+The *meam* pair style is a translation of the original Fortran version
+to C++. It is functionally equivalent but more efficient and has
+additional features. The Fortran version of the *meam* pair style has
+been removed from LAMMPS after the 12 December 2018 release.
+
+Pair style *meam/ms* uses the multi-state MEAM (MS-MEAM) method
+according to :ref:`(Baskes2) <Baskes2>`, which is an extension to MEAM.
+This pair style is mostly equivalent to *meam* and differs only
+where noted in the documentation below.
 
 In the MEAM formulation, the total energy E of a system of atoms is
 given by:
@@ -351,6 +370,16 @@ Most published MEAM parameter sets use the default values *attrac* = *repulse* =
 Setting *repuls* = *attrac* = *delta* corresponds to the form used in several
 recent published MEAM parameter sets, such as :ref:`(Valone) <Valone>`
 
+Then using *meam/ms* pair style the multi-state MEAM (MS-MEAM) method is
+activated.  This requires 6 extra parameters in the MEAM library file,
+resulting in 25 parameters ordered that are ordered like this:
+
+elt, lat, z, ielement, atwt, alpha, b0, b1, b2, b3, b1m, b2m, b3m, alat, esub, asub,
+t0, t1, t2, t3, t1m, t2m, t3m, rozero, ibar
+
+The 6 extra MS-MEAM parameters are *b1m, b2m, b3m, t1m, t2m, t3m*.
+In the LAMMPS ``potentials`` folder, compatible files have an ".msmeam" extension.
+
 ----------
 
 .. include:: accel_styles.rst
@@ -393,16 +422,15 @@ This pair style can only be used via the *pair* keyword of the
 Restrictions
 """"""""""""
 
-The *meam* style is provided in the MEAM package. It is
-only enabled if LAMMPS was built with that package.
+The *meam* and *meam/ms* pair styles are provided in the MEAM
+package. They are only enabled if LAMMPS was built with that package.
 See the :doc:`Build package <Build_package>` page for more info.
 
-The maximum number of elements, that can be read from the MEAM
-library file, is determined at compile time. The default is 5.
-If you need support for more elements, you have to change the
-define for the constant 'maxelt' at the beginning of the file
-src/MEAM/meam.h and update/recompile LAMMPS. There is no
-limit on the number of atoms types.
+The maximum number of elements, that can be read from the MEAM library
+file, is determined at compile time. The default is 5.  If you need
+support for more elements, you have to change the the constant 'maxelt'
+at the beginning of the file ``src/MEAM/meam.h`` and update/recompile
+LAMMPS.  There is no limit on the number of atoms types.
 
 Related commands
 """"""""""""""""
@@ -421,6 +449,10 @@ none
 
 **(Baskes)** Baskes, Phys Rev B, 46, 2727-2742 (1992).
 
+.. _Baskes2:
+
+**(Baskes2)** Baskes, Phys Rev B, 75, 094113 (2007).
+
 .. _Gullet:
 
 **(Gullet)** Gullet, Wagner, Slepoy, SANDIA Report 2003-8782 (2003). DOI:10.2172/918395
diff --git a/doc/src/pair_style.rst b/doc/src/pair_style.rst
index facfadeb9b..b3f7276480 100644
--- a/doc/src/pair_style.rst
+++ b/doc/src/pair_style.rst
@@ -277,7 +277,8 @@ accelerated styles exist.
 * :doc:`lubricateU/poly <pair_lubricateU>` - hydrodynamic lubrication forces for Fast Lubrication with polydispersity
 * :doc:`mdpd <pair_mesodpd>` - mDPD particle interactions
 * :doc:`mdpd/rhosum <pair_mesodpd>` - mDPD particle interactions for mass density
-* :doc:`meam <pair_meam>` - modified embedded atom method (MEAM) in C
+* :doc:`meam <pair_meam>` - modified embedded atom method (MEAM)
+* :doc:`meam/ms <pair_meam>` - multi-state modified embedded atom method (MS-MEAM)
 * :doc:`meam/spline <pair_meam_spline>` - splined version of MEAM
 * :doc:`meam/sw/spline <pair_meam_sw_spline>` - splined version of MEAM with a Stillinger-Weber term
 * :doc:`mesocnt <pair_mesocnt>` - mesoscopic vdW potential for (carbon) nanotubes
diff --git a/examples/meam/msmeam/HGa.meam b/examples/meam/msmeam/HGa.meam
new file mode 100644
index 0000000000..9f01501c16
--- /dev/null
+++ b/examples/meam/msmeam/HGa.meam
@@ -0,0 +1,30 @@
+bkgd_dyn        =       1
+emb_lin_neg = 1
+augt1=0 
+ialloy=1 
+rc	=	 5.9 
+#H
+attrac(1,1)=0.460 
+repuls(1,1)=0.460 
+Cmin(1,1,1)=1.3 # PuMS
+Cmax(1,1,1)= 2.80 
+nn2(1,1)=1
+#Ga
+rho0(2)         =       0.6
+attrac(2,2)=0.097 
+repuls(2,2)=0.097 
+nn2(2,2)=1
+#HGa
+attrac(1,2)=0.300 
+repuls(1,2)=0.300 
+lattce(1,2)=l12 
+re(1,2)=3.19 
+delta(1,2)=-0.48  
+alpha(1,2)=6.6 
+Cmin(1,1,2)=2.0 
+Cmin(2,1,2)= 2.0 
+Cmin(1,2,1)=2.0 
+Cmin(2,2,1)     =       1.4
+Cmin(1,2,2)     =       1.4
+Cmin(1,1,2)     =       1.4
+nn2(1,2)=1
diff --git a/examples/meam/msmeam/README.md b/examples/meam/msmeam/README.md
new file mode 100644
index 0000000000..dbf569d4b3
--- /dev/null
+++ b/examples/meam/msmeam/README.md
@@ -0,0 +1,9 @@
+To run Baske's test, do
+
+    lmp -in in.msmeam
+
+Then 
+
+    diff dump.msmeam dump.msmeam.bu
+
+
diff --git a/examples/meam/msmeam/data.msmeam.bu b/examples/meam/msmeam/data.msmeam.bu
new file mode 100644
index 0000000000..576a3c50de
--- /dev/null
+++ b/examples/meam/msmeam/data.msmeam.bu
@@ -0,0 +1,25 @@
+LAMMPS data file via write_data, version 16 Feb 2016, timestep = 1
+
+3 atoms
+2 atom types
+
+-4.0000000000000000e+00 4.0000000000000000e+00 xlo xhi
+-4.0000000000000000e+00 4.0000000000000000e+00 ylo yhi
+-4.0000000000000000e+00 4.0000000000000000e+00 zlo zhi
+
+Masses
+
+1 1.0079
+2 69.723
+
+Atoms # atomic
+
+1 1 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00 0 0 0
+2 2 2.2000000000000002e+00 0.0000000000000000e+00 0.0000000000000000e+00 0 0 0
+3 2 2.9999999999999999e-01 2.2999999999999998e+00 0.0000000000000000e+00 0 0 0
+
+Velocities
+
+1 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00
+2 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00
+3 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00
diff --git a/examples/meam/msmeam/dump.msmeam.bu b/examples/meam/msmeam/dump.msmeam.bu
new file mode 100644
index 0000000000..039f630073
--- /dev/null
+++ b/examples/meam/msmeam/dump.msmeam.bu
@@ -0,0 +1,24 @@
+ITEM: TIMESTEP
+0
+ITEM: NUMBER OF ATOMS
+3
+ITEM: BOX BOUNDS pp pp pp
+-4 4
+-4 4
+-4 4
+ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] 
+1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0 
+2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0 
+3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0 
+ITEM: TIMESTEP
+1
+ITEM: NUMBER OF ATOMS
+3
+ITEM: BOX BOUNDS pp pp pp
+-4 4
+-4 4
+-4 4
+ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] 
+1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0 
+2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0 
+3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0 
diff --git a/examples/meam/msmeam/in.msmeam b/examples/meam/msmeam/in.msmeam
new file mode 100644
index 0000000000..82ffb89a13
--- /dev/null
+++ b/examples/meam/msmeam/in.msmeam
@@ -0,0 +1,31 @@
+echo	both
+log	log.msmeam
+# Test of MEAM potential for HGa
+
+# ------------------------ INITIALIZATION ----------------------------
+units           metal
+dimension       3
+boundary        p       p       p
+atom_style      atomic
+variable latparam equal 4.646
+variable ncell equal 3
+
+# ----------------------- ATOM DEFINITION ----------------------------
+region          box block -4 4 -4 4 -4 4
+create_box      2 box
+
+#
+
+include potential.mod
+create_atoms    1 single 0 0 0  units box
+create_atoms    2 single 2.2 0 0  units box
+create_atoms    2 single 0.3 2.3 0  units box
+# ---------- Define Settings ---------------------
+variable	teng equal "c_eatoms"
+compute pot_energy all pe/atom
+compute stress all stress/atom NULL
+dump 1 all custom 1 dump.msmeam id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
+run	1
+write_data	data.msmeam
+
+print "All done!"
diff --git a/examples/meam/msmeam/library.msmeam b/examples/meam/msmeam/library.msmeam
new file mode 100644
index 0000000000..9937eaee08
--- /dev/null
+++ b/examples/meam/msmeam/library.msmeam
@@ -0,0 +1,14 @@
+# DATE: 2018-09-22 UNITS: metal CONTRIBUTOR: Steve Valone, smv@lanl.gov CITATION: Baskes, PRB 1992; smv, sr, mib, JNM 2010
+# ms-meam data format May 2010
+#  elt   lat    z     ielement      atwt
+#  alpha b0     b1    b2     b3     b1m     b2m   b3m      alat   esub   asub
+#    -   t0     t1    t2     t3     t1m     t2m   t3m      rozero ibar
+#  NOTE:  leading character cannot be a space
+
+'H'    'dim'  1.0   1      1.0079
+2.960  2.960  3.0   1.0    1.0    1.0    3.0  1.0       0.741  2.235  2.50
+1.0    0.44721 0.0  0.00   0.0    0.31623 0  6.70 0
+
+'Ga4'  'fcc'  12.0  31     69.723
+4.42   4.80   3.10  6.00   0.00   0.0    0.0  0.5       4.247  2.897  0.97
+1.0    1.649 1.435  0.00   0.0    0.0  2.0       0.70   0
diff --git a/examples/meam/msmeam/log.msmeam.bu b/examples/meam/msmeam/log.msmeam.bu
new file mode 100644
index 0000000000..8eac453c1e
--- /dev/null
+++ b/examples/meam/msmeam/log.msmeam.bu
@@ -0,0 +1,107 @@
+# Test of MEAM potential for HGa
+
+# ------------------------ INITIALIZATION ----------------------------
+units           metal
+dimension       3
+boundary        p       p       p
+atom_style      atomic
+variable latparam equal 4.646
+variable ncell equal 3
+
+# ----------------------- ATOM DEFINITION ----------------------------
+region          box block -4 4 -4 4 -4 4
+create_box      2 box
+Created orthogonal box = (-4 -4 -4) to (4 4 4)
+  1 by 1 by 1 MPI processor grid
+
+#
+
+include potential.mod
+# NOTE: This script can be modified for different pair styles
+# See in.elastic for more info.
+
+variable Pu string H
+print "potential chosen ${Pu}"
+potential chosen H
+# Choose potential
+pair_style      MSmeam
+print		"we just executed"
+we just executed
+
+pair_coeff      * * library.MSmeam ${Pu} Ga4  HGaMS.meam ${Pu} Ga4
+pair_coeff      * * library.MSmeam H Ga4  HGaMS.meam ${Pu} Ga4
+pair_coeff      * * library.MSmeam H Ga4  HGaMS.meam H Ga4
+Reading potential file library.MSmeam with DATE: 2018-09-22
+# Setup neighbor style
+neighbor 1.0 nsq
+neigh_modify once no every 1 delay 0 check yes
+
+# Setup minimization style
+variable dmax equal 1.0e-2
+min_style	     cg
+min_modify	     dmax ${dmax} line quadratic
+min_modify	     dmax 0.01 line quadratic
+compute eng all pe/atom
+compute eatoms all reduce sum c_eng
+
+# Setup output
+thermo		100
+thermo_style custom step temp etotal  press pxx pyy pzz pxy pxz pyz lx ly lz vol c_eatoms
+thermo_modify norm yes
+create_atoms    1 single 0 0 0  units box
+Created 1 atoms
+create_atoms    2 single 2.2 0 0  units box
+Created 1 atoms
+create_atoms    2 single 0.3 2.3 0  units box
+Created 1 atoms
+# ---------- Define Settings ---------------------
+variable	teng equal "c_eatoms"
+compute pot_energy all pe/atom
+compute stress all stress/atom NULL
+dump 1 all custom 1 dump.msmeam id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
+run	1
+WARNING: No fixes defined, atoms won't move (../verlet.cpp:55)
+Neighbor list info ...
+  2 neighbor list requests
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 6.9
+  ghost atom cutoff = 6.9
+Memory usage per processor = 12.9295 Mbytes
+Step Temp TotEng Press Pxx Pyy Pzz Pxy Pxz Pyz Lx Ly Lz Volume eatoms 
+       0            0    15.433079    491354.68    838670.91    635393.13            0    80195.793            0            0            8            8            8          512    15.433079 
+       1            0    15.433079    491354.68    838670.91    635393.13            0    80195.793            0            0            8            8            8          512    15.433079 
+Loop time of 0.000172138 on 1 procs for 1 steps with 3 atoms
+
+Performance: 501.922 ns/day, 0.048 hours/ns, 5809.285 timesteps/s
+81.3% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 6.6996e-05 | 6.6996e-05 | 6.6996e-05 |   0.0 | 38.92
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 1.9073e-06 | 1.9073e-06 | 1.9073e-06 |   0.0 |  1.11
+Output  | 9.7036e-05 | 9.7036e-05 | 9.7036e-05 |   0.0 | 56.37
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 6.199e-06  |            |       |  3.60
+
+Nlocal:    3 ave 3 max 3 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    78 ave 78 max 78 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    7 ave 7 max 7 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:  14 ave 14 max 14 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 14
+Ave neighs/atom = 4.66667
+Neighbor list builds = 0
+Dangerous builds = 0
+write_data	data.msmeam
+
+print "All done!"
+All done!
+Total wall time: 0:00:00
+
diff --git a/examples/meam/msmeam/msmeam.dump.bu b/examples/meam/msmeam/msmeam.dump.bu
new file mode 100644
index 0000000000..039f630073
--- /dev/null
+++ b/examples/meam/msmeam/msmeam.dump.bu
@@ -0,0 +1,24 @@
+ITEM: TIMESTEP
+0
+ITEM: NUMBER OF ATOMS
+3
+ITEM: BOX BOUNDS pp pp pp
+-4 4
+-4 4
+-4 4
+ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] 
+1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0 
+2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0 
+3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0 
+ITEM: TIMESTEP
+1
+ITEM: NUMBER OF ATOMS
+3
+ITEM: BOX BOUNDS pp pp pp
+-4 4
+-4 4
+-4 4
+ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] 
+1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0 
+2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0 
+3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0 
diff --git a/examples/meam/msmeam/potential.mod b/examples/meam/msmeam/potential.mod
new file mode 100644
index 0000000000..760cc93503
--- /dev/null
+++ b/examples/meam/msmeam/potential.mod
@@ -0,0 +1,25 @@
+# NOTE: This script can be modified for different pair styles 
+# See in.elastic for more info.
+
+variable Pu string H
+print "potential chosen ${Pu}"
+# Choose potential
+pair_style meam/ms
+print		"we just executed"
+
+pair_coeff      * * library.msmeam ${Pu} Ga4  HGa.meam ${Pu} Ga4
+# Setup neighbor style
+neighbor 1.0 bin
+neigh_modify once no every 1 delay 0 check yes
+
+# Setup minimization style
+variable dmax equal 1.0e-2
+min_style	     cg
+min_modify	     dmax ${dmax} line quadratic
+compute eng all pe/atom
+compute eatoms all reduce sum c_eng
+
+# Setup output
+thermo		100
+thermo_style custom step temp etotal  press pxx pyy pzz pxy pxz pyz lx ly lz vol c_eatoms
+thermo_modify norm yes
diff --git a/lib/gpu/Makefile.lammps.standard b/lib/gpu/Makefile.lammps.standard
index 9526e8e373..0bb3394b3e 100644
--- a/lib/gpu/Makefile.lammps.standard
+++ b/lib/gpu/Makefile.lammps.standard
@@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda
 endif
 
 gpu_SYSINC =
-gpu_SYSLIB =  -lcudart -lcuda
+gpu_SYSLIB =  -lcudart -lcuda -lcufft
 gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs
 
diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 56942d3f3c..298d404117 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -1,9 +1,17 @@
+# Common headers for kernels
+PRE1_H = lal_preprocessor.h lal_aux_fun1.h
+
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \
          lal_pre_cuda_hip.h
-ALL_H  =  $(NVD_H) $(wildcard ./lal_*.h)
 
+# Headers for Host files
+HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
+         lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
+         lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
+         lal_neighbor_shared.h lal_pre_ocl_config.h $(NVD_H)
+         
 # Source files
 SRCS := $(wildcard ./lal_*.cpp)
 OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
@@ -54,13 +62,40 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
 $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
 	$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
 
-$(OBJ_DIR)/%_cubin.h: lal_%.cu  $(ALL_H)
+$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
 	$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
 	$(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@
 
 # host code compilation
 
-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
+$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
 
 #ifdef CUDPP_OPT
@@ -77,10 +112,10 @@ $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
 	$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini
 
 $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
-	$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
+	$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu -Icudpp_mini
 
 $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
-	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
+	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu -Icudpp_mini
 #endif
 
 # build libgpu.a
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 2ff98827d4..d318da15dd 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -6,7 +6,7 @@ UCL_H  = $(wildcard ./geryon/ucl*.h)
 OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h
 
 # Headers for Host files
-HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \
+HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
          lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
          lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
          lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H)
@@ -74,6 +74,9 @@ $(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.
 $(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h
 	$(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h;
 
+$(OBJ_DIR)/hippo_cl.h: lal_hippo.cu $(PRE1_H) lal_hippo_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh hippo $(PRE1_H) lal_hippo_extra.h lal_hippo.cu $(OBJ_DIR)/hippo_cl.h;
+
 $(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;
 
diff --git a/lib/gpu/geryon/hip_macros.h b/lib/gpu/geryon/hip_macros.h
index 96313ec87e..e16caf4944 100644
--- a/lib/gpu/geryon/hip_macros.h
+++ b/lib/gpu/geryon/hip_macros.h
@@ -26,6 +26,9 @@
 #ifdef UCL_DEBUG
 #define UCL_SYNC_DEBUG
 #define UCL_DESTRUCT_CHECK
+#define UCL_DEBUG_ARG(arg) arg
+#else
+#define UCL_DEBUG_ARG(arg)
 #endif
 
 #ifndef UCL_NO_API_CHECK
diff --git a/lib/gpu/geryon/nvd_macros.h b/lib/gpu/geryon/nvd_macros.h
index ac2e6cc682..19c8ff4b6c 100644
--- a/lib/gpu/geryon/nvd_macros.h
+++ b/lib/gpu/geryon/nvd_macros.h
@@ -33,6 +33,9 @@
 #ifdef UCL_DEBUG
 #define UCL_SYNC_DEBUG
 #define UCL_DESTRUCT_CHECK
+#define UCL_DEBUG_ARG(arg) arg
+#else
+#define UCL_DEBUG_ARG(arg)
 #endif
 
 #ifndef UCL_NO_API_CHECK
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 4163d40881..588c53c8fa 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -309,15 +309,14 @@ class UCL_Device {
   /// Return the maximum memory pitch in bytes for current device
   inline size_t max_pitch() { return max_pitch(_device); }
   /// Return the maximum memory pitch in bytes
-  inline size_t max_pitch(const int i) { return 0; }
+  inline size_t max_pitch(const int) { return 0; }
 
   /// Returns false if accelerator cannot be shared by multiple processes
   /** If it cannot be determined, true is returned **/
   inline bool sharing_supported() { return sharing_supported(_device); }
   /// Returns false if accelerator cannot be shared by multiple processes
   /** If it cannot be determined, true is returned **/
-  inline bool sharing_supported(const int i)
-    { return true; }
+  inline bool sharing_supported(const int) { return true; }
 
   /// True if the device is a sub-device
   inline bool is_subdevice()
diff --git a/lib/gpu/geryon/ocl_macros.h b/lib/gpu/geryon/ocl_macros.h
index 5e5a190ede..652d7795e9 100644
--- a/lib/gpu/geryon/ocl_macros.h
+++ b/lib/gpu/geryon/ocl_macros.h
@@ -33,6 +33,9 @@
 #ifdef UCL_DEBUG
 #define UCL_SYNC_DEBUG
 #define UCL_DESTRUCT_CHECK
+#define UCL_DEBUG_ARG(arg) arg
+#else
+#define UCL_DEBUG_ARG(arg)
 #endif
 
 #ifndef UCL_NO_API_CHECK
diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h
index bfc260889a..5d8b9808bd 100644
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@@ -137,7 +137,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
 
 template <class mat_type>
 inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
-                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT /*kind2*/){
   cl_mem_flags buffer_perm;
   cl_map_flags map_perm;
   if (kind==UCL_READ_ONLY) {
@@ -583,7 +583,7 @@ template <> struct _ucl_memcpy<1,0> {
   template <class p1, class p2>
   static inline void mc(p1 &dst, const p2 &src, const size_t n,
                         cl_command_queue &cq, const cl_bool block,
-                        const size_t dst_offset, const size_t src_offset) {
+                        const size_t /*dst_offset*/, const size_t src_offset) {
     if (src.cbegin()==dst.cbegin()) {
       #ifdef UCL_DBG_MEM_TRACE
       std::cerr << "UCL_COPY 1S\n";
@@ -641,7 +641,7 @@ template <> struct _ucl_memcpy<0,1> {
   template <class p1, class p2>
   static inline void mc(p1 &dst, const p2 &src, const size_t n,
                         cl_command_queue &cq, const cl_bool block,
-                        const size_t dst_offset, const size_t src_offset) {
+                        const size_t dst_offset, const size_t /*src_offset*/) {
     if (src.cbegin()==dst.cbegin()) {
       if (block) ucl_sync(cq);
       #ifdef UCL_DBG_MEM_TRACE
diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h
index 8ddde5b2a3..87db3794a6 100644
--- a/lib/gpu/geryon/ocl_texture.h
+++ b/lib/gpu/geryon/ocl_texture.h
@@ -35,19 +35,19 @@ class UCL_Texture {
   UCL_Texture() {}
   ~UCL_Texture() {}
   /// Construct with a specified texture reference
-  inline UCL_Texture(UCL_Program &prog, const char *texture_name) { }
+  inline UCL_Texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }
   /// Set the texture reference for this object
-  inline void get_texture(UCL_Program &prog, const char *texture_name) { }
+  inline void get_texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }
 
   /// Bind a float array where each fetch grabs a vector of length numel
   template<class mat_typ>
-  inline void bind_float(mat_typ &vec, const unsigned numel) { }
+    inline void bind_float(mat_typ & /*vec*/, const unsigned /*numel*/) { }
 
   /// Unbind the texture reference from the memory allocation
   inline void unbind() { }
 
   /// Make a texture reference available to kernel
-  inline void allow(UCL_Kernel &kernel) { }
+  inline void allow(UCL_Kernel & /*kernel*/) { }
 
  private:
   friend class UCL_Kernel;
@@ -62,7 +62,7 @@ class UCL_Const {
   inline UCL_Const(UCL_Program &prog, const char *global_name)
     { get_global(prog,global_name); }
   /// Set the global reference for this object
-  inline void get_global(UCL_Program &prog, const char *global_name) {
+  inline void get_global(UCL_Program &prog, const char * /*global_name*/) {
     if (_active) {
       CL_DESTRUCT_CALL(clReleaseContext(_context));
       CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h
index 189871e631..8f55a91a28 100644
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@@ -71,7 +71,7 @@ class UCL_Timer {
   inline void init(UCL_Device &dev) { init(dev,dev.cq()); }
 
   /// Initialize command queue for timing
-  inline void init(UCL_Device &dev, command_queue &cq) {
+  inline void init(UCL_Device & /*dev*/, command_queue &cq) {
     clear();
     _cq=cq;
     clRetainCommandQueue(_cq);
diff --git a/lib/gpu/geryon/ucl_copy.h b/lib/gpu/geryon/ucl_copy.h
index c906a14f30..94b57f7a09 100644
--- a/lib/gpu/geryon/ucl_copy.h
+++ b/lib/gpu/geryon/ucl_copy.h
@@ -205,12 +205,11 @@ template <> struct _host_host_copy<1,1> {
 // Should never be here
 template <int host_t1, int host_t2> struct _host_host_copy {
   template <class mat1, class mat2>
-  static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
+  static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/) {
     assert(0==1);
   }
   template <class mat1, class mat2>
-  static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
-                         const size_t cols) {
+  static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, const size_t /*cols*/) {
     assert(0==1);
   }
 };
@@ -470,24 +469,22 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
 // Neither on host or both on host
 template <> struct _ucl_cast_copy<1,1> {
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer, command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
+                          mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
     assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
     assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t /*cols*/, mat3 & /*cast_buffer*/) {
     assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer,
-                        command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t /*cols*/, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
     assert(0==1);
   }
 };
@@ -495,24 +492,22 @@ template <> struct _ucl_cast_copy<1,1> {
 // Neither on host or both on host
 template <> struct _ucl_cast_copy<0,0> {
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer, command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
+                          mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
     assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
     assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t /*cols*/, mat3 & /*cast_buffer*/) {
     assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer,
-                        command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t cols, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
     assert(0==1);
   }
 };
diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h
index 9158e145b3..5e281fef07 100644
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@@ -125,7 +125,7 @@ class UCL_D_Vec : public UCL_BaseMat {
     * - The view does not prevent the memory from being freed by the
     *   allocating container when using CUDA APIs **/
   template <class ucl_type>
-  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
+  inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
     #ifdef UCL_DEBUG
     assert(rows==1);
     #endif
@@ -230,8 +230,8 @@ class UCL_D_Vec : public UCL_BaseMat {
     * - The view does not prevent the memory from being freed by the
     *   allocating container when using CUDA APIs **/
   template <class ucl_type>
-  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols) {
+  inline void view_offset(const size_t offset,ucl_type &input,
+                          const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
     #ifdef UCL_DEBUG
     assert(rows==1);
     #endif
diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h
index 2f49f9f633..9f734ac40c 100644
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@@ -126,7 +126,7 @@ class UCL_H_Vec : public UCL_BaseMat {
     *   allocating container when using CUDA APIs
     * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
-  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
+  inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
     #ifdef UCL_DEBUG
     assert(rows==1);
     #endif
@@ -188,7 +188,7 @@ class UCL_H_Vec : public UCL_BaseMat {
     *   allocating container when using CUDA APIs
     * - Viewing a device pointer on the host is not supported **/
   template <class ptr_type>
-  inline void view(ptr_type *input, const size_t rows, const size_t cols,
+  inline void view(ptr_type *input, const size_t UCL_DEBUG_ARG(rows), const size_t cols,
                    UCL_Device &dev) {
     #ifdef UCL_DEBUG
     assert(rows==1);
@@ -233,7 +233,7 @@ class UCL_H_Vec : public UCL_BaseMat {
     *   allocating container when using CUDA APIs
     * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
-  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t UCL_DEBUG_ARG(rows),
                           const size_t cols) {
     #ifdef UCL_DEBUG
     assert(rows==1);
diff --git a/lib/gpu/geryon/ucl_s_obj_help.h b/lib/gpu/geryon/ucl_s_obj_help.h
index a10f3cdb3f..9bc2c40fe2 100644
--- a/lib/gpu/geryon/ucl_s_obj_help.h
+++ b/lib/gpu/geryon/ucl_s_obj_help.h
@@ -27,7 +27,7 @@ template <int st> struct _ucl_s_obj_help;
 // -- Can potentially use same memory if shared by accelerator
 template <> struct _ucl_s_obj_help<1> {
   template <class t1, class t2, class t3>
-  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
+    static inline int alloc(t1 &host, t2 &device, t3 & /*_buffer*/,
                           const int cols, UCL_Device &acc,
                           const enum UCL_MEMOPT kind1,
                           const enum UCL_MEMOPT kind2) {
@@ -131,41 +131,37 @@ template <> struct _ucl_s_obj_help<1> {
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
+    static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, const bool async) {
     ucl_copy(dst,src,async);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
+    static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, command_queue &cq) {
     ucl_copy(dst,src,cq);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
-                          const bool async) {
+    static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, const bool async) {
     ucl_copy(dst,src,cols,async);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
-                          command_queue &cq) {
+    static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, command_queue &cq) {
     ucl_copy(dst,src,cols,cq);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
-                          t3 &buffer, const bool async) {
+    static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, const bool async) {
     ucl_copy(dst,src,rows,cols,async);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
-                          t3 &buffer, command_queue &cq) {
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, command_queue &cq) {
     ucl_copy(dst,src,rows,cols,cq);
   }
 
   template <class t1, class t2, class t3>
-  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
+    static inline int dev_resize(t1 &device, t2 &host, t3 & /*buff*/,const int cols) {
     if (device.kind()==UCL_VIEW) {
       device.view(host);
       return UCL_SUCCESS;
@@ -353,7 +349,7 @@ template <int st> struct _ucl_s_obj_help {
   }
 
   template <class t1, class t2, class t3>
-  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
+  static inline int dev_resize(t1 &device, t2 & /*host*/, t3 &buff,const int cols) {
     int err=buff.resize(cols);
     if (err!=UCL_SUCCESS)
       return err;
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
new file mode 100644
index 0000000000..5e19997913
--- /dev/null
+++ b/lib/gpu/lal_amoeba.cpp
@@ -0,0 +1,322 @@
+/***************************************************************************
+                                 amoeba.cpp
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the amoeba pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "amoeba_cl.h"
+#elif defined(USE_CUDART)
+const char *amoeba=0;
+#else
+#include "amoeba_cubin.h"
+#endif
+
+#include "lal_amoeba.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define AmoebaT Amoeba<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+AmoebaT::Amoeba() : BaseAmoeba<numtyp,acctyp>(),
+  _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+AmoebaT::~Amoeba() {
+  clear();
+}
+
+template <class numtyp, class acctyp>
+int AmoebaT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
+                  const double *host_pdamp, const double *host_thole,
+                  const double *host_dirdamp, const int *host_amtype2class,
+                  const double *host_special_hal,
+                  const double * /*host_special_repel*/,
+                  const double * /*host_special_disp*/,
+                  const double *host_special_mpole,
+                  const double * /*host_special_polar_wscale*/,
+                  const double *host_special_polar_piscale,
+                  const double *host_special_polar_pscale,
+                  const double *host_csix, const double *host_adisp,
+                  const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const int maxspecial15,
+                  const double cell_size, const double gpu_split, FILE *_screen,
+                  const double polar_dscale, const double polar_uscale) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
+                            cell_size,gpu_split,_screen,amoeba,
+                            "k_amoeba_multipole", "k_amoeba_udirect2b",
+                            "k_amoeba_umutual2b", "k_amoeba_polar",
+                            "k_amoeba_fphi_uind", "k_amoeba_fphi_mpole",
+                            "k_amoeba_short_nbor", "k_amoeba_special15");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+
+  UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_pdamp[i];
+    host_write[i].y = host_thole[i];
+    host_write[i].z = host_dirdamp[i];
+    host_write[i].w = host_amtype2class[i];
+  }
+
+  coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amtype,host_write,false);
+
+  UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amclass; i++) {
+    host_write2[i].x = host_csix[i];
+    host_write2[i].y = host_adisp[i];
+    host_write2[i].z = (numtyp)0;
+    host_write2[i].w = (numtyp)0;
+  }
+
+  coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amclass,host_write2,false);
+
+  UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
+  sp_amoeba.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_hal[i];
+    dview[i].y=host_special_polar_piscale[i];
+    dview[i].z=host_special_polar_pscale[i];
+    dview[i].w=host_special_mpole[i];
+  }
+  ucl_copy(sp_amoeba,dview,5,false);
+
+  _polar_dscale = polar_dscale;
+  _polar_uscale = polar_uscale;
+
+  _allocated=true;
+  this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
+    + sp_amoeba.row_bytes() + this->_tep.row_bytes()
+    + this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
+    + this->_thetai2.row_bytes()  + this->_thetai3.row_bytes()
+    + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void AmoebaT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  coeff_amtype.clear();
+  coeff_amclass.clear();
+  sp_amoeba.clear();
+
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double AmoebaT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the multipole real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::multipole_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_mpole,
+  //   at this point mpole is the first kernel in a time step for AMOEBA
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_mpole, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  this->k_multipole.set_size(GX,BX);
+  this->k_multipole.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &sp_amoeba,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->ans->force, &this->ans->engv, &this->_tep,
+                        &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                        &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Launch the real-space permanent field kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff _off2_polar, if not done yet
+  //   this is the first kernel in a time step where _off2_polar is used
+
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(),
+                           &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                           &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_udirect2b.set_size(GX,BX);
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
+                        &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Launch the real-space induced field kernel, returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(), &this->dev_short_nbor,
+                           &this->_off2_polar, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_umutual2b.set_size(GX,BX);
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
+                        &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
+                        &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Launch the polar real-space kernel, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::polar_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=this->block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  /*
+  const int cus = this->device->gpu->cus();
+  while (GX < cus && GX > 1) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  }
+  */
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_polar.set_size(GX,BX);
+  this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                    &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  // Signal that short nbor list is not avail for the next time step
+  //   do it here because polar_real() is the last kernel in a time step at this point
+
+  this->short_nbor_polar_avail = false;
+
+  return GX;
+}
+
+template class Amoeba<PRECISION,ACC_PRECISION>;
+}
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
new file mode 100644
index 0000000000..f572d3ebd0
--- /dev/null
+++ b/lib/gpu/lal_amoeba.cu
@@ -0,0 +1,2099 @@
+// **************************************************************************
+//                                   amoeba.cu
+//                             -------------------
+//                          Trung Dac Nguyen (Northwestern)
+//
+//  Device code for acceleration of the amoeba pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : trung.nguyen@northwestern.edu
+// ***************************************************************************
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+
+#include "lal_aux_fun1.h"
+#ifdef LAMMPS_SMALLBIG
+#define tagint int
+#endif
+#ifdef LAMMPS_BIGBIG
+#include "inttypes.h"
+#define tagint int64_t
+#endif
+#ifdef LAMMPS_SMALLSMALL
+#define tagint int
+#endif
+#ifndef _DOUBLE_DOUBLE
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+#else
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#ifdef LAMMPS_SMALLBIG
+#define tagint int
+#endif
+#ifdef LAMMPS_BIGBIG
+#define tagint long
+#endif
+#ifdef LAMMPS_SMALLSMALL
+#define tagint int
+#endif
+
+#endif // defined(NV_KERNEL) || defined(USE_HIP)
+
+
+#if (SHUFFLE_AVAIL == 0)
+
+#define local_allocate_store_ufld()                                         \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+
+#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
+                                tep)                                        \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=tq.x;                                                   \
+    red_acc[1][tid]=tq.y;                                                   \
+    red_acc[2][tid]=tq.z;                                                   \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<3; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    tq.x=red_acc[0][tid];                                                   \
+    tq.y=red_acc[1][tid];                                                   \
+    tq.z=red_acc[2][tid];                                                   \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    tep[i]=tq;                                                              \
+  }
+
+#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
+                          i, tep)                                           \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=ufld[0];                                                \
+    red_acc[1][tid]=ufld[1];                                                \
+    red_acc[2][tid]=ufld[2];                                                \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<3; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    ufld[0]=red_acc[0][tid];                                                \
+    ufld[1]=red_acc[1][tid];                                                \
+    ufld[2]=red_acc[2][tid];                                                \
+    red_acc[0][tid]=dufld[0];                                               \
+    red_acc[1][tid]=dufld[1];                                               \
+    red_acc[2][tid]=dufld[2];                                               \
+    red_acc[3][tid]=dufld[3];                                               \
+    red_acc[4][tid]=dufld[4];                                               \
+    red_acc[5][tid]=dufld[5];                                               \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    dufld[0]=red_acc[0][tid];                                               \
+    dufld[1]=red_acc[1][tid];                                               \
+    dufld[2]=red_acc[2][tid];                                               \
+    dufld[3]=red_acc[3][tid];                                               \
+    dufld[4]=red_acc[4][tid];                                               \
+    dufld[5]=red_acc[5][tid];                                               \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 t;                                                              \
+    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
+      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
+    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
+      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
+    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
+      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
+    tep[i]=t;                                                               \
+  }
+
+#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i,  \
+                              fieldp)                                       \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=_fieldp[0];                                             \
+    red_acc[1][tid]=_fieldp[1];                                             \
+    red_acc[2][tid]=_fieldp[2];                                             \
+    red_acc[3][tid]=_fieldp[3];                                             \
+    red_acc[4][tid]=_fieldp[4];                                             \
+    red_acc[5][tid]=_fieldp[5];                                             \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    _fieldp[0]=red_acc[0][tid];                                             \
+    _fieldp[1]=red_acc[1][tid];                                             \
+    _fieldp[2]=red_acc[2][tid];                                             \
+    _fieldp[3]=red_acc[3][tid];                                             \
+    _fieldp[4]=red_acc[4][tid];                                             \
+    _fieldp[5]=red_acc[5][tid];                                             \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 f, fp;                                                          \
+    f.x = _fieldp[0];                                                       \
+    f.y = _fieldp[1];                                                       \
+    f.z = _fieldp[2];                                                       \
+    fieldp[ii] = f;                                                         \
+    fp.x = _fieldp[3];                                                      \
+    fp.y = _fieldp[4];                                                      \
+    fp.z = _fieldp[5];                                                      \
+    fieldp[ii+inum] = fp;                                                   \
+  }
+
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+          engv[ei]+=e_coul*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+        engv[ei]+=e_coul*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else // SHUFFLE_AVAIL == 1
+
+#define local_allocate_store_ufld()
+
+#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
+                          tep)                                              \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      tq.x += shfl_down(tq.x, s, t_per_atom);                               \
+      tq.y += shfl_down(tq.y, s, t_per_atom);                               \
+      tq.z += shfl_down(tq.z, s, t_per_atom);                               \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    tep[i]=tq;                                                              \
+  }
+
+#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
+                          i, tep)                                           \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      ufld[0] += shfl_down(ufld[0], s, t_per_atom);                         \
+      ufld[1] += shfl_down(ufld[1], s, t_per_atom);                         \
+      ufld[2] += shfl_down(ufld[2], s, t_per_atom);                         \
+      dufld[0] += shfl_down(dufld[0], s, t_per_atom);                       \
+      dufld[1] += shfl_down(dufld[1], s, t_per_atom);                       \
+      dufld[2] += shfl_down(dufld[2], s, t_per_atom);                       \
+      dufld[3] += shfl_down(dufld[3], s, t_per_atom);                       \
+      dufld[4] += shfl_down(dufld[4], s, t_per_atom);                       \
+      dufld[5] += shfl_down(dufld[5], s, t_per_atom);                       \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 t;                                                              \
+    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
+      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
+    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
+      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
+    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
+      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
+    tep[i]=t;                                                               \
+  }
+
+#define store_answers_fieldp(_fieldp, ii, inum, tid, t_per_atom, offset, i, \
+                             fieldp)                                        \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom);                   \
+      _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom);                   \
+      _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom);                   \
+      _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom);                   \
+      _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom);                   \
+      _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom);                   \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 f, fp;                                                          \
+    f.x = _fieldp[0];                                                       \
+    f.y = _fieldp[1];                                                       \
+    f.z = _fieldp[2];                                                       \
+    fieldp[ii] = f;                                                         \
+    fp.x = _fieldp[3];                                                      \
+    fp.y = _fieldp[4];                                                      \
+    fp.z = _fieldp[5];                                                      \
+    fieldp[ii+inum] = fp;                                                   \
+  }
+
+#if (EVFLAG == 1)
+
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+            engv[ei]+=e_coul*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+        engv[ei]+=e_coul*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+// EVFLAG == 0
+#else
+
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }
+
+#endif // EVFLAG
+#endif // SHUFFLE_AVAIL
+
+#define MIN(A,B) ((A) < (B) ? (A) : (B))
+#define MY_PIS (acctyp)1.77245385090551602729
+
+/* ----------------------------------------------------------------------
+   multipole_real = real-space portion of multipole
+   adapted from Tinker emreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
+                                 const __global numtyp4 *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_amoeba,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 __global acctyp4 *restrict tep,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp felec, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp4 tq;
+  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
+  
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    const numtyp4 pol1i = polar1[i];
+    numtyp ci  = pol1i.x;    // rpole[i][0];
+    numtyp dix = pol1i.y;    // rpole[i][1];
+    numtyp diy = pol1i.z;    // rpole[i][2];
+    numtyp diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    numtyp qixx = pol2i.x;   // rpole[i][4];
+    numtyp qixy = pol2i.y;   // rpole[i][5];
+    numtyp qixz = pol2i.z;   // rpole[i][6];
+    numtyp qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    numtyp qiyz = pol3i.x;   // rpole[i][9];
+    numtyp qizz = pol3i.y;   // rpole[i][12];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      numtyp r = ucl_sqrt(r2);
+      const numtyp4 pol1j = polar1[j];
+      numtyp ck  = pol1j.x;  // rpole[j][0];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      //int jtype = pol3j.z; // amtype[j];
+      //int jgroup =  pol3j.w; // amgroup[j];
+
+      const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)];
+      numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+
+      numtyp dik = dix*dkx + diy*dky + diz*dkz;
+      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
+      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
+      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
+        qixx*qkxx + qiyy*qkyy + qizz*qkzz;
+
+      // additional intermediates involving moments and distance
+
+      numtyp dirx = diy*zr - diz*yr;
+      numtyp diry = diz*xr - dix*zr;
+      numtyp dirz = dix*yr - diy*xr;
+      numtyp dikx = diy*dkz - diz*dky;
+      numtyp diky = diz*dkx - dix*dkz;
+      numtyp dikz = dix*dky - diy*dkx;
+      numtyp qirx = qiz*yr - qiy*zr;
+      numtyp qiry = qix*zr - qiz*xr;
+      numtyp qirz = qiy*xr - qix*yr;
+      numtyp qikx = qky*qiz - qkz*qiy;
+      numtyp qiky = qkz*qix - qkx*qiz;
+      numtyp qikz = qkx*qiy - qky*qix;
+      numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz;
+      numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz;
+      numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz;
+      numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz;
+      numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz;
+      numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz;
+      numtyp qikrx = qizk*yr - qiyk*zr;
+      numtyp qikry = qixk*zr - qizk*xr;
+      numtyp qikrz = qiyk*xr - qixk*yr;
+      numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
+      numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
+      numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
+      numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
+      numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
+      numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
+      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
+      numtyp dkqiry = dkqix*zr - dkqiz*xr;
+      numtyp dkqirz = dkqiy*xr - dkqix*yr;
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy -
+        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz -
+        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix -
+        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = felec * rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+      numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[6];
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
+      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
+      numtyp alsq2n = (numtyp)0.0;
+      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+      int m;
+      for (m = 1; m < 6; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        alsq2n = alsq2 * alsq2n;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
+      }
+      for (m = 0; m < 6; m++) bn[m] *= felec;
+
+      numtyp term1,term2,term3;
+      numtyp term4,term5,term6;
+
+      term1 = ci*ck;
+      term2 = ck*dir - ci*dkr + dik;
+      term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
+      term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik;
+      term5 = qir*qkr;
+      numtyp scalek = (numtyp)1.0 - factor_mpole;
+      rr1 = bn[0] - scalek*rr1;
+      rr3 = bn[1] - scalek*rr3;
+      rr5 = bn[2] - scalek*rr5;
+      rr7 = bn[3] - scalek*rr7;
+      rr9 = bn[4] - scalek*rr9;
+      rr11 = bn[5] - scalek*rr11;
+      numtyp e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9;
+
+      // find standard multipole intermediates for force and torque
+
+      numtyp de = term1*rr3 + term2*rr5 + term3*rr7 + term4*rr9 + term5*rr11;
+      term1 = -ck*rr3 + dkr*rr5 - qkr*rr7;
+      term2 = ci*rr3 + dir*rr5 + qir*rr7;
+      term3 = (numtyp)2.0 * rr5;
+      term4 = (numtyp)2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
+      term5 = (numtyp)2.0 * (-ci*rr5-dir*rr7-qir*rr9);
+      term6 = (numtyp)4.0 * rr7;
+
+      energy += e;
+
+      // compute the force components for this interaction
+
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) +
+        term4*qix + term5*qkx + term6*(qixk+qkxi);
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) +
+        term4*qiy + term5*qky + term6*(qiyk+qkyi);
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
+        term4*qiz + term5*qkz + term6*(qizk+qkzi);
+
+      // compute the torque components for this interaction
+
+      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) -
+        term4*qirx - term6*(qikrx+qikx);
+      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) -
+        term4*qiry - term6*(qikry+qiky);
+      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) -
+        term4*qirz - term6*(qikrz+qikz);
+
+      // increment force-based gradient and torque on first site
+
+      f.x -= frcx;
+      f.y -= frcy;
+      f.z -= frcz;
+      tq.x += ttmix;
+      tq.y += ttmiy;
+      tq.z += ttmiz;
+
+      if (EVFLAG && vflag) {
+        numtyp vxx = -xr * frcx;
+        numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = -yr * frcy;
+        numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = -zr * frcz;
+
+        virial[0] -= vxx;
+        virial[1] -= vyy;
+        virial[2] -= vzz;
+        virial[3] -= vxy;
+        virial[4] -= vxz;
+        virial[5] -= vyz;
+      }
+    } // nbor
+
+  } // ii<inum
+
+  // accumulate tq
+  store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
+
+  // accumate force, energy and virial: use _acc if not the first kernel
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
+}
+
+/* ----------------------------------------------------------------------
+  udirect2b = Ewald real direct field via list
+  udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
+                                 const __global numtyp4 *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_amoeba,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_ufld();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    const numtyp4 pol3i = polar3[i];
+    int itype  = pol3i.z;    // amtype[i];
+    int igroup = pol3i.w;    // amgroup[i];
+
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
+    numtyp ddi = coeff[itype].z;
+
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_rsqrt(r2);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+
+      const numtyp4 pol1j = polar1[j];
+      numtyp ck  = pol1j.x;  // rpole[j][0];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype = pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
+
+      numtyp factor_dscale, factor_pscale;
+      const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_amoeba_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_amoeba_pscale[sbmask15(jextra)];
+        factor_dscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[4], bcn[3];
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find the field components for Thole polarization damping
+
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp scale7 = (numtyp)1.0;
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(ddi,coeff[jtype].z); // dirdamp[jtype]
+        if (pgamma != (numtyp)0.0) {
+          numtyp tmp = r*ucl_recip(damp);
+          damp = pgamma * ucl_sqrt(tmp*tmp*tmp);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp) ;
+            scale3 = (numtyp)1.0 - expdamp ;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
+          }
+        } else {
+          pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
+          numtyp tmp = r*ucl_recip(damp);
+          damp = pgamma * (tmp*tmp*tmp);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp);
+            scale3 = (numtyp)1.0 - expdamp;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
+          }
+        }
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_dscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+
+      numtyp fid[3];
+      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+
+      scalek = factor_pscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      numtyp fip[3];
+      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
+/* ----------------------------------------------------------------------
+  umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
+                                 const __global numtyp4 *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_amoeba,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_ufld();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  const __global numtyp4* polar3 = &extra[2*nall];
+  const __global numtyp4* polar4 = &extra[3*nall];
+  const __global numtyp4* polar5 = &extra[4*nall];
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    int itype,igroup;
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
+
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_rsqrt(r2);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+
+      const numtyp4 pol3j = polar3[j];
+      int jtype = pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
+      const numtyp4 pol4j = polar4[j];
+      numtyp ukx = pol4j.x;  // uind[j][0];
+      numtyp uky = pol4j.y;  // uind[j][1];
+      numtyp ukz = pol4j.z;  // uind[j][2];
+      const numtyp4 pol5j = polar5[j];
+      numtyp ukxp = pol5j.x; // uinp[j][0];
+      numtyp ukyp = pol5j.y; // uinp[j][1];
+      numtyp ukzp = pol5j.z; // uinp[j][2];
+
+      numtyp factor_uscale;
+      if (igroup == jgroup) factor_uscale = polar_uscale;
+      else factor_uscale = (numtyp)1.0;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[4];
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find terms needed later to compute mutual polarization
+      // if (poltyp != DIRECT)
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
+        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+        if (damp < (numtyp)50.0) {
+          numtyp expdamp = ucl_exp(-damp);
+          scale3 = (numtyp)1.0 - expdamp;
+          scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+        }
+
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_uscale;
+      numtyp bcn[3];
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+
+      numtyp tdipdip[6];
+      tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
+      tdipdip[1] = bcn[1]*xr*yr;
+      tdipdip[2] = bcn[1]*xr*zr;
+      tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
+      tdipdip[4] = bcn[1]*yr*zr;
+      tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
+
+      numtyp fid[3];
+      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
+      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
+      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
+
+      numtyp fip[3];
+      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
+      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
+      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
+
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
+/* ----------------------------------------------------------------------
+   polar_real = real-space portion of induced dipole polarization
+   adapted from Tinker epreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
+                             const __global numtyp4 *restrict extra,
+                             const __global numtyp4 *restrict coeff,
+                             const __global numtyp4 *restrict sp_amoeba,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             const __global int *dev_short_nbor,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             __global acctyp4 *restrict tep,
+                             const int eflag, const int vflag, const int inum,
+                             const int nall, const int nbor_pitch, const int t_per_atom,
+                             const numtyp aewald, const numtyp felec,
+                             const numtyp off2, const numtyp polar_dscale,
+                             const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp ufld[3];
+  ufld[0] = (acctyp)0; ufld[1]=(acctyp)0; ufld[2]=(acctyp)0;
+  acctyp dufld[6];
+  for (int l=0; l<6; l++) dufld[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
+  const __global numtyp4* polar4 = &extra[3*nall];
+  const __global numtyp4* polar5 = &extra[4*nall];
+
+  if (ii<inum) {
+    int itype,igroup;
+    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
+
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    const numtyp4 pol1i = polar1[i];
+    ci  = pol1i.x;    // rpole[i][0];
+    dix = pol1i.y;    // rpole[i][1];
+    diy = pol1i.z;    // rpole[i][2];
+    diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    qixx = pol2i.x;   // rpole[i][4];
+    qixy = pol2i.y;   // rpole[i][5];
+    qixz = pol2i.z;   // rpole[i][6];
+    qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    qiyz = pol3i.x;   // rpole[i][9];
+    qizz = pol3i.y;   // rpole[i][12];
+    itype  = pol3i.z;    // amtype[i];
+    igroup = pol3i.w;    // amgroup[i];
+    const numtyp4 pol4i = polar4[i];
+    uix = pol4i.x;    // uind[i][0];
+    uiy = pol4i.y;    // uind[i][1];
+    uiz = pol4i.z;    // uind[i][2];
+    const numtyp4 pol5i = polar5[i];
+    uixp = pol5i.x;   // uinp[i][0];
+    uiyp = pol5i.y;   // uinp[i][1];
+    uizp = pol5i.z;   // uinp[i][2];
+
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+      numtyp r = ucl_sqrt(r2);
+
+      const numtyp4 pol1j = polar1[j];
+      numtyp ck = pol1j.x;   // rpole[j][0];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype =   pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
+      const numtyp4 pol4j = polar4[j];
+      numtyp ukx = pol4j.x;  // uind[j][0];
+      numtyp uky = pol4j.y;  // uind[j][1];
+      numtyp ukz = pol4j.z;  // uind[j][2];
+      const numtyp4 pol5j = polar5[j];
+      numtyp ukxp = pol5j.x; // uinp[j][0];
+      numtyp ukyp = pol5j.y; // uinp[j][1];
+      numtyp ukzp = pol5j.z; // uinp[j][2];
+
+      numtyp factor_dscale, factor_pscale, factor_uscale;
+      const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_amoeba_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_amoeba_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+      numtyp uir = uix*xr + uiy*yr + uiz*zr;
+      numtyp ukr = ukx*xr + uky*yr + ukz*zr;
+      numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = felec * rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+
+      // calculate the real space Ewald error function terms
+
+      int k,m;
+      numtyp psc3,psc5,psc7;
+      numtyp dsc3,dsc5,dsc7;
+      numtyp usc3,usc5;
+      numtyp psr3,psr5,psr7;
+      numtyp dsr3,dsr5,dsr7;
+      numtyp usr5;
+      numtyp term1,term2,term3;
+      numtyp term4,term5;
+      numtyp term6,term7;
+      numtyp rc3[3],rc5[3],rc7[3];
+      numtyp prc3[3],prc5[3],prc7[3];
+      numtyp drc3[3],drc5[3],drc7[3];
+      numtyp urc3[3],urc5[3];
+    
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[5];
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
+      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
+      numtyp alsq2n = (numtyp)0.0;
+      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+      for (m = 1; m <= 4; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        alsq2n = alsq2 * alsq2n;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
+      }
+      for (m = 0; m < 5; m++) bn[m] *= felec;
+
+      // apply Thole polarization damping to scale factors
+
+      numtyp sc3 = (numtyp)1.0;
+      numtyp sc5 = (numtyp)1.0;
+      numtyp sc7 = (numtyp)1.0;
+      for (k = 0; k < 3; k++) {
+        rc3[k] = (numtyp)0.0;
+        rc5[k] = (numtyp)0.0;
+        rc7[k] = (numtyp)0.0;
+      }
+
+      // apply Thole polarization damping to scale factors
+
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
+        numtyp tmp = r*ucl_recip(damp);
+        damp = pgamma * (tmp*tmp*tmp);
+        if (damp < (numtyp)50.0) {
+          numtyp expdamp = ucl_exp(-damp);
+          sc3 = (numtyp)1.0 - expdamp;
+          sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp;
+          sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp;
+          numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv;
+          numtyp temp5 = damp;
+          numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp;
+          rc3[0] = xr * temp3;
+          rc3[1] = yr * temp3;
+          rc3[2] = zr * temp3;
+          rc5[0] = rc3[0] * temp5;
+          rc5[1] = rc3[1] * temp5;
+          rc5[2] = rc3[2] * temp5;
+          rc7[0] = rc5[0] * temp7;
+          rc7[1] = rc5[1] * temp7;
+          rc7[2] = rc5[2] * temp7;
+        }
+
+        psc3 = (numtyp)1.0 - sc3*factor_pscale;
+        psc5 = (numtyp)1.0 - sc5*factor_pscale;
+        psc7 = (numtyp)1.0 - sc7*factor_pscale;
+        dsc3 = (numtyp)1.0 - sc3*factor_dscale;
+        dsc5 = (numtyp)1.0 - sc5*factor_dscale;
+        dsc7 = (numtyp)1.0 - sc7*factor_dscale;
+        usc3 = (numtyp)1.0 - sc3*factor_uscale;
+        usc5 = (numtyp)1.0 - sc5*factor_uscale;
+        psr3 = bn[1] - psc3*rr3;
+        psr5 = bn[2] - psc5*rr5;
+        psr7 = bn[3] - psc7*rr7;
+        dsr3 = bn[1] - dsc3*rr3;
+        dsr5 = bn[2] - dsc5*rr5;
+        dsr7 = bn[3] - dsc7*rr7;
+        usr5 = bn[2] - usc5*rr5;
+        for (k = 0; k < 3; k++) {
+          prc3[k] = rc3[k] * factor_pscale;
+          prc5[k] = rc5[k] * factor_pscale;
+          prc7[k] = rc7[k] * factor_pscale;
+          drc3[k] = rc3[k] * factor_dscale;
+          drc5[k] = rc5[k] * factor_dscale;
+          drc7[k] = rc7[k] * factor_dscale;
+          urc3[k] = rc3[k] * factor_uscale;
+          urc5[k] = rc5[k] * factor_uscale;
+        }
+      } else { // damp == 0: ???
+      }
+
+      // get the induced dipole field used for dipole torques
+
+      numtyp tix3 = psr3*ukx + dsr3*ukxp;
+      numtyp tiy3 = psr3*uky + dsr3*ukyp;
+      numtyp tiz3 = psr3*ukz + dsr3*ukzp;
+      numtyp tuir = -psr5*ukr - dsr5*ukrp;
+
+      ufld[0] += tix3 + xr*tuir;
+      ufld[1] += tiy3 + yr*tuir;
+      ufld[2] += tiz3 + zr*tuir;
+
+      // get induced dipole field gradient used for quadrupole torques
+
+      numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp);
+      numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp);
+      numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp);
+      tuir = -psr7*ukr - dsr7*ukrp;
+
+      dufld[0] += xr*tix5 + xr*xr*tuir;
+      dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
+      dufld[2] += yr*tiy5 + yr*yr*tuir;
+      dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
+      dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
+      dufld[5] += zr*tiz5 + zr*zr*tuir;
+
+      // get the dEd/dR terms used for direct polarization force
+
+      term1 = bn[2] - dsc3*rr5;
+      term2 = bn[3] - dsc5*rr7;
+      term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0];
+      term4 = rr3*drc3[0] - term1*xr - dsr5*xr;
+      term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
+      term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
+      numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
+      numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
+
+      term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1];
+      term4 = rr3*drc3[1] - term1*yr - dsr5*yr;
+      term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1];
+      term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1];
+      term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr;
+      numtyp tiyy = ci*term3 + diy*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
+      numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2];
+      term4 = rr3*drc3[2] - term1*zr - dsr5*zr;
+      term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2];
+      term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2];
+      term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr;
+      numtyp tizz = ci*term3 + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      term3 = term1*xr*yr - rr3*yr*drc3[0];
+      term4 = rr3*drc3[0] - term1*xr;
+      term5 = term2*xr*yr - rr5*yr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0];
+      term7 = rr5*drc5[0] - term2*xr;
+      numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
+      numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6;
+
+      term3 = term1*xr*zr - rr3*zr*drc3[0];
+      term5 = term2*xr*zr - rr5*zr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0];
+      numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      term3 = term1*yr*zr - rr3*zr*drc3[1];
+      term4 = rr3*drc3[1] - term1*yr;
+      term5 = term2*yr*zr - rr5*zr*drc5[1];
+      term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1];
+      term7 = rr5*drc5[1] - term2*yr;
+      numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp;
+      numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp;
+      numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp;
+
+      numtyp frcx = depx;
+      numtyp frcy = depy;
+      numtyp frcz = depz;
+
+      // get the dEp/dR terms used for direct polarization force
+
+      // tixx and tkxx
+      term1 = bn[2] - psc3*rr5;
+      term2 = bn[3] - psc5*rr7;
+      term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0];
+      term4 = rr3*prc3[0] - term1*xr - psr5*xr;
+      term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0];
+      term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr;
+      tixx = ci*term3 + dix*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
+      tkxx = ck*term3 - dkx*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
+
+      // tiyy and tkyy
+      term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1];
+      term4 = rr3*prc3[1] - term1*yr - psr5*yr;
+      term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1];
+      term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1];
+      term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr;
+      tiyy = ci*term3 + diy*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
+      tkyy = ck*term3 - dky*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      // tizz and tkzz
+      term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2];
+      term4 = rr3*prc3[2] - term1*zr - psr5*zr;
+      term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2];
+      term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2];
+      term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr;
+      tizz = ci*term3 + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkzz = ck*term3 - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      // tixy and tkxy
+      term3 = term1*xr*yr - rr3*yr*prc3[0];
+      term4 = rr3*prc3[0] - term1*xr;
+      term5 = term2*xr*yr - rr5*yr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0];
+      term7 = rr5*prc5[0] - term2*xr;
+      tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
+      tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      // tixz and tkxz
+      term3 = term1*xr*zr - rr3*zr*prc3[0];
+      term5 = term2*xr*zr - rr5*zr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0];
+      tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      // tiyz and tkyz
+      term3 = term1*yr*zr - rr3*zr*prc3[1];
+      term4 = rr3*prc3[1] - term1*yr;
+      term5 = term2*yr*zr - rr5*zr*prc5[1];
+      term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1];
+      term7 = rr5*prc5[1] - term2*yr;
+      tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz;
+      depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
+      depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;
+
+      frcx = frcx + depx;
+      frcy = frcy + depy;
+      frcz = frcz + depz;
+
+      // get the dtau/dr terms used for mutual polarization force
+      // poltyp == MUTUAL  && amoeba
+
+      term1 = bn[2] - usc3*rr5;
+      term2 = bn[3] - usc5*rr7;
+      term3 = usr5 + term1;
+      term4 = rr3 * factor_uscale;
+      term5 = -xr*term3 + rc3[0]*term4;
+      term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0];
+      tixx = uix*term5 + uir*term6;
+      tkxx = ukx*term5 + ukr*term6;
+
+      term5 = -yr*term3 + rc3[1]*term4;
+      term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1];
+      tiyy = uiy*term5 + uir*term6;
+      tkyy = uky*term5 + ukr*term6;
+
+      term5 = -zr*term3 + rc3[2]*term4;
+      term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2];
+      tizz = uiz*term5 + uir*term6;
+      tkzz = ukz*term5 + ukr*term6;
+
+      term4 = -usr5 * yr;
+      term5 = -xr*term1 + rr3*urc3[0];
+      term6 = xr*yr*term2 - rr5*yr*urc5[0];
+      tixy = uix*term4 + uiy*term5 + uir*term6;
+      tkxy = ukx*term4 + uky*term5 + ukr*term6;
+
+      term4 = -usr5 * zr;
+      term6 = xr*zr*term2 - rr5*zr*urc5[0];
+      tixz = uix*term4 + uiz*term5 + uir*term6;
+      tkxz = ukx*term4 + ukz*term5 + ukr*term6;
+
+      term5 = -yr*term1 + rr3*urc3[1];
+      term6 = yr*zr*term2 - rr5*zr*urc5[1];
+      tiyz = uiy*term4 + uiz*term5 + uir*term6;
+      tkyz = uky*term4 + ukz*term5 + ukr*term6;
+
+      depx = tixx*ukxp + tixy*ukyp + tixz*ukzp
+        + tkxx*uixp + tkxy*uiyp + tkxz*uizp;
+      depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp
+        + tkxy*uixp + tkyy*uiyp + tkyz*uizp;
+      depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp
+        + tkxz*uixp + tkyz*uiyp + tkzz*uizp;
+
+      frcx = frcx + depx;
+      frcy = frcy + depy;
+      frcz = frcz + depz;
+
+      f.x += frcx;
+      f.y += frcy;
+      f.z += frcz;
+
+      if (EVFLAG && vflag) {
+        numtyp vxx = xr * frcx;
+        numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = yr * frcy;
+        numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = zr * frcz;
+
+        virial[0] -= vxx;
+        virial[1] -= vyy;
+        virial[2] -= vzz;
+        virial[3] -= vxy;
+        virial[4] -= vxz;
+        virial[5] -= vyz;
+      }
+    } // nbor
+
+  } // ii<inum
+
+  // accumulate ufld and dufld to compute tep
+  store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);
+
+  // accumate force, energy and virial
+  store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
+}
+
+/* ----------------------------------------------------------------------
+   fphi_uind = induced potential from grid
+   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
+                          const __global int *restrict igrid,
+                          const __global numtyp2 *restrict grid,
+                          __global acctyp *restrict fdip_phi1,
+                          __global acctyp *restrict fdip_phi2,
+                          __global acctyp *restrict fdip_sum_phi,
+                          const int bsorder, const int inum,
+                          const int nzlo_out, const int nylo_out,
+                          const int nxlo_out, const int ngridxy,
+                          const int ngridx)
+{
+  int tid=THREAD_ID_X;
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
+
+  if (ii<inum) {
+
+    const int nlpts = (bsorder-1) / 2;
+    
+    int istart = fast_mul(ii,4);
+    const int igridx = igrid[istart];
+    const int igridy = igrid[istart+1];
+    const int igridz = igrid[istart+2];
+    
+    // now istart is used to index thetai1, thetai2 and thetai3
+    istart = fast_mul(ii,bsorder);
+
+    // extract the permanent multipole field at each site
+
+    numtyp tuv100_1 = (numtyp)0.0;
+    numtyp tuv010_1 = (numtyp)0.0;
+    numtyp tuv001_1 = (numtyp)0.0;
+    numtyp tuv200_1 = (numtyp)0.0;
+    numtyp tuv020_1 = (numtyp)0.0;
+    numtyp tuv002_1 = (numtyp)0.0;
+    numtyp tuv110_1 = (numtyp)0.0;
+    numtyp tuv101_1 = (numtyp)0.0;
+    numtyp tuv011_1 = (numtyp)0.0;
+    numtyp tuv100_2 = (numtyp)0.0;
+    numtyp tuv010_2 = (numtyp)0.0;
+    numtyp tuv001_2 = (numtyp)0.0;
+    numtyp tuv200_2 = (numtyp)0.0;
+    numtyp tuv020_2 = (numtyp)0.0;
+    numtyp tuv002_2 = (numtyp)0.0;
+    numtyp tuv110_2 = (numtyp)0.0;
+    numtyp tuv101_2 = (numtyp)0.0;
+    numtyp tuv011_2 = (numtyp)0.0;
+    numtyp tuv000 = (numtyp)0.0;
+    numtyp tuv001 = (numtyp)0.0;
+    numtyp tuv010 = (numtyp)0.0;
+    numtyp tuv100 = (numtyp)0.0;
+    numtyp tuv200 = (numtyp)0.0;
+    numtyp tuv020 = (numtyp)0.0;
+    numtyp tuv002 = (numtyp)0.0;
+    numtyp tuv110 = (numtyp)0.0;
+    numtyp tuv101 = (numtyp)0.0;
+    numtyp tuv011 = (numtyp)0.0;
+    numtyp tuv300 = (numtyp)0.0;
+    numtyp tuv030 = (numtyp)0.0;
+    numtyp tuv003 = (numtyp)0.0;
+    numtyp tuv210 = (numtyp)0.0;
+    numtyp tuv201 = (numtyp)0.0;
+    numtyp tuv120 = (numtyp)0.0;
+    numtyp tuv021 = (numtyp)0.0;
+    numtyp tuv102 = (numtyp)0.0;
+    numtyp tuv012 = (numtyp)0.0;
+    numtyp tuv111 = (numtyp)0.0;
+
+    int k = (igridz - nzlo_out) - nlpts;
+    for (int kb = 0; kb < bsorder; kb++) {
+      const int mz = fast_mul(k, ngridxy);
+      const int i3 = istart + kb;
+      const numtyp4 tha3 = thetai3[i3];
+      const numtyp v0 = tha3.x; // thetai3[m][kb][0];
+      const numtyp v1 = tha3.y; // thetai3[m][kb][1];
+      const numtyp v2 = tha3.z; // thetai3[m][kb][2];
+      const numtyp v3 = tha3.w; // thetai3[m][kb][3];
+      numtyp tu00_1 = (numtyp)0.0;
+      numtyp tu01_1 = (numtyp)0.0;
+      numtyp tu10_1 = (numtyp)0.0;
+      numtyp tu20_1 = (numtyp)0.0;
+      numtyp tu11_1 = (numtyp)0.0;
+      numtyp tu02_1 = (numtyp)0.0;
+      numtyp tu00_2 = (numtyp)0.0;
+      numtyp tu01_2 = (numtyp)0.0;
+      numtyp tu10_2 = (numtyp)0.0;
+      numtyp tu20_2 = (numtyp)0.0;
+      numtyp tu11_2 = (numtyp)0.0;
+      numtyp tu02_2 = (numtyp)0.0;
+      numtyp tu00 = (numtyp)0.0;
+      numtyp tu10 = (numtyp)0.0;
+      numtyp tu01 = (numtyp)0.0;
+      numtyp tu20 = (numtyp)0.0;
+      numtyp tu11 = (numtyp)0.0;
+      numtyp tu02 = (numtyp)0.0;
+      numtyp tu30 = (numtyp)0.0;
+      numtyp tu21 = (numtyp)0.0;
+      numtyp tu12 = (numtyp)0.0;
+      numtyp tu03 = (numtyp)0.0;
+
+      int j = (igridy - nylo_out) - nlpts;
+      for (int jb = 0; jb < bsorder; jb++) {
+        const int my = mz + fast_mul(j, ngridx);
+        const int i2 = istart + jb;
+        const numtyp4 tha2 = thetai2[i2];
+        const numtyp u0 = tha2.x; // thetai2[m][jb][0];
+        const numtyp u1 = tha2.y; // thetai2[m][jb][1];
+        const numtyp u2 = tha2.z; // thetai2[m][jb][2];
+        const numtyp u3 = tha2.w; // thetai2[m][jb][3];
+        numtyp t0_1 = (numtyp)0.0;
+        numtyp t1_1 = (numtyp)0.0;
+        numtyp t2_1 = (numtyp)0.0;
+        numtyp t0_2 = (numtyp)0.0;
+        numtyp t1_2 = (numtyp)0.0;
+        numtyp t2_2 = (numtyp)0.0;
+        numtyp t3 = (numtyp)0.0;
+
+        int i = (igridx - nxlo_out) - nlpts;
+        for (int ib = 0; ib < bsorder; ib++) {
+          const int i1 = istart + ib;
+          const numtyp4 tha1 = thetai1[i1];
+          const int gidx = my + i; // k*ngridxy + j*ngridx + i;
+          const numtyp2 tq = grid[gidx];
+          const numtyp tq_1 = tq.x; //grid[gidx];
+          const numtyp tq_2 = tq.y; //grid[gidx+1];
+          t0_1 += tq_1*tha1.x;
+          t1_1 += tq_1*tha1.y;
+          t2_1 += tq_1*tha1.z;
+          t0_2 += tq_2*tha1.x;
+          t1_2 += tq_2*tha1.y;
+          t2_2 += tq_2*tha1.z;
+          t3 += (tq_1+tq_2)*tha1.w;
+          i++;
+        }
+
+        tu00_1 += t0_1*u0;
+        tu10_1 += t1_1*u0;
+        tu01_1 += t0_1*u1;
+        tu20_1 += t2_1*u0;
+        tu11_1 += t1_1*u1;
+        tu02_1 += t0_1*u2;
+        tu00_2 += t0_2*u0;
+        tu10_2 += t1_2*u0;
+        tu01_2 += t0_2*u1;
+        tu20_2 += t2_2*u0;
+        tu11_2 += t1_2*u1;
+        tu02_2 += t0_2*u2;
+        numtyp t0 = t0_1 + t0_2;
+        numtyp t1 = t1_1 + t1_2;
+        numtyp t2 = t2_1 + t2_2;
+        tu00 += t0*u0;
+        tu10 += t1*u0;
+        tu01 += t0*u1;
+        tu20 += t2*u0;
+        tu11 += t1*u1;
+        tu02 += t0*u2;
+        tu30 += t3*u0;
+        tu21 += t2*u1;
+        tu12 += t1*u2;
+        tu03 += t0*u3;
+        j++;
+      }
+
+      tuv100_1 += tu10_1*v0;
+      tuv010_1 += tu01_1*v0;
+      tuv001_1 += tu00_1*v1;
+      tuv200_1 += tu20_1*v0;
+      tuv020_1 += tu02_1*v0;
+      tuv002_1 += tu00_1*v2;
+      tuv110_1 += tu11_1*v0;
+      tuv101_1 += tu10_1*v1;
+      tuv011_1 += tu01_1*v1;
+      tuv100_2 += tu10_2*v0;
+      tuv010_2 += tu01_2*v0;
+      tuv001_2 += tu00_2*v1;
+      tuv200_2 += tu20_2*v0;
+      tuv020_2 += tu02_2*v0;
+      tuv002_2 += tu00_2*v2;
+      tuv110_2 += tu11_2*v0;
+      tuv101_2 += tu10_2*v1;
+      tuv011_2 += tu01_2*v1;
+      tuv000 += tu00*v0;
+      tuv100 += tu10*v0;
+      tuv010 += tu01*v0;
+      tuv001 += tu00*v1;
+      tuv200 += tu20*v0;
+      tuv020 += tu02*v0;
+      tuv002 += tu00*v2;
+      tuv110 += tu11*v0;
+      tuv101 += tu10*v1;
+      tuv011 += tu01*v1;
+      tuv300 += tu30*v0;
+      tuv030 += tu03*v0;
+      tuv003 += tu00*v3;
+      tuv210 += tu21*v0;
+      tuv201 += tu20*v1;
+      tuv120 += tu12*v0;
+      tuv021 += tu02*v1;
+      tuv102 += tu10*v2;
+      tuv012 += tu01*v2;
+      tuv111 += tu11*v1;
+      k++;
+    }
+
+    int idx;
+    acctyp fdip_buf[20];
+
+    fdip_buf[0] = (numtyp)0.0;
+    fdip_buf[1] = tuv100_1;
+    fdip_buf[2] = tuv010_1;
+    fdip_buf[3] = tuv001_1;
+    fdip_buf[4] = tuv200_1;
+    fdip_buf[5] = tuv020_1;
+    fdip_buf[6] = tuv002_1;
+    fdip_buf[7] = tuv110_1;
+    fdip_buf[8] = tuv101_1;
+    fdip_buf[9] = tuv011_1;
+    idx = ii;    
+    for (int m = 0; m < 10; m++) {
+      fdip_phi1[idx] = fdip_buf[m];
+      idx += inum;
+    }
+
+    fdip_buf[0] = (numtyp)0.0;
+    fdip_buf[1] = tuv100_2;
+    fdip_buf[2] = tuv010_2;
+    fdip_buf[3] = tuv001_2;
+    fdip_buf[4] = tuv200_2;
+    fdip_buf[5] = tuv020_2;
+    fdip_buf[6] = tuv002_2;
+    fdip_buf[7] = tuv110_2;
+    fdip_buf[8] = tuv101_2;
+    fdip_buf[9] = tuv011_2;
+    idx = ii;    
+    for (int m = 0; m < 10; m++) {
+      fdip_phi2[idx] = fdip_buf[m];
+      idx += inum;
+    }
+
+    fdip_buf[0] = tuv000;
+    fdip_buf[1] = tuv100;
+    fdip_buf[2] = tuv010;
+    fdip_buf[3] = tuv001;
+    fdip_buf[4] = tuv200;
+    fdip_buf[5] = tuv020;
+    fdip_buf[6] = tuv002;
+    fdip_buf[7] = tuv110;
+    fdip_buf[8] = tuv101;
+    fdip_buf[9] = tuv011;
+    fdip_buf[10] = tuv300;
+    fdip_buf[11] = tuv030;
+    fdip_buf[12] = tuv003;
+    fdip_buf[13] = tuv210;
+    fdip_buf[14] = tuv201;
+    fdip_buf[15] = tuv120;
+    fdip_buf[16] = tuv021;
+    fdip_buf[17] = tuv102;
+    fdip_buf[18] = tuv012;
+    fdip_buf[19] = tuv111;
+    idx = ii;    
+    for (int m = 0; m < 20; m++) {
+      fdip_sum_phi[idx] = fdip_buf[m];
+      idx += inum;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   fphi_mpole = multipole potential from grid
+   fphi_mpole extracts the permanent multipole potential from
+   the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
+                          const __global int *restrict igrid,
+                          const __global numtyp2 *restrict grid,
+                          __global acctyp *restrict fphi,
+                          const int bsorder, const int inum, const numtyp felec,
+                          const int nzlo_out, const int nylo_out,
+                          const int nxlo_out, const int ngridxy,
+                          const int ngridx)
+{
+  int tid=THREAD_ID_X;
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
+
+  if (ii<inum) {
+
+    int nlpts = (bsorder-1) / 2;
+    
+    int istart = fast_mul(ii,4);
+    int igridx = igrid[istart];
+    int igridy = igrid[istart+1];
+    int igridz = igrid[istart+2];
+    
+    // now istart is used to index thetai1, thetai2 and thetai3
+    istart = fast_mul(ii,bsorder);
+
+    // extract the permanent multipole field at each site
+
+    numtyp tuv000 = (numtyp)0.0;
+    numtyp tuv001 = (numtyp)0.0;
+    numtyp tuv010 = (numtyp)0.0;
+    numtyp tuv100 = (numtyp)0.0;
+    numtyp tuv200 = (numtyp)0.0;
+    numtyp tuv020 = (numtyp)0.0;
+    numtyp tuv002 = (numtyp)0.0;
+    numtyp tuv110 = (numtyp)0.0;
+    numtyp tuv101 = (numtyp)0.0;
+    numtyp tuv011 = (numtyp)0.0;
+    numtyp tuv300 = (numtyp)0.0;
+    numtyp tuv030 = (numtyp)0.0;
+    numtyp tuv003 = (numtyp)0.0;
+    numtyp tuv210 = (numtyp)0.0;
+    numtyp tuv201 = (numtyp)0.0;
+    numtyp tuv120 = (numtyp)0.0;
+    numtyp tuv021 = (numtyp)0.0;
+    numtyp tuv102 = (numtyp)0.0;
+    numtyp tuv012 = (numtyp)0.0;
+    numtyp tuv111 = (numtyp)0.0;
+
+    int k = (igridz - nzlo_out) - nlpts;
+    for (int kb = 0; kb < bsorder; kb++) {
+      int i3 = istart + kb;
+      numtyp4 tha3 = thetai3[i3];
+      numtyp v0 = tha3.x;
+      numtyp v1 = tha3.y;
+      numtyp v2 = tha3.z;
+      numtyp v3 = tha3.w;
+      numtyp tu00 = (numtyp)0.0;
+      numtyp tu10 = (numtyp)0.0;
+      numtyp tu01 = (numtyp)0.0;
+      numtyp tu20 = (numtyp)0.0;
+      numtyp tu11 = (numtyp)0.0;
+      numtyp tu02 = (numtyp)0.0;
+      numtyp tu30 = (numtyp)0.0;
+      numtyp tu21 = (numtyp)0.0;
+      numtyp tu12 = (numtyp)0.0;
+      numtyp tu03 = (numtyp)0.0;
+
+      int j = (igridy - nylo_out) - nlpts;
+      for (int jb = 0; jb < bsorder; jb++) {
+        int i2 = istart + jb;
+        numtyp4 tha2 = thetai2[i2];
+        numtyp u0 = tha2.x;
+        numtyp u1 = tha2.y;
+        numtyp u2 = tha2.z;
+        numtyp u3 = tha2.w;
+        numtyp t0 = (numtyp)0.0;
+        numtyp t1 = (numtyp)0.0;
+        numtyp t2 = (numtyp)0.0;
+        numtyp t3 = (numtyp)0.0;
+
+        int i = (igridx - nxlo_out) - nlpts;
+        for (int ib = 0; ib < bsorder; ib++) {
+          int i1 = istart + ib;
+          numtyp4 tha1 = thetai1[i1];
+          int gidx = k*ngridxy + j*ngridx + i;
+          numtyp tq = grid[gidx].x;
+          t0 += tq*tha1.x;
+          t1 += tq*tha1.y;
+          t2 += tq*tha1.z;
+          t3 += tq*tha1.w;
+          i++;
+        }
+
+        tu00 += t0*u0;
+        tu10 += t1*u0;
+        tu01 += t0*u1;
+        tu20 += t2*u0;
+        tu11 += t1*u1;
+        tu02 += t0*u2;
+        tu30 += t3*u0;
+        tu21 += t2*u1;
+        tu12 += t1*u2;
+        tu03 += t0*u3;
+        j++;
+      }
+
+      tuv000 += tu00*v0;
+      tuv100 += tu10*v0;
+      tuv010 += tu01*v0;
+      tuv001 += tu00*v1;
+      tuv200 += tu20*v0;
+      tuv020 += tu02*v0;
+      tuv002 += tu00*v2;
+      tuv110 += tu11*v0;
+      tuv101 += tu10*v1;
+      tuv011 += tu01*v1;
+      tuv300 += tu30*v0;
+      tuv030 += tu03*v0;
+      tuv003 += tu00*v3;
+      tuv210 += tu21*v0;
+      tuv201 += tu20*v1;
+      tuv120 += tu12*v0;
+      tuv021 += tu02*v1;
+      tuv102 += tu10*v2;
+      tuv012 += tu01*v2;
+      tuv111 += tu11*v1;
+      k++;
+    }
+
+    numtyp buf[20];
+    buf[0] = tuv000;
+    buf[1] = tuv100;
+    buf[2] = tuv010;
+    buf[3] = tuv001;
+    buf[4] = tuv200;
+    buf[5] = tuv020;
+    buf[6] = tuv002;
+    buf[7] = tuv110;
+    buf[8] = tuv101;
+    buf[9] = tuv011;
+    buf[10] = tuv300;
+    buf[11] = tuv030;
+    buf[12] = tuv003;
+    buf[13] = tuv210;
+    buf[14] = tuv201;
+    buf[15] = tuv120;
+    buf[16] = tuv021;
+    buf[17] = tuv102;
+    buf[18] = tuv012;
+    buf[19] = tuv111;
+
+    int idx = ii;    
+    for (int m = 0; m < 20; m++) {
+      fphi[idx] = felec * buf[m];
+      idx += inum;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   scan standard neighbor list and make it compatible with 1-5 neighbors
+   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
+   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
+   else do nothing to IJ entry
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_special15(__global int * dev_nbor,
+                          const __global int * dev_packed,
+                          const __global tagint *restrict tag,
+                          const __global int *restrict nspecial15,
+                          const __global tagint *restrict special15,
+                          const int inum, const int nall, const int nbor_pitch,
+                          const int t_per_atom) {
+  int tid, ii, offset, n_stride, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    int n15 = nspecial15[ii];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int sj=dev_packed[nbor];
+      int which = sj >> SBBITS & 3;
+      int j = sj & NEIGHMASK;
+      tagint jtag = tag[j];
+
+      if (!which) {
+        int offset=ii;
+        for (int k=0; k<n15; k++) {
+          if (special15[offset] == jtag) {
+            which = 4;
+            break;
+          }
+          offset += nall;
+        }
+      }
+
+      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
+    } // for nbor
+
+  } // if ii
+}
+
+__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
+                                  const __global int * dev_nbor,
+                                  const __global int * dev_packed,
+                                  __global int * dev_short_nbor,
+                                  const numtyp off2,
+                                  const int inum, const int nbor_pitch,
+                                  const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<off2) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
new file mode 100644
index 0000000000..d12b79719f
--- /dev/null
+++ b/lib/gpu/lal_amoeba.h
@@ -0,0 +1,100 @@
+/***************************************************************************
+                                  amoeba.h
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the amoeba pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_AMOEBA_H
+#define LAL_AMOEBA_H
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Amoeba : public BaseAmoeba<numtyp, acctyp> {
+ public:
+  Amoeba();
+  ~Amoeba();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const int max_amtype, const int max_amclass,
+           const double *host_pdamp, const double *host_thole,
+           const double *host_dirdamp, const int *host_amtype2class,
+           const double *host_special_mpole,
+           const double *host_special_hal,
+           const double *host_special_repel,
+           const double *host_special_disp,
+           const double *host_special_polar_wscale,
+           const double *host_special_polar_piscale,
+           const double *host_special_polar_pscale,
+           const double *host_csix, const double *host_adisp,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const int maxspecial15, const double cell_size,
+           const double gpu_split, FILE *_screen,
+           const double polar_dscale, const double polar_uscale);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
+  /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
+  UCL_D_Vec<numtyp4> coeff_amtype;
+  /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
+  UCL_D_Vec<numtyp4> coeff_amclass;
+  /// Special amoeba values [0-4]:
+  ///   sp_amoeba.x = special_hal
+  ///   sp_amoeba.y = special_polar_pscale,
+  ///   sp_amoeba.z = special_polar_piscale
+  ///   sp_amoeba.w = special_mpole
+  UCL_D_Vec<numtyp4> sp_amoeba;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _polar_dscale, _polar_uscale;
+  numtyp _qqrd2e;
+
+ protected:
+  bool _allocated;
+  int multipole_real(const int eflag, const int vflag);
+  int udirect2b(const int eflag, const int vflag);
+  int umutual2b(const int eflag, const int vflag);
+  int polar_real(const int eflag, const int vflag);
+
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
new file mode 100644
index 0000000000..995dfbe95f
--- /dev/null
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -0,0 +1,213 @@
+/***************************************************************************
+                                 amoeba_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Functions for LAMMPS access to amoeba acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_amoeba.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp, const int *host_amtype2class,
+                    const double *host_special_hal,
+                    const double *host_special_repel,
+                    const double *host_special_disp,
+                    const double *host_special_mpole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const double *host_csix, const double *host_adisp,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double polar_dscale, const double polar_uscale) {
+  AMOEBAMF.clear();
+  gpu_mode=AMOEBAMF.device->gpu_mode();
+  double gpu_split=AMOEBAMF.device->particle_split();
+  int first_gpu=AMOEBAMF.device->first_device();
+  int last_gpu=AMOEBAMF.device->last_device();
+  int world_me=AMOEBAMF.device->world_me();
+  int gpu_rank=AMOEBAMF.device->gpu_rank();
+  int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();
+
+  AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);
+
+  bool message=false;
+  if (AMOEBAMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
+                          host_pdamp, host_thole, host_dirdamp,
+                          host_amtype2class, host_special_hal,
+                          host_special_repel, host_special_disp,
+                          host_special_mpole, host_special_polar_wscale,
+                          host_special_polar_piscale, host_special_polar_pscale,
+                          host_csix, host_adisp, nlocal, nall, max_nbors,
+                          maxspecial, maxspecial15, cell_size, gpu_split,
+                          screen, polar_dscale, polar_uscale);
+
+  AMOEBAMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
+                            host_pdamp, host_thole, host_dirdamp,
+                            host_amtype2class, host_special_hal,
+                            host_special_repel, host_special_disp,
+                            host_special_mpole, host_special_polar_wscale,
+                            host_special_polar_piscale, host_special_polar_pscale,
+                            host_csix, host_adisp, nlocal, nall, max_nbors,
+                            maxspecial, maxspecial15, cell_size, gpu_split,
+                            screen, polar_dscale, polar_uscale);
+
+    AMOEBAMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    AMOEBAMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void amoeba_gpu_clear() {
+  AMOEBAMF.clear();
+}
+
+int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
+                            double **host_x, int *host_type, int *host_amtype,
+                            int *host_amgroup, double **host_rpole, double ** /*host_uind*/,
+                            double ** /*host_uinp*/, double * /*host_pval*/,
+                            double *sublo, double *subhi, tagint *tag,
+                            int **nspecial, tagint **special,
+                            int *nspecial15, tagint **special15,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom, int &host_start,
+                            int **ilist, int **jnum, const double cpu_time,
+                            bool &success, double *host_q, double *boxlo, double *prd) {
+  return AMOEBAMF.precompute(ago, inum_full, nall, host_x, host_type,
+                             host_amtype, host_amgroup, host_rpole,
+                             nullptr, nullptr, nullptr, sublo, subhi, tag,
+                             nspecial, special, nspecial15, special15,
+                             eflag_in, vflag_in, eatom, vatom,
+                             host_start, ilist, jnum, cpu_time,
+                             success, host_q, boxlo, prd);
+}
+
+
+void amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double felec, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
+}
+
+void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           const double aewald, const double off2, void **fieldp_ptr) {
+  AMOEBAMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
+                             aewald, off2, fieldp_ptr);
+}
+
+void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                   double **host_uind, double **host_uinp,
+                                   const double aewald, const double off2, void **fieldp_ptr) {
+  AMOEBAMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
+                             aewald, off2, fieldp_ptr);
+}
+
+void amoeba_gpu_update_fieldp(void **fieldp_ptr) {
+  AMOEBAMF.update_fieldp(fieldp_ptr);
+}
+
+void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                    double **host_uind, double **host_uinp,
+                                    const bool eflag_in, const bool vflag_in,
+                                    const bool eatom, const bool vatom,
+                                    const double aewald, const double felec, const double off2,
+                                    void **tep_ptr) {
+  AMOEBAMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
+                              eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
+}
+
+void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out) {
+   AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid,
+                              nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out);
+}
+
+void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi) {
+   AMOEBAMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1,
+                              host_fdip_phi2, host_fdip_sum_phi);
+}
+
+void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) {
+   AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec);
+}
+
+void amoeba_setup_fft(const int numel, const int element_type) {
+  AMOEBAMF.setup_fft(numel, element_type);
+}
+
+void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) {
+  AMOEBAMF.compute_fft1d(in, out, numel, mode);
+}
+
+double amoeba_gpu_bytes() {
+  return AMOEBAMF.host_memory_usage();
+}
diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index 17cfa0dc5a..72cb59a912 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const {
     bytes+=sizeof(numtyp);
   if (_vel)
     bytes+=4*sizeof(numtyp);
+  if (_extra_fields>0)
+    bytes+=_extra_fields*sizeof(numtyp4);
   return bytes;
 }
 
@@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) {
                                    UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=v.device.row_bytes();
   }
+  if (_extra_fields>0) {
+    success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=extra.device.row_bytes();
+  }
 
   if (_gpu_nbor>0) {
     if (_bonds) {
@@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) {
 
 template <class numtyp, class acctyp>
 bool AtomT::add_fields(const bool charge, const bool rot,
-                       const int gpu_nbor, const bool bonds, const bool vel) {
+                       const int gpu_nbor, const bool bonds, const bool vel,
+                       const int extra_fields) {
   bool success=true;
   // Ignore host/device transfers?
   int gpu_bytes=0;
@@ -191,7 +199,17 @@ bool AtomT::add_fields(const bool charge, const bool rot,
     }
   }
 
-  if (bonds && !_bonds) {
+  if (extra_fields > 0 && _extra_fields==0) {
+    _extra_fields=extra_fields;
+    _other=true;
+    if (_host_view==false) {
+      success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
+                                     UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=extra.device.row_bytes();
+    }
+  }
+
+  if (bonds && _bonds==false) {
     _bonds=true;
     if (_bonds && _gpu_nbor>0) {
       success=success && (dev_tag.alloc(_max_atoms,*dev,
@@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,
 
 template <class numtyp, class acctyp>
 bool AtomT::init(const int nall, const bool charge, const bool rot,
-                 UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) {
+                 UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel,
+                 const int extra_fields) {
   clear();
 
   bool success=true;
@@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
   _q_avail=false;
   _quat_avail=false;
   _v_avail=false;
+  _extra_avail=false;
   _resized=false;
   _gpu_nbor=gpu_nbor;
   _bonds=bonds;
   _charge=charge;
   _rot=rot;
   _vel=vel;
-  _other=_charge || _rot || _vel;
+  _extra_fields=extra_fields;
+  _other=_charge || _rot || _vel || (extra_fields>0);
   dev=&devi;
   _time_transfer=0;
 
@@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
   time_q.init(*dev);
   time_quat.init(*dev);
   time_vel.init(*dev);
+  time_extra.init(*dev);
+
   time_pos.zero();
   time_q.zero();
   time_quat.zero();
   time_vel.zero();
+  time_extra.zero();
+
   _time_cast=0.0;
 
   #ifdef GPU_CAST
@@ -308,6 +333,8 @@ void AtomT::clear_resize() {
     quat.clear();
   if (_vel)
     v.clear();
+  if (_extra_fields>0)
+    extra.clear();
 
   dev_cell_id.clear();
   dev_particle_id.clear();
@@ -350,6 +377,7 @@ void AtomT::clear() {
   time_q.clear();
   time_quat.clear();
   time_vel.clear();
+  time_extra.clear();
   clear_resize();
 
   #ifdef GPU_CAST
@@ -370,12 +398,19 @@ double AtomT::host_memory_usage() const {
     atom_bytes+=4;
   if (_vel)
     atom_bytes+=4;
+  if (_extra_fields>0)
+    atom_bytes+=_extra_fields;
   return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }
 
+#ifdef USE_CUDPP
+#define USE_CUDPP_ARG(arg) arg
+#else
+#define USE_CUDPP_ARG(arg)
+#endif
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
-void AtomT::sort_neighbor(const int num_atoms) {
+void AtomT::sort_neighbor(const int USE_CUDPP_ARG(num_atoms)) {
   #ifdef USE_CUDPP
   CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
                                  (int *)dev_particle_id.begin(),
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index 77c1faa784..771c2a3571 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -76,7 +76,7 @@ class Atom {
     *        gpu_nbor 2 if binning on host and neighboring on device **/
   bool init(const int nall, const bool charge, const bool rot,
             UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
-            const bool vel=false);
+            const bool vel=false, const int extra_fields=0);
 
   /// Check if we have enough device storage and realloc if not
   /** Returns true if resized with any call during this timestep **/
@@ -96,7 +96,7 @@ class Atom {
     *        gpu_nbor 1 if neighboring will be performed on device
     *        gpu_nbor 2 if binning on host and neighboring on device **/
   bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
-                  const bool bonds, const bool vel=false);
+                  const bool bonds, const bool vel=false, const int extra_fields=0);
 
   /// Returns true if GPU is using charges
   bool charge() { return _charge; }
@@ -107,6 +107,9 @@ class Atom {
   /// Returns true if GPU is using velocities
   bool velocity() { return _vel; }
 
+  /// Returns true if GPU is using extra fields
+  bool using_extra() { return (_extra_fields>0); }
+
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
 
@@ -128,6 +131,8 @@ class Atom {
       time_quat.add_to_total();
     if (_vel)
       time_vel.add_to_total();
+    if (_extra_fields>0)
+      time_extra.add_to_total();
   }
 
   /// Add copy times to timers
@@ -139,6 +144,8 @@ class Atom {
       time_quat.zero();
     if (_vel)
       time_vel.zero();
+    if (_extra_fields>0)
+      time_extra.zero();
   }
 
   /// Return the total time for host/device data transfer
@@ -158,6 +165,10 @@ class Atom {
       total+=time_vel.total_seconds();
       time_vel.zero_total();
     }
+    if (_extra_fields>0) {
+      total+=time_extra.total_seconds();
+      time_extra.zero_total();
+    }
 
     return total+_time_transfer/1000.0;
   }
@@ -281,7 +292,11 @@ class Atom {
 
   /// Signal that we need to transfer atom data for next timestep
   inline void data_unavail()
-    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; }
+
+  /// Signal that we need to transfer atom extra data for next kernel call
+  inline void extra_data_unavail()
+    { _extra_avail=false; }
 
   typedef struct { double x,y,z; } vec3d;
   typedef struct { numtyp x,y,z,w; } vec4d_t;
@@ -312,7 +327,7 @@ class Atom {
 
   /// Copy positions and types to device asynchronously
   /** Copies nall() elements **/
-  inline void add_x_data(double **host_ptr, int *host_type) {
+  inline void add_x_data(double ** /*host_ptr*/, int * /*host_type*/) {
     time_pos.start();
     if (_x_avail==false) {
       #ifdef GPU_CAST
@@ -426,7 +441,7 @@ class Atom {
 
   /// Copy velocities and tags to device asynchronously
   /** Copies nall() elements **/
-  inline void add_v_data(double **host_ptr, tagint *host_tag) {
+  inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) {
     time_vel.start();
     if (_v_avail==false) {
       #ifdef GPU_CAST
@@ -450,6 +465,33 @@ class Atom {
     add_v_data(host_ptr,host_tag);
   }
 
+ // Cast extras to write buffer
+  template<class cpytyp>
+  inline void cast_extra_data(cpytyp *host_ptr) {
+    if (_extra_avail==false) {
+      double t=MPI_Wtime();
+      #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+      #pragma omp parallel for simd schedule(static)
+      #elif (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i=0; i<_nall*_extra_fields; i++)
+        extra[i]=host_ptr[i];
+      _time_cast+=MPI_Wtime()-t;
+    }
+  }
+
+  // Copy extras to device
+  /** Copies nall()*_extra elements **/
+  inline void add_extra_data() {
+    time_extra.start();
+    if (_extra_avail==false) {
+      extra.update_device(_nall*_extra_fields,true);
+      _extra_avail=true;
+    }
+    time_extra.stop();
+  }
+
   /// Add in casting time from additional data (seconds)
   inline void add_cast_time(double t) { _time_cast+=t; }
 
@@ -473,6 +515,8 @@ class Atom {
   UCL_Vector<numtyp,numtyp> quat;
   /// Velocities
   UCL_Vector<numtyp,numtyp> v;
+  /// Extras
+  UCL_Vector<numtyp4,numtyp4> extra;
 
   #ifdef GPU_CAST
   UCL_Vector<numtyp,numtyp> x_cast;
@@ -493,7 +537,7 @@ class Atom {
   UCL_H_Vec<int> host_particle_id;
 
   /// Device timers
-  UCL_Timer time_pos, time_q, time_quat, time_vel;
+  UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra;
 
   /// Geryon device
   UCL_Device *dev;
@@ -508,11 +552,12 @@ class Atom {
   bool _compiled;
 
   // True if data has been copied to device already
-  bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
+  bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized;
 
   bool alloc(const int nall);
 
   bool _allocated, _rot, _charge, _bonds, _vel, _other;
+  int _extra_fields;
   int _max_atoms, _nall, _gpu_nbor;
   bool _host_view;
   double _time_cast, _time_transfer;
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
new file mode 100644
index 0000000000..09d7386461
--- /dev/null
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -0,0 +1,962 @@
+/***************************************************************************
+                               base_amoeba.cpp
+                             -------------------
+                            Trung Dac Nguyen (Northwestern)
+
+  Base class for pair styles needing per-particle data for position,
+  charge, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+#define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> global_device;
+
+template <class numtyp, class acctyp>
+BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_polar_avail(false) {
+  device=&global_device;
+  ans=new Answer<numtyp,acctyp>();
+  nbor=new Neighbor();
+  pair_program=nullptr;
+  ucl_device=nullptr;
+}
+
+template <class numtyp, class acctyp>
+BaseAmoebaT::~BaseAmoeba() {
+  delete ans;
+  delete nbor;
+  k_multipole.clear();
+  k_udirect2b.clear();
+  k_umutual2b.clear();
+  k_fphi_uind.clear();
+  k_fphi_mpole.clear();
+  k_polar.clear();
+  k_special15.clear();
+  k_short_nbor.clear();
+
+  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
+  if (fft_plan_created) cufftDestroy(plan);
+  #endif
+
+  if (pair_program) delete pair_program;
+}
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
+                             const int max_nbors, const int maxspecial,
+                             const int maxspecial15,
+                             const double cell_size, const double gpu_split,
+                             FILE *_screen, const void *pair_program,
+                             const char *k_name_multipole,
+                             const char *k_name_udirect2b,
+                             const char *k_name_umutual2b,
+                             const char *k_name_polar,
+                             const char *k_name_fphi_uind,
+                             const char *k_name_fphi_mpole,
+                             const char *k_name_short_nbor,
+                             const char* k_name_special15) {
+  screen=_screen;
+
+  int gpu_nbor=0;
+  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  _threads_per_atom=device->threads_per_charge();
+
+  bool charge = true;
+  bool rot = false;
+  bool vel = false;
+  _extra_fields = 24; // round up to accomodate quadruples of numtyp values
+                      // rpole 13; uind 3; uinp 3; amtype, amgroup; pval
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4);
+  if (success!=0)
+    return success;
+
+  if (ucl_device!=device->gpu) _compiled=false;
+
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pair_block_size();
+  _block_bio_size=device->block_bio_pair();
+  compile_kernels(*ucl_device,pair_program,k_name_multipole,
+                   k_name_udirect2b, k_name_umutual2b,k_name_polar,
+                   k_name_fphi_uind, k_name_fphi_mpole,
+                   k_name_short_nbor, k_name_special15);
+
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else {
+    _nbor_data=&(nbor->dev_nbor);
+  }
+
+  bool alloc_packed=false;
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
+                              _gpu_host,max_nbors,cell_size,alloc_packed,
+                              _threads_per_atom);
+  if (success!=0)
+    return success;
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_nbor,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->x,4);
+  q_tex.bind_float(atom->q,1);
+
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  _maxspecial=maxspecial;
+  _maxspecial15=maxspecial15;
+
+  // allocate per-atom array tep
+
+  int ef_nall=nlocal; //nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+
+  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
+
+  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
+  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+
+  _max_fieldp_size = _max_tep_size;
+  _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+
+  _max_thetai_size = 0;
+
+  _nmax = nall;
+  dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+  dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+  dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
+
+  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
+  fft_plan_created = false;
+  #endif
+
+  #ifdef ASYNC_DEVICE_COPY
+  _end_command_queue=ucl_device->num_queues();
+  ucl_device->push_command_queue();
+  #endif
+
+  return success;
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::clear_atomic() {
+  // Output any timing information
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
+
+  time_pair.clear();
+  hd_balancer.clear();
+
+  dev_short_nbor.clear();
+  nbor->clear();
+  ans->clear();
+
+  _tep.clear();
+  _fieldp.clear();
+  _thetai1.clear();
+  _thetai2.clear();
+  _thetai3.clear();
+  _igrid.clear();
+  _fdip_phi1.clear();
+  _fdip_phi2.clear();
+  _fdip_sum_phi.clear();
+  _cgrid_brick.clear();
+
+  dev_nspecial15.clear();
+  dev_special15.clear();
+  dev_special15_t.clear();
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
+                                   int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return nullptr;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+
+  return ilist;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
+                                        const int nall, double **host_x,
+                                        int *host_type, double *sublo,
+                                        double *subhi, tagint *tag,
+                                        int **nspecial, tagint **special,
+                                        int *nspecial15, tagint **special15,
+                                        bool &success) {
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return 0;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
+
+  // add one-five neighbors
+
+  if (_maxspecial15>0) {
+    UCL_H_Vec<int> view_nspecial15;
+    UCL_H_Vec<tagint> view_special15;
+    view_nspecial15.view(nspecial15,nall,*ucl_device);
+    view_special15.view(special15[0],nall*_maxspecial15,*ucl_device);
+    ucl_copy(dev_nspecial15,view_nspecial15,nall,false);
+    ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false);
+    nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall);
+
+    add_onefive_neighbors();
+  }
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+  return mn;
+}
+
+// ---------------------------------------------------------------------------
+// Prepare for multiple kernel calls in a time step:
+//   - reallocate per-atom arrays, if needed
+//   - transfer extra data from host to device
+//   - build the full neighbor lists for use by different kernels
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
+                              double **host_x, int *host_type, int *host_amtype,
+                              int *host_amgroup, double **host_rpole,
+                              double **host_uind, double **host_uinp, double *host_pval,
+                              double *sublo, double *subhi, tagint *tag,
+                              int **nspecial, tagint **special,
+                              int *nspecial15, tagint **special15,
+                              const bool eflag_in, const bool vflag_in,
+                              const bool eatom, const bool vatom, int &host_start,
+                              int **&ilist, int **&jnum, const double cpu_time,
+                              bool &success, double *host_q, double * /*boxlo*/, double * /*prd*/) {
+  acc_timers();
+  if (eatom) _eflag=2;
+  else if (eflag_in) _eflag=1;
+  else _eflag=0;
+  if (vatom) _vflag=2;
+  else if (vflag_in) _vflag=1;
+  else _vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (_eflag) _eflag=2;
+  if (_vflag) _vflag=2;
+  #endif
+
+  set_kernel(_eflag,_vflag);
+
+  // ------------------- Resize 1-5 neighbor arrays ------------------------
+
+  if (nall>_nmax) {
+    _nmax = nall;
+    dev_nspecial15.clear();
+    dev_special15.clear();
+    dev_special15_t.clear();
+    dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
+  }
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return nullptr;
+  }
+
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                    success);
+    if (!success)
+      return nullptr;
+    atom->cast_q_data(host_q);
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    atom->cast_q_data(host_q);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  atom->add_q_data();
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  atom->add_extra_data();
+
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  // re-allocate dev_short_nbor if necessary
+  if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    dev_short_nbor.resize((2+_max_nbors)*_nmax);
+  }
+
+  hd_balancer.stop_timer();
+
+  return nbor->host_jlist.begin()-host_start;
+}
+
+// ---------------------------------------------------------------------------
+// Compute multipole real-space part
+//   precompute() should be already invoked before mem (re)allocation
+//   this is the first part in a time step done on the GPU for AMOEBA for now
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
+                                         const int /*nall*/, double ** /*host_x*/,
+                                         int * /*host_type*/, int * /*host_amtype*/,
+                                         int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                         double */*host_pval*/, double * /*sublo*/,
+                                         double * /*subhi*/, tagint * /*tag*/,
+                                         int ** /*nspecial*/, tagint ** /*special*/,
+                                         int * /*nspecial15*/, tagint ** /*special15*/,
+                                         const bool /*eflag_in*/, const bool /*vflag_in*/,
+                                         const bool /*eatom*/, const bool /*vatom*/,
+                                         int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
+                                         const double /*cpu_time*/, bool & /*success*/,
+                                         const double aewald, const double felec,
+                                         const double off2_mpole, double * /*host_q*/,
+                                         double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    _tep.resize(_max_tep_size*4);
+  }
+  *tep_ptr=_tep.host.begin();
+
+  _off2_mpole = off2_mpole;
+  _felec = felec;
+  _aewald = aewald;
+  multipole_real(_eflag,_vflag);
+
+  // leave the answers (forces, energies and virial) on the device,
+  //   only copy them back in the last kernel (polar_real)
+  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //device->add_ans_object(ans);
+
+  // copy tep from device to host
+
+  _tep.update_host(_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute the direct real space part
+//    of the permanent field
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                     double **host_uind, double **host_uinp, double *host_pval,
+                                     const double aewald, const double off2_polar,
+                                     void** fieldp_ptr) {
+  // all the necessary data arrays are already copied from host to device
+
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  atom->add_extra_data();
+
+  *fieldp_ptr=_fieldp.host.begin();
+
+  // specify the correct cutoff and alpha values
+  _off2_polar = off2_polar;
+  _aewald = aewald;
+  udirect2b(_eflag,_vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+
+  _fieldp.update_host(_max_fieldp_size*8,false);
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute the direct real space part
+//    of the induced field
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double ** /*host_rpole*/,
+                                    double **host_uind, double **host_uinp, double * /*host_pval*/,
+                                    const double aewald, const double off2_polar,
+                                    void** /*fieldp_ptr*/) {
+  // only copy the necessary data arrays that are updated over the iterations
+  // use nullptr for the other arrays that are already copied from host to device
+  cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr);
+  atom->add_extra_data();
+
+  // set the correct cutoff and alpha
+  _off2_polar = off2_polar;
+  _aewald = aewald;
+  // launch the kernel
+  umutual2b(_eflag,_vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+  // NOTE: move this step to update_fieldp() to delay device-host transfer
+  //       after umutual1 and self are done on the GPU
+  // *fieldp_ptr=_fieldp.host.begin();
+  // _fieldp.update_host(_max_fieldp_size*8,false);
+}
+
+// ---------------------------------------------------------------------------
+// Prepare for umutual1() after bspline_fill() is done on host
+//   - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed
+//     host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
+//     host_igrid is allocated with nmax by 4
+//   - transfer extra data from host to device
+// NOTE: can be re-used for fphi_mpole() but with a different bsorder value
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
+                                    double ***host_thetai1, double ***host_thetai2,
+                                    double ***host_thetai3, int** host_igrid,
+                                    const int nzlo_out, const int nzhi_out,
+                                    const int nylo_out, const int nyhi_out,
+                                    const int nxlo_out, const int nxhi_out) {
+  // update bsorder with that of the kspace solver
+  _bsorder = bsorder;
+
+  // allocate or resize per-atom arrays
+  // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
+  //   will be consolidated once all terms are ready
+
+  if (_max_thetai_size == 0) {
+    _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
+    _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+    _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+    _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+    _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+
+    _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
+    _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
+    _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
+  } else {
+    if ((int)_thetai1.cols()<_max_thetai_size*bsorder) {
+      _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+      _thetai1.resize(_max_thetai_size*bsorder);
+      _thetai2.resize(_max_thetai_size*bsorder);
+      _thetai3.resize(_max_thetai_size*bsorder);
+      _igrid.resize(_max_thetai_size*4);
+
+      _fdip_phi1.resize(_max_thetai_size*10);
+      _fdip_phi2.resize(_max_thetai_size*10);
+      _fdip_sum_phi.resize(_max_thetai_size*20);
+    }
+  }
+
+  #ifdef ASYNC_DEVICE_COPY
+  _thetai1.cq(ucl_device->cq(_end_command_queue));
+  _thetai2.cq(ucl_device->cq(_end_command_queue));
+  _thetai3.cq(ucl_device->cq(_end_command_queue));
+  #endif
+
+  // pack host data to device
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai1[i][j][0];
+      v.y = host_thetai1[i][j][1];
+      v.z = host_thetai1[i][j][2];
+      v.w = host_thetai1[i][j][3];
+      _thetai1[idx] = v;
+    }
+  _thetai1.update_device(true);
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai2[i][j][0];
+      v.y = host_thetai2[i][j][1];
+      v.z = host_thetai2[i][j][2];
+      v.w = host_thetai2[i][j][3];
+      _thetai2[idx] = v;
+    }
+  _thetai2.update_device(true);
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai3[i][j][0];
+      v.y = host_thetai3[i][j][1];
+      v.z = host_thetai3[i][j][2];
+      v.w = host_thetai3[i][j][3];
+      _thetai3[idx] = v;
+    }
+  _thetai3.update_device(true);
+
+  for (int i = 0; i < inum_full; i++) {
+    int idx = i*4;
+    _igrid[idx+0] = host_igrid[i][0];
+    _igrid[idx+1] = host_igrid[i][1];
+    _igrid[idx+2] = host_igrid[i][2];
+  }
+  _igrid.update_device(true);
+
+  // _cgrid_brick holds the grid-based potential
+
+  _nzlo_out = nzlo_out;
+  _nzhi_out = nzhi_out;
+  _nylo_out = nylo_out;
+  _nyhi_out = nyhi_out;
+  _nxlo_out = nxlo_out;
+  _nxhi_out = nxhi_out;
+  _ngridz = nzhi_out - nzlo_out + 1;
+  _ngridy = nyhi_out - nylo_out + 1;
+  _ngridx = nxhi_out - nxlo_out + 1;
+  _num_grid_points = _ngridx * _ngridy * _ngridz;
+
+  int numel = _num_grid_points;
+  if (_cgrid_brick.cols() == 0) {
+    int nsize=(int)(((double)numel)*1.1);
+    _cgrid_brick.alloc(nsize, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
+  } else if (numel > (int)_cgrid_brick.cols()) {
+    _cgrid_brick.resize(numel);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// fphi_uind = induced potential from grid
+// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+// NOTE: host_grid_brick is from ic_kspace post_convolution()
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
+                                    void **host_fdip_phi1,
+                                    void **host_fdip_phi2,
+                                    void **host_fdip_sum_phi)
+{
+  int n = 0;
+  for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
+    for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
+      for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
+        numtyp2 v;
+        v.x = host_grid_brick[iz][iy][ix][0];
+        v.y = host_grid_brick[iz][iy][ix][1];
+        _cgrid_brick[n] = v;
+        n++;
+      }
+  _cgrid_brick.update_device(_num_grid_points, false);
+
+  #ifdef ASYNC_DEVICE_COPY
+  ucl_device->sync();
+  #endif
+
+  // launch the kernel with its execution configuration (see below)
+  fphi_uind();
+
+  // copy data from device to host
+  _fdip_phi1.update_host(_max_thetai_size*10, false);
+  _fdip_phi2.update_host(_max_thetai_size*10, false);
+  _fdip_sum_phi.update_host(_max_thetai_size*20, false);
+
+  // return the pointers to the host-side arrays
+  *host_fdip_phi1 = _fdip_phi1.host.begin();
+  *host_fdip_phi2 = _fdip_phi2.host.begin();
+  *host_fdip_sum_phi = _fdip_sum_phi.host.begin();
+}
+
+// ---------------------------------------------------------------------------
+// Interpolate the potential from the PME grid
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int BaseAmoebaT::fphi_uind() {
+  int ainum=ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+
+  time_pair.start();
+  int ngridxy = _ngridx * _ngridy;
+  k_fphi_uind.set_size(GX,BX);
+  k_fphi_uind.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
+                  &_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum,
+                  &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
+  time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// fphi_mpole = multipole potential from grid (limited to polar_kspace for now)
+// fphi_mpole extracts the permanent multipole potential from
+//   the particle mesh Ewald grid
+// NOTE: host_grid_brick is from p_kspace post_convolution()
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec)
+{
+  int n = 0;
+  for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
+    for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
+      for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
+        numtyp2 v;
+        v.x = host_grid_brick[iz][iy][ix];
+        v.y = (numtyp)0;
+        _cgrid_brick[n] = v;
+        n++;
+      }
+  _cgrid_brick.update_device(_num_grid_points, false);
+
+  _felec = felec;
+  fphi_mpole();
+
+  _fdip_sum_phi.update_host(_max_thetai_size*20, false);
+
+  *host_fphi = _fdip_sum_phi.host.begin();
+}
+
+// ---------------------------------------------------------------------------
+// Interpolate the potential from the PME grid
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int BaseAmoebaT::fphi_mpole() {
+  int ainum=ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+
+  time_pair.start();
+  int ngridxy = _ngridx * _ngridy;
+  k_fphi_mpole.set_size(GX,BX);
+  k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
+                  &_fdip_sum_phi, &_bsorder, &ainum, &_felec,
+                  &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
+  time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute polar real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
+                                     double **host_rpole, double **host_uind,
+                                     double **host_uinp, double *host_pval,
+                                     const bool eflag_in, const bool vflag_in,
+                                     const bool eatom, const bool vatom,
+                                     const double aewald, const double felec,
+                                     const double off2_polar, void **tep_ptr) {
+
+  // cast necessary data arrays from host to device
+
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  atom->add_extra_data();
+
+  *tep_ptr=_tep.host.begin();
+
+  _off2_polar = off2_polar;
+  _felec = felec;
+  _aewald = aewald;
+  const int red_blocks=polar_real(_eflag,_vflag);
+
+  // only copy answers (forces, energies and virial) back from the device
+  //   in the last kernel (which is polar_real here)
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  device->add_ans_object(ans);
+
+  // copy tep from device to host
+  _tep.update_host(_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Return the memory bytes allocated on the host and device
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+double BaseAmoebaT::host_memory_usage_atomic() const {
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(BaseAmoeba<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Setup the FFT plan: only placeholder for now
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::setup_fft(const int /*numel*/, const int /*element_type*/)
+{
+  // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
+}
+
+// ---------------------------------------------------------------------------
+// Compute FFT on the device: only placeholder for now
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fft1d(void * /*in*/, void * /*out*/,
+                                const int /*numel*/, const int /*mode*/)
+{
+  // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
+  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
+  if (fft_plan_created == false) {
+    int m = numel/2;
+    cufftPlan1d(&plan, m, CUFFT_Z2Z, 1);
+    fft_plan_created = true;
+  }
+
+  // n = number of double complex
+  int n = numel/2;
+
+  // copy the host array to the device (data)
+  UCL_Vector<cufftDoubleComplex,cufftDoubleComplex> data;
+  data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE);
+  int m = 0;
+  double* d_in = (double*)in;
+  for (int i = 0; i < n; i++) {
+    data[i].x = d_in[m];
+    data[i].y = d_in[m+1];
+    m += 2;
+  }
+  data.update_device(false);
+
+  // perform the in-place forward FFT
+
+  cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device,
+    (cufftDoubleComplex*)&data.device, CUFFT_FORWARD);
+  if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result);
+  ucl_device->sync();
+  data.update_host(false);
+
+  // copy back the data to the host array
+
+  m = 0;
+  double* d_out = (double*)out;
+  for (int i = 0; i < n; i++) {
+    d_out[m] = data[i].x;
+    d_out[m+1] = data[i].y;
+    m += 2;
+  }
+
+  data.clear();
+  #endif
+}
+
+// ---------------------------------------------------------------------------
+// Copy the extra data from host to device
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
+                                  double** uind, double** uinp, double* pval) {
+  // signal that we need to transfer extra data from the host
+
+  atom->extra_data_unavail();
+
+  int _nall=atom->nall();
+  numtyp4 *pextra=reinterpret_cast<numtyp4*>(&(atom->extra[0]));
+
+  int n = 0;
+  int nstride = 1; //4;
+  if (rpole) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = rpole[i][0];
+      pextra[idx].y = rpole[i][1];
+      pextra[idx].z = rpole[i][2];
+      pextra[idx].w = rpole[i][3];
+    }
+
+    n += nstride*_nall;
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = rpole[i][4];
+      pextra[idx].y = rpole[i][5];
+      pextra[idx].z = rpole[i][6];
+      pextra[idx].w = rpole[i][8];
+    }
+
+    n += nstride*_nall;
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = rpole[i][9];
+      pextra[idx].y = rpole[i][12];
+      pextra[idx].z = (numtyp)amtype[i];
+      pextra[idx].w = (numtyp)amgroup[i];
+    }
+  } else {
+    n += 2*nstride*_nall;
+  }
+
+  n += nstride*_nall;
+  if (uind) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = uind[i][0];
+      pextra[idx].y = uind[i][1];
+      pextra[idx].z = uind[i][2];
+      pextra[idx].w = 0;
+    }
+  }
+
+  n += nstride*_nall;
+  if (uinp) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = uinp[i][0];
+      pextra[idx].y = uinp[i][1];
+      pextra[idx].z = uinp[i][2];
+      pextra[idx].w = 0;
+    }
+  }
+
+  n += nstride*_nall;
+  if (pval) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx].x = pval[i];
+      pextra[idx].y = 0;
+      pextra[idx].z = 0;
+      pextra[idx].w = 0;
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Compile (load) the kernel strings and set the kernels
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
+                                  const char *kname_multipole,
+                                  const char *kname_udirect2b,
+                                  const char *kname_umutual2b,
+                                  const char *kname_polar,
+                                  const char *kname_fphi_uind,
+                                  const char *kname_fphi_mpole,
+                                  const char *kname_short_nbor,
+                                  const char* kname_special15) {
+  if (_compiled)
+    return;
+
+  if (pair_program) delete pair_program;
+  pair_program=new UCL_Program(dev);
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen);
+
+  k_multipole.set_function(*pair_program, kname_multipole);
+  k_udirect2b.set_function(*pair_program, kname_udirect2b);
+  k_umutual2b.set_function(*pair_program, kname_umutual2b);
+  k_polar.set_function(*pair_program, kname_polar);
+  k_fphi_uind.set_function(*pair_program, kname_fphi_uind);
+  k_fphi_mpole.set_function(*pair_program, kname_fphi_mpole);
+  k_short_nbor.set_function(*pair_program, kname_short_nbor);
+  k_special15.set_function(*pair_program, kname_special15);
+  pos_tex.get_texture(*pair_program, "pos_tex");
+  q_tex.get_texture(*pair_program, "q_tex");
+
+  _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.has_subgroup_support()) {
+    int mx_subgroup_sz = k_polar.max_subgroup_size(_block_size);
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
+}
+
+// ---------------------------------------------------------------------------
+//  Specify 1-5 neighbors from the current neighbor list
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::add_onefive_neighbors() {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
+                               (BX/_threads_per_atom)));
+
+  int _nall=atom->nall();
+  int ainum=ans->inum();
+  int nbor_pitch=nbor->nbor_pitch();
+
+  k_special15.set_size(GX,BX);
+  k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
+                  &atom->dev_tag, &dev_nspecial15, &dev_special15,
+                  &ainum, &_nall, &nbor_pitch,
+                  &_threads_per_atom);
+
+  return GX;
+}
+
+template class BaseAmoeba<PRECISION,ACC_PRECISION>;
+}
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
new file mode 100644
index 0000000000..0eaaafeb1e
--- /dev/null
+++ b/lib/gpu/lal_base_amoeba.h
@@ -0,0 +1,325 @@
+/***************************************************************************
+                                base_amoeba.h
+                             -------------------
+                        Trung Dac Nguyen (Northwestern)
+
+  Base class for pair styles needing per-particle data for position,
+  charge, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_BASE_AMOEBA_H
+#define LAL_BASE_AMOEBA_H
+
+#include "lal_device.h"
+#include "lal_balance.h"
+#include "mpi.h"
+
+#if defined(USE_OPENCL)
+#include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+//#define ASYNC_DEVICE_COPY
+
+#if !defined(USE_OPENCL) && !defined(USE_HIP)
+// temporary workaround for int2 also defined in cufft
+#ifdef int2
+#undef int2
+#endif
+#include "cufft.h"
+#endif
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BaseAmoeba {
+ public:
+  BaseAmoeba();
+  virtual ~BaseAmoeba();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * \param k_name name for the kernel for force calculation
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const int maxspecial15, const double cell_size,
+                  const double gpu_split, FILE *screen, const void *pair_program,
+                  const char *kname_multipole, const char *kname_udirect2b,
+                  const char *kname_umutual2b, const char *kname_polar,
+                  const char *kname_fphi_uind, const char *kname_fphi_mpole,
+                  const char *kname_short_nbor, const char* kname_special15);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead(const int add_kernels=0);
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success)) {
+      pos_tex.bind_float(atom->x,4);
+      q_tex.bind_float(atom->q,1);
+    }
+    ans->resize(inum,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note olist_size=total number of local particles **/
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note nlocal+host_inum=total number local particles
+    * \note olist_size=0 **/
+  inline void resize_local(const int inum, const int host_inum,
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear_atomic();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom_atomic(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_atomic() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      nbor->acc_timers(screen);
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
+    }
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    time_pair.zero();
+    atom->zero_timers();
+    ans->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  int build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, int *nspecial15, tagint **special15,
+                       bool &success);
+
+  /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed
+  virtual int** precompute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double **host_uind,
+                double **host_uinp, double *host_pval, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **&ilist, int **&numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd);
+
+  /// Compute multipole real-space with device neighboring
+  virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *host_pval,
+                double *sublo, double *subhi, tagint *tag,
+                int **nspecial, tagint **special, int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+                int &host_start, int **ilist, int **numj, const double cpu_time,
+                bool &success, const double aewald, const double felec,
+                const double off2_mpole, double *charge, double *boxlo,
+                double *prd, void **tep_ptr);
+
+  /// Compute the real space part of the permanent field (udirect2b) with device neighboring
+  virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const double aewald, const double off2_polar, void **fieldp_ptr);
+
+  /// Compute the real space part of the induced field (umutual2b) with device neighboring
+  virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const double aewald, const double off2_polar, void **fieldp_ptr);
+
+  /// Allocate/resize per-atom arrays before the kspace parts in induce() and polar
+  virtual void precompute_kspace(const int inum_full, const int bsorder,
+                                 double ***host_thetai1, double ***host_thetai2,
+                                 double ***host_thetai3, int** igrid,
+                                 const int nzlo_out, const int nzhi_out,
+                                 const int nylo_out, const int nyhi_out,
+                                 const int nxlo_out, const int nxhi_out);
+  /// Interpolate the induced potential from the grid
+  virtual void compute_fphi_uind(double ****host_grid_brick,
+                                 void **host_fdip_phi1, void **host_fdip_phi2,
+                                 void **host_fdip_sum_phi);
+
+  /// Interpolate the multipolar potential from the grid
+  virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
+                                  const double felec);
+
+  /// Compute polar real-space with device neighboring
+  virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom,
+                const double aewald, const double felec, const double off2_polar,
+                void **tep_ptr);
+
+  // copy field and fieldp from device to host after umutual2b
+  virtual void update_fieldp(void **fieldp_ptr) {
+    *fieldp_ptr=_fieldp.host.begin();
+     // _fieldp store both arrays, one after another
+    _fieldp.update_host(_max_fieldp_size*8,false);
+  }
+
+  /// setup a plan for FFT, where size is the number of elements
+
+  void setup_fft(const int size, const int element_type=0);
+
+  /// compute forward/backward FFT on the device
+
+  void compute_fft1d(void* in, void* out, const int numel, const int mode);
+
+  // -------------------------- DEVICE DATA -------------------------
+
+  /// Device Properties and Atom and Neighbor storage
+  Device<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_pair;
+
+  /// Host device load balancer
+  Balance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  Atom<numtyp,acctyp> *atom;
+
+  UCL_Vector<numtyp,numtyp> polar1, polar2, polar3, polar4, polar5;
+
+  /// cast host arrays into a single array for atom->extra
+  void cast_extra_data(int* amtype, int* amgroup, double** rpole,
+    double** uind, double** uinp, double* pval=nullptr);
+
+  /// Per-atom arrays
+  UCL_Vector<acctyp,acctyp> _tep, _fieldp;
+  int _nmax, _max_tep_size, _max_fieldp_size;
+
+  int _bsorder;
+  UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
+  UCL_Vector<int,int> _igrid;
+  UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
+  UCL_Vector<acctyp,acctyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
+  int _max_thetai_size;
+  int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
+  int _ngridx, _ngridy, _ngridz, _num_grid_points;
+
+  int _end_command_queue;
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  Answer<numtyp,acctyp> *ans;
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  Neighbor *nbor;
+  /// Device storage for 1-5 special neighbor counts
+  UCL_D_Vec<int> dev_nspecial15;
+  /// Device storage for special neighbors
+  UCL_D_Vec<tagint> dev_special15, dev_special15_t;
+
+  int add_onefive_neighbors();
+
+  UCL_D_Vec<int> dev_short_nbor;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program;
+  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
+  UCL_Kernel k_fphi_uind, k_fphi_mpole;
+  UCL_Kernel k_special15, k_short_nbor;
+  inline int block_size() { return _block_size; }
+  inline void set_kernel(const int /*eflag*/, const int /*vflag*/) {}
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+
+ protected:
+  bool _compiled;
+  int _block_size, _block_bio_size, _threads_per_atom;
+  int _extra_fields;
+  double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
+  double _gpu_overhead, _driver_overhead;
+  bool short_nbor_polar_avail;
+  UCL_D_Vec<int> *_nbor_data;
+
+  numtyp _aewald,_felec;
+  numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;
+
+  int _eflag, _vflag;
+
+  void compile_kernels(UCL_Device &dev, const void *pair_string,
+     const char *kname_multipole, const char *kname_udirect2b,
+     const char *kname_umutual2b, const char *kname_polar,
+     const char *kname_fphi_uind, const char *kname_fphi_mpole,
+     const char *kname_short_nbor, const char* kname_special15);
+
+  virtual int multipole_real(const int eflag, const int vflag) = 0;
+  virtual int udirect2b(const int eflag, const int vflag) = 0;
+  virtual int umutual2b(const int eflag, const int vflag) = 0;
+  virtual int fphi_uind();
+  virtual int fphi_mpole();
+  virtual int polar_real(const int eflag, const int vflag) = 0;
+
+
+  #if !defined(USE_OPENCL) && !defined(USE_HIP)
+  cufftHandle plan;
+  #endif
+  bool fft_plan_created;
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp
index bb0e815b3f..0cfc084fa4 100644
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
 
   _threads_per_atom=device->threads_per_atom();
 
-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  bool charge = false;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp
index 4a59f70d83..3cd6c6030a 100644
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
 
   _threads_per_atom=device->threads_per_charge();
 
-  int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
+  bool charge = true;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp
index 66e03de651..6ef1c40ca7 100644
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
 
   _threads_per_atom=device->threads_per_charge();
 
-  int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
+  bool charge = true;
+  bool rot = true;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp
index 44b86abeeb..e103699d40 100644
--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
 
   _threads_per_atom=device->threads_per_atom();
 
-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
+  bool charge = false;
+  bool rot = false;
+  bool vel = true;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel);
   if (success!=0)
     return success;
 
@@ -193,7 +196,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall,
                        const double cpu_time, bool &success, tagint *tag,
                        double **host_v, const double dtinvsqrt,
                        const int seed, const int timestep,
-                       const int nlocal, double *boxlo, double *prd) {
+                       const int /*nlocal*/, double * /*boxlo*/, double * /*prd*/) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -258,7 +261,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
                         const double cpu_time, bool &success,
                         double **host_v, const double dtinvsqrt,
                         const int seed, const int timestep,
-                        double *boxlo, double *prd) {
+                        double * /*boxlo*/, double * /*prd*/) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp
index 3457955b3e..bfadfebf66 100644
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
   else
     _threads_per_atom=device->threads_per_three();
 
-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  bool charge = false;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp
index 8008b1fbb3..0d01d70fb1 100644
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@@ -44,19 +44,15 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-int CHARMMLongT::init(const int ntypes,
-                           double host_cut_bothsq, double **host_lj1,
-                           double **host_lj2, double **host_lj3,
-                           double **host_lj4, double **host_offset,
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen,
-                           double host_cut_ljsq, const double host_cut_coulsq,
-                           double *host_special_coul, const double qqrd2e,
-                           const double g_ewald, const double cut_lj_innersq,
-                           const double denom_lj, double **epsilon,
-                           double **sigma, const bool mix_arithmetic) {
+int CHARMMLongT::init(const int ntypes, double host_cut_bothsq, double **host_lj1,
+                      double **host_lj2, double **host_lj3, double **host_lj4,
+                      double ** /*host_offset*/, double *host_special_lj, const int nlocal,
+                      const int nall, const int max_nbors, const int maxspecial,
+                      const double cell_size, const double gpu_split, FILE *_screen,
+                      double host_cut_ljsq, const double host_cut_coulsq,
+                      double *host_special_coul, const double qqrd2e, const double g_ewald,
+                      const double cut_lj_innersq, const double denom_lj, double **epsilon,
+                      double **sigma, const bool mix_arithmetic) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                             _screen,charmm_long,"k_charmm_long");
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 0d9578b491..dd3ce15827 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -52,7 +52,7 @@ DeviceT::~Device() {
 }
 
 template <class numtyp, class acctyp>
-int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu,
                          const int first_gpu_id, const int gpu_mode,
                          const double p_split, const int t_per_atom,
                          const double user_cell_size, char *ocl_args,
@@ -386,6 +386,9 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
   }
 
   _ocl_compile_string="-cl-mad-enable ";
+  #ifdef CL_VERSION_2_0
+  _ocl_compile_string+="-cl-std=CL2.0 ";
+  #endif
   if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
   _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
     std::string(OCL_PRECISION_COMPILE);
@@ -438,7 +441,7 @@ template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                   const bool rot, const int nlocal,
                   const int nall, const int maxspecial,
-                  const bool vel) {
+                  const bool vel, const int extra_fields) {
   if (!_device_init)
     return -1;
   if (sizeof(acctyp)==sizeof(double) && !gpu->double_precision())
@@ -467,7 +470,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
 
   if (_init_count==0) {
     // Initialize atom and nbor data
-    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
+    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields))
       return -3;
 
     _data_in_estimate++;
@@ -477,6 +480,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
       _data_in_estimate++;
     if (vel)
       _data_in_estimate++;
+    if (extra_fields>0)
+      _data_in_estimate++;
+
   } else {
     if (!atom.charge() && charge)
       _data_in_estimate++;
@@ -484,7 +490,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
       _data_in_estimate++;
     if (!atom.velocity() && vel)
       _data_in_estimate++;
-    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
+    if (atom.using_extra() && extra_fields>0)
+      _data_in_estimate++;
+    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields))
       return -3;
   }
 
@@ -520,7 +528,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
 
 template <class numtyp, class acctyp>
 int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
-                       const int host_nlocal, const int nall,
+                       const int host_nlocal, const int /*nall*/,
                        const int maxspecial, const int gpu_host,
                        const int max_nbors, const double cutoff,
                        const bool pre_cut, const int threads_per_atom,
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index f5136d9fa0..3b27223007 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -61,6 +61,7 @@ class Device {
     * \param nall Total number of local+ghost particles
     * \param maxspecial Maximum mumber of special bonded atoms per atom
     * \param vel True if velocities need to be stored
+    * \param extra_fields Nonzero if extra fields need to be stored
     *
     * Returns:
     * -  0 if successful
@@ -70,7 +71,7 @@ class Device {
     * - -5 Double precision is not supported on card **/
   int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
            const int nlocal, const int nall, const int maxspecial,
-           const bool vel=false);
+           const bool vel=false, const int extra_fields=0);
 
   /// Initialize the device for Atom storage only
   /** \param nlocal Total number of local particles to allocate memory for
diff --git a/lib/gpu/lal_dpd_tstat_ext.cpp b/lib/gpu/lal_dpd_tstat_ext.cpp
index 2b63bf62e7..78a1bf2d9d 100644
--- a/lib/gpu/lal_dpd_tstat_ext.cpp
+++ b/lib/gpu/lal_dpd_tstat_ext.cpp
@@ -28,10 +28,10 @@ static DPD<PRECISION,ACC_PRECISION> DPDTMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0,
-                 double **host_gamma, double **host_sigma, double **host_cut,
-                 double *special_lj, const int inum,
-                 const int nall, const int max_nbors,  const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen) {
+                       double **host_gamma, double **host_sigma, double **host_cut,
+                       double *special_lj, const int inum,
+                       const int nall, const int /*max_nbors*/,  const int maxspecial,
+                       const double cell_size, int &gpu_mode, FILE *screen) {
   DPDTMF.clear();
   gpu_mode=DPDTMF.device->gpu_mode();
   double gpu_split=DPDTMF.device->particle_split();
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index 2c0d63f7bf..b7bc7b958a 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -310,7 +310,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
                    const int nall, double **host_x, int *host_type,
                    int *ilist, int *numj, int **firstneigh,
                    const bool eflag_in, const bool vflag_in,
-                   const bool eatom, const bool vatom,
+                   const bool /*eatom*/, const bool /*vatom*/,
                    int &host_start, const double cpu_time,
                    bool &success, void **fp_ptr) {
   this->acc_timers();
@@ -386,8 +386,8 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, double *sublo,
                     double *subhi, tagint *tag, int **nspecial,
                     tagint **special, const bool eflag_in,
-                    const bool vflag_in, const bool eatom,
-                    const bool vatom, int &host_start, int **ilist, int **jnum,
+                    const bool vflag_in, const bool /*eatom*/,
+                    const bool /*vatom*/, int &host_start, int **ilist, int **jnum,
                     const double cpu_time, bool &success, int &inum,
                     void **fp_ptr) {
   this->acc_timers();
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
new file mode 100644
index 0000000000..24ffae8de2
--- /dev/null
+++ b/lib/gpu/lal_hippo.cpp
@@ -0,0 +1,641 @@
+/***************************************************************************
+                                 hippo.cpp
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the hippo pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "hippo_cl.h"
+#elif defined(USE_CUDART)
+const char *hippo=0;
+#else
+#include "hippo_cubin.h"
+#endif
+
+#include "lal_hippo.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define HippoT Hippo<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+HippoT::Hippo() : BaseAmoeba<numtyp,acctyp>(),
+  _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+HippoT::~Hippo() {
+  clear();
+  k_repulsion.clear();
+  k_dispersion.clear();
+
+}
+
+template <class numtyp, class acctyp>
+int HippoT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
+                 const double *host_pdamp, const double *host_thole,
+                 const double *host_dirdamp, const int *host_amtype2class,
+                 const double *host_special_repel, const double *host_special_disp,
+                 const double *host_special_mpole,
+                 const double *host_special_polar_wscale,
+                 const double *host_special_polar_piscale,
+                 const double *host_special_polar_pscale,
+                 const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
+                 const double *host_csix, const double *host_adisp,
+                 const double *host_pcore, const double *host_palpha,
+                 const int nlocal, const int nall, const int max_nbors,
+                 const int maxspecial, const int maxspecial15,
+                 const double cell_size, const double gpu_split, FILE *_screen,
+                 const double polar_dscale, const double polar_uscale) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
+                            cell_size,gpu_split,_screen,hippo,
+                            "k_hippo_multipole", "k_hippo_udirect2b",
+                            "k_hippo_umutual2b", "k_hippo_polar",
+                            "k_hippo_fphi_uind", "k_hippo_fphi_mpole",
+                            "k_hippo_short_nbor", "k_hippo_special15");
+  if (success!=0)
+    return success;
+
+  // specific to HIPPO
+  k_repulsion.set_function(*(this->pair_program),"k_hippo_repulsion");
+  k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+
+  UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_pdamp[i];
+    host_write[i].y = host_thole[i];
+    host_write[i].z = host_dirdamp[i];
+    host_write[i].w = host_amtype2class[i];
+  }
+
+  coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amtype,host_write,false);
+
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_sizpr[i];
+    host_write[i].y = host_dmppr[i];
+    host_write[i].z = host_elepr[i];
+    host_write[i].w = (numtyp)0;
+  }
+
+  coeff_rep.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_rep,host_write,false);
+
+  UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amclass; i++) {
+    host_write2[i].x = host_csix[i];
+    host_write2[i].y = host_adisp[i];
+    host_write2[i].z = host_pcore[i];
+    host_write2[i].w = host_palpha[i];
+  }
+
+  coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amclass,host_write2,false);
+
+  UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
+  sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_polar_wscale[i];
+    dview[i].y=host_special_polar_piscale[i];
+    dview[i].z=host_special_polar_pscale[i];
+    dview[i].w=host_special_mpole[i];
+  }
+  ucl_copy(sp_polar,dview,5,false);
+
+  sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_repel[i];
+    dview[i].y=host_special_disp[i];
+    dview[i].z=(numtyp)0;
+    dview[i].w=(numtyp)0;
+  }
+  ucl_copy(sp_nonpolar,dview,5,false);
+
+  _polar_dscale = polar_dscale;
+  _polar_uscale = polar_uscale;
+
+  _allocated=true;
+  this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes()
+    + coeff_amclass.row_bytes() + sp_polar.row_bytes()
+    + sp_nonpolar.row_bytes() + this->_tep.row_bytes()
+    + this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
+    + this->_thetai2.row_bytes()  + this->_thetai3.row_bytes()
+    + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void HippoT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  coeff_amtype.clear();
+  coeff_rep.clear();
+  coeff_amclass.clear();
+  sp_polar.clear();
+  sp_nonpolar.clear();
+
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double HippoT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Hippo<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Compute the repulsion term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_repulsion(const int /*ago*/, const int inum_full,
+                               const int /*nall*/, double ** /*host_x*/,
+                               int * /*host_type*/, int * /*host_amtype*/,
+                               int * /*host_amgroup*/, double ** /*host_rpole*/,
+                               double * /*sublo*/, double * /*subhi*/, tagint * /*tag*/,
+                               int ** /*nspecial*/, tagint ** /*special*/,
+                               int * /*nspecial15*/, tagint ** /*special15*/,
+                               const bool eflag_in, const bool vflag_in,
+                               const bool eatom, const bool vatom,
+                               int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
+                               const double /*cpu_time*/, bool & /*success*/,
+                               const double /*aewald*/, const double off2_repulse,
+                               double * /*host_q*/, double * /*boxlo*/, double * /*prd*/,
+                               double cut2, double c0, double c1, double c2,
+                               double c3, double c4, double c5, void **tep_ptr) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>this->_max_tep_size) {
+    this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->_tep.resize(this->_max_tep_size*4);
+  }
+  *tep_ptr=this->_tep.host.begin();
+
+  this->_off2_repulse = off2_repulse;
+  _cut2 = cut2;
+  _c0 = c0;
+  _c1 = c1;
+  _c2 = c2;
+  _c3 = c3;
+  _c4 = c4;
+  _c5 = c5;
+  repulsion(this->_eflag,this->_vflag);
+
+  // copy tep from device to host
+  this->_tep.update_host(this->_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the repulsion kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::repulsion(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_disp,
+  //   at this point repuslion is the first kernel in a time step for HIPPO
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_repulse, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  k_repulsion.set_size(GX,BX);
+  k_repulsion.run(&this->atom->x, &this->atom->extra,
+                  &coeff_rep, &sp_nonpolar,
+                  &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                  &this->dev_short_nbor,
+                  &this->ans->force, &this->ans->engv, &this->_tep,
+                  &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                  &this->_threads_per_atom,  &this->_aewald,
+                  &this->_off2_repulse, &_cut2,
+                  &_c0, &_c1, &_c2, &_c3, &_c4, &_c5);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Compute dispersion real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
+                                      double **host_rpole, const double aewald,
+                                      const double off2_disp) {
+
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(host_amtype, host_amgroup, host_rpole,
+                        nullptr, nullptr, nullptr);
+  this->atom->add_extra_data();
+
+  this->_off2_disp = off2_disp;
+  this->_aewald = aewald;
+  dispersion_real(this->_eflag,this->_vflag);
+
+  // only copy them back if this is the last kernel
+  //   otherwise, commenting out these two lines to leave the answers
+  //   (forces, energies and virial) on the device until the last kernel
+  //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //this->device->add_ans_object(this->ans);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the dispersion real-space kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::dispersion_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_disp,
+  //   at this point dispersion is the first kernel in a time step
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_disp, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  k_dispersion.set_size(GX,BX);
+  k_dispersion.run(&this->atom->x, &this->atom->extra,
+                   &coeff_amtype, &coeff_amclass, &sp_nonpolar,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &this->ans->force, &this->ans->engv,
+                   &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                   &this->_threads_per_atom,  &this->_aewald,
+                   &this->_off2_disp);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Compute the multipole real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_multipole_real(const int /*ago*/, const int inum_full,
+                                    const int /*nall*/, double ** /*host_x*/,
+                                    int * /*host_type*/, int * /*host_amtype*/,
+                                    int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                    double* host_pval, double * /*sublo*/,
+                                    double * /*subhi*/, tagint * /*tag*/,
+                                    int ** /*nspecial*/, tagint ** /*special*/,
+                                    int * /*nspecial15*/, tagint ** /*special15*/,
+                                    const bool /*eflag_in*/, const bool /*vflag_in*/,
+                                    const bool /*eatom*/, const bool /*vatom*/,
+                                    int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
+                                    const double /*cpu_time*/, bool & /*success*/,
+                                     const double aewald, const double felec,
+                                    const double off2_mpole, double * /*host_q*/,
+                                    double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
+
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, nullptr, nullptr, host_pval);
+  this->atom->add_extra_data();
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>this->_max_tep_size) {
+    this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->_tep.resize(this->_max_tep_size*4);
+  }
+  *tep_ptr=this->_tep.host.begin();
+
+  this->_off2_mpole = off2_mpole;
+  this->_felec = felec;
+  this->_aewald = aewald;
+  multipole_real(this->_eflag,this->_vflag);
+
+  // copy tep from device to host
+  this->_tep.update_host(this->_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the multipole real-space kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::multipole_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_mpole
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_mpole, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  this->k_multipole.set_size(GX,BX);
+  this->k_multipole.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->ans->force, &this->ans->engv, &this->_tep,
+                        &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                        &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Compute the direct real space part of the permanent field
+//   returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_udirect2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                double **host_uind, double **host_uinp, double* host_pval,
+                                const double aewald, const double off2_polar,
+                                void** fieldp_ptr) {
+
+  // all the necessary data arrays are already copied from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval);
+  this->atom->add_extra_data();
+
+  *fieldp_ptr=this->_fieldp.host.begin();
+
+  this->_off2_polar = off2_polar;
+  this->_aewald = aewald;
+  udirect2b(this->_eflag,this->_vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+
+  this->_fieldp.update_host(this->_max_fieldp_size*8,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the real-space permanent field kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff _off2_polar, if not done yet
+  //   this is the first kernel in a time step where _off2_polar is used
+
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(),
+                           &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                           &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_udirect2b.set_size(GX,BX);
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
+                        &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Compute the direct real space term of the induced field
+//   returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_umutual2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
+                               double **host_uind, double **host_uinp, double * /*host_pval*/,
+                               const double aewald, const double off2_polar, void ** /*fieldp_ptr*/) {
+
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
+  this->atom->add_extra_data();
+
+  this->_off2_polar = off2_polar;
+  this->_aewald = aewald;
+  umutual2b(this->_eflag,this->_vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+  // NOTE: move this step to update_fieldp() to delay device-host transfer
+  // *fieldp_ptr=this->_fieldp.host.begin();
+  // this->_fieldp.update_host(this->_max_fieldp_size*8,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the real-space induced field kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(), &this->dev_short_nbor,
+                           &this->_off2_polar, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_umutual2b.set_size(GX,BX);
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
+                        &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
+                        &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute polar real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void HippoT::compute_polar_real(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                double **host_uind, double **host_uinp, double * /*host_pval*/,
+                                const bool eflag_in, const bool vflag_in,
+                                const bool eatom, const bool vatom,
+                                const double aewald, const double felec,
+                                const double off2_polar, void **tep_ptr) {
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
+  this->atom->add_extra_data();
+
+  *tep_ptr=this->_tep.host.begin();
+
+  this->_off2_polar = off2_polar;
+  this->_felec = felec;
+  this->_aewald = aewald;
+  const int red_blocks=polar_real(this->_eflag,this->_vflag);
+
+  // only copy answers (forces, energies and virial) back from the device
+  //   in the last kernel in a timestep (which is polar_real here)
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  this->device->add_ans_object(this->ans);
+
+  // copy tep from device to host
+  this->_tep.update_host(this->_max_tep_size*4,false);
+}
+
+// ---------------------------------------------------------------------------
+// Launch the polar real-space kernel
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::polar_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=this->block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  /*
+  const int cus = this->device->gpu->cus();
+  while (GX < cus && GX > 1) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  }
+  */
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_polar.set_size(GX,BX);
+  this->k_polar.run(&this->atom->x, &this->atom->extra,
+                    &coeff_amtype, &coeff_amclass, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                    &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  // Signal that short nbor list is not avail for the next time step
+  //   do it here because polar_real() is the last kernel in a time step at this point
+
+  this->short_nbor_polar_avail = false;
+
+  return GX;
+}
+
+template class Hippo<PRECISION,ACC_PRECISION>;
+}
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
new file mode 100644
index 0000000000..99e20db223
--- /dev/null
+++ b/lib/gpu/lal_hippo.cu
@@ -0,0 +1,2519 @@
+// **************************************************************************
+//                                   hippo.cu
+//                             -------------------
+//                          Trung Dac Nguyen (Northwestern)
+//
+//  Device code for acceleration of the hippo pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : trung.nguyen@northwestern.edu
+// ***************************************************************************
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+
+#include "lal_hippo_extra.h"
+#ifdef LAMMPS_SMALLBIG
+#define tagint int
+#endif
+#ifdef LAMMPS_BIGBIG
+#include "inttypes.h"
+#define tagint int64_t
+#endif
+#ifdef LAMMPS_SMALLSMALL
+#define tagint int
+#endif
+#ifndef _DOUBLE_DOUBLE
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+#else
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#ifdef LAMMPS_SMALLBIG
+#define tagint int
+#endif
+#ifdef LAMMPS_BIGBIG
+#define tagint long
+#endif
+#ifdef LAMMPS_SMALLSMALL
+#define tagint int
+#endif
+
+#endif // defined(NV_KERNEL) || defined(USE_HIP)
+
+
+#if (SHUFFLE_AVAIL == 0)
+
+#define local_allocate_store_ufld()                                         \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+
+#define store_answers_hippo_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
+                                tep)                                        \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=tq.x;                                                   \
+    red_acc[1][tid]=tq.y;                                                   \
+    red_acc[2][tid]=tq.z;                                                   \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<3; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    tq.x=red_acc[0][tid];                                                   \
+    tq.y=red_acc[1][tid];                                                   \
+    tq.z=red_acc[2][tid];                                                   \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    tep[i]=tq;                                                              \
+  }
+
+#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
+                          i, tep)                                           \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=ufld[0];                                                \
+    red_acc[1][tid]=ufld[1];                                                \
+    red_acc[2][tid]=ufld[2];                                                \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<3; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    ufld[0]=red_acc[0][tid];                                                \
+    ufld[1]=red_acc[1][tid];                                                \
+    ufld[2]=red_acc[2][tid];                                                \
+    red_acc[0][tid]=dufld[0];                                               \
+    red_acc[1][tid]=dufld[1];                                               \
+    red_acc[2][tid]=dufld[2];                                               \
+    red_acc[3][tid]=dufld[3];                                               \
+    red_acc[4][tid]=dufld[4];                                               \
+    red_acc[5][tid]=dufld[5];                                               \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    dufld[0]=red_acc[0][tid];                                               \
+    dufld[1]=red_acc[1][tid];                                               \
+    dufld[2]=red_acc[2][tid];                                               \
+    dufld[3]=red_acc[3][tid];                                               \
+    dufld[4]=red_acc[4][tid];                                               \
+    dufld[5]=red_acc[5][tid];                                               \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 t;                                                              \
+    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
+      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
+    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
+      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
+    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
+      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
+    tep[i]=t;                                                               \
+  }
+
+#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i,  \
+                              fieldp)                                       \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=_fieldp[0];                                             \
+    red_acc[1][tid]=_fieldp[1];                                             \
+    red_acc[2][tid]=_fieldp[2];                                             \
+    red_acc[3][tid]=_fieldp[3];                                             \
+    red_acc[4][tid]=_fieldp[4];                                             \
+    red_acc[5][tid]=_fieldp[5];                                             \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    _fieldp[0]=red_acc[0][tid];                                             \
+    _fieldp[1]=red_acc[1][tid];                                             \
+    _fieldp[2]=red_acc[2][tid];                                             \
+    _fieldp[3]=red_acc[3][tid];                                             \
+    _fieldp[4]=red_acc[4][tid];                                             \
+    _fieldp[5]=red_acc[5][tid];                                             \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 f, fp;                                                          \
+    f.x = _fieldp[0];                                                       \
+    f.y = _fieldp[1];                                                       \
+    f.z = _fieldp[2];                                                       \
+    fieldp[ii] = f;                                                         \
+    fp.x = _fieldp[3];                                                      \
+    fp.y = _fieldp[4];                                                      \
+    fp.z = _fieldp[5];                                                      \
+    fieldp[ii+inum] = fp;                                                   \
+  }
+
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+          engv[ei]+=e_coul*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+        engv[ei]+=e_coul*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else // SHUFFLE_AVAIL == 1
+
+#define local_allocate_store_ufld()
+
+#define store_answers_hippo_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
+                          tep)                                              \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      tq.x += shfl_down(tq.x, s, t_per_atom);                               \
+      tq.y += shfl_down(tq.y, s, t_per_atom);                               \
+      tq.z += shfl_down(tq.z, s, t_per_atom);                               \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    tep[i]=tq;                                                              \
+  }
+
+#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
+                          i, tep)                                           \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      ufld[0] += shfl_down(ufld[0], s, t_per_atom);                         \
+      ufld[1] += shfl_down(ufld[1], s, t_per_atom);                         \
+      ufld[2] += shfl_down(ufld[2], s, t_per_atom);                         \
+      dufld[0] += shfl_down(dufld[0], s, t_per_atom);                       \
+      dufld[1] += shfl_down(dufld[1], s, t_per_atom);                       \
+      dufld[2] += shfl_down(dufld[2], s, t_per_atom);                       \
+      dufld[3] += shfl_down(dufld[3], s, t_per_atom);                       \
+      dufld[4] += shfl_down(dufld[4], s, t_per_atom);                       \
+      dufld[5] += shfl_down(dufld[5], s, t_per_atom);                       \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 t;                                                              \
+    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
+      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
+    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
+      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
+    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
+      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
+    tep[i]=t;                                                               \
+  }
+
+#define store_answers_fieldp(_fieldp, ii, inum, tid, t_per_atom, offset, i, \
+                             fieldp)                                        \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom);                   \
+      _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom);                   \
+      _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom);                   \
+      _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom);                   \
+      _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom);                   \
+      _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom);                   \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 f, fp;                                                          \
+    f.x = _fieldp[0];                                                       \
+    f.y = _fieldp[1];                                                       \
+    f.z = _fieldp[2];                                                       \
+    fieldp[ii] = f;                                                         \
+    fp.x = _fieldp[3];                                                      \
+    fp.y = _fieldp[4];                                                      \
+    fp.z = _fieldp[5];                                                      \
+    fieldp[ii+inum] = fp;                                                   \
+  }
+
+#if (EVFLAG == 1)
+
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+            engv[ei]+=e_coul*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+        engv[ei]+=e_coul*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+// EVFLAG == 0
+#else
+
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }
+
+#endif // EVFLAG
+#endif // SHUFFLE_AVAIL
+
+#define MIN(A,B) ((A) < (B) ? (A) : (B))
+#define MY_PIS (acctyp)1.77245385090551602729
+
+/* ----------------------------------------------------------------------
+   repulsion = Pauli repulsion interactions
+   adapted from Tinker erepel1b() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
+                                const __global numtyp4 *restrict extra,
+                                const __global numtyp4 *restrict coeff_rep,
+                                const __global numtyp4 *restrict sp_nonpolar,
+                                const __global int *dev_nbor,
+                                const __global int *dev_packed,
+                                const __global int *dev_short_nbor,
+                                __global acctyp4 *restrict ans,
+                                __global acctyp *restrict engv,
+                                __global acctyp4 *restrict tep,
+                                const int eflag, const int vflag, const int inum,
+                                const int nall, const int nbor_pitch,
+                                const int t_per_atom, const numtyp aewald,
+                                const numtyp off2, const numtyp cut2,
+                                const numtyp c0, const numtyp c1, const numtyp c2,
+                                const numtyp c3, const numtyp c4, const numtyp c5)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp4 tq;
+  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
+
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    const numtyp4 pol1i = polar1[i];
+    //numtyp ci  = pol1i.x;  // rpole[i][0];
+    numtyp dix = pol1i.y;    // rpole[i][1];
+    numtyp diy = pol1i.z;    // rpole[i][2];
+    numtyp diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    numtyp qixx = pol2i.x;   // rpole[i][4];
+    numtyp qixy = pol2i.y;   // rpole[i][5];
+    numtyp qixz = pol2i.z;   // rpole[i][6];
+    numtyp qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    numtyp qiyz = pol3i.x;   // rpole[i][9];
+    numtyp qizz = pol3i.y;   // rpole[i][12];
+    int itype = pol3i.z; // amtype[i];
+    numtyp sizi = coeff_rep[itype].x; // sizpr[itype];
+    numtyp dmpi = coeff_rep[itype].y; // dmppr[itype];
+    numtyp vali = coeff_rep[itype].z; // elepr[itype];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      const numtyp4 pol1j = polar1[j];
+      //numtyp ck  = pol1j.x;  // rpole[j][0];
+      numtyp dkx = pol1j.y;    // rpole[j][1];
+      numtyp dky = pol1j.z;    // rpole[j][2];
+      numtyp dkz = pol1j.w;    // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x;   // rpole[j][4];
+      numtyp qkxy = pol2j.y;   // rpole[j][5];
+      numtyp qkxz = pol2j.z;   // rpole[j][6];
+      numtyp qkyy = pol2j.w;   // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x;   // rpole[j][9];
+      numtyp qkzz = pol3j.y;   // rpole[j][12];
+      int jtype = pol3j.z;     // amtype[j];
+
+      numtyp sizk = coeff_rep[jtype].x; // sizpr[jtype];
+      numtyp dmpk = coeff_rep[jtype].y; // dmppr[jtype];
+      numtyp valk = coeff_rep[jtype].z; // elepr[jtype];
+
+      const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)];
+      numtyp factor_repel = sp_nonpol.x; // factor_repel = special_repel[sbmask15(j)];
+      if (factor_repel == (numtyp)0) continue;
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+
+      numtyp dik = dix*dkx + diy*dky + diz*dkz;
+      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
+      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
+      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
+        qixx*qkxx + qiyy*qkyy + qizz*qkzz;
+
+      // additional intermediates involving moments and distance
+
+      numtyp dirx = diy*zr - diz*yr;
+      numtyp diry = diz*xr - dix*zr;
+      numtyp dirz = dix*yr - diy*xr;
+      numtyp dikx = diy*dkz - diz*dky;
+      numtyp diky = diz*dkx - dix*dkz;
+      numtyp dikz = dix*dky - diy*dkx;
+      numtyp qirx = qiz*yr - qiy*zr;
+      numtyp qiry = qix*zr - qiz*xr;
+      numtyp qirz = qiy*xr - qix*yr;
+      numtyp qikx = qky*qiz - qkz*qiy;
+      numtyp qiky = qkz*qix - qkx*qiz;
+      numtyp qikz = qkx*qiy - qky*qix;
+      numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz;
+      numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz;
+      numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz;
+      numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz;
+      numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz;
+      numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz;
+      numtyp qikrx = qizk*yr - qiyk*zr;
+      numtyp qikry = qixk*zr - qizk*xr;
+      numtyp qikrz = qiyk*xr - qixk*yr;
+      numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
+      numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
+      numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
+      numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
+      numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
+      numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
+      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
+      numtyp dkqiry = dkqix*zr - dkqiz*xr;
+      numtyp dkqirz = dkqiy*xr - dkqix*yr;
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy -
+        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz -
+        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix -
+        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+      numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;
+
+      // get damping coefficients for the Pauli repulsion energy
+      numtyp dmpik[11];
+      damprep(r,r2,rr1,rr3,rr5,rr7,rr9,rr11,11,dmpi,dmpk,dmpik);
+
+      // calculate intermediate terms needed for the energy
+
+      numtyp term1 = vali*valk;
+      numtyp term2 = valk*dir - vali*dkr + dik;
+      numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
+      numtyp term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik;
+      numtyp term5 = qir*qkr;
+      numtyp eterm = term1*dmpik[0] + term2*dmpik[2] +
+        term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8];
+
+      // compute the Pauli repulsion energy for this interaction
+
+      numtyp sizik = sizi * sizk * factor_repel;
+      numtyp e = sizik * eterm * rr1;
+
+      // calculate intermediate terms for force and torque
+
+      numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] +
+        term4*dmpik[8] + term5*dmpik[10];
+      term1 = -valk*dmpik[2] + dkr*dmpik[4] - qkr*dmpik[6];
+      term2 = vali*dmpik[2] + dir*dmpik[4] + qir*dmpik[6];
+      term3 = (numtyp)2.0 * dmpik[4];
+      term4 = (numtyp)2.0 * (-valk*dmpik[4] + dkr*dmpik[6] - qkr*dmpik[8]);
+      term5 = (numtyp)2.0 * (-vali*dmpik[4] - dir*dmpik[6] - qir*dmpik[8]);
+      numtyp term6 = (numtyp)4.0 * dmpik[6];
+
+      // compute the force components for this interaction
+
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) +
+        term4*qix + term5*qkx + term6*(qixk+qkxi);
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) +
+        term4*qiy + term5*qky + term6*(qiyk+qkyi);
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
+        term4*qiz + term5*qkz + term6*(qizk+qkzi);
+
+      frcx = frcx*rr1 + eterm*rr3*xr;
+      frcy = frcy*rr1 + eterm*rr3*yr;
+      frcz = frcz*rr1 + eterm*rr3*zr;
+      frcx = sizik * frcx;
+      frcy = sizik * frcy;
+      frcz = sizik * frcz;
+      
+      // compute the torque components for this interaction
+
+      numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) -
+        term4*qirx - term6*(qikrx+qikx);
+      numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) -
+        term4*qiry - term6*(qikry+qiky);
+      numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) -
+        term4*qirz - term6*(qikrz+qikz);
+      ttmix = sizik * ttmix * rr1;
+      ttmiy = sizik * ttmiy * rr1;
+      ttmiz = sizik * ttmiz * rr1;
+
+      // use energy switching if near the cutoff distance
+
+      if (r2 > cut2) {
+        numtyp r3 = r2 * r;
+        numtyp r4 = r2 * r2;
+        numtyp r5 = r2 * r3;
+        numtyp taper = c5*r5 + c4*r4 + c3*r3 + c2*r2 + c1*r + c0;
+        numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp)4.0*c4*r3 +
+          (numtyp)3.0*c3*r2 + (numtyp)2.0*c2*r + c1;
+        dtaper *= e * rr1;
+        e *= taper;
+        frcx = frcx*taper - dtaper*xr;
+        frcy = frcy*taper - dtaper*yr;
+        frcz = frcz*taper - dtaper*zr;
+        ttmix *= taper;
+        ttmiy *= taper;
+        ttmiz *= taper;
+      }
+
+      energy += e;
+
+      // increment force-based gradient and torque on atom I
+
+      f.x -= frcx;
+      f.y -= frcy;
+      f.z -= frcz;
+      tq.x += ttmix;
+      tq.y += ttmiy;
+      tq.z += ttmiz;
+
+      // increment the internal virial tensor components
+      if (EVFLAG && vflag) {
+        numtyp vxx = -xr * frcx;
+        numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = -yr * frcy;
+        numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = -zr * frcz;
+
+        virial[0] -= vxx;
+        virial[1] -= vyy;
+        virial[2] -= vzz;
+        virial[3] -= vxy;
+        virial[4] -= vxz;
+        virial[5] -= vyz;
+      }
+    } // nbor
+
+  } // ii<inum
+
+  // accumulate tq
+  store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
+  // accumate force, energy and virial
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
+}
+
+/* ----------------------------------------------------------------------
+   dispersion = real-space portion of Ewald dispersion
+   adapted from Tinker edreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
+                                 const __global numtyp4 *restrict extra,
+                                 const __global numtyp4 *restrict coeff_amtype,
+                                 const __global numtyp4 *restrict coeff_amclass,
+                                 const __global numtyp4 *restrict sp_nonpolar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp off2)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  const __global numtyp4* polar3 = &extra[2*nall];
+
+  if (ii<inum) {
+    int itype,iclass;
+    numtyp ci,ai;
+
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    itype  = polar3[i].z;            // amtype[i];
+    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+    ci = coeff_amclass[iclass].x;    // csix[iclass];
+    ai = coeff_amclass[iclass].y;    // adisp[iclass];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp xr = ix.x - jx.x;
+      numtyp yr = ix.y - jx.y;
+      numtyp zr = ix.z - jx.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      int jtype =   polar3[j].z; // amtype[j];
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
+      numtyp ck = coeff_amclass[jclass].x;    // csix[jclass];
+      numtyp ak = coeff_amclass[jclass].y;    // adisp[jclass];
+
+      numtyp r6 = r2*r2*r2;
+      numtyp ralpha2 = r2 * aewald*aewald;
+      numtyp term = (numtyp)1.0 + ralpha2 + (numtyp)0.5*ralpha2*ralpha2;
+      numtyp expterm = ucl_exp(-ralpha2);
+      numtyp expa = expterm * term;
+
+      // find the damping factor for the dispersion interaction
+
+      numtyp r = ucl_sqrt(r2);
+      numtyp r7 = r6 * r;
+      numtyp di = ai * r;
+      numtyp di2 = di * di;
+      numtyp di3 = di * di2;
+      numtyp dk = ak * r;
+      numtyp expi = ucl_exp(-di);
+      numtyp expk = ucl_exp(-dk);
+
+      numtyp ai2,ak2;
+      numtyp di4,di5;
+      numtyp dk2,dk3;
+      numtyp ti,ti2;
+      numtyp tk,tk2;
+      numtyp damp3,damp5;
+      numtyp ddamp;
+      const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)];
+      numtyp factor_disp = sp_nonpol.y; // factor_disp = special_disp[sbmask15(j)];
+
+      if (ai != ak) {
+        ai2 = ai * ai;
+        ak2 = ak * ak;
+        dk2 = dk * dk;
+        dk3 = dk * dk2;
+        ti = ak2 / (ak2-ai2);
+        ti2 = ti * ti;
+        tk = ai2 / (ai2-ak2);
+        tk2 = tk * tk;
+        damp3 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2) * expi
+          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2) * expk
+          - (numtyp)2.0*ti2*tk * ((numtyp)1.0 + di)* expi
+          - (numtyp)2.0*tk2*ti * ((numtyp)1.0 + dk) *expk;
+        damp5 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2 + di3/(numtyp)6.0) * expi
+          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk
+          - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi
+          - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk;
+        ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) +
+          (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0);
+
+      } else {
+        di4 = di2 * di2;
+        di5 = di2 * di3;
+        damp3 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + (numtyp)7.0*di3/(numtyp)48.0+di4/(numtyp)48.0)*expi;
+        damp5 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + di3/(numtyp)6.0+di4/(numtyp)24.0+di5/(numtyp)144.0)*expi;
+        ddamp = ai * expi * (di5-(numtyp)3.0*di3-(numtyp)3.0*di2) / (numtyp)96.0;
+      }
+
+      numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3;
+
+      // apply damping and scaling factors for this interaction
+
+      numtyp scale = factor_disp * damp*damp;
+      scale = scale - (numtyp)1.0;
+      numtyp e = -ci * ck * (expa+scale) / r6;
+      numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r;
+      numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7;
+
+      energy+= e;
+
+      // increment the damped dispersion derivative components
+
+      numtyp dedx = de * xr;
+      numtyp dedy = de * yr;
+      numtyp dedz = de * zr;
+      f.x -= dedx;
+      f.y -= dedy;
+      f.z -= dedz;
+
+      // increment the internal virial tensor components
+
+      numtyp vxx = xr * dedx;
+      numtyp vyx = yr * dedx;
+      numtyp vzx = zr * dedx;
+      numtyp vyy = yr * dedy;
+      numtyp vzy = zr * dedy;
+      numtyp vzz = zr * dedz;
+
+      virial[0] -= vxx;
+      virial[1] -= vyy;
+      virial[2] -= vzz;
+      virial[3] -= vyx;
+      virial[4] -= vzx;
+      virial[5] -= vzy;
+    } // nbor
+
+  } // ii<inum
+
+  store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
+}
+
+/* ----------------------------------------------------------------------
+   multipole_real = real-space portion of multipole
+   adapted from Tinker emreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
+                                const __global numtyp4 *restrict extra,
+                                const __global numtyp4 *restrict coeff_amtype,
+                                const __global numtyp4 *restrict coeff_amclass,
+                                const __global numtyp4 *restrict sp_polar,
+                                const __global int *dev_nbor,
+                                const __global int *dev_packed,
+                                const __global int *dev_short_nbor,
+                                __global acctyp4 *restrict ans,
+                                __global acctyp *restrict engv,
+                                __global acctyp4 *restrict tep,
+                                const int eflag, const int vflag, const int inum,
+                                const int nall, const int nbor_pitch,
+                                const int t_per_atom, const numtyp aewald,
+                                const numtyp felec, const numtyp off2,
+                                const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp4 tq;
+  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
+
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
+  const __global numtyp4* polar6 = &extra[5*nall];
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    const numtyp4 pol1i = polar1[i];
+    numtyp dix = pol1i.y;    // rpole[i][1];
+    numtyp diy = pol1i.z;    // rpole[i][2];
+    numtyp diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    numtyp qixx = pol2i.x;   // rpole[i][4];
+    numtyp qixy = pol2i.y;   // rpole[i][5];
+    numtyp qixz = pol2i.z;   // rpole[i][6];
+    numtyp qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    numtyp qiyz = pol3i.x;   // rpole[i][9];
+    numtyp qizz = pol3i.y;   // rpole[i][12];
+    int itype  = pol3i.z;        // amtype[i];
+    int iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+    numtyp corei = coeff_amclass[iclass].z;  // pcore[iclass];
+    numtyp alphai = coeff_amclass[iclass].w; // palpha[iclass];
+    numtyp vali = polar6[i].x;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      numtyp r = ucl_sqrt(r2);
+      const numtyp4 pol1j = polar1[j];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype = pol3j.z; // amtype[j];
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
+
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
+
+      numtyp corek = coeff_amclass[jclass].z;  // pcore[jclass];
+      numtyp alphak = coeff_amclass[jclass].w; // palpha[jclass];
+      numtyp valk = polar6[j].x;
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+
+      numtyp dik = dix*dkx + diy*dky + diz*dkz;
+      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
+      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
+      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
+        qixx*qkxx + qiyy*qkyy + qizz*qkzz;
+
+      // additional intermediates involving moments and distance
+
+      numtyp dirx = diy*zr - diz*yr;
+      numtyp diry = diz*xr - dix*zr;
+      numtyp dirz = dix*yr - diy*xr;
+      numtyp dikx = diy*dkz - diz*dky;
+      numtyp diky = diz*dkx - dix*dkz;
+      numtyp dikz = dix*dky - diy*dkx;
+      numtyp qirx = qiz*yr - qiy*zr;
+      numtyp qiry = qix*zr - qiz*xr;
+      numtyp qirz = qiy*xr - qix*yr;
+      numtyp qikx = qky*qiz - qkz*qiy;
+      numtyp qiky = qkz*qix - qkx*qiz;
+      numtyp qikz = qkx*qiy - qky*qix;
+      numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz;
+      numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz;
+      numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz;
+      numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz;
+      numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz;
+      numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz;
+      numtyp qikrx = qizk*yr - qiyk*zr;
+      numtyp qikry = qixk*zr - qizk*xr;
+      numtyp qikrz = qiyk*xr - qixk*yr;
+      numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
+      numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
+      numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
+      numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
+      numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
+      numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
+      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
+      numtyp dkqiry = dkqix*zr - dkqiz*xr;
+      numtyp dkqirz = dkqiy*xr - dkqix*yr;
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy -
+        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz -
+        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix -
+        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = felec * rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+      numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[6];
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
+      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
+      numtyp alsq2n = (numtyp)0.0;
+      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+      int m;
+      for (m = 1; m < 6; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        alsq2n = alsq2 * alsq2n;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
+      }
+      for (m = 0; m < 6; m++) bn[m] *= felec;
+
+      numtyp term1,term2,term3;
+      numtyp term4,term5,term6;
+
+      term1 = corei*corek;
+      numtyp term1i = corek*vali;
+      numtyp term2i = corek*dir;
+      numtyp term3i = corek*qir;
+      numtyp term1k = corei*valk;
+      numtyp term2k = -corei*dkr;
+      numtyp term3k = corei*qkr;
+      numtyp term1ik = vali*valk;
+      numtyp term2ik = valk*dir - vali*dkr + dik;
+      numtyp term3ik = vali*qkr + valk*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
+      numtyp term4ik = dir*qkr - dkr*qir - 4.0*qik;
+      numtyp term5ik = qir*qkr;
+      numtyp dmpi[9],dmpj[9];
+      numtyp dmpij[11];
+      damppole(r,11,alphai,alphak,dmpi,dmpj,dmpij);
+      numtyp scalek = factor_mpole;
+      numtyp rr1i = bn[0] - ((numtyp)1.0-scalek*dmpi[0])*rr1;
+      numtyp rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3;
+      numtyp rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5;
+      numtyp rr7i = bn[3] - ((numtyp)1.0-scalek*dmpi[6])*rr7;
+      numtyp rr1k = bn[0] - ((numtyp)1.0-scalek*dmpj[0])*rr1;
+      numtyp rr3k = bn[1] - ((numtyp)1.0-scalek*dmpj[2])*rr3;
+      numtyp rr5k = bn[2] - ((numtyp)1.0-scalek*dmpj[4])*rr5;
+      numtyp rr7k = bn[3] - ((numtyp)1.0-scalek*dmpj[6])*rr7;
+      numtyp rr1ik = bn[0] - ((numtyp)1.0-scalek*dmpij[0])*rr1;
+      numtyp rr3ik = bn[1] - ((numtyp)1.0-scalek*dmpij[2])*rr3;
+      numtyp rr5ik = bn[2] - ((numtyp)1.0-scalek*dmpij[4])*rr5;
+      numtyp rr7ik = bn[3] - ((numtyp)1.0-scalek*dmpij[6])*rr7;
+      numtyp rr9ik = bn[4] - ((numtyp)1.0-scalek*dmpij[8])*rr9;
+      numtyp rr11ik = bn[5] - ((numtyp)1.0-scalek*dmpij[10])*rr11;
+      rr1 = bn[0] - ((numtyp)1.0-scalek)*rr1;
+      rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3;
+      numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik +
+        term1i*rr1i + term1k*rr1k + term1ik*rr1ik +
+        term2i*rr3i + term2k*rr3k + term2ik*rr3ik +
+        term3i*rr5i + term3k*rr5k + term3ik*rr5ik;
+
+      // find damped multipole intermediates for force and torque
+
+      numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik +
+        term1i*rr3i + term1k*rr3k + term1ik*rr3ik +
+        term2i*rr5i + term2k*rr5k + term2ik*rr5ik +
+        term3i*rr7i + term3k*rr7k + term3ik*rr7ik;
+      term1 = -corek*rr3i - valk*rr3ik + dkr*rr5ik - qkr*rr7ik;
+      term2 = corei*rr3k + vali*rr3ik + dir*rr5ik + qir*rr7ik;
+      term3 = (numtyp)2.0 * rr5ik;
+      term4 = (numtyp)-2.0 * (corek*rr5i+valk*rr5ik - dkr*rr7ik+qkr*rr9ik);
+      term5 = (numtyp)-2.0 * (corei*rr5k+vali*rr5ik + dir*rr7ik+qir*rr9ik);
+      term6 = (numtyp)4.0 * rr7ik;
+      rr3 = rr3ik;
+
+      energy += e;
+
+      // compute the force components for this interaction
+
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) +
+        term4*qix + term5*qkx + term6*(qixk+qkxi);
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) +
+        term4*qiy + term5*qky + term6*(qiyk+qkyi);
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
+        term4*qiz + term5*qkz + term6*(qizk+qkzi);
+
+      // compute the torque components for this interaction
+
+      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) -
+        term4*qirx - term6*(qikrx+qikx);
+      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) -
+        term4*qiry - term6*(qikry+qiky);
+      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) -
+        term4*qirz - term6*(qikrz+qikz);
+
+      // increment force-based gradient and torque on first site
+
+      f.x -= frcx;
+      f.y -= frcy;
+      f.z -= frcz;
+      tq.x += ttmix;
+      tq.y += ttmiy;
+      tq.z += ttmiz;
+
+      if (EVFLAG && vflag) {
+        numtyp vxx = -xr * frcx;
+        numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = -yr * frcy;
+        numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = -zr * frcz;
+
+        virial[0] -= vxx;
+        virial[1] -= vyy;
+        virial[2] -= vzz;
+        virial[3] -= vxy;
+        virial[4] -= vxz;
+        virial[5] -= vyz;
+      }
+    } // nbor
+
+  } // ii<inum
+
+  // accumulate tq
+  store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
+
+  // accumate force, energy and virial
+  store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
+}
+
+/* ----------------------------------------------------------------------
+  udirect2b = Ewald real direct field via list
+  udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
+                                const __global numtyp4 *restrict extra,
+                                const __global numtyp4 *restrict coeff_amtype,
+                                const __global numtyp4 *restrict coeff_amclass,
+                                const __global numtyp4 *restrict sp_polar,
+                                const __global int *dev_nbor,
+                                const __global int *dev_packed,
+                                const __global int *dev_short_nbor,
+                                __global acctyp4 *restrict fieldp,
+                                const int inum,  const int nall,
+                                const int nbor_pitch, const int t_per_atom,
+                                const numtyp aewald, const numtyp off2,
+                                const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
+  const __global numtyp4* polar6 = &extra[5*nall];
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    const numtyp4 pol3i = polar3[i];
+    int itype  = pol3i.z;    // amtype[i];
+    int igroup = pol3i.w;    // amgroup[i];
+    int iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+
+    numtyp alphai = coeff_amclass[iclass].w; // palpha[iclass];
+
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+
+      const numtyp4 pol1j = polar1[j];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype = pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
+
+      numtyp corek = coeff_amclass[jclass].z;  // pcore[jclass];
+      numtyp alphak = coeff_amclass[jclass].w; // palpha[jclass];
+      numtyp valk = polar6[j].x;
+
+      numtyp factor_dscale, factor_pscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
+      } else {
+        factor_dscale = factor_pscale = sp_pol.z; // special_polar_pscale[sbmask15(jextra)];
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[4];
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find the field components for charge penetration damping
+      numtyp dmpi[7],dmpk[7];
+      dampdir(r,alphai,alphak,dmpi,dmpk);
+
+      numtyp scalek = factor_dscale;
+      numtyp rr3k = bn[1] - ((numtyp)1.0-scalek*dmpk[2])*rr3;
+      numtyp rr5k = bn[2] - ((numtyp)1.0-scalek*dmpk[4])*rr5;
+      numtyp rr7k = bn[3] - ((numtyp)1.0-scalek*dmpk[6])*rr7;
+      rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3;
+      numtyp fid[3];
+      fid[0] = -xr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
+        rr3k*dkx + (numtyp)2.0*rr5k*qkx;
+      fid[1] = -yr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
+        rr3k*dky + (numtyp)2.0*rr5k*qky;
+      fid[2] = -zr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
+        rr3k*dkz + (numtyp)2.0*rr5k*qkz;
+
+      scalek = factor_pscale;
+      rr3 = r2inv * rr1;
+      rr3k = bn[1] - ((numtyp)1.0-scalek*dmpk[2])*rr3;
+      rr5k = bn[2] - ((numtyp)1.0-scalek*dmpk[4])*rr5;
+      rr7k = bn[3] - ((numtyp)1.0-scalek*dmpk[6])*rr7;
+      rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3;
+      numtyp fip[3];
+      fip[0] = -xr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
+        rr3k*dkx + (numtyp)2.0*rr5k*qkx;
+      fip[1] = -yr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
+        rr3k*dky + (numtyp)2.0*rr5k*qky;
+      fip[2] = -zr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
+        rr3k*dkz + (numtyp)2.0*rr5k*qkz;
+
+      // find terms needed later to compute mutual polarization
+
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
+/* ----------------------------------------------------------------------
+  umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
+                                const __global numtyp4 *restrict extra,
+                                const __global numtyp4 *restrict coeff_amtype,
+                                const __global numtyp4 *restrict coeff_amclass,
+                                const __global numtyp4 *restrict sp_polar,
+                                const __global int *dev_nbor,
+                                const __global int *dev_packed,
+                                const __global int *dev_short_nbor,
+                                __global acctyp4 *restrict fieldp,
+                                const int inum,  const int nall,
+                                const int nbor_pitch, const int t_per_atom,
+                                const numtyp aewald, const numtyp off2,
+                                const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  const __global numtyp4* polar3 = &extra[2*nall];
+  const __global numtyp4* polar4 = &extra[3*nall];
+  const __global numtyp4* polar5 = &extra[4*nall];
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    int itype;
+    itype  = polar3[i].z; // amtype[i];
+    int iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+    numtyp alphai = coeff_amclass[iclass].w; // palpha[iclass];
+
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_rsqrt(r2);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+
+      const numtyp4 pol3j = polar3[j];
+      int jtype = pol3j.z; // amtype[j];
+      const numtyp4 pol4j = polar4[j];
+      numtyp ukx = pol4j.x;  // uind[j][0];
+      numtyp uky = pol4j.y;  // uind[j][1];
+      numtyp ukz = pol4j.z;  // uind[j][2];
+      const numtyp4 pol5j = polar5[j];
+      numtyp ukxp = pol5j.x; // uinp[j][0];
+      numtyp ukyp = pol5j.y; // uinp[j][1];
+      numtyp ukzp = pol5j.z; // uinp[j][2];
+
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
+      numtyp alphak = coeff_amclass[jclass].w; // palpha[jclass];
+
+      numtyp factor_wscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)];
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[4];
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find terms needed later to compute mutual polarization
+      // if (poltyp != DIRECT)
+      numtyp dmpik[5];
+      dampmut(r,alphai,alphak,dmpik);
+      numtyp scalek = factor_wscale;
+      rr3 = r2inv * rr1;
+      numtyp rr3ik = bn[1] - ((numtyp)1.0-scalek*dmpik[2])*rr3;
+      numtyp rr5ik = bn[2] - ((numtyp)1.0-scalek*dmpik[4])*rr5;
+
+      numtyp tdipdip[6];
+      tdipdip[0] = -rr3ik + rr5ik*xr*xr;
+      tdipdip[1] = rr5ik*xr*yr;
+      tdipdip[2] = rr5ik*xr*zr;
+      tdipdip[3] = -rr3ik + rr5ik*yr*yr;
+      tdipdip[4] = rr5ik*yr*zr;
+      tdipdip[5] = -rr3ik + rr5ik*zr*zr;
+
+      numtyp fid[3];
+      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
+      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
+      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
+
+      numtyp fip[3];
+      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
+      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
+      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
+
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
+/* ----------------------------------------------------------------------
+   polar_real = real-space portion of induced dipole polarization
+   adapted from Tinker epreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
+                            const __global numtyp4 *restrict extra,
+                            const __global numtyp4 *restrict coeff_amtype,
+                            const __global numtyp4 *restrict coeff_amclass,
+                            const __global numtyp4 *restrict sp_polar,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            const __global int *dev_short_nbor,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            __global acctyp4 *restrict tep,
+                            const int eflag, const int vflag, const int inum,
+                            const int nall, const int nbor_pitch, const int t_per_atom,
+                            const numtyp aewald, const numtyp felec,
+                            const numtyp off2, const numtyp polar_dscale,
+                            const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp ufld[3];
+  ufld[0] = (acctyp)0; ufld[1]=(acctyp)0; ufld[2]=(acctyp)0;
+  acctyp dufld[6];
+  for (int l=0; l<6; l++) dufld[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
+  const __global numtyp4* polar4 = &extra[3*nall];
+  const __global numtyp4* polar5 = &extra[4*nall];
+  const __global numtyp4* polar6 = &extra[5*nall];
+
+  if (ii<inum) {
+    int itype,igroup;
+    numtyp uix,uiy,uiz,uixp,uiyp,uizp;
+
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    const numtyp4 pol1i = polar1[i];
+    dix = pol1i.y;    // rpole[i][1];
+    diy = pol1i.z;    // rpole[i][2];
+    diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    qixx = pol2i.x;   // rpole[i][4];
+    qixy = pol2i.y;   // rpole[i][5];
+    qixz = pol2i.z;   // rpole[i][6];
+    qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    qiyz = pol3i.x;   // rpole[i][9];
+    qizz = pol3i.y;   // rpole[i][12];
+    itype  = pol3i.z;    // amtype[i];
+    igroup = pol3i.w;    // amgroup[i];
+    const numtyp4 pol4i = polar4[i];
+    uix = pol4i.x;    // uind[i][0];
+    uiy = pol4i.y;    // uind[i][1];
+    uiz = pol4i.z;    // uind[i][2];
+    const numtyp4 pol5i = polar5[i];
+    uixp = pol5i.x;   // uinp[i][0];
+    uiyp = pol5i.y;   // uinp[i][1];
+    uizp = pol5i.z;   // uinp[i][2];
+
+    numtyp corei = coeff_amclass[itype].z;  // pcore[iclass];
+    numtyp alphai = coeff_amclass[itype].w; // palpha[iclass];
+    numtyp vali = polar6[i].x;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+      numtyp r = ucl_sqrt(r2);
+
+      const numtyp4 pol1j = polar1[j];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype =   pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
+      const numtyp4 pol4j = polar4[j];
+      numtyp ukx = pol4j.x;  // uind[j][0];
+      numtyp uky = pol4j.y;  // uind[j][1];
+      numtyp ukz = pol4j.z;  // uind[j][2];
+      const numtyp4 pol5j = polar5[j];
+      numtyp ukxp = pol5j.x; // uinp[j][0];
+      numtyp ukyp = pol5j.y; // uinp[j][1];
+      numtyp ukzp = pol5j.z; // uinp[j][2];
+
+      numtyp factor_wscale, factor_dscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_dscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
+      } else {
+        factor_dscale = sp_pol.z; // special_polar_pscale[sbmask15(jextra)];
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+      numtyp uir = uix*xr + uiy*yr + uiz*zr;
+      numtyp ukr = ukx*xr + uky*yr + ukz*zr;
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = felec * rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+
+      // calculate the real space Ewald error function terms
+      
+      int m;
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[5];
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
+      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
+      numtyp alsq2n = (numtyp)0.0;
+      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+      for (m = 1; m <= 4; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        alsq2n = alsq2 * alsq2n;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
+      }
+      for (m = 0; m < 5; m++) bn[m] *= felec;
+
+      // apply charge penetration damping to scale factors
+
+      numtyp corek = coeff_amclass[jtype].z;  // pcore[jclass];
+      numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
+      numtyp valk = polar6[j].x;
+      numtyp dmpi[9],dmpk[9];
+      numtyp dmpik[9];
+      damppole(r,9,alphai,alphak,dmpi,dmpk,dmpik);
+      numtyp rr3core = bn[1] - ((numtyp)1.0-factor_dscale)*rr3;
+      numtyp rr5core = bn[2] - ((numtyp)1.0-factor_dscale)*rr5;
+
+      numtyp rr3i = bn[1] - ((numtyp)1.0-factor_dscale*dmpi[2])*rr3;
+      numtyp rr5i = bn[2] - ((numtyp)1.0-factor_dscale*dmpi[4])*rr5;
+      numtyp rr7i = bn[3] - ((numtyp)1.0-factor_dscale*dmpi[6])*rr7;
+      numtyp rr9i = bn[4] - ((numtyp)1.0-factor_dscale*dmpi[8])*rr9;
+      numtyp rr3k = bn[1] - ((numtyp)1.0-factor_dscale*dmpk[2])*rr3;
+      numtyp rr5k = bn[2] - ((numtyp)1.0-factor_dscale*dmpk[4])*rr5;
+      numtyp rr7k = bn[3] - ((numtyp)1.0-factor_dscale*dmpk[6])*rr7;
+      numtyp rr9k = bn[4] - ((numtyp)1.0-factor_dscale*dmpk[8])*rr9;
+      numtyp rr5ik = bn[2] - ((numtyp)1.0-factor_wscale*dmpik[4])*rr5;
+      numtyp rr7ik = bn[3] - ((numtyp)1.0-factor_wscale*dmpik[6])*rr7;
+
+      // get the induced dipole field used for dipole torques
+
+      numtyp tix3 = (numtyp)2.0*rr3i*ukx;
+      numtyp tiy3 = (numtyp)2.0*rr3i*uky;
+      numtyp tiz3 = (numtyp)2.0*rr3i*ukz;
+      numtyp tuir = (numtyp)-2.0*rr5i*ukr;
+
+      ufld[0] += tix3 + xr*tuir;
+      ufld[1] += tiy3 + yr*tuir;
+      ufld[2] += tiz3 + zr*tuir;
+
+      // get induced dipole field gradient used for quadrupole torques
+
+      numtyp tix5 = (numtyp)4.0 * (rr5i*ukx);
+      numtyp tiy5 = (numtyp)4.0 * (rr5i*uky);
+      numtyp tiz5 = (numtyp)4.0 * (rr5i*ukz);
+      tuir = (numtyp)-2.0*rr7i*ukr;
+
+      dufld[0] += xr*tix5 + xr*xr*tuir;
+      dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
+      dufld[2] += yr*tiy5 + yr*yr*tuir;
+      dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
+      dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
+      dufld[5] += zr*tiz5 + zr*zr*tuir;
+
+      // get the field gradient for direct polarization force
+
+      numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i;
+      numtyp term1k,term2k,term3k,term4k,term5k,term6k,term7k,term8k;
+      numtyp term1core;
+      numtyp tixx,tiyy,tizz,tixy,tixz,tiyz;
+      numtyp tkxx,tkyy,tkzz,tkxy,tkxz,tkyz;
+
+      term1i = rr3i - rr5i*xr*xr;
+      term1core = rr3core - rr5core*xr*xr;
+      term2i = (numtyp)2.0*rr5i*xr ;
+      term3i = rr7i*xr*xr - rr5i;
+      term4i = (numtyp)2.0*rr5i;
+      term5i = (numtyp)5.0*rr7i*xr;
+      term6i = rr9i*xr*xr;
+      term1k = rr3k - rr5k*xr*xr;
+      term2k = (numtyp)2.0*rr5k*xr;
+      term3k = rr7k*xr*xr - rr5k;
+      term4k = (numtyp)2.0*rr5k;
+      term5k = (numtyp)5.0*rr7k*xr;
+      term6k = rr9k*xr*xr;
+      tixx = vali*term1i + corei*term1core + dix*term2i - dir*term3i -
+        qixx*term4i + qix*term5i - qir*term6i + (qiy*yr+qiz*zr)*rr7i;
+      tkxx = valk*term1k + corek*term1core - dkx*term2k + dkr*term3k -
+        qkxx*term4k + qkx*term5k - qkr*term6k + (qky*yr+qkz*zr)*rr7k;
+
+      term1i = rr3i - rr5i*yr*yr;
+      term1core = rr3core - rr5core*yr*yr;
+      term2i = (numtyp)2.0*rr5i*yr;
+      term3i = rr7i*yr*yr - rr5i;
+      term4i = (numtyp)2.0*rr5i;
+      term5i = (numtyp)5.0*rr7i*yr;
+      term6i = rr9i*yr*yr;
+      term1k = rr3k - rr5k*yr*yr;
+      term2k = (numtyp)2.0*rr5k*yr;
+      term3k = rr7k*yr*yr - rr5k;
+      term4k = (numtyp)2.0*rr5k;
+      term5k = (numtyp)5.0*rr7k*yr;
+      term6k = rr9k*yr*yr;
+      tiyy = vali*term1i + corei*term1core + diy*term2i - dir*term3i -
+        qiyy*term4i + qiy*term5i - qir*term6i + (qix*xr+qiz*zr)*rr7i;
+      tkyy = valk*term1k + corek*term1core - dky*term2k + dkr*term3k -
+        qkyy*term4k + qky*term5k - qkr*term6k + (qkx*xr+qkz*zr)*rr7k;
+
+      term1i = rr3i - rr5i*zr*zr;
+      term1core = rr3core - rr5core*zr*zr;
+      term2i = (numtyp)2.0*rr5i*zr;
+      term3i = rr7i*zr*zr - rr5i;
+      term4i = (numtyp)2.0*rr5i;
+      term5i = (numtyp)5.0*rr7i*zr;
+      term6i = rr9i*zr*zr;
+      term1k = rr3k - rr5k*zr*zr;
+      term2k = (numtyp)2.0*rr5k*zr;
+      term3k = rr7k*zr*zr - rr5k;
+      term4k = (numtyp)2.0*rr5k;
+      term5k = (numtyp)5.0*rr7k*zr;
+      term6k = rr9k*zr*zr;
+      tizz = vali*term1i + corei*term1core + diz*term2i - dir*term3i -
+        qizz*term4i + qiz*term5i - qir*term6i + (qix*xr+qiy*yr)*rr7i;
+      tkzz = valk*term1k + corek*term1core - dkz*term2k + dkr*term3k -
+        qkzz*term4k + qkz*term5k - qkr*term6k + (qkx*xr+qky*yr)*rr7k;
+
+      term2i = rr5i*xr ;
+      term1i = yr * term2i;
+      term1core = rr5core*xr*yr;
+      term3i = rr5i*yr;
+      term4i = yr * (rr7i*xr);
+      term5i = (numtyp)2.0*rr5i;
+      term6i = (numtyp)2.0*rr7i*xr;
+      term7i = (numtyp)2.0*rr7i*yr;
+      term8i = yr*rr9i*xr;
+      term2k = rr5k*xr;
+      term1k = yr * term2k;
+      term3k = rr5k*yr;
+      term4k = yr * (rr7k*xr);
+      term5k = (numtyp)2.0*rr5k;
+      term6k = (numtyp)2.0*rr7k*xr;
+      term7k = (numtyp)2.0*rr7k*yr;
+      term8k = yr*rr9k*xr;
+      tixy = -vali*term1i - corei*term1core + diy*term2i + dix*term3i -
+        dir*term4i - qixy*term5i + qiy*term6i + qix*term7i - qir*term8i;
+      tkxy = -valk*term1k - corek*term1core - dky*term2k - dkx*term3k +
+        dkr*term4k - qkxy*term5k + qky*term6k + qkx*term7k - qkr*term8k;
+
+      term2i = rr5i*xr;
+      term1i = zr * term2i;
+      term1core = rr5core*xr*zr;
+      term3i = rr5i*zr;
+      term4i = zr * (rr7i*xr);
+      term5i = (numtyp)2.0*rr5i;
+      term6i = (numtyp)2.0*rr7i*xr;
+      term7i = (numtyp)2.0*rr7i*zr;
+      term8i = zr*rr9i*xr;
+      term2k = rr5k*xr;
+      term1k = zr * term2k;
+      term3k = rr5k*zr;
+      term4k = zr * (rr7k*xr);
+      term5k = (numtyp)2.0*rr5k;
+      term6k = (numtyp)2.0*rr7k*xr;
+      term7k = (numtyp)2.0*rr7k*zr;
+      term8k = zr*rr9k*xr;
+      tixz = -vali*term1i - corei*term1core + diz*term2i + dix*term3i -
+        dir*term4i - qixz*term5i + qiz*term6i + qix*term7i - qir*term8i;
+      tkxz = -valk*term1k - corek*term1core - dkz*term2k - dkx*term3k +
+        dkr*term4k - qkxz*term5k + qkz*term6k + qkx*term7k - qkr*term8k;
+
+      term2i = rr5i*yr;
+      term1i = zr * term2i;
+      term1core = rr5core*yr*zr;
+      term3i = rr5i*zr;
+      term4i = zr * (rr7i*yr);
+      term5i = (numtyp)2.0*rr5i;
+      term6i = (numtyp)2.0*rr7i*yr;
+      term7i = (numtyp)2.0*rr7i*zr;
+      term8i = zr*rr9i*yr;
+      term2k = rr5k*yr;
+      term1k = zr * term2k;
+      term3k = rr5k*zr;
+      term4k = zr * (rr7k*yr);
+      term5k = (numtyp)2.0*rr5k;
+      term6k = (numtyp)2.0*rr7k*yr;
+      term7k = (numtyp)2.0*rr7k*zr;
+      term8k = zr*rr9k*yr;
+      tiyz = -vali*term1i - corei*term1core + diz*term2i + diy*term3i -
+        dir*term4i - qiyz*term5i + qiz*term6i + qiy*term7i - qir*term8i;
+      tkyz = -valk*term1k - corek*term1core - dkz*term2k - dky*term3k +
+        dkr*term4k - qkyz*term5k + qkz*term6k + qky*term7k - qkr*term8k;
+
+      numtyp depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz;
+      numtyp depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
+      numtyp depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;
+
+      numtyp frcx = (numtyp)-2.0 * depx;
+      numtyp frcy = (numtyp)-2.0 * depy;
+      numtyp frcz = (numtyp)-2.0 * depz;
+
+      numtyp term1,term2,term3;
+
+      // get the dEp/dR terms used for direct polarization force
+      // poltyp == MUTUAL && hippo
+      // tixx and tkxx
+      term1 = (numtyp)2.0 * rr5ik;
+      term2 = term1*xr;
+      term3 = rr5ik - rr7ik*xr*xr;
+      tixx = uix*term2 + uir*term3;
+      tkxx = ukx*term2 + ukr*term3;
+
+      // tiyy and tkyy
+      term2 = term1*yr;
+      term3 = rr5ik - rr7ik*yr*yr;
+      tiyy = uiy*term2 + uir*term3;
+      tkyy = uky*term2 + ukr*term3;
+
+      // tiz and tkzz
+      term2 = term1*zr;
+      term3 = rr5ik - rr7ik*zr*zr;
+      tizz = uiz*term2 + uir*term3;
+      tkzz = ukz*term2 + ukr*term3;
+
+      // tixy and tkxy
+      term1 = rr5ik*yr;
+      term2 = rr5ik*xr;
+      term3 = yr * (rr7ik*xr);
+      tixy = uix*term1 + uiy*term2 - uir*term3;
+      tkxy = ukx*term1 + uky*term2 - ukr*term3;
+
+      // tixx and tkxx
+      term1 = rr5ik * zr;
+      term3 = zr * (rr7ik*xr);
+      tixz = uix*term1 + uiz*term2 - uir*term3;
+      tkxz = ukx*term1 + ukz*term2 - ukr*term3;
+
+      // tiyz and tkyz
+      term2 = rr5ik*yr;
+      term3 = zr * (rr7ik*yr);
+      tiyz = uiy*term1 + uiz*term2 - uir*term3;
+      tkyz = uky*term1 + ukz*term2 - ukr*term3;
+
+      depx = tixx*ukxp + tixy*ukyp + tixz*ukzp + tkxx*uixp + tkxy*uiyp + tkxz*uizp;
+      depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp + tkxy*uixp + tkyy*uiyp + tkyz*uizp;
+      depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp + tkxz*uixp + tkyz*uiyp + tkzz*uizp;
+
+      frcx = frcx - depx;
+      frcy = frcy - depy;
+      frcz = frcz - depz;
+
+      f.x += frcx;
+      f.y += frcy;
+      f.z += frcz;
+
+      if (EVFLAG && vflag) {
+        numtyp vxx = xr * frcx;
+        numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = yr * frcy;
+        numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = zr * frcz;
+
+        virial[0] -= vxx;
+        virial[1] -= vyy;
+        virial[2] -= vzz;
+        virial[3] -= vxy;
+        virial[4] -= vxz;
+        virial[5] -= vyz;
+      }
+    } // nbor
+
+  } // ii<inum
+
+  // accumulate ufld and dufld to compute tep
+  store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);
+
+  // accumate force, energy and virial
+  store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
+}
+
+/* ----------------------------------------------------------------------
+   fphi_uind = induced potential from grid
+   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
+                          const __global int *restrict igrid,
+                          const __global numtyp2 *restrict grid,
+                          __global acctyp *restrict fdip_phi1,
+                          __global acctyp *restrict fdip_phi2,
+                          __global acctyp *restrict fdip_sum_phi,
+                          const int bsorder, const int inum,
+                          const int nzlo_out, const int nylo_out,
+                          const int nxlo_out, const int ngridxy,
+                          const int ngridx)
+{
+  int tid=THREAD_ID_X;
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
+
+  if (ii<inum) {
+
+    const int nlpts = (bsorder-1) / 2;
+    
+    int istart = fast_mul(ii,4);
+    const int igridx = igrid[istart];
+    const int igridy = igrid[istart+1];
+    const int igridz = igrid[istart+2];
+    
+    // now istart is used to index thetai1, thetai2 and thetai3
+    istart = fast_mul(ii,bsorder);
+
+    // extract the permanent multipole field at each site
+
+    numtyp tuv100_1 = (numtyp)0.0;
+    numtyp tuv010_1 = (numtyp)0.0;
+    numtyp tuv001_1 = (numtyp)0.0;
+    numtyp tuv200_1 = (numtyp)0.0;
+    numtyp tuv020_1 = (numtyp)0.0;
+    numtyp tuv002_1 = (numtyp)0.0;
+    numtyp tuv110_1 = (numtyp)0.0;
+    numtyp tuv101_1 = (numtyp)0.0;
+    numtyp tuv011_1 = (numtyp)0.0;
+    numtyp tuv100_2 = (numtyp)0.0;
+    numtyp tuv010_2 = (numtyp)0.0;
+    numtyp tuv001_2 = (numtyp)0.0;
+    numtyp tuv200_2 = (numtyp)0.0;
+    numtyp tuv020_2 = (numtyp)0.0;
+    numtyp tuv002_2 = (numtyp)0.0;
+    numtyp tuv110_2 = (numtyp)0.0;
+    numtyp tuv101_2 = (numtyp)0.0;
+    numtyp tuv011_2 = (numtyp)0.0;
+    numtyp tuv000 = (numtyp)0.0;
+    numtyp tuv001 = (numtyp)0.0;
+    numtyp tuv010 = (numtyp)0.0;
+    numtyp tuv100 = (numtyp)0.0;
+    numtyp tuv200 = (numtyp)0.0;
+    numtyp tuv020 = (numtyp)0.0;
+    numtyp tuv002 = (numtyp)0.0;
+    numtyp tuv110 = (numtyp)0.0;
+    numtyp tuv101 = (numtyp)0.0;
+    numtyp tuv011 = (numtyp)0.0;
+    numtyp tuv300 = (numtyp)0.0;
+    numtyp tuv030 = (numtyp)0.0;
+    numtyp tuv003 = (numtyp)0.0;
+    numtyp tuv210 = (numtyp)0.0;
+    numtyp tuv201 = (numtyp)0.0;
+    numtyp tuv120 = (numtyp)0.0;
+    numtyp tuv021 = (numtyp)0.0;
+    numtyp tuv102 = (numtyp)0.0;
+    numtyp tuv012 = (numtyp)0.0;
+    numtyp tuv111 = (numtyp)0.0;
+
+    int k = (igridz - nzlo_out) - nlpts;
+    for (int kb = 0; kb < bsorder; kb++) {
+      const int mz = fast_mul(k, ngridxy);
+      const int i3 = istart + kb;
+      const numtyp4 tha3 = thetai3[i3];
+      const numtyp v0 = tha3.x; // thetai3[m][kb][0];
+      const numtyp v1 = tha3.y; // thetai3[m][kb][1];
+      const numtyp v2 = tha3.z; // thetai3[m][kb][2];
+      const numtyp v3 = tha3.w; // thetai3[m][kb][3];
+      numtyp tu00_1 = (numtyp)0.0;
+      numtyp tu01_1 = (numtyp)0.0;
+      numtyp tu10_1 = (numtyp)0.0;
+      numtyp tu20_1 = (numtyp)0.0;
+      numtyp tu11_1 = (numtyp)0.0;
+      numtyp tu02_1 = (numtyp)0.0;
+      numtyp tu00_2 = (numtyp)0.0;
+      numtyp tu01_2 = (numtyp)0.0;
+      numtyp tu10_2 = (numtyp)0.0;
+      numtyp tu20_2 = (numtyp)0.0;
+      numtyp tu11_2 = (numtyp)0.0;
+      numtyp tu02_2 = (numtyp)0.0;
+      numtyp tu00 = (numtyp)0.0;
+      numtyp tu10 = (numtyp)0.0;
+      numtyp tu01 = (numtyp)0.0;
+      numtyp tu20 = (numtyp)0.0;
+      numtyp tu11 = (numtyp)0.0;
+      numtyp tu02 = (numtyp)0.0;
+      numtyp tu30 = (numtyp)0.0;
+      numtyp tu21 = (numtyp)0.0;
+      numtyp tu12 = (numtyp)0.0;
+      numtyp tu03 = (numtyp)0.0;
+
+      int j = (igridy - nylo_out) - nlpts;
+      for (int jb = 0; jb < bsorder; jb++) {
+        const int my = mz + fast_mul(j, ngridx);
+        const int i2 = istart + jb;
+        const numtyp4 tha2 = thetai2[i2];
+        const numtyp u0 = tha2.x; // thetai2[m][jb][0];
+        const numtyp u1 = tha2.y; // thetai2[m][jb][1];
+        const numtyp u2 = tha2.z; // thetai2[m][jb][2];
+        const numtyp u3 = tha2.w; // thetai2[m][jb][3];
+        numtyp t0_1 = (numtyp)0.0;
+        numtyp t1_1 = (numtyp)0.0;
+        numtyp t2_1 = (numtyp)0.0;
+        numtyp t0_2 = (numtyp)0.0;
+        numtyp t1_2 = (numtyp)0.0;
+        numtyp t2_2 = (numtyp)0.0;
+        numtyp t3 = (numtyp)0.0;
+
+        int i = (igridx - nxlo_out) - nlpts;
+        for (int ib = 0; ib < bsorder; ib++) {
+          const int i1 = istart + ib;
+          const numtyp4 tha1 = thetai1[i1];
+          const int gidx = my + i; // k*ngridxy + j*ngridx + i;
+          const numtyp2 tq = grid[gidx];
+          const numtyp tq_1 = tq.x; //grid[gidx];
+          const numtyp tq_2 = tq.y; //grid[gidx+1];
+          t0_1 += tq_1*tha1.x;
+          t1_1 += tq_1*tha1.y;
+          t2_1 += tq_1*tha1.z;
+          t0_2 += tq_2*tha1.x;
+          t1_2 += tq_2*tha1.y;
+          t2_2 += tq_2*tha1.z;
+          t3 += (tq_1+tq_2)*tha1.w;
+          i++;
+        }
+
+        tu00_1 += t0_1*u0;
+        tu10_1 += t1_1*u0;
+        tu01_1 += t0_1*u1;
+        tu20_1 += t2_1*u0;
+        tu11_1 += t1_1*u1;
+        tu02_1 += t0_1*u2;
+        tu00_2 += t0_2*u0;
+        tu10_2 += t1_2*u0;
+        tu01_2 += t0_2*u1;
+        tu20_2 += t2_2*u0;
+        tu11_2 += t1_2*u1;
+        tu02_2 += t0_2*u2;
+        numtyp t0 = t0_1 + t0_2;
+        numtyp t1 = t1_1 + t1_2;
+        numtyp t2 = t2_1 + t2_2;
+        tu00 += t0*u0;
+        tu10 += t1*u0;
+        tu01 += t0*u1;
+        tu20 += t2*u0;
+        tu11 += t1*u1;
+        tu02 += t0*u2;
+        tu30 += t3*u0;
+        tu21 += t2*u1;
+        tu12 += t1*u2;
+        tu03 += t0*u3;
+        j++;
+      }
+
+      tuv100_1 += tu10_1*v0;
+      tuv010_1 += tu01_1*v0;
+      tuv001_1 += tu00_1*v1;
+      tuv200_1 += tu20_1*v0;
+      tuv020_1 += tu02_1*v0;
+      tuv002_1 += tu00_1*v2;
+      tuv110_1 += tu11_1*v0;
+      tuv101_1 += tu10_1*v1;
+      tuv011_1 += tu01_1*v1;
+      tuv100_2 += tu10_2*v0;
+      tuv010_2 += tu01_2*v0;
+      tuv001_2 += tu00_2*v1;
+      tuv200_2 += tu20_2*v0;
+      tuv020_2 += tu02_2*v0;
+      tuv002_2 += tu00_2*v2;
+      tuv110_2 += tu11_2*v0;
+      tuv101_2 += tu10_2*v1;
+      tuv011_2 += tu01_2*v1;
+      tuv000 += tu00*v0;
+      tuv100 += tu10*v0;
+      tuv010 += tu01*v0;
+      tuv001 += tu00*v1;
+      tuv200 += tu20*v0;
+      tuv020 += tu02*v0;
+      tuv002 += tu00*v2;
+      tuv110 += tu11*v0;
+      tuv101 += tu10*v1;
+      tuv011 += tu01*v1;
+      tuv300 += tu30*v0;
+      tuv030 += tu03*v0;
+      tuv003 += tu00*v3;
+      tuv210 += tu21*v0;
+      tuv201 += tu20*v1;
+      tuv120 += tu12*v0;
+      tuv021 += tu02*v1;
+      tuv102 += tu10*v2;
+      tuv012 += tu01*v2;
+      tuv111 += tu11*v1;
+      k++;
+    }
+
+    int idx;
+    acctyp fdip_buf[20];
+
+    fdip_buf[0] = (numtyp)0.0;
+    fdip_buf[1] = tuv100_1;
+    fdip_buf[2] = tuv010_1;
+    fdip_buf[3] = tuv001_1;
+    fdip_buf[4] = tuv200_1;
+    fdip_buf[5] = tuv020_1;
+    fdip_buf[6] = tuv002_1;
+    fdip_buf[7] = tuv110_1;
+    fdip_buf[8] = tuv101_1;
+    fdip_buf[9] = tuv011_1;
+    idx = ii;    
+    for (int m = 0; m < 10; m++) {
+      fdip_phi1[idx] = fdip_buf[m];
+      idx += inum;
+    }
+
+    fdip_buf[0] = (numtyp)0.0;
+    fdip_buf[1] = tuv100_2;
+    fdip_buf[2] = tuv010_2;
+    fdip_buf[3] = tuv001_2;
+    fdip_buf[4] = tuv200_2;
+    fdip_buf[5] = tuv020_2;
+    fdip_buf[6] = tuv002_2;
+    fdip_buf[7] = tuv110_2;
+    fdip_buf[8] = tuv101_2;
+    fdip_buf[9] = tuv011_2;
+    idx = ii;    
+    for (int m = 0; m < 10; m++) {
+      fdip_phi2[idx] = fdip_buf[m];
+      idx += inum;
+    }
+
+    fdip_buf[0] = tuv000;
+    fdip_buf[1] = tuv100;
+    fdip_buf[2] = tuv010;
+    fdip_buf[3] = tuv001;
+    fdip_buf[4] = tuv200;
+    fdip_buf[5] = tuv020;
+    fdip_buf[6] = tuv002;
+    fdip_buf[7] = tuv110;
+    fdip_buf[8] = tuv101;
+    fdip_buf[9] = tuv011;
+    fdip_buf[10] = tuv300;
+    fdip_buf[11] = tuv030;
+    fdip_buf[12] = tuv003;
+    fdip_buf[13] = tuv210;
+    fdip_buf[14] = tuv201;
+    fdip_buf[15] = tuv120;
+    fdip_buf[16] = tuv021;
+    fdip_buf[17] = tuv102;
+    fdip_buf[18] = tuv012;
+    fdip_buf[19] = tuv111;
+    idx = ii;    
+    for (int m = 0; m < 20; m++) {
+      fdip_sum_phi[idx] = fdip_buf[m];
+      idx += inum;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   fphi_mpole = multipole potential from grid
+   fphi_mpole extracts the permanent multipole potential from
+   the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
+                          const __global int *restrict igrid,
+                          const __global numtyp2 *restrict grid,
+                          __global acctyp *restrict fphi,
+                          const int bsorder, const int inum,
+                          const int nzlo_out, const int nylo_out,
+                          const int nxlo_out, const int ngridxy,
+                          const int ngridx)
+{
+  int tid=THREAD_ID_X;
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
+
+  if (ii<inum) {
+
+    int nlpts = (bsorder-1) / 2;
+    
+    int istart = fast_mul(ii,4);
+    int igridx = igrid[istart];
+    int igridy = igrid[istart+1];
+    int igridz = igrid[istart+2];
+    
+    // now istart is used to index thetai1, thetai2 and thetai3
+    istart = fast_mul(ii,bsorder);
+
+    // extract the permanent multipole field at each site
+
+    numtyp tuv000 = (numtyp)0.0;
+    numtyp tuv001 = (numtyp)0.0;
+    numtyp tuv010 = (numtyp)0.0;
+    numtyp tuv100 = (numtyp)0.0;
+    numtyp tuv200 = (numtyp)0.0;
+    numtyp tuv020 = (numtyp)0.0;
+    numtyp tuv002 = (numtyp)0.0;
+    numtyp tuv110 = (numtyp)0.0;
+    numtyp tuv101 = (numtyp)0.0;
+    numtyp tuv011 = (numtyp)0.0;
+    numtyp tuv300 = (numtyp)0.0;
+    numtyp tuv030 = (numtyp)0.0;
+    numtyp tuv003 = (numtyp)0.0;
+    numtyp tuv210 = (numtyp)0.0;
+    numtyp tuv201 = (numtyp)0.0;
+    numtyp tuv120 = (numtyp)0.0;
+    numtyp tuv021 = (numtyp)0.0;
+    numtyp tuv102 = (numtyp)0.0;
+    numtyp tuv012 = (numtyp)0.0;
+    numtyp tuv111 = (numtyp)0.0;
+
+    int k = (igridz - nzlo_out) - nlpts;
+    for (int kb = 0; kb < bsorder; kb++) {
+      int i3 = istart + kb;
+      numtyp4 tha3 = thetai3[i3];
+      numtyp v0 = tha3.x;
+      numtyp v1 = tha3.y;
+      numtyp v2 = tha3.z;
+      numtyp v3 = tha3.w;
+      numtyp tu00 = (numtyp)0.0;
+      numtyp tu10 = (numtyp)0.0;
+      numtyp tu01 = (numtyp)0.0;
+      numtyp tu20 = (numtyp)0.0;
+      numtyp tu11 = (numtyp)0.0;
+      numtyp tu02 = (numtyp)0.0;
+      numtyp tu30 = (numtyp)0.0;
+      numtyp tu21 = (numtyp)0.0;
+      numtyp tu12 = (numtyp)0.0;
+      numtyp tu03 = (numtyp)0.0;
+
+      int j = (igridy - nylo_out) - nlpts;
+      for (int jb = 0; jb < bsorder; jb++) {
+        int i2 = istart + jb;
+        numtyp4 tha2 = thetai2[i2];
+        numtyp u0 = tha2.x;
+        numtyp u1 = tha2.y;
+        numtyp u2 = tha2.z;
+        numtyp u3 = tha2.w;
+        numtyp t0 = (numtyp)0.0;
+        numtyp t1 = (numtyp)0.0;
+        numtyp t2 = (numtyp)0.0;
+        numtyp t3 = (numtyp)0.0;
+
+        int i = (igridx - nxlo_out) - nlpts;
+        for (int ib = 0; ib < bsorder; ib++) {
+          int i1 = istart + ib;
+          numtyp4 tha1 = thetai1[i1];
+          int gidx = k*ngridxy + j*ngridx + i;
+          numtyp tq = grid[gidx].x;
+          t0 += tq*tha1.x;
+          t1 += tq*tha1.y;
+          t2 += tq*tha1.z;
+          t3 += tq*tha1.w;
+          i++;
+        }
+
+        tu00 += t0*u0;
+        tu10 += t1*u0;
+        tu01 += t0*u1;
+        tu20 += t2*u0;
+        tu11 += t1*u1;
+        tu02 += t0*u2;
+        tu30 += t3*u0;
+        tu21 += t2*u1;
+        tu12 += t1*u2;
+        tu03 += t0*u3;
+        j++;
+      }
+
+      tuv000 += tu00*v0;
+      tuv100 += tu10*v0;
+      tuv010 += tu01*v0;
+      tuv001 += tu00*v1;
+      tuv200 += tu20*v0;
+      tuv020 += tu02*v0;
+      tuv002 += tu00*v2;
+      tuv110 += tu11*v0;
+      tuv101 += tu10*v1;
+      tuv011 += tu01*v1;
+      tuv300 += tu30*v0;
+      tuv030 += tu03*v0;
+      tuv003 += tu00*v3;
+      tuv210 += tu21*v0;
+      tuv201 += tu20*v1;
+      tuv120 += tu12*v0;
+      tuv021 += tu02*v1;
+      tuv102 += tu10*v2;
+      tuv012 += tu01*v2;
+      tuv111 += tu11*v1;
+      k++;
+    }
+
+    numtyp buf[20];
+    buf[0] = tuv000;
+    buf[1] = tuv100;
+    buf[2] = tuv010;
+    buf[3] = tuv001;
+    buf[4] = tuv200;
+    buf[5] = tuv020;
+    buf[6] = tuv002;
+    buf[7] = tuv110;
+    buf[8] = tuv101;
+    buf[9] = tuv011;
+    buf[10] = tuv300;
+    buf[11] = tuv030;
+    buf[12] = tuv003;
+    buf[13] = tuv210;
+    buf[14] = tuv201;
+    buf[15] = tuv120;
+    buf[16] = tuv021;
+    buf[17] = tuv102;
+    buf[18] = tuv012;
+    buf[19] = tuv111;
+
+    int idx = ii;    
+    for (int m = 0; m < 20; m++) {
+      fphi[idx] = buf[m];
+      idx += inum;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   scan standard neighbor list and make it compatible with 1-5 neighbors
+   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
+   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
+   else do nothing to IJ entry
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_special15(__global int * dev_nbor,
+                          const __global int * dev_packed,
+                          const __global tagint *restrict tag,
+                          const __global int *restrict nspecial15,
+                          const __global tagint *restrict special15,
+                          const int inum, const int nall, const int nbor_pitch,
+                          const int t_per_atom) {
+  int tid, ii, offset, n_stride, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    int n15 = nspecial15[ii];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int sj=dev_packed[nbor];
+      int which = sj >> SBBITS & 3;
+      int j = sj & NEIGHMASK;
+      tagint jtag = tag[j];
+
+      if (!which) {
+        int offset=ii;
+        for (int k=0; k<n15; k++) {
+          if (special15[offset] == jtag) {
+            which = 4;
+            break;
+          }
+          offset += nall;
+        }
+      }
+
+      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
+    } // for nbor
+
+  } // if ii
+}
+
+__kernel void k_hippo_short_nbor(const __global numtyp4 *restrict x_,
+                                  const __global int * dev_nbor,
+                                  const __global int * dev_packed,
+                                  __global int * dev_short_nbor,
+                                  const numtyp off2,
+                                  const int inum, const int nbor_pitch,
+                                  const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<off2) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
new file mode 100644
index 0000000000..4780ab8ea9
--- /dev/null
+++ b/lib/gpu/lal_hippo.h
@@ -0,0 +1,166 @@
+/***************************************************************************
+                                  hippo.h
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the hippo pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_HIPPO_H
+#define LAL_HIPPO_H
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Hippo : public BaseAmoeba<numtyp, acctyp> {
+ public:
+  Hippo();
+  ~Hippo();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const int max_amtype, const int max_amclass,
+           const double *host_pdamp, const double *host_thole,
+           const double *host_dirdamp, const int *host_amtype2class,
+           const double *host_special_mpole,
+           const double *host_special_repel,
+           const double *host_special_disp,
+           const double *host_special_polar_wscale,
+           const double *host_special_polar_piscale,
+           const double *host_special_polar_pscale,
+           const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
+           const double *host_csix, const double *host_adisp,
+           const double *host_pcore, const double *host_palpha,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const int maxspecial15, const double cell_size,
+           const double gpu_split, FILE *_screen,
+           const double polar_dscale, const double polar_uscale);
+
+  /// Compute repulsion with device neighboring
+  virtual void compute_repulsion(const int ago, const int inum_full,
+                          const int nall, double **host_x,
+                          int *host_type, int *host_amtype,
+                          int *host_amgroup, double **host_rpole,
+                          double *sublo, double *subhi, tagint *tag,
+                          int **nspecial, tagint **special,
+                          int *nspecial15, tagint **special15,
+                          const bool eflag_in, const bool vflag_in,
+                          const bool eatom, const bool vatom,
+                          int &host_start, int **ilist, int **jnum,
+                          const double cpu_time, bool &success,
+                          const double aewald, const double off2_repulse,
+                          double *host_q, double *boxlo, double *prd,
+                          double cut2, double c0, double c1, double c2,
+                          double c3, double c4, double c5,void** tep_ptr);
+
+  /// Compute dispersion real-space with device neighboring
+  virtual void compute_dispersion_real(int *host_amtype,  int *host_amgroup,
+                                double **host_rpole, const double aewald,
+                                const double off2_disp);
+
+  /// Compute multipole real-space with device neighboring
+  virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *host_pval,
+                double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                const double aewald, const double felec, const double off2_mpole, double *charge,
+                double *boxlo, double *prd, void **tep_ptr);
+
+   /// Compute the real space part of the permanent field (udirect2b) with device neighboring
+   virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                  double **host_uind, double **host_uinp, double* host_pval,
+                  const double aewald, const double off2_polar, void** fieldp_ptr);
+
+   /// Compute the real space part of the induced field (umutual2b) with device neighboring
+   virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                   double **host_uind, double **host_uinp, double *host_pval,
+                                   const double aewald, const double off2_polar,
+                                   void** fieldp_ptr);
+
+  /// Compute polar real-space with device neighboring
+  virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const bool eflag_in, const bool vflag_in,
+                const bool eatom, const bool vatom,
+                const double aewald, const double felec, const double off2_polar,
+                void **tep_ptr);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
+  /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
+  UCL_D_Vec<numtyp4> coeff_amtype;
+  /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
+  UCL_D_Vec<numtyp4> coeff_amclass;
+  /// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z;
+  UCL_D_Vec<numtyp4> coeff_rep;
+  /// Special polar values [0-4]:
+  ///   sp_polar.x = special_polar_wscale
+  ///   sp_polar.y special_polar_pscale,
+  ///   sp_polar.z = special_polar_piscale
+  ///   sp_polar.w = special_mpole
+  UCL_D_Vec<numtyp4> sp_polar;
+  /// Special nonpolar values [0-4]:
+  ///   sp_nonpolar.x = special_hal
+  ///   sp_nonpolar.y special_repel
+  ///   sp_nonpolar.z = special_disp
+  UCL_D_Vec<numtyp4> sp_nonpolar;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _cut2,_c0,_c1,_c2,_c3,_c4,_c5;
+  numtyp _polar_dscale, _polar_uscale;
+  numtyp _qqrd2e;
+
+  UCL_Kernel k_repulsion, k_dispersion;
+
+ protected:
+  bool _allocated;
+  int repulsion(const int eflag, const int vflag);
+  int dispersion_real(const int eflag, const int vflag);
+  int multipole_real(const int eflag, const int vflag);
+  int udirect2b(const int eflag, const int vflag);
+  int umutual2b(const int eflag, const int vflag);
+  int polar_real(const int eflag, const int vflag);
+
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
new file mode 100644
index 0000000000..0cb00387ca
--- /dev/null
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -0,0 +1,231 @@
+/***************************************************************************
+                                 hippo_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Functions for LAMMPS access to hippo acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_hippo.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Hippo<PRECISION,ACC_PRECISION> HIPPOMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp, const int *host_amtype2class,
+                    const double *host_special_repel,
+                    const double *host_special_disp,
+                    const double *host_special_mpole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
+                    const double *host_csix, const double *host_adisp,
+                    const double *host_pcore, const double *host_palpha,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                   const double polar_dscale, const double polar_uscale) {
+  HIPPOMF.clear();
+  gpu_mode=HIPPOMF.device->gpu_mode();
+  double gpu_split=HIPPOMF.device->particle_split();
+  int first_gpu=HIPPOMF.device->first_device();
+  int last_gpu=HIPPOMF.device->last_device();
+  int world_me=HIPPOMF.device->world_me();
+  int gpu_rank=HIPPOMF.device->gpu_rank();
+  int procs_per_gpu=HIPPOMF.device->procs_per_gpu();
+
+  HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu);
+
+  bool message=false;
+  if (HIPPOMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
+                         host_pdamp, host_thole, host_dirdamp,
+                         host_amtype2class, host_special_repel, host_special_disp,
+                         host_special_mpole, host_special_polar_wscale,
+                         host_special_polar_piscale, host_special_polar_pscale,
+                         host_sizpr, host_dmppr, host_elepr,
+                         host_csix, host_adisp, host_pcore, host_palpha,
+                         nlocal, nall, max_nbors,
+                         maxspecial, maxspecial15, cell_size, gpu_split,
+                         screen, polar_dscale, polar_uscale);
+
+  HIPPOMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
+                           host_pdamp, host_thole, host_dirdamp,
+                           host_amtype2class, host_special_repel, host_special_disp,
+                           host_special_mpole, host_special_polar_wscale,
+                           host_special_polar_piscale, host_special_polar_pscale,
+                           host_sizpr, host_dmppr, host_elepr,
+                           host_csix, host_adisp, host_pcore, host_palpha,
+                           nlocal, nall, max_nbors,
+                           maxspecial, maxspecial15, cell_size, gpu_split,
+                           screen, polar_dscale, polar_uscale);
+
+    HIPPOMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    HIPPOMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void hippo_gpu_clear() {
+  HIPPOMF.clear();
+}
+
+int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double ** /*host_uind*/, double ** /*host_uinp*/, double * /*host_pval*/,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo, double *prd) {
+  return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type,
+                            host_amtype, host_amgroup, host_rpole,
+                            nullptr, nullptr, nullptr, sublo, subhi, tag,
+                            nspecial, special, nspecial15, special15,
+                            eflag_in, vflag_in, eatom, vatom,
+                            host_start, ilist, jnum, cpu_time,
+                            success, host_q, boxlo, prd);
+}
+
+void hippo_gpu_compute_repulsion(const int ago, const int inum_full,
+                                 const int nall, double **host_x, int *host_type,
+                                 int *host_amtype, int *host_amgroup, double **host_rpole,
+                                 double *sublo, double *subhi, tagint *tag, int **nspecial,
+                                 tagint **special, int *nspecial15, tagint** special15,
+                                 const bool eflag, const bool vflag, const bool eatom,
+                                 const bool vatom, int &host_start,
+                                 int **ilist, int **jnum, const double cpu_time,
+                                 bool &success, const double aewald, const double off2,
+                                 double *host_q, double *boxlo, double *prd,
+                                 double cut2, double c0, double c1, double c2,
+                                 double c3, double c4, double c5, void **tep_ptr) {
+  HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd,
+                          cut2, c0, c1, c2, c3, c4, c5, tep_ptr);
+}
+
+void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup,
+                                        double **host_rpole, const double aewald,
+                                        const double off2) {
+  HIPPOMF.compute_dispersion_real(host_amtype, host_amgroup, host_rpole,
+                                         aewald, off2);
+}
+
+void hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double felec, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
+}
+
+void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp, double *host_pval,
+                           const double aewald, const double off2, void **fieldp_ptr) {
+  HIPPOMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole,
+                            host_uind, host_uinp, host_pval,
+                            aewald, off2, fieldp_ptr);
+}
+
+void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                  double **host_uind, double **host_uinp, double *host_pval,
+                                  const double aewald, const double off2, void **fieldp_ptr) {
+  HIPPOMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
+                            aewald, off2, fieldp_ptr);
+}
+
+void hippo_gpu_update_fieldp(void **fieldp_ptr) {
+  HIPPOMF.update_fieldp(fieldp_ptr);
+}
+
+void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                  double **host_uind, double **host_uinp, double *host_pval,
+                                  const bool eflag_in, const bool vflag_in,
+                                  const bool eatom, const bool vatom,
+                                  const double aewald, const double felec, const double off2,
+                                  void **tep_ptr) {
+  HIPPOMF.compute_polar_real(host_amtype, host_amgroup, host_rpole,  host_uind, host_uinp, host_pval,
+                             eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
+}
+
+void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out) {
+   HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2,
+                             host_thetai3, igrid, nzlo_out, nzhi_out,
+                             nylo_out, nyhi_out, nxlo_out, nxhi_out);
+}
+
+void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
+                         void **host_fdip_phi2, void **host_fdip_sum_phi) {
+   HIPPOMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi);
+}
+
+double hippo_gpu_bytes() {
+  return HIPPOMF.host_memory_usage();
+}
diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h
new file mode 100644
index 0000000000..7ff62aa9a4
--- /dev/null
+++ b/lib/gpu/lal_hippo_extra.h
@@ -0,0 +1,431 @@
+/// **************************************************************************
+//                              hippo_extra.h
+//                             -------------------
+//                              Trung Dac Nguyen
+//
+//  Device code for hippo math routines
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : ndactrung@gmail.com
+// ***************************************************************************/*
+
+#ifndef LAL_HIPPO_EXTRA_H
+#define LAL_HIPPO_EXTRA_H
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include "lal_aux_fun1.h"
+#else
+#endif
+
+#define MY_PI2 (numtyp)1.57079632679489661923
+#define MY_PI4 (numtyp)0.78539816339744830962
+
+/* ----------------------------------------------------------------------
+   damprep generates coefficients for the Pauli repulsion
+   damping function for powers of the interatomic distance
+
+   literature reference:
+
+   J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An
+   Anisotropic, Atomic Multipole Model", Journal of Chemical Physics,
+   150, 084104 (2019)
+------------------------------------------------------------------------- */
+
+ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
+                        const numtyp rr3, const numtyp rr5, const numtyp rr7,
+                        const numtyp rr9, const numtyp rr11, const int rorder,
+                        const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11])
+{
+  numtyp r3,r4;
+  numtyp r5,r6,r7,r8;
+  numtyp s,ds,d2s;
+  numtyp d3s,d4s,d5s;
+  numtyp dmpi2,dmpk2;
+  numtyp dmpi22,dmpi23;
+  numtyp dmpi24,dmpi25;
+  numtyp dmpi26,dmpi27;
+  numtyp dmpk22,dmpk23;
+  numtyp dmpk24,dmpk25;
+  numtyp dmpk26;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp pre,term,tmp;
+
+  // compute tolerance value for damping exponents
+
+  eps = (numtyp)0.001;
+  diff = dmpi-dmpk; // fabs(dmpi-dmpk)
+  if (diff < (numtyp)0) diff = -diff;
+
+  // treat the case where alpha damping exponents are equal
+
+  if (diff < eps) {
+    r3 = r2 * r;
+    r4 = r3 * r;
+    r5 = r4 * r;
+    r6 = r5 * r;
+    r7 = r6 * r;
+    dmpi2 = (numtyp)0.5 * dmpi;
+    dampi = dmpi2 * r;
+    expi = ucl_exp(-dampi);
+    dmpi22 = dmpi2 * dmpi2;
+    dmpi23 = dmpi22 * dmpi2;
+    dmpi24 = dmpi23 * dmpi2;
+    dmpi25 = dmpi24 * dmpi2;
+    dmpi26 = dmpi25 * dmpi2;
+    pre = (numtyp)128.0;
+    s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi;
+
+    ds = (dmpi22*r3 + dmpi23*r4) * expi / (numtyp)3.0;
+    d2s = dmpi24 * expi * r5 / (numtyp)9.0;
+    d3s = dmpi25 * expi * r6 / (numtyp)45.0;
+    d4s = (dmpi25*r6 + dmpi26*r7) * expi / (numtyp)315.0;
+    if (rorder >= 11) {
+      r8 = r7 * r;
+      dmpi27 = dmpi2 * dmpi26;
+      d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0;
+    }
+
+  // treat the case where alpha damping exponents are unequal
+
+  } else {
+    r3 = r2 * r;
+    r4 = r3 * r;
+    r5 = r4 * r;
+    dmpi2 = (numtyp)0.5 * dmpi;
+    dmpk2 = (numtyp)0.5 * dmpk;
+    dampi = dmpi2 * r;
+    dampk = dmpk2 * r;
+    expi = ucl_exp(-dampi);
+    expk = ucl_exp(-dampk);
+    dmpi22 = dmpi2 * dmpi2;
+    dmpi23 = dmpi22 * dmpi2;
+    dmpi24 = dmpi23 * dmpi2;
+    dmpi25 = dmpi24 * dmpi2;
+    dmpk22 = dmpk2 * dmpk2;
+    dmpk23 = dmpk22 * dmpk2;
+    dmpk24 = dmpk23 * dmpk2;
+    dmpk25 = dmpk24 * dmpk2;
+    term = dmpi22 - dmpk22;
+    pre = (numtyp)8192.0 * dmpi23 * dmpk23 / (term*term*term*term); //ucl_powr(term,(numtyp)4.0);
+    tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term;
+    s = (dampi-tmp)*expk + (dampk+tmp)*expi;
+
+    ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term -
+          (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 -
+           ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 +
+       ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
+       (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    d3s = (dmpi2*dmpk23*r4/(numtyp)15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 -
+           ((numtyp)4.0/(numtyp)15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term -
+           (numtyp)4.0*dmpi2*dmpk22*r/term - (numtyp)4.0/term*dmpi2*dmpk2) * expk +
+      (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 +
+       ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term +
+       (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
+    d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 +
+           dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 -
+           ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term -
+           ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi24*dmpk2*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 +
+       dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 +
+       ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term +
+       ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
+       (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+
+    if (rorder >= 11) {
+      r6 = r5 * r;
+      dmpi26 = dmpi25 * dmpi2;
+      dmpk26 = dmpk25 * dmpk2;
+      d5s = (dmpi2*dmpk25*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi2*dmpk24*r5 +
+             dmpi2*dmpk23*r4/(numtyp)21.0 + dmpi2*dmpk22*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 -
+             ((numtyp)4.0/(numtyp)945.0)*dmpi2*dmpk26*r5/term -
+             ((numtyp)4.0/(numtyp)63.0)*dmpi2*dmpk25*r4/term - ((numtyp)4.0/(numtyp)9.0)*dmpi2*dmpk24*r3/term -
+             ((numtyp)16.0/(numtyp)9.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+             (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+        (dmpi25*dmpk2*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi24*dmpk2*r5 +
+         dmpi23*dmpk2*r4/(numtyp)21.0 + dmpi22*dmpk2*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 +
+         ((numtyp)4.0/(numtyp)945.0)*dmpi26*dmpk2*r5/term + ((numtyp)4.0/(numtyp)63.0)*dmpi25*dmpk2*r4/term +
+         ((numtyp)4.0/(numtyp)9.0)*dmpi24*dmpk2*r3/term + ((numtyp)16.0/(numtyp)9.0)*dmpi23*dmpk2*r2/term +
+         (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    }
+  }
+
+  // convert partial derivatives into full derivatives
+
+  s = s * rr1;
+  ds = ds * rr3;
+  d2s = d2s * rr5;
+  d3s = d3s * rr7;
+  d4s = d4s * rr9;
+  d5s = d5s * rr11;
+  dmpik[0] = (numtyp)0.5 * pre * s * s;
+  dmpik[2] = pre * s * ds;
+  dmpik[4] = pre * (s*d2s + ds*ds);
+  dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s);
+  dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s);
+
+  if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s);
+}
+
+/* ----------------------------------------------------------------------
+   damppole generates coefficients for the charge penetration
+   damping function for powers of the interatomic distance
+
+   literature references:
+
+   L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the
+   Effective Fragment Potential Method: Theory and Application to
+   the Benzene Dimer", Journal of Computational Chemistry, 28,
+   276-291 (2007)  [Gordon f1 and f2 models]
+
+   J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and
+   J. W. Ponder, "An Optimized Charge Penetration Model for Use with
+   the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19,
+   276-291 (2017)
+------------------------------------------------------------------------- */
+
+ucl_inline void damppole(const numtyp r, const int rorder,
+                         const numtyp alphai, const numtyp alphak,
+                         numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11])
+{
+  numtyp termi,termk;
+  numtyp termi2,termk2;
+  numtyp alphai2,alphak2;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampi3;
+  numtyp dampi4,dampi5;
+  numtyp dampi6,dampi7;
+  numtyp dampi8;
+  numtyp dampk2,dampk3;
+  numtyp dampk4,dampk5;
+  numtyp dampk6;
+
+  // compute tolerance and exponential damping factors
+
+  eps = (numtyp)0.001;
+  diff = alphai-alphak;
+  if (diff < (numtyp)0) diff = -diff;
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
+
+  // core-valence charge penetration damping for Gordon f1
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  dampi4 = dampi2 * dampi2;
+  dampi5 = dampi2 * dampi3;
+  dmpi[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampi)*expi;
+  dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
+  dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
+  dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
+  dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                   (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi;
+  if (diff < eps) {
+    dmpk[0] = dmpi[0];
+    dmpk[2] = dmpi[2];
+    dmpk[4] = dmpi[4];
+    dmpk[6] = dmpi[6];
+    dmpk[8] = dmpi[8];
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    dampk4 = dampk2 * dampk2;
+    dampk5 = dampk2 * dampk3;
+    dmpk[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampk)*expk;
+    dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
+    dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
+    dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk;
+    dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
+                     (numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk;
+  }
+
+  // valence-valence charge penetration damping for Gordon f1
+
+  if (diff < eps) {
+    dampi6 = dampi3 * dampi3;
+    dampi7 = dampi3 * dampi4;
+    dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 +
+                      dampi3/(numtyp)48.0)*expi;
+    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
+                      (numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi;
+    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
+    dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi;
+    dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
+                      dampi7/(numtyp)5040.0)*expi;
+    if (rorder >= 11) {
+      dampi8 = dampi4 * dampi4;
+      dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                         dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
+                         dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi;
+    }
+
+  } else {
+    alphai2 = alphai * alphai;
+    alphak2 = alphak * alphak;
+    termi = alphak2 / (alphak2-alphai2);
+    termk = alphai2 / (alphai2-alphak2);
+    termi2 = termi * termi;
+    termk2 = termk * termk;
+    dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi -
+      termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk;
+    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
+    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
+      termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk;
+    dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
+                             dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi -
+      termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk;
+    dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                             (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi -
+      termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
+              (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 +
+                        (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 +
+                        (numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk;
+
+    if (rorder >= 11) {
+      dampi6 = dampi3 * dampi3;
+      dampk6 = dampk3 * dampk3;
+      dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                                (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 +
+                                dampi6/(numtyp)1890.0)*expi -
+        termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 +
+                (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk -
+        (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 +
+                          dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi -
+        (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 +
+                          dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   dampdir = direct field damping coefficents
+   dampdir generates coefficients for the direct field damping
+   function for powers of the interatomic distance
+------------------------------------------------------------------------- */
+
+ucl_inline void dampdir(numtyp r, numtyp alphai, numtyp alphak, numtyp *dmpi, numtyp *dmpk)
+{
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampk2;
+  numtyp dampi3,dampk3;
+  numtyp dampi4,dampk4;
+
+  // compute tolerance and exponential damping factors
+
+  eps = (numtyp)0.001;
+  diff = alphai-alphak; // fabs(alphai-alphak);
+  if (diff < (numtyp)0) diff = -diff;
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
+
+  // core-valence charge penetration damping for Gordon f1 (HIPPO)
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  dampi4 = dampi2 * dampi2;
+  dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
+  dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
+  dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
+  if (diff < eps) {
+    dmpk[2] = dmpi[2];
+    dmpk[4] = dmpi[4];
+    dmpk[6] = dmpi[6];
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    dampk4 = dampk2 * dampk2;
+    dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
+    dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
+    dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/30.0)*expk;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   dampmut = mutual field damping coefficents
+   dampmut generates coefficients for the mutual field damping
+   function for powers of the interatomic distance
+------------------------------------------------------------------------- */
+
+ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
+{
+  numtyp termi,termk;
+  numtyp termi2,termk2;
+  numtyp alphai2,alphak2;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampi3;
+  numtyp dampi4,dampi5;
+  numtyp dampk2,dampk3;
+
+  // compute tolerance and exponential damping factors
+
+  eps = (numtyp)0.001;
+  diff = alphai-alphak; // fabs(alphai-alphak);
+  if (diff < (numtyp)0) diff = -diff;
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
+
+  // valence-valence charge penetration damping for Gordon f1 (HIPPO)
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  if (diff < eps) {
+    dampi4 = dampi2 * dampi2;
+    dampi5 = dampi2 * dampi3;
+    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
+                      7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi;
+    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    alphai2 = alphai * alphai;
+    alphak2 = alphak * alphak;
+    termi = alphak2 / (alphak2-alphai2);
+    termk = alphai2 / (alphai2-alphak2);
+    termi2 = termi * termi;
+    termk2 = termk * termk;
+    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
+    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk -
+      (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi -
+      (numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk;
+  }
+}
+
+#endif
diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index a327fdd45b..10816e2fa6 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -576,6 +576,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     time_nbor.stop();
     if (_time_device)
       time_nbor.add_to_total();
+
+    // on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial)
+    // on the device, transpose the matrix (1-d array) for coalesced reads
+    //   dev_special[i][j] = the special i neighbor of atom j
+
     time_transpose.start();
     const int b2x=_block_cell_2d;
     const int b2y=_block_cell_2d;
@@ -679,6 +684,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     if (_cutoff < _cell_size) vadjust*=1.46;
     mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
     if (mn<33) mn+=3;
+
     resize_max_neighbors<numtyp,acctyp>(mn,success);
     set_nbor_block_size(mn/2);
     if (!success)
@@ -831,6 +837,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
   time_nbor.stop();
 }
 
+void Neighbor::transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
+    const int columns_in, const int rows_in)
+{
+  const int b2x=_block_cell_2d;
+  const int b2y=_block_cell_2d;
+  const int g2x=static_cast<int>(ceil(static_cast<double>(columns_in)/b2x));
+  const int g2y=static_cast<int>(ceil(static_cast<double>(rows_in)/b2y));
+  _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
+  _shared->k_transpose.run(&out, &in, &columns_in, &rows_in);
+}
+
 template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
      (double **x, const int inum, const int host_inum, const int nall,
       Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index 5b569f804a..45ec95a9d1 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -33,7 +33,7 @@
 #endif
 #endif
 
-#if defined(USE_HIP)
+#if defined(USE_HIP) || defined(__APPLE__)
 #define LAL_USE_OLD_NEIGHBOR
 #endif
 
@@ -259,6 +259,10 @@ class Neighbor {
     return o.str();
   }
 
+  /// Helper function
+  void transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
+    const int columns_in, const int rows_in);
+
  private:
   NeighborShared *_shared;
   UCL_Device *dev;
@@ -289,15 +293,17 @@ class Neighbor {
   #endif
 
   int _simd_size;
+  #ifdef LAL_USE_OLD_NEIGHBOR
   inline void set_nbor_block_size(const int mn) {
-    #ifdef LAL_USE_OLD_NEIGHBOR
     int desired=mn/(2*_simd_size);
     desired*=_simd_size;
     if (desired<_simd_size) desired=_simd_size;
     else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
     _block_nbor_build=desired;
-    #endif
   }
+  #else
+  inline void set_nbor_block_size(const int) {}
+  #endif
 };
 
 }
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index 352f1d6138..359d9b75cb 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -48,6 +48,19 @@ _texture_2d( pos_tex,int4);
 #define LAL_USE_OLD_NEIGHBOR
 #endif
 
+/*
+  compute the id of the cell where the atoms belong to
+x: atom coordinates
+cell_id: cell ids
+particle_id: 
+boxlo[0-2]: the lower left corner of the local box
+ncell[xyz]: the number of cells in xyz dims
+i_cell_size is the inverse cell size
+inum = the number of the local atoms that are ported to the device
+nall = the number of the local+ghost atoms that are ported to the device
+cells_in_cutoff = the number of cells that are within the cutoff
+*/
+
 __kernel void calc_cell_id(const numtyp4 *restrict x_,
                            unsigned *restrict cell_id,
                            int *restrict particle_id,
@@ -90,6 +103,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_,
   }
 }
 
+// compute the number of atoms in each cell
+
 __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
                                       int *restrict cell_counts,
                                       int nall, int ncell) {
diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu
index e17df5b88c..a8e929efe4 100644
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@@ -273,19 +273,19 @@ __kernel void interp(const __global numtyp4 *restrict x_,
         int my=mz+fast_mul(ny,npts_x);
         for (int m=0; m<order; m++) {
           grdtyp y0=z0*rho1d_1[m][tid];
-                for (int l=0; l<order; l++) {
-                  grdtyp x0=y0*rho1d_0[l][tid];
-                  grdtyp4 el=brick[my+l];
-                  ek.x-=x0*el.x;
-                  ek.y-=x0*el.y;
-                  ek.z-=x0*el.z;
-                }
+          for (int l=0; l<order; l++) {
+            grdtyp x0=y0*rho1d_0[l][tid];
+            grdtyp4 el=brick[my+l];
+            ek.x-=x0*el.x;
+            ek.y-=x0*el.y;
+            ek.z-=x0*el.z;
+          }
           my+=npts_x;
         }
         mz+=npts_yx;
-            }
+      }
     }
     ans[ii]=ek;
-        }
+  }
 }
 
diff --git a/lib/gpu/lal_pre_cuda_hip.h b/lib/gpu/lal_pre_cuda_hip.h
index 92af47a4b7..f432757cf1 100644
--- a/lib/gpu/lal_pre_cuda_hip.h
+++ b/lib/gpu/lal_pre_cuda_hip.h
@@ -182,12 +182,15 @@
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
+#define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrt
 #define ucl_sqrt sqrt
-#define ucl_recip(x) ((numtyp)1.0/(x))
+#define ucl_erfc erfc
 
 #else
 
+#define ucl_exp expf
+#define ucl_powr powf
 #define ucl_atan atanf
 #define ucl_cbrt cbrtf
 #define ucl_ceil ceilf
@@ -195,8 +198,7 @@
 #define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrtf
 #define ucl_sqrt sqrtf
-#define ucl_exp expf
-#define ucl_powr powf
+#define ucl_erfc erfcf
 
 #endif
 
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 12cf6345c2..c734e67b98 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -166,6 +166,7 @@
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
+#define ucl_erfc erfc
 
 #if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
 
@@ -330,6 +331,10 @@
 #define NEIGHMASK 0x3FFFFFFF
 ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
 
+#define SBBITS15 29
+#define NEIGHMASK15 0x1FFFFFFF
+ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; };
+
 // default to 32-bit smallint and other ints, 64-bit bigint:
 // same as defined in src/lmptype.h
 #if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \
diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp
index eb42c710cc..9687a0352d 100644
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@@ -150,7 +150,7 @@ double SWT::host_memory_usage() const {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int SWT::loop(const int eflag, const int vflag, const int evatom,
-              bool &success) {
+              bool & /*success*/) {
   const int nbor_pitch=this->nbor->nbor_pitch();
 
   // build the short neighbor list
diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index 8baa5ce12a..feab8bb5c0 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -106,6 +106,7 @@ _texture_2d( pos_tex,int4);
     }                                                                       \
   }
 
+// (SHUFFLE_AVAIL == 1)
 #else
 
 #define local_allocate_acc_zeta()
@@ -202,6 +203,7 @@ _texture_2d( pos_tex,int4);
     }                                                                       \
   }
 
+// EVFLAG == 0
 #else
 
 #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
@@ -216,8 +218,8 @@ _texture_2d( pos_tex,int4);
     ans[ii]=old;                                                            \
   }
 
-#endif
-#endif
+#endif // EVFLAG
+#endif // SHUFFLE_AVAIL
 
 #ifdef LAL_SIMD_IP_SYNC
 #define t_per_atom t_per_atom_in
diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp
index c343de3f55..fcc9d00ab0 100644
--- a/lib/gpu/lal_vashishta.cpp
+++ b/lib/gpu/lal_vashishta.cpp
@@ -56,7 +56,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
            const double* costheta, const double* bigb,
            const double* big2b, const double* bigc)
 {
-  int success;
+  int success=0;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,vashishta,"k_vashishta","k_vashishta_three_center",
                            "k_vashishta_three_end","k_vashishta_short_nbor");
@@ -211,7 +211,7 @@ double VashishtaT::host_memory_usage() const {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int VashishtaT::loop(const int eflag, const int vflag, const int evatom,
-                     bool &success) {
+                     bool & /*success*/) {
   const int nbor_pitch=this->nbor->nbor_pitch();
 
   // build the short neighbor list
diff --git a/lib/kokkos/kokkos_5538.diff b/lib/kokkos/kokkos_5538.diff
deleted file mode 100644
index 6bf2ccf6a4..0000000000
--- a/lib/kokkos/kokkos_5538.diff
+++ /dev/null
@@ -1,199 +0,0 @@
-diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
-index 22af411f32..530510a0d1 100644
---- a/lib/kokkos/Makefile.kokkos
-+++ b/lib/kokkos/Makefile.kokkos
-@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
- #KOKKOS_DEVICES ?= "Threads"
- # Options:
- # Intel:    KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
--# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
-+# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
- # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
- # IBM:      BGQ,Power7,Power8,Power9
- # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
-@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
- KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
- KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
- KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
-+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
- KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
-@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72)   \
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_TURING75)  \
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80)  \
--                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
-+                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)  \
-+                                              + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
- 
- #SEK: This seems like a bug to me
- ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
-@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
-     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
-     KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
-   endif
-+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
-+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
-+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
-+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
-+  endif
- 
-   ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
-     KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
-diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
-index 88ddc48378..b83ced9243 100644
---- a/lib/kokkos/cmake/KokkosCore_config.h.in
-+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
-@@ -102,6 +102,7 @@
- #cmakedefine KOKKOS_ARCH_AMPERE
- #cmakedefine KOKKOS_ARCH_AMPERE80
- #cmakedefine KOKKOS_ARCH_AMPERE86
-+#cmakedefine KOKKOS_ARCH_HOPPER90
- #cmakedefine KOKKOS_ARCH_AMD_ZEN
- #cmakedefine KOKKOS_ARCH_AMD_ZEN2
- #cmakedefine KOKKOS_ARCH_AMD_ZEN3
-diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
-index f56cef1651..2585a6a64c 100644
---- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
-+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
-@@ -74,6 +74,7 @@ int main() {
-     case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
-     case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
-     case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
-+    case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
-     default:
-       std::cout << "Compute capability " << compute_capability
-                 << " is not supported" << std::endl;
-diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
-index ef16aad047..c1d76cceeb 100644
---- a/lib/kokkos/cmake/kokkos_arch.cmake
-+++ b/lib/kokkos/cmake/kokkos_arch.cmake
-@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72         GPU  "NVIDIA Volta generation CC 7.2"   "KOKK
- KOKKOS_ARCH_OPTION(TURING75        GPU  "NVIDIA Turing generation CC 7.5"  "KOKKOS_SHOW_CUDA_ARCHS")
- KOKKOS_ARCH_OPTION(AMPERE80        GPU  "NVIDIA Ampere generation CC 8.0"  "KOKKOS_SHOW_CUDA_ARCHS")
- KOKKOS_ARCH_OPTION(AMPERE86        GPU  "NVIDIA Ampere generation CC 8.6"  "KOKKOS_SHOW_CUDA_ARCHS")
-+KOKKOS_ARCH_OPTION(HOPPER90        GPU  "NVIDIA Hopper generation CC 9.0"  "KOKKOS_SHOW_CUDA_ARCHS")
- 
- IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
-   SET(KOKKOS_SHOW_HIP_ARCHS ON)
-@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72   sm_72)
- CHECK_CUDA_ARCH(TURING75  sm_75)
- CHECK_CUDA_ARCH(AMPERE80  sm_80)
- CHECK_CUDA_ARCH(AMPERE86  sm_86)
-+CHECK_CUDA_ARCH(HOPPER90  sm_90)
- 
- SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
- FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
-@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
-   SET(KOKKOS_ARCH_AMPERE ON)
- ENDIF()
- 
-+IF (KOKKOS_ARCH_HOPPER90)
-+  SET(KOKKOS_ARCH_HOPPER ON)
-+ENDIF()
-+
- #Regardless of version, make sure we define the general architecture name
- IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
-   SET(KOKKOS_ARCH_VEGA ON)
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-index 56f9117844..fcd4773dbc 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
-       case 61: return 96;
-       case 70:
-       case 80:
--      case 86: return 8;
-+      case 86:
-+      case 90: return 8;
-       case 75: return 32;
-       default:
-         Kokkos::Impl::throw_runtime_exception(
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
-index 40a263561f..8c40ebd60d 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
-@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
- #endif  // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
- 
- #if CUDA_VERSION >= 11010 && \
--    ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
-+    ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
- KOKKOS_INLINE_FUNCTION
- bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
- KOKKOS_INLINE_FUNCTION
-diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
-index f9451ecfe6..2ce1efb98c 100644
---- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
-+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
-@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
- 
- struct OpenACC_Traits {
- #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
--    defined(KOKKOS_ARCH_AMPERE)
-+    defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
-   static constexpr acc_device_t dev_type     = acc_device_nvidia;
-   static constexpr bool may_fallback_to_host = false;
- #else
-diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
-index a9bc085912..27ee1d4232 100644
---- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
-+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
-@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
- 
-   // FIXME_OPENMPTARGET:  Only fix the number of teams for NVIDIA architectures
-   // from Pascal and upwards.
--#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
--    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
-+    defined(KOKKOS_ARCH_HOPPER)
- #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
-   omp_set_num_teams(512);
- #endif
-diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
-index 840db4327c..7e5addbc5b 100644
---- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
-+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
-@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
- #if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
-     !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) &&   \
-     !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) &&   \
--    !defined(KOKKOS_ARCH_AMPERE)
-+    !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
-   if (!settings.has_device_id() && gpu_devices.empty()) {
-     Impl::SYCLInternal::singleton().initialize(sycl::device());
-     return;
-diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
-index 5ac7d8af30..ba101f699e 100644
---- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
-+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
-@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
-     return std::min({
-              int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
-       // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
--#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
--    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||   \
--    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
-+    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
-+    defined(KOKKOS_ARCH_HOPPER)
-                  256,
- #endif
-                  max_threads_for_memory
-@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
-     return std::min<int>({
-              int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
-       // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
--#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
--    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||   \
--    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
-+    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
-+    defined(KOKKOS_ARCH_HOPPER)
-                  256,
- #endif
-                  max_threads_for_memory
diff --git a/lib/kokkos/kokkos_5706.diff b/lib/kokkos/kokkos_5706.diff
deleted file mode 100644
index 2bfbb35b06..0000000000
--- a/lib/kokkos/kokkos_5706.diff
+++ /dev/null
@@ -1,523 +0,0 @@
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-index fcd4773dbc..30b6958a67 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
-                                 LaunchBounds{});
- }
- 
--// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
- // NOTE these number can be obtained several ways:
- // * One option is to download the CUDA Occupancy Calculator spreadsheet, select
- // "Compute Capability" first and check what is the smallest "Shared Memory
-@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
-     return 0;
-   }() * 1024;
- }
-+
- }  // namespace Impl
- }  // namespace Kokkos
- 
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
-index 5811498e01..e22eb3b842 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
-@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
-   }
- #endif
- 
--#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
--  cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
--#else
--  cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
--#endif
--
-   // Init the array for used for arbitrarily sized atomics
-   if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
- 
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-index b7a80ad84f..5c4c3a7d39 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-@@ -93,10 +93,6 @@ namespace Impl {
- //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
- // function qualifier which could be used to improve performance.
- //----------------------------------------------------------------------------
--// Maximize L1 cache and minimize shared memory:
--//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
--// For 2.0 capability: 48 KB L1 and 16 KB shared
--//----------------------------------------------------------------------------
- 
- template <class DriverType>
- __global__ static void cuda_parallel_launch_constant_memory() {
-@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
-   }
- }
- 
--// This function needs to be template on DriverType and LaunchBounds
-+// These functions needs to be template on DriverType and LaunchBounds
- // so that the static bool is unique for each type combo
- // KernelFuncPtr does not necessarily contain that type information.
-+
- template <class DriverType, class LaunchBounds, class KernelFuncPtr>
--inline void configure_shmem_preference(KernelFuncPtr const& func,
--                                       bool prefer_shmem) {
-+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
-+    const KernelFuncPtr& func) {
-+  // Only call cudaFuncGetAttributes once for each unique kernel
-+  // by leveraging static variable initialization rules
-+  auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
-+    cudaFuncAttributes attr;
-+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
-+    return attr;
-+  };
-+  static cudaFuncAttributes func_attr = wrap_get_attributes();
-+  return func_attr;
-+}
-+
-+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-+inline void configure_shmem_preference(const KernelFuncPtr& func,
-+                                       const cudaDeviceProp& device_props,
-+                                       const size_t block_size, int& shmem,
-+                                       const size_t occupancy) {
- #ifndef KOKKOS_ARCH_KEPLER
--  // On Kepler the L1 has no benefit since it doesn't cache reads
-+
-+  const auto& func_attr =
-+      get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
-+
-+  // Compute limits for number of blocks due to registers/SM
-+  const size_t regs_per_sm     = device_props.regsPerMultiprocessor;
-+  const size_t regs_per_thread = func_attr.numRegs;
-+  // The granularity of register allocation is chunks of 256 registers per warp
-+  // -> 8 registers per thread
-+  const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
-+  const size_t max_blocks_regs =
-+      regs_per_sm / (allocated_regs_per_thread * block_size);
-+
-+  // Compute how many threads per sm we actually want
-+  const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
-+  // only allocate multiples of warp size
-+  const size_t num_threads_desired =
-+      ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
-+  // Get close to the desired occupancy,
-+  // don't undershoot by much but also don't allocate a whole new block just
-+  // because one is a few threads over otherwise.
-+  size_t num_blocks_desired =
-+      (num_threads_desired + block_size * 0.8) / block_size;
-+  num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
-+  if (num_blocks_desired == 0) num_blocks_desired = 1;
-+
-+  // Calculate how much shared memory we need per block
-+  size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
-+
-+  // The minimum shared memory allocation we can have in total per SM is 8kB.
-+  // If we want to lower occupancy we have to make sure we request at least that
-+  // much in aggregate over all blocks, so that shared memory actually becomes a
-+  // limiting factor for occupancy
-+  constexpr size_t min_shmem_size_per_sm = 8192;
-+  if ((occupancy < 100) &&
-+      (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
-+    shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
-+    // Need to set the caller's shmem variable so that the
-+    // kernel launch uses the correct dynamic shared memory request
-+    shmem = shmem_per_block - func_attr.sharedSizeBytes;
-+  }
-+
-+  // Compute the carveout fraction we need based on occupancy
-+  // Use multiples of 8kB
-+  const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
-+  size_t carveout               = shmem_per_block == 0
-+                        ? 0
-+                        : 100 *
-+                              (((num_blocks_desired * shmem_per_block +
-+                                 min_shmem_size_per_sm - 1) /
-+                                min_shmem_size_per_sm) *
-+                               min_shmem_size_per_sm) /
-+                              max_shmem_per_sm;
-+  if (carveout > 100) carveout = 100;
-+
-+  // Set the carveout, but only call it once per kernel or when it changes
-   auto set_cache_config = [&] {
--    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
--        func,
--        (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
--    return prefer_shmem;
-+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
-+        func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
-+    return carveout;
-   };
--  static bool cache_config_preference_cached = set_cache_config();
--  if (cache_config_preference_cached != prefer_shmem) {
-+  // Store the value in a static variable so we only reset if needed
-+  static size_t cache_config_preference_cached = set_cache_config();
-+  if (cache_config_preference_cached != carveout) {
-     cache_config_preference_cached = set_cache_config();
-   }
- #else
-   // Use the parameters so we don't get a warning
-   (void)func;
--  (void)prefer_shmem;
-+  (void)device_props;
-+  (void)block_size;
-+  (void)occupancy;
- #endif
- }
- 
--template <class Policy>
--std::enable_if_t<Policy::experimental_contains_desired_occupancy>
--modify_launch_configuration_if_desired_occupancy_is_specified(
--    Policy const& policy, cudaDeviceProp const& properties,
--    cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
--    bool& prefer_shmem) {
--  int const block_size        = block.x * block.y * block.z;
--  int const desired_occupancy = policy.impl_get_desired_occupancy().value();
--
--  size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
--  size_t const static_shmem           = attributes.sharedSizeBytes;
--
--  // round to nearest integer and avoid division by zero
--  int active_blocks = std::max(
--      1, static_cast<int>(std::round(
--             static_cast<double>(properties.maxThreadsPerMultiProcessor) /
--             block_size * desired_occupancy / 100)));
--  int const dynamic_shmem =
--      shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
--
--  if (dynamic_shmem > shmem) {
--    shmem        = dynamic_shmem;
--    prefer_shmem = false;
--  }
--}
--
--template <class Policy>
--std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
--modify_launch_configuration_if_desired_occupancy_is_specified(
--    Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
--    dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
--
- // </editor-fold> end Some helper functions for launch code readability }}}1
- //==============================================================================
- 
-@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
- #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
-   inline static void create_parallel_launch_graph_node(
-       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
--      CudaInternal const* cuda_instance, bool prefer_shmem) {
-+      CudaInternal const* cuda_instance) {
-     //----------------------------------------
-     auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
-     KOKKOS_EXPECTS(bool(graph));
-@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
- 
-     if (!Impl::is_empty_launch(grid, block)) {
-       Impl::check_shmem_request(cuda_instance, shmem);
--      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
--          base_t::get_kernel_func(), prefer_shmem);
-+      if (DriverType::Policy::
-+                        experimental_contains_desired_occupancy) {
-+        int desired_occupancy =
-+            driver.get_policy().impl_get_desired_occupancy().value();
-+        size_t block_size = block.x * block.y * block.z;
-+        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-+            shmem, desired_occupancy);
-+      }
- 
-       void const* args[] = {&driver};
- 
-@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
- #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
-   inline static void create_parallel_launch_graph_node(
-       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
--      CudaInternal const* cuda_instance, bool prefer_shmem) {
-+      CudaInternal const* cuda_instance) {
-     //----------------------------------------
-     auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
-     KOKKOS_EXPECTS(bool(graph));
-@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
- 
-     if (!Impl::is_empty_launch(grid, block)) {
-       Impl::check_shmem_request(cuda_instance, shmem);
--      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
--          base_t::get_kernel_func(), prefer_shmem);
-+      if constexpr (DriverType::Policy::
-+                        experimental_contains_desired_occupancy) {
-+        int desired_occupancy =
-+            driver.get_policy().impl_get_desired_occupancy().value();
-+        size_t block_size = block.x * block.y * block.z;
-+        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-+            shmem, desired_occupancy);
-+      }
- 
-       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
- 
-@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
- #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
-   inline static void create_parallel_launch_graph_node(
-       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
--      CudaInternal const* cuda_instance, bool prefer_shmem) {
-+      CudaInternal const* cuda_instance) {
-     // Just use global memory; coordinating through events to share constant
-     // memory with the non-graph interface is not really reasonable since
-     // events don't work with Graphs directly, and this would anyway require
-@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
-         DriverType, LaunchBounds,
-         Experimental::CudaLaunchMechanism::GlobalMemory>;
-     global_launch_impl_t::create_parallel_launch_graph_node(
--        driver, grid, block, shmem, cuda_instance, prefer_shmem);
-+        driver, grid, block, shmem, cuda_instance);
-   }
- #endif
- };
-@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
- 
-   inline static void launch_kernel(const DriverType& driver, const dim3& grid,
-                                    const dim3& block, int shmem,
--                                   const CudaInternal* cuda_instance,
--                                   bool prefer_shmem) {
-+                                   const CudaInternal* cuda_instance) {
-     if (!Impl::is_empty_launch(grid, block)) {
-       // Prevent multiple threads to simultaneously set the cache configuration
-       // preference and launch the same kernel
-@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
- 
-       Impl::check_shmem_request(cuda_instance, shmem);
- 
--      // If a desired occupancy is specified, we compute how much shared memory
--      // to ask for to achieve that occupancy, assuming that the cache
--      // configuration is `cudaFuncCachePreferL1`.  If the amount of dynamic
--      // shared memory computed is actually smaller than `shmem` we overwrite
--      // `shmem` and set `prefer_shmem` to `false`.
--      modify_launch_configuration_if_desired_occupancy_is_specified(
--          driver.get_policy(), cuda_instance->m_deviceProp,
--          get_cuda_func_attributes(), block, shmem, prefer_shmem);
--
--      Impl::configure_shmem_preference<
--          DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
--          base_t::get_kernel_func(), prefer_shmem);
-+      if (DriverType::Policy::
-+                        experimental_contains_desired_occupancy) {
-+        int desired_occupancy =
-+            driver.get_policy().impl_get_desired_occupancy().value();
-+        size_t block_size = block.x * block.y * block.z;
-+        Impl::configure_shmem_preference<
-+            DriverType,
-+            Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-+            shmem, desired_occupancy);
-+      }
- 
-       KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
- 
-@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
-   }
- 
-   static cudaFuncAttributes get_cuda_func_attributes() {
--    // Race condition inside of cudaFuncGetAttributes if the same address is
--    // given requires using a local variable as input instead of a static Rely
--    // on static variable initialization to make sure only one thread executes
--    // the code and the result is visible.
--    auto wrap_get_attributes = []() -> cudaFuncAttributes {
--      cudaFuncAttributes attr_tmp;
--      KOKKOS_IMPL_CUDA_SAFE_CALL(
--          cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
--      return attr_tmp;
--    };
--    static cudaFuncAttributes attr = wrap_get_attributes();
--    return attr;
-+    return get_cuda_kernel_func_attributes<
-+        DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-+        base_t::get_kernel_func());
-   }
- };
- 
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-index e586bb4cc6..0e348c092a 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-               maxblocks[1]),
-           1);
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
--          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
--          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else if (RP::rank == 3) {
-       const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
-       KOKKOS_ASSERT(block.x > 0);
-@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-               (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
-               maxblocks[2]));
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
--          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
--          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else if (RP::rank == 4) {
-       // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
-       // threadIdx.z
-@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-               (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
-               maxblocks[2]));
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
--          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
--          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else if (RP::rank == 5) {
-       // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
-       // threadIdx.z
-@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-               (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
-               maxblocks[2]));
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
--          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
--          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else if (RP::rank == 6) {
-       // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
-       // threadIdx.z
-@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-           std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
-                                      maxblocks[2]));
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
--          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
--          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else {
-       Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
-     }
-@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
- 
-       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
-           *this, grid, block, shmem,
--          m_policy.space().impl_internal_space_instance(),
--          false);  // copy to device and execute
-+          m_policy.space()
-+	      .impl_internal_space_instance()); // copy to device and execute
- 
-       if (!m_result_ptr_device_accessible) {
-         if (m_result_ptr) {
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
-index ac160f8fe2..d1031751c2 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
-@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
- #endif
- 
-     CudaParallelLaunch<ParallelFor, LaunchBounds>(
--        *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
--        false);
-+        *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
-   }
- 
-   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
- 
-       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
-           *this, grid, block, shmem,
--          m_policy.space().impl_internal_space_instance(),
--          false);  // copy to device and execute
-+          m_policy.space()
-+	      .impl_internal_space_instance());  // copy to device and execute
- 
-       if (!m_result_ptr_device_accessible) {
-         if (m_result_ptr) {
-@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
-         m_final = false;
-         CudaParallelLaunch<ParallelScan, LaunchBounds>(
-             *this, grid, block, shmem,
--            m_policy.space().impl_internal_space_instance(),
--            false);  // copy to device and execute
-+            m_policy.space()
-+	        .impl_internal_space_instance());  // copy to device and execute
- #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-       }
- #endif
-       m_final = true;
-       CudaParallelLaunch<ParallelScan, LaunchBounds>(
-           *this, grid, block, shmem,
--          m_policy.space().impl_internal_space_instance(),
--          false);  // copy to device and execute
-+          m_policy.space()
-+	      .impl_internal_space_instance());  // copy to device and execute
-     }
-   }
- 
-@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
-         m_final = false;
-         CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
-             *this, grid, block, shmem,
--            m_policy.space().impl_internal_space_instance(),
--            false);  // copy to device and execute
-+            m_policy.space()
-+	        .impl_internal_space_instance());  // copy to device and execute
- #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-       }
- #endif
-       m_final = true;
-       CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
-           *this, grid, block, shmem,
--          m_policy.space().impl_internal_space_instance(),
--          false);  // copy to device and execute
-+          m_policy.space()
-+              .impl_internal_space_instance());  // copy to device and execute
- 
-       const int size = Analysis::value_size(m_functor);
- #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
-index cdd16085b3..ea9430b812 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
-@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
- 
-     CudaParallelLaunch<ParallelFor, LaunchBounds>(
-         *this, grid, block, shmem_size_total,
--        m_policy.space().impl_internal_space_instance(),
--        true);  // copy to device and execute
-+        m_policy.space()
-+	    .impl_internal_space_instance());  // copy to device and execute
-   }
- 
-   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
- 
-       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
-           *this, grid, block, shmem_size_total,
--          m_policy.space().impl_internal_space_instance(),
--          true);  // copy to device and execute
-+          m_policy.space()
-+	      .impl_internal_space_instance());  // copy to device and execute
- 
-       if (!m_result_ptr_device_accessible) {
-         m_policy.space().fence(
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
-index 34d4bef9fd..178012431c 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
-@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
- //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
- // function qualifier which could be used to improve performance.
- //----------------------------------------------------------------------------
--// Maximize shared memory and minimize L1 cache:
--//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
--// For 2.0 capability: 48 KB shared and 16 KB L1
--//----------------------------------------------------------------------------
--//----------------------------------------------------------------------------
- /*
-  *  Algorithmic constraints:
-  *   (a) blockDim.y <= 1024
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
-index fb3a6b138f..a12378a891 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
-@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
-     const int shared = 0;
- 
-     Kokkos::Impl::CudaParallelLaunch<Self>(
--        *this, grid, block, shared, Cuda().impl_internal_space_instance(),
--        false);
-+        *this, grid, block, shared, Cuda().impl_internal_space_instance());
-   }
- 
-   inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
diff --git a/lib/kokkos/kokkos_5731.diff b/lib/kokkos/kokkos_5731.diff
deleted file mode 100644
index e95f4a1546..0000000000
--- a/lib/kokkos/kokkos_5731.diff
+++ /dev/null
@@ -1,46 +0,0 @@
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-index 30b6958a67..b94f053272 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
-                                 LaunchBounds{});
- }
- 
--// NOTE these number can be obtained several ways:
--// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
--// "Compute Capability" first and check what is the smallest "Shared Memory
--// Size Config" that is available.  The "Shared Memory Per Multiprocessor" in
--// bytes is then to be found below in the summary.
--// * Another option would be to look for the information in the "Tuning
--// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
--// the "Shared Memory" section (more tedious)
--inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
--  int const compute_capability = properties.major * 10 + properties.minor;
--  return [compute_capability]() {
--    switch (compute_capability) {
--      case 30:
--      case 32:
--      case 35: return 16;
--      case 37: return 80;
--      case 50:
--      case 53:
--      case 60:
--      case 62: return 64;
--      case 52:
--      case 61: return 96;
--      case 70:
--      case 80:
--      case 86:
--      case 90: return 8;
--      case 75: return 32;
--      default:
--        Kokkos::Impl::throw_runtime_exception(
--            "Unknown device in cuda block size deduction");
--    }
--    return 0;
--  }() * 1024;
--}
--
- }  // namespace Impl
- }  // namespace Kokkos
- 
diff --git a/lib/kokkos/kokkos_5739.diff b/lib/kokkos/kokkos_5739.diff
deleted file mode 100644
index fe7a1ff551..0000000000
--- a/lib/kokkos/kokkos_5739.diff
+++ /dev/null
@@ -1,204 +0,0 @@
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-index b94f053272..252c13c524 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-@@ -53,17 +53,69 @@
- namespace Kokkos {
- namespace Impl {
- 
-+inline int cuda_warp_per_sm_allocation_granularity(
-+    cudaDeviceProp const& properties) {
-+  // Allocation granularity of warps in each sm
-+  switch (properties.major) {
-+    case 3:
-+    case 5:
-+    case 7:
-+    case 8:
-+    case 9: return 4;
-+    case 6: return (properties.minor == 0 ? 2 : 4);
-+    default:
-+      throw_runtime_exception(
-+          "Unknown device in cuda warp per sm allocation granularity");
-+      return 0;
-+  }
-+}
-+
-+inline int cuda_max_warps_per_sm_registers(
-+    cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
-+  // Maximum number of warps per sm as a function of register counts,
-+  // subject to the constraint that warps are allocated with a fixed granularity
-+  int const max_regs_per_block = properties.regsPerBlock;
-+  int const regs_per_warp      = attributes.numRegs * properties.warpSize;
-+  int const warp_granularity =
-+      cuda_warp_per_sm_allocation_granularity(properties);
-+  // The granularity of register allocation is chunks of 256 registers per warp,
-+  // which implies a need to over-allocate, so we round up
-+  int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
-+
-+  // The maximum number of warps per SM is constrained from above by register
-+  // allocation. To satisfy the constraint that warps per SM is allocated at a
-+  // finite granularity, we need to round down.
-+  int const max_warps_per_sm =
-+      warp_granularity *
-+      (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
-+
-+  return max_warps_per_sm;
-+}
-+
- inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
-                                          cudaFuncAttributes const& attributes,
-                                          int block_size, size_t dynamic_shmem) {
--  // Limits due do registers/SM
-+  // Limits due to registers/SM
-   int const regs_per_sm     = properties.regsPerMultiprocessor;
-   int const regs_per_thread = attributes.numRegs;
-   // The granularity of register allocation is chunks of 256 registers per warp
-   // -> 8 registers per thread
-   int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
--  int const max_blocks_regs =
--      regs_per_sm / (allocated_regs_per_thread * block_size);
-+  int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
-+
-+  // Compute the maximum number of warps as a function of the number of
-+  // registers
-+  int const max_warps_per_sm_registers =
-+      cuda_max_warps_per_sm_registers(properties, attributes);
-+
-+  // Constrain the number of blocks to respect the maximum number of warps per
-+  // SM On face value this should be an equality, but due to the warp
-+  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
-+  // left-hand-side of this comparison can overshoot what the hardware allows
-+  // based on register counts alone
-+  while ((max_blocks_regs * block_size / properties.warpSize) >
-+         max_warps_per_sm_registers)
-+    max_blocks_regs--;
- 
-   // Limits due to shared memory/SM
-   size_t const shmem_per_sm            = properties.sharedMemPerMultiprocessor;
-@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
-                                 LaunchBounds{});
- }
- 
-+template <class LaunchBounds>
-+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
-+                                     LaunchBounds) {
-+  auto const& prop = Kokkos::Cuda().cuda_device_prop();
-+
-+  // Thin version of cuda_get_opt_block_size for cases where there is no shared
-+  // memory
-+  auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
-+
-+  return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
-+                                LaunchBounds{});
-+}
-+
- }  // namespace Impl
- }  // namespace Kokkos
- 
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-index 5c4c3a7d39..170183ca0a 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
-   // The granularity of register allocation is chunks of 256 registers per warp
-   // -> 8 registers per thread
-   const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
--  const size_t max_blocks_regs =
-+  size_t max_blocks_regs =
-       regs_per_sm / (allocated_regs_per_thread * block_size);
- 
-+  // Compute the maximum number of warps as a function of the number of
-+  // registers
-+  const size_t max_warps_per_sm_registers =
-+      cuda_max_warps_per_sm_registers(device_props, func_attr);
-+
-+  // Constrain the number of blocks to respect the maximum number of warps per
-+  // SM On face value this should be an equality, but due to the warp
-+  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
-+  // left-hand-side of this comparison can overshoot what the hardware allows
-+  // based on register counts alone
-+  while ((max_blocks_regs * block_size / device_props.warpSize) >
-+         max_warps_per_sm_registers)
-+    max_blocks_regs--;
-+
-   // Compute how many threads per sm we actually want
-   const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
-   // only allocate multiples of warp size
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-index 0e348c092a..7e4f62f12e 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-@@ -67,6 +67,34 @@
- namespace Kokkos {
- namespace Impl {
- 
-+template <typename ParallelType, typename Policy, typename LaunchBounds>
-+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
-+  cudaFuncAttributes attr =
-+      CudaParallelLaunch<ParallelType,
-+                         LaunchBounds>::get_cuda_func_attributes();
-+  auto const& prop = pol.space().cuda_device_prop();
-+
-+  // Limits due to registers/SM, MDRange doesn't have
-+  // shared memory constraints
-+  int const optimal_block_size =
-+      Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
-+
-+  // Compute how many blocks of this size we can launch, based on warp
-+  // constraints
-+  int const max_warps_per_sm_registers =
-+      Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
-+  int const max_num_threads_from_warps =
-+      max_warps_per_sm_registers * prop.warpSize;
-+  int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
-+
-+  // Compute the total number of threads
-+  int const max_threads_per_sm = optimal_block_size * max_num_blocks;
-+
-+  return std::min(
-+      max_threads_per_sm,
-+      static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
-+}
-+
- template <class FunctorType, class... Traits>
- class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-  public:
-@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-  public:
-   template <typename Policy, typename Functor>
-   static int max_tile_size_product(const Policy& pol, const Functor&) {
--    cudaFuncAttributes attr =
--        CudaParallelLaunch<ParallelFor,
--                           LaunchBounds>::get_cuda_func_attributes();
--    auto const& prop = pol.space().cuda_device_prop();
--    // Limits due to registers/SM, MDRange doesn't have
--    // shared memory constraints
--    int const regs_per_sm        = prop.regsPerMultiprocessor;
--    int const regs_per_thread    = attr.numRegs;
--    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
--    return std::min(
--        max_threads_per_sm,
--        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
-+    return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
-   }
-   Policy const& get_policy() const { return m_rp; }
-   inline __device__ void operator()() const {
-@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-  public:
-   template <typename Policy, typename Functor>
-   static int max_tile_size_product(const Policy& pol, const Functor&) {
--    cudaFuncAttributes attr =
--        CudaParallelLaunch<ParallelReduce,
--                           LaunchBounds>::get_cuda_func_attributes();
--    auto const& prop = pol.space().cuda_device_prop();
--    // Limits due do registers/SM
--    int const regs_per_sm        = prop.regsPerMultiprocessor;
--    int const regs_per_thread    = attr.numRegs;
--    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
--    return std::min(
--        max_threads_per_sm,
--        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
-+    return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
-   }
-   Policy const& get_policy() const { return m_policy; }
-   inline __device__ void exec_range(reference_type update) const {
diff --git a/lib/kokkos/kokkos_fix_5706_apply_last.diff b/lib/kokkos/kokkos_fix_5706_apply_last.diff
deleted file mode 100644
index 5d298323fd..0000000000
--- a/lib/kokkos/kokkos_fix_5706_apply_last.diff
+++ /dev/null
@@ -1,63 +0,0 @@
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-index 170183ca0a..ba43e362bb 100644
---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
-       Impl::check_shmem_request(cuda_instance, shmem);
-       if (DriverType::Policy::
-                         experimental_contains_desired_occupancy) {
-+      /*
-         int desired_occupancy =
-             driver.get_policy().impl_get_desired_occupancy().value();
-         size_t block_size = block.x * block.y * block.z;
-         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
--            shmem, desired_occupancy);
-+            shmem, desired_occupancy);*/
-+        Kokkos::Impl::throw_runtime_exception(
-+        std::string("Cuda graph node creation FAILED:"
-+                    " occupancy requests are currently broken."));
-       }
- 
-       void const* args[] = {&driver};
-@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
- 
-     if (!Impl::is_empty_launch(grid, block)) {
-       Impl::check_shmem_request(cuda_instance, shmem);
--      if constexpr (DriverType::Policy::
-+      if (DriverType::Policy::
-                         experimental_contains_desired_occupancy) {
--        int desired_occupancy =
-+        /*int desired_occupancy =
-             driver.get_policy().impl_get_desired_occupancy().value();
-         size_t block_size = block.x * block.y * block.z;
-         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
--            shmem, desired_occupancy);
-+            shmem, desired_occupancy);*/
-+        Kokkos::Impl::throw_runtime_exception(
-+        std::string("Cuda graph node creation FAILED:"
-+                    " occupancy requests are currently broken."));
-       }
- 
-       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
-@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
- 
-       if (DriverType::Policy::
-                         experimental_contains_desired_occupancy) {
--        int desired_occupancy =
-+        /*int desired_occupancy =
-             driver.get_policy().impl_get_desired_occupancy().value();
-         size_t block_size = block.x * block.y * block.z;
-         Impl::configure_shmem_preference<
-             DriverType,
-             Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
--            shmem, desired_occupancy);
-+            shmem, desired_occupancy);*/
-+        Kokkos::Impl::throw_runtime_exception(
-+        std::string("Cuda graph node creation FAILED:"
-+                    " occupancy requests are currently broken."));
-       }
- 
-       KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
diff --git a/potentials/HGa.msmeam b/potentials/HGa.msmeam
new file mode 100644
index 0000000000..9f01501c16
--- /dev/null
+++ b/potentials/HGa.msmeam
@@ -0,0 +1,30 @@
+bkgd_dyn        =       1
+emb_lin_neg = 1
+augt1=0 
+ialloy=1 
+rc	=	 5.9 
+#H
+attrac(1,1)=0.460 
+repuls(1,1)=0.460 
+Cmin(1,1,1)=1.3 # PuMS
+Cmax(1,1,1)= 2.80 
+nn2(1,1)=1
+#Ga
+rho0(2)         =       0.6
+attrac(2,2)=0.097 
+repuls(2,2)=0.097 
+nn2(2,2)=1
+#HGa
+attrac(1,2)=0.300 
+repuls(1,2)=0.300 
+lattce(1,2)=l12 
+re(1,2)=3.19 
+delta(1,2)=-0.48  
+alpha(1,2)=6.6 
+Cmin(1,1,2)=2.0 
+Cmin(2,1,2)= 2.0 
+Cmin(1,2,1)=2.0 
+Cmin(2,2,1)     =       1.4
+Cmin(1,2,2)     =       1.4
+Cmin(1,1,2)     =       1.4
+nn2(1,2)=1
diff --git a/potentials/library.msmeam b/potentials/library.msmeam
new file mode 100644
index 0000000000..9937eaee08
--- /dev/null
+++ b/potentials/library.msmeam
@@ -0,0 +1,14 @@
+# DATE: 2018-09-22 UNITS: metal CONTRIBUTOR: Steve Valone, smv@lanl.gov CITATION: Baskes, PRB 1992; smv, sr, mib, JNM 2010
+# ms-meam data format May 2010
+#  elt   lat    z     ielement      atwt
+#  alpha b0     b1    b2     b3     b1m     b2m   b3m      alat   esub   asub
+#    -   t0     t1    t2     t3     t1m     t2m   t3m      rozero ibar
+#  NOTE:  leading character cannot be a space
+
+'H'    'dim'  1.0   1      1.0079
+2.960  2.960  3.0   1.0    1.0    1.0    3.0  1.0       0.741  2.235  2.50
+1.0    0.44721 0.0  0.00   0.0    0.31623 0  6.70 0
+
+'Ga4'  'fcc'  12.0  31     69.723
+4.42   4.80   3.10  6.00   0.00   0.0    0.0  0.5       4.247  2.897  0.97
+1.0    1.649 1.435  0.00   0.0    0.0  2.0       0.70   0
diff --git a/python/lammps/mliap/__init__.py b/python/lammps/mliap/__init__.py
index c1a9752855..6e638ac360 100644
--- a/python/lammps/mliap/__init__.py
+++ b/python/lammps/mliap/__init__.py
@@ -32,7 +32,7 @@ if not pylib.Py_IsInitialized():
 else:
     from .loader import load_model, load_unified, activate_mliappy
     try:
-         from .loader import  load_model_kokkos,  activate_mliappy_kokkos
+         from .loader import  load_model_kokkos, load_unified_kokkos,  activate_mliappy_kokkos
     except Exception as ee:
         # ignore import error, it means that the KOKKOS package was not included in LAMMPS
         pass
diff --git a/python/lammps/mliap/loader.py b/python/lammps/mliap/loader.py
index 940bd10f1f..558c69a7a9 100644
--- a/python/lammps/mliap/loader.py
+++ b/python/lammps/mliap/loader.py
@@ -75,7 +75,7 @@ def activate_mliappy(lmp):
 def activate_mliappy_kokkos(lmp):
     try:
         library = lmp.lib
-        module_names = ["mliap_model_python_couple_kokkos"]
+        module_names = ["mliap_model_python_couple_kokkos", "mliap_unified_couple_kokkos"]
         api_version = library.lammps_python_api_version()
 
         for module_name in module_names:
@@ -118,3 +118,12 @@ def load_unified(model):
                           ) from ie
     mliap_unified_couple.load_from_python(model)
 
+def load_unified_kokkos(model):
+    try:
+        import mliap_unified_couple_kokkos
+    except ImportError as ie:
+        raise ImportError("ML-IAP python module must be activated before loading\n"
+                          "the pair style. Call lammps.mliap.activate_mliappy(lmp)."
+                          ) from ie
+    mliap_unified_couple_kokkos.load_from_python(model)
+
diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp
index f222613c3c..ae3dbf16c4 100644
--- a/src/AMOEBA/amoeba_convolution.cpp
+++ b/src/AMOEBA/amoeba_convolution.cpp
@@ -22,6 +22,7 @@
 #include "memory.h"
 #include "neighbor.h"
 #include "remap_wrap.h"
+#include "timer.h"
 
 #include <cmath>
 #include <cstring>
@@ -326,15 +327,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
     cfft[n++] = ZEROF;
   }
 
+  double time0,time1;
+
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
   // perform forward FFT
 
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  time1 = platform::walltime();
 
   if (SCALE) {
-    double scale = 1.0/nfft_global;
+    FFT_SCALAR scale = 1.0/nfft_global;
     for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
   }
 
+  time_fft += time1 - time0;
+
 #if DEBUG_AMOEBA
   debug_scalar(CFFT1,"PRE Convo / POST FFT");
   debug_file(CFFT1,"pre.convo.post.fft");
@@ -382,15 +391,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
   debug_scalar(FFT,"PRE Convo / POST Remap");
   debug_file(FFT,"pre.convo.post.remap");
 #endif
+
+  double time0,time1;
+
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
   // perform forward FFT
 
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  time1 = platform::walltime();
 
   if (SCALE) {
-    double scale = 1.0/nfft_global;
+    FFT_SCALAR scale = 1.0/nfft_global;
     for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
   }
 
+  time_fft += time1  - time0;
+
 #if DEBUG_AMOEBA
   debug_scalar(CFFT1,"PRE Convo / POST FFT");
   debug_file(CFFT1,"pre.convo.post.fft");
@@ -423,7 +441,16 @@ void *AmoebaConvolution::post_convolution_3d()
   debug_scalar(CFFT1,"POST Convo / PRE FFT");
   debug_file(CFFT1,"post.convo.pre.fft");
 #endif
+
+  double time0,time1;
+
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
+  time1 = platform::walltime();
+
+  time_fft += time1 - time0;
 
 #if DEBUG_AMOEBA
   debug_scalar(CFFT2,"POST Convo / POST FFT");
@@ -465,8 +492,18 @@ void *AmoebaConvolution::post_convolution_4d()
   debug_scalar(CFFT1,"POST Convo / PRE FFT");
   debug_file(CFFT1,"post.convo.pre.fft");
 #endif
+
+  double time0,time1;
+
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
 
+  time1 = platform::walltime();
+
+  time_fft += time1 - time0;
+
 #if DEBUG_AMOEBA
   debug_scalar(CFFT2,"POST Convo / POST FFT");
   debug_file(CFFT2,"post.convo.post.fft");
diff --git a/src/AMOEBA/amoeba_convolution.h b/src/AMOEBA/amoeba_convolution.h
index 99ad11ade4..bed65149ec 100644
--- a/src/AMOEBA/amoeba_convolution.h
+++ b/src/AMOEBA/amoeba_convolution.h
@@ -38,7 +38,7 @@ class AmoebaConvolution : protected Pointers {
   int nxlo_out, nxhi_out, nylo_out, nyhi_out, nzlo_out, nzhi_out;
   int nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft;
   bigint nfft_global;          // nx * ny * nz
-  double *grid_brick_start;    // lower left corner of (c)grid_brick data
+  FFT_SCALAR *grid_brick_start;    // lower left corner of (c)grid_brick data
 
   AmoebaConvolution(class LAMMPS *, class Pair *, int, int, int, int, int);
   ~AmoebaConvolution();
@@ -47,35 +47,37 @@ class AmoebaConvolution : protected Pointers {
   FFT_SCALAR *pre_convolution();
   void *post_convolution();
 
- private:
-  int which;            // caller name for convolution being performed
-  int flag3d;           // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
-  int nbrick_owned;     // owned grid points in brick decomp
-  int nbrick_ghosts;    // owned + ghost brick grid points
-  int ngrid_either;     // max of nbrick_owned or nfft_owned
+  double time_fft;
+
+ protected:
+  int which;                   // caller name for convolution being performed
+  int flag3d;                  // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
+  int nbrick_owned;            // owned grid points in brick decomp
+  int nbrick_ghosts;           // owned + ghost brick grid points
+  int ngrid_either;            // max of nbrick_owned or nfft_owned
 
   class Pair *amoeba;
   class FFT3d *fft1, *fft2;
   class Grid3d *gc;
   class Remap *remap;
 
-  double ***grid_brick;      // 3d real brick grid with ghosts
-  double ****cgrid_brick;    // 4d complex brick grid with ghosts
+  FFT_SCALAR ***grid_brick;      // 3d real brick grid with ghosts
+  FFT_SCALAR ****cgrid_brick;    // 4d complex brick grid with ghosts
 
   FFT_SCALAR *grid_fft;    // 3d FFT grid as 1d vector
   FFT_SCALAR *cfft;        // 3d complex FFT grid as 1d vector
 
-  double *gc_buf1, *gc_buf2;    // buffers for GridComm
-  double *remap_buf;            // buffer for Remap
+  FFT_SCALAR *gc_buf1, *gc_buf2;    // buffers for GridComm
+  FFT_SCALAR *remap_buf;            // buffer for Remap
 
   void allocate_grid();
   void deallocate_grid();
   void *zero_3d();
   void *zero_4d();
   FFT_SCALAR *pre_convolution_3d();
-  FFT_SCALAR *pre_convolution_4d();
+  virtual FFT_SCALAR *pre_convolution_4d();
   void *post_convolution_3d();
-  void *post_convolution_4d();
+  virtual void *post_convolution_4d();
   void procs2grid2d(int, int, int, int &, int &);
 
   // DEBUG
diff --git a/src/AMOEBA/amoeba_dispersion.cpp b/src/AMOEBA/amoeba_dispersion.cpp
index f3af921d85..cc283f22d2 100644
--- a/src/AMOEBA/amoeba_dispersion.cpp
+++ b/src/AMOEBA/amoeba_dispersion.cpp
@@ -285,7 +285,7 @@ void PairAmoeba::dispersion_kspace()
   // gridpre = my portion of 3d grid in brick decomp w/ ghost values
   // zeroed by zero()
 
-  double ***gridpre = (double ***) d_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) d_kspace->zero();
 
   // map atoms to grid
 
@@ -294,7 +294,7 @@ void PairAmoeba::dispersion_kspace()
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
-  double *gridfft = d_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = d_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index a6724e2bb7..ecc20a198c 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -24,6 +24,7 @@
 #include "math_special.h"
 #include "my_page.h"
 #include "neigh_list.h"
+#include "timer.h"
 
 #include <cmath>
 
@@ -381,8 +382,6 @@ void PairAmoeba::induce()
       }
     }
 
-    // if (comm->me == 0) printf("CG iteration count = %d\n",iter);
-
     // terminate the calculation if dipoles failed to converge
     // NOTE: could make this an error
 
@@ -546,13 +545,19 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
     }
   }
 
-  // get the reciprocal space part of the mutual field
-
-  if (polar_kspace_flag) umutual1(field,fieldp);
+  double time0, time1, time2;
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
 
   // get the real space portion of the mutual field
 
   if (polar_rspace_flag) umutual2b(field,fieldp);
+  time1 = platform::walltime();
+
+  // get the reciprocal space part of the mutual field
+
+  if (polar_kspace_flag) umutual1(field,fieldp);
+  time2 = platform::walltime();
 
   // add the self-energy portion of the mutual field
 
@@ -563,6 +568,11 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
       fieldp[i][j] += term*uinp[i][j];
     }
   }
+
+  // accumulate timing information
+
+  time_mutual_rspace += time1 - time0;
+  time_mutual_kspace += time2 - time1;
 }
 
 /* ----------------------------------------------------------------------
@@ -785,7 +795,12 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
 
   // get the reciprocal space part of the permanent field
 
+  double time0, time1, time2;
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
   if (polar_kspace_flag) udirect1(field);
+  time1 = platform::walltime();
 
   for (i = 0; i < nlocal; i++) {
     for (j = 0; j < 3; j++) {
@@ -796,6 +811,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
   // get the real space portion of the permanent field
 
   if (polar_rspace_flag) udirect2b(field,fieldp);
+  time2 = platform::walltime();
 
   // get the self-energy portion of the permanent field
 
@@ -806,6 +822,11 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
       fieldp[i][j] += term*rpole[i][j+1];
     }
   }
+
+  // accumulate timing information
+
+  time_direct_kspace += time1 - time0;
+  time_direct_rspace += time2 - time1;
 }
 
 /* ----------------------------------------------------------------------
@@ -842,18 +863,26 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
     }
   }
 
+  double time0, time1;
+
   // gridpre = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpre = (double ****) ic_kspace->zero();
+  FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero();
 
   // map 2 values to grid
 
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
   grid_uind(fuind,fuinp,gridpre);
 
+  time1 = platform::walltime();
+  time_grid_uind += (time1 - time0);
+
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
-  double *gridfft = ic_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = ic_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -883,12 +912,18 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
   // post-convolution operations including backward FFT
   // gridppost = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpost = (double ****) ic_kspace->post_convolution();
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution();
 
   // get potential
 
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
   fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
 
+  time1 = platform::walltime();
+  time_fphi_uind += (time1 - time0);
+
   // store fractional reciprocal potentials for OPT method
 
   if (poltyp == OPT) {
@@ -1055,7 +1090,7 @@ void PairAmoeba::udirect1(double **field)
   // gridpre = my portion of 3d grid in brick decomp w/ ghost values
   // zeroed by setup()
 
-  double ***gridpre = (double ***) i_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) i_kspace->zero();
 
   // map multipole moments to grid
 
@@ -1064,7 +1099,7 @@ void PairAmoeba::udirect1(double **field)
   // pre-convolution operations including forward FFT
   // gridfft = my 1d portion of complex 3d grid in FFT decomp
 
-  double *gridfft = i_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = i_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -1109,7 +1144,7 @@ void PairAmoeba::udirect1(double **field)
   // post-convolution operations including backward FFT
   // gridppost = my portion of 3d grid in brick decomp w/ ghost values
 
-  double ***gridpost = (double ***) i_kspace->post_convolution();
+  FFT_SCALAR ***gridpost = (FFT_SCALAR ***) i_kspace->post_convolution();
 
   // get potential
 
diff --git a/src/AMOEBA/amoeba_kspace.cpp b/src/AMOEBA/amoeba_kspace.cpp
index da6483ef40..6d2fb64dd6 100644
--- a/src/AMOEBA/amoeba_kspace.cpp
+++ b/src/AMOEBA/amoeba_kspace.cpp
@@ -68,25 +68,23 @@ void PairAmoeba::moduli()
   int maxfft = MAX(nfft1,nfft2);
   maxfft = MAX(maxfft,nfft3);
 
-  double *array = new double[bsorder];
-  double *bsarray = new double[maxfft];
+  if (maxfft > _nfft_max) {
+    memory->destroy(_moduli_bsarray);
+    _nfft_max = maxfft;
+    memory->create(_moduli_bsarray,_nfft_max,"amoeba:_moduli_bsarray");
+  }
 
   // compute and load the moduli values
 
   double x = 0.0;
-  bspline(x,bsorder,array);
+  bspline(x,bsorder,_moduli_array);
 
-  for (i = 0; i < maxfft; i++) bsarray[i] = 0.0;
-  for (i = 0; i < bsorder; i++) bsarray[i+1] = array[i];
+  for (i = 0; i < maxfft; i++) _moduli_bsarray[i] = 0.0;
+  for (i = 0; i < bsorder; i++) _moduli_bsarray[i+1] = _moduli_array[i];
 
-  dftmod(bsmod1,bsarray,nfft1,bsorder);
-  dftmod(bsmod2,bsarray,nfft2,bsorder);
-  dftmod(bsmod3,bsarray,nfft3,bsorder);
-
-  // perform deallocation of local arrays
-
-  delete[] array;
-  delete[] bsarray;
+  dftmod(bsmod1,_moduli_bsarray,nfft1,bsorder);
+  dftmod(bsmod2,_moduli_bsarray,nfft2,bsorder);
+  dftmod(bsmod3,_moduli_bsarray,nfft3,bsorder);
 }
 
 /* ----------------------------------------------------------------------
@@ -525,7 +523,7 @@ void PairAmoeba::frac_to_cart()
    grid_mpole maps fractional atomic multipoles to PME grid
 ------------------------------------------------------------------------- */
 
-void PairAmoeba::grid_mpole(double **fmp, double ***grid)
+void PairAmoeba::grid_mpole(double **fmp, FFT_SCALAR ***grid)
 {
   int i,j,k,m,ib,jb,kb;
   double v0,u0,t0;
@@ -598,7 +596,7 @@ void PairAmoeba::grid_mpole(double **fmp, double ***grid)
    the particle mesh Ewald grid
 ------------------------------------------------------------------------- */
 
-void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
+void PairAmoeba::fphi_mpole(FFT_SCALAR ***grid, double **fphi)
 {
   int i,j,k,m,ib,jb,kb;
   double v0,v1,v2,v3;
@@ -742,7 +740,7 @@ void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
    grid_uind maps fractional induced dipoles to the PME grid
 ------------------------------------------------------------------------- */
 
-void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
+void PairAmoeba::grid_uind(double **fuind, double **fuinp, FFT_SCALAR ****grid)
 {
   int i,j,k,m,ib,jb,kb;
   double v0,u0,t0;
@@ -793,7 +791,7 @@ void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
    fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
 ------------------------------------------------------------------------- */
 
-void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
+void PairAmoeba::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
                            double **fdip_phi2, double **fdip_sum_phi)
 {
   int i,j,k,m,ib,jb,kb;
@@ -1042,7 +1040,7 @@ void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
    grid_disp maps dispersion coefficients to PME grid
 ------------------------------------------------------------------------- */
 
-void PairAmoeba::grid_disp(double ***grid)
+void PairAmoeba::grid_disp(FFT_SCALAR ***grid)
 {
   int i,j,k,m,ib,jb,kb,itype,iclass;
   double v0,u0,t0;
diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp
index f58395aa1c..a1503a91f3 100644
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@@ -21,6 +21,7 @@
 #include "math_const.h"
 #include "math_special.h"
 #include "neigh_list.h"
+#include "timer.h"
 
 #include <cmath>
 
@@ -55,6 +56,8 @@ void PairAmoeba::multipole()
   double qixx,qixy,qixz,qiyy,qiyz,qizz;
   double cii,dii,qii;
 
+  double time0,time1,time2;
+
   // set cutoffs, taper coeffs, and PME params
 
   if (use_ewald) choose(MPOLE_LONG);
@@ -78,13 +81,18 @@ void PairAmoeba::multipole()
 
   felec = electric / am_dielectric;
 
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
   // compute the real space part of the Ewald summation
 
   if (mpole_rspace_flag) multipole_real();
+  time1 = platform::walltime();
 
   // compute the reciprocal space part of the Ewald summation
 
   if (mpole_kspace_flag) multipole_kspace();
+  time2 = platform::walltime();
 
   // compute the Ewald self-energy term over all the atoms
 
@@ -109,6 +117,11 @@ void PairAmoeba::multipole()
     e = fterm * (cii + term*(dii/3.0+2.0*term*qii/5.0));
     empole += e;
   }
+
+  // accumulate timing information
+
+  time_mpole_rspace += time1 - time0;
+  time_mpole_kspace += time2 - time1;
 }
 
 /* ----------------------------------------------------------------------
@@ -361,6 +374,9 @@ void PairAmoeba::multipole_real()
         bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2;
       }
       for (k = 0; k < 6; k++) bn[k] *= felec;
+      //if (i == 0 && j < 10) {
+      //  printf("j = %d: aewald = %f; rr1 = %f; bn: %f %f %f %f %f %f\n", j, aewald, rr1, bn[0], bn[1], bn[2], bn[3], bn[4], bn[5]);
+      //}
 
       // find damped multipole intermediates and energy value
 
@@ -404,6 +420,8 @@ void PairAmoeba::multipole_real()
           term2i*rr3i + term2k*rr3k + term2ik*rr3ik +
           term3i*rr5i + term3k*rr5k + term3ik*rr5ik;
 
+
+
         // find damped multipole intermediates for force and torque
 
         de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik +
@@ -444,6 +462,7 @@ void PairAmoeba::multipole_real()
         term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
         term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9);
         term6 = 4.0 * rr7;
+
       }
 
       empole += e;
@@ -482,6 +501,7 @@ void PairAmoeba::multipole_real()
       tq[i][2] += ttmi[2];
 
       // increment force-based gradient and torque on second site
+      // commenting out j parts for DEBUGGING
 
       f[j][0] += frcx;
       f[j][1] += frcy;
@@ -638,7 +658,7 @@ void PairAmoeba::multipole_kspace()
 
   // gridpre = my portion of 3d grid in brick decomp w/ ghost values
 
-  double ***gridpre = (double ***) m_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) m_kspace->zero();
 
   // map atoms to grid
 
@@ -647,7 +667,7 @@ void PairAmoeba::multipole_kspace()
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
 
-  double *gridfft = m_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = m_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -718,7 +738,7 @@ void PairAmoeba::multipole_kspace()
   // post-convolution operations including backward FFT
   // gridppost = my portion of 3d grid in brick decomp w/ ghost values
 
-  double ***gridpost = (double ***) m_kspace->post_convolution();
+  FFT_SCALAR ***gridpost = (FFT_SCALAR ***) m_kspace->post_convolution();
 
   // get potential
 
diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp
index 4d143c7a22..3c51426beb 100644
--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@@ -21,6 +21,7 @@
 #include "math_const.h"
 #include "math_special.h"
 #include "neigh_list.h"
+#include "timer.h"
 
 #include <cmath>
 #include <cstring>
@@ -55,6 +56,8 @@ void PairAmoeba::polar()
   double fix[3],fiy[3],fiz[3];
   double tep[3];
 
+  double time0,time1,time2;
+
   // set cutoffs, taper coeffs, and PME params
 
   if (use_ewald) choose(POLAR_LONG);
@@ -76,11 +79,16 @@ void PairAmoeba::polar()
 
   // compute the real space part of the dipole interactions
 
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
+
   if (polar_rspace_flag) polar_real();
+  time1 = platform::walltime();
 
   // compute the reciprocal space part of dipole interactions
 
   if (polar_kspace_flag) polar_kspace();
+  time2 = platform::walltime();
 
   // compute the Ewald self-energy torque and virial terms
 
@@ -133,6 +141,11 @@ void PairAmoeba::polar()
     virpolar[4] -= vxz;
     virpolar[5] -= vyz;
   }
+
+  // accumulate timing information
+
+  time_polar_rspace += time1 - time0;
+  time_polar_kspace += time2 - time1;
 }
 
 /* ----------------------------------------------------------------------
@@ -382,7 +395,7 @@ void PairAmoeba::polar_real()
           factor_uscale = 1.0;
         }
       }
-
+      //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale);
       r = sqrt(r2);
       ck = rpole[j][0];
       dkx = rpole[j][1];
@@ -597,7 +610,6 @@ void PairAmoeba::polar_real()
       dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir;
       dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir;
       dufld[i][5] += zr*tiz5 + zr*zr*tuir;
-
       dufld[j][0] -= xr*tkx5 + xr*xr*tukr;
       dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr;
       dufld[j][2] -= yr*tky5 + yr*yr*tukr;
@@ -855,6 +867,7 @@ void PairAmoeba::polar_real()
         frcx = -2.0 * depx;
         frcy = -2.0 * depy;
         frcz = -2.0 * depz;
+
       }
 
       // get the dtau/dr terms used for mutual polarization force
@@ -1327,7 +1340,7 @@ void PairAmoeba::polar_kspace()
 
     // gridpre = my portion of 3d grid in brick decomp w/ ghost values
 
-    double ***gridpre = (double ***) p_kspace->zero();
+    FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
     // map atoms to grid
 
@@ -1336,7 +1349,7 @@ void PairAmoeba::polar_kspace()
     // pre-convolution operations including forward FFT
     // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
 
-    double *gridfft = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft = p_kspace->pre_convolution();
 
     // ---------------------
     // convolution operation
@@ -1386,7 +1399,7 @@ void PairAmoeba::polar_kspace()
     // post-convolution operations including backward FFT
     // gridppost = my portion of 3d grid in brick decomp w/ ghost values
 
-    double ***gridpost = (double ***) p_kspace->post_convolution();
+    FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution();
 
     // get potential
 
@@ -1419,7 +1432,7 @@ void PairAmoeba::polar_kspace()
 
   // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpre2 = (double ****) pc_kspace->zero();
+  FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero();
 
   // map 2 values to grid
 
@@ -1428,7 +1441,7 @@ void PairAmoeba::polar_kspace()
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
-  double *gridfft = pc_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = pc_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -1451,7 +1464,7 @@ void PairAmoeba::polar_kspace()
   // post-convolution operations including backward FFT
   // gridppost = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpost = (double ****) pc_kspace->post_convolution();
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution();
 
   // get potential
 
@@ -1857,7 +1870,7 @@ void PairAmoeba::polar_kspace()
   // gridpre = my portion of 3d grid in brick decomp w/ ghost values
   // zeroed by zero()
 
-  double ***gridpre = (double ***) p_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
   // map atoms to grid
 
@@ -1887,7 +1900,7 @@ void PairAmoeba::polar_kspace()
   // gridpre = my portion of 3d grid in brick decomp w/ ghost values
   // zeroed by zero()
 
-  gridpre = (double ***) p_kspace->zero();
+  gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
   // map atoms to grid
 
@@ -1896,7 +1909,7 @@ void PairAmoeba::polar_kspace()
   // pre-convolution operations including forward FFT
   // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors
 
-  double *gridfft2 = p_kspace->pre_convolution();
+  FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -1953,7 +1966,7 @@ void PairAmoeba::polar_kspace()
     // gridpre = my portion of 3d grid in brick decomp w/ ghost values
     // zeroed by zero()
 
-    double ***gridpre = (double ***) p_kspace->zero();
+    FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
     // map atoms to grid
 
@@ -1962,12 +1975,12 @@ void PairAmoeba::polar_kspace()
     // pre-convolution operations including forward FFT
     // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
 
-    double *gridfft = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft = p_kspace->pre_convolution();
 
     // gridfft1 = copy of first FFT
 
     int nfft_owned = p_kspace->nfft_owned;
-    memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double));
+    memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR));
 
     // assign ??? to the PME grid
 
@@ -1982,7 +1995,7 @@ void PairAmoeba::polar_kspace()
 
     // gridpre = my portion of 3d grid in brick decomp w/ ghost values
 
-    gridpre = (double ***) p_kspace->zero();
+    gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
     // map atoms to grid
 
@@ -1991,7 +2004,7 @@ void PairAmoeba::polar_kspace()
     // pre-convolution operations including forward FFT
     // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
 
-    double *gridfft2 = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
 
     // ---------------------
     // convolution operation
diff --git a/src/AMOEBA/fix_amoeba_bitorsion.cpp b/src/AMOEBA/fix_amoeba_bitorsion.cpp
index aeba26fb4d..cb8c62819d 100644
--- a/src/AMOEBA/fix_amoeba_bitorsion.cpp
+++ b/src/AMOEBA/fix_amoeba_bitorsion.cpp
@@ -194,8 +194,8 @@ void FixAmoebaBiTorsion::init()
   // error check that PairAmoeba or PairHiippo exist
 
   pair = nullptr;
-  pair = force->pair_match("amoeba",1,0);
-  if (!pair) pair = force->pair_match("hippo",1,0);
+  pair = force->pair_match("^amoeba",0,0);
+  if (!pair) pair = force->pair_match("^hippo",0,0);
 
   if (!pair)
     error->all(FLERR,"Cannot use fix amoeba/bitorsion w/out pair amoeba/hippo");
diff --git a/src/AMOEBA/improper_amoeba.cpp b/src/AMOEBA/improper_amoeba.cpp
index b1e403da78..cb9db01b59 100644
--- a/src/AMOEBA/improper_amoeba.cpp
+++ b/src/AMOEBA/improper_amoeba.cpp
@@ -285,8 +285,9 @@ void ImproperAmoeba::init_style()
   // check if PairAmoeba disabled improper terms
 
   Pair *pair = nullptr;
-  pair = force->pair_match("amoeba",1,0);
-  if (!pair) pair = force->pair_match("hippo",1,0);
+  pair = force->pair_match("^amoeba",0,0);
+  if (!pair) pair = force->pair_match("^hippo",0,0);
+
   if (!pair) error->all(FLERR,"Improper amoeba could not find pair amoeba/hippo");
 
   int tmp;
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index e8b7a18dba..0812fe43f0 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -29,6 +29,7 @@
 #include "my_page.h"
 #include "neigh_list.h"
 #include "neighbor.h"
+#include "timer.h"
 #include "update.h"
 
 #include <cmath>
@@ -47,6 +48,7 @@ enum{MUTUAL,OPT,TCG,DIRECT};
 enum{GEAR,ASPC,LSQR};
 
 #define DELTASTACK 16
+#define DEBUG_AMOEBA 0
 
 /* ---------------------------------------------------------------------- */
 
@@ -85,6 +87,10 @@ PairAmoeba::PairAmoeba(LAMMPS *lmp) : Pair(lmp)
   cmp = fmp = nullptr;
   cphi = fphi = nullptr;
 
+  _moduli_array = nullptr;
+  _moduli_bsarray = nullptr;
+  _nfft_max = 0;
+
   poli = nullptr;
   conj = conjp = nullptr;
   vec = vecp = nullptr;
@@ -227,6 +233,9 @@ PairAmoeba::~PairAmoeba()
   memory->destroy(fphidp);
   memory->destroy(cphidp);
 
+  memory->destroy(_moduli_array);
+  memory->destroy(_moduli_bsarray);
+
   memory->destroy(thetai1);
   memory->destroy(thetai2);
   memory->destroy(thetai3);
@@ -349,12 +358,22 @@ void PairAmoeba::compute(int eflag, int vflag)
   if (update->ntimestep <= update->beginstep+1) {
     time_init = time_hal = time_repulse = time_disp = time_mpole = 0.0;
     time_induce = time_polar = time_qxfer = 0.0;
+
+    time_mpole_rspace = time_mpole_kspace = 0.0;
+    time_direct_rspace = time_direct_kspace = 0.0;
+    time_mutual_rspace = time_mutual_kspace = 0.0;
+    time_polar_rspace = time_polar_kspace = 0.0;
+
+    time_grid_uind = time_fphi_uind = 0.0;
+    if (ic_kspace) {
+      ic_kspace->time_fft = 0.0;
+    }
   }
 
   double time0,time1,time2,time3,time4,time5,time6,time7,time8;
 
-  MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  if (timer->has_sync()) MPI_Barrier(world);
+  time0 = platform::walltime();
 
   // if reneighboring step:
   // augment neighbor list to include 1-5 neighbor flags
@@ -410,8 +429,7 @@ void PairAmoeba::compute(int eflag, int vflag)
   comm->forward_comm(this);
 
   if (amoeba) pbc_xred();
-
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   // ----------------------------------------
   // compute components of force field
@@ -420,22 +438,22 @@ void PairAmoeba::compute(int eflag, int vflag)
   // buffered 14-7 Vdwl, pairwise
 
   if (amoeba && hal_flag) hal();
-  time2 = MPI_Wtime();
+  time2 = platform::walltime();
 
   // Pauli repulsion, pairwise
 
   if (!amoeba && repulse_flag) repulsion();
-  time3 = MPI_Wtime();
+  time3 = platform::walltime();
 
   // Ewald dispersion, pairwise and long range
 
   if (!amoeba && (disp_rspace_flag || disp_kspace_flag)) dispersion();
-  time4 = MPI_Wtime();
+  time4 = platform::walltime();
 
   // multipole, pairwise and long range
 
   if (mpole_rspace_flag || mpole_kspace_flag) multipole();
-  time5 = MPI_Wtime();
+  time5 = platform::walltime();
 
   // induced dipoles, interative CG relaxation
   // communicate induce() output values needed by ghost atoms
@@ -445,17 +463,17 @@ void PairAmoeba::compute(int eflag, int vflag)
     cfstyle = INDUCE;
     comm->forward_comm(this);
   }
-  time6 = MPI_Wtime();
+  time6 = platform::walltime();
 
   // dipoles, pairwise and long range
 
   if (polar_rspace_flag || polar_kspace_flag) polar();
-  time7 = MPI_Wtime();
+  time7 = platform::walltime();
 
   // charge transfer, pairwise
 
   if (!amoeba && qxfer_flag) charge_transfer();
-  time8 = MPI_Wtime();
+  time8 = platform::walltime();
 
   // store energy components for output by compute pair command
 
@@ -518,6 +536,44 @@ void PairAmoeba::finish()
   MPI_Allreduce(&time_qxfer,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_qxfer = ave/comm->nprocs;
 
+  #if DEBUG_AMOEBA
+  // real-space/kspace breakdown
+  MPI_Allreduce(&time_mpole_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mpole_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_mpole_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mpole_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_direct_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_direct_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_direct_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_direct_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_mutual_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_mutual_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_polar_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_polar_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_polar_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_grid_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_grid_uind = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_fphi_uind = ave/comm->nprocs;
+
+  double time_mutual_fft = 0;
+  if (ic_kspace) time_mutual_fft = ic_kspace->time_fft;
+  MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_fft = ave/comm->nprocs;
+  #endif // DEBUG_AMOEBA
+
   double time_total = (time_init + time_hal + time_repulse + time_disp +
                        time_mpole + time_induce + time_polar + time_qxfer) / 100.0;
 
@@ -534,8 +590,27 @@ void PairAmoeba::finish()
     utils::logmesg(lmp,"  Induce  time: {:<12.6g} {:6.2f}%\n", time_induce, time_induce/time_total);
     utils::logmesg(lmp,"  Polar   time: {:<12.6g} {:6.2f}%\n", time_polar, time_polar/time_total);
     if (!amoeba)
-      utils::logmesg(lmp,"  Qxfer   time: {:<12.6g} {:6.2f}%\n", time_qxfer, time_qxfer/time_total);
-    utils::logmesg(lmp,"  Total   time: {:<12.6g}\n",time_total * 100.0);
+      utils::logmesg(lmp,"  Qxfer   time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total);
+    utils::logmesg(lmp,"  Total   time: {:.6g}\n",time_total * 100.0);
+
+    #if DEBUG_AMOEBA
+    double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace;
+    double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace;
+
+    utils::logmesg(lmp,"    Real-space timing breakdown: {:.3g}%\n", rspace_time/time_total);
+    utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total);
+    utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total);
+    utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total);
+    utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total);
+    utils::logmesg(lmp,"    K-space timing breakdown   : {:.3g}%\n", kspace_time/time_total);
+    utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
+    utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
+    utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
+    utils::logmesg(lmp,"       - Grid    : {:.6g} {:.3g}%\n", time_grid_uind, time_grid_uind/time_total);
+    utils::logmesg(lmp,"       - FFT     : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
+    utils::logmesg(lmp,"       - Interp  : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total);
+    utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
+    #endif
   }
 }
 
@@ -2320,6 +2395,8 @@ void PairAmoeba::grow_local()
     firstneigh_pcpc = (double **)
       memory->smalloc(nmax*sizeof(double *),"induce:firstneigh_pcpc");
   }
+
+  memory->create(_moduli_array,bsordermax,"amoeba:_moduli_array");
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 847764244b..cdeee6c95f 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -82,6 +82,12 @@ class PairAmoeba : public Pair {
   double time_init, time_hal, time_repulse, time_disp;
   double time_mpole, time_induce, time_polar, time_qxfer;
 
+  double time_mpole_rspace, time_mpole_kspace;
+  double time_direct_rspace, time_direct_kspace;
+  double time_mutual_rspace, time_mutual_kspace;
+  double time_polar_rspace, time_polar_kspace;
+  double time_grid_uind, time_fphi_uind;
+
   // energy/virial components
 
   double ehal, erepulse, edisp, epolar, empole, eqxfer;
@@ -324,8 +330,12 @@ class PairAmoeba : public Pair {
   double *qfac;        // convoulution pre-factors
   double *gridfft1;    // copy of p_kspace FFT grid
 
-  double **cmp, **fmp;    // Cartesian and fractional multipoles
-  double **cphi, **fphi;
+  double **cmp,**fmp;              // Cartesian and fractional multipoles
+  double **cphi,**fphi;
+
+  double *_moduli_array;           // buffers for moduli
+  double *_moduli_bsarray;
+  int _nfft_max;
 
   // params for current KSpace solve and FFT being worked on
 
@@ -335,8 +345,12 @@ class PairAmoeba : public Pair {
   double ctf[10][10];         // indices NOT flipped vs Fortran
   double ftc[10][10];         // indices NOT flipped vs Fortran
 
-  class AmoebaConvolution *m_kspace, *p_kspace, *pc_kspace, *d_kspace;
-  class AmoebaConvolution *i_kspace, *ic_kspace;
+  class AmoebaConvolution *m_kspace;   // multipole KSpace
+  class AmoebaConvolution *p_kspace;   // polar KSpace
+  class AmoebaConvolution *pc_kspace;
+  class AmoebaConvolution *d_kspace;   // dispersion KSpace
+  class AmoebaConvolution *i_kspace;   // induce KSpace
+  class AmoebaConvolution *ic_kspace;
 
   // FFT grid size factors
 
@@ -347,33 +361,33 @@ class PairAmoeba : public Pair {
 
   void hal();
 
-  void repulsion();
-  void damprep(double, double, double, double, double, double, double, double, int, double, double,
-               double *);
+  virtual void repulsion();
+  void damprep(double, double, double, double, double, double, double, double,
+               int, double, double, double *);
 
   void dispersion();
-  void dispersion_real();
+  virtual void dispersion_real();
   void dispersion_kspace();
 
   void multipole();
-  void multipole_real();
+  virtual void multipole_real();
   void multipole_kspace();
 
   void polar();
   void polar_energy();
-  void polar_real();
-  void polar_kspace();
+  virtual void polar_real();
+  virtual void polar_kspace();
   void damppole(double, int, double, double, double *, double *, double *);
 
-  void induce();
+  virtual void induce();
   void ulspred();
-  void ufield0c(double **, double **);
+  virtual void ufield0c(double **, double **);
   void uscale0b(int, double **, double **, double **, double **);
   void dfield0c(double **, double **);
-  void umutual1(double **, double **);
-  void umutual2b(double **, double **);
+  virtual void umutual1(double **, double **);
+  virtual void umutual2b(double **, double **);
   void udirect1(double **);
-  void udirect2b(double **, double **);
+  virtual void udirect2b(double **, double **);
   void dampmut(double, double, double, double *);
   void dampdir(double, double, double, double *, double *);
   void cholesky(int, double *, double *);
@@ -393,11 +407,11 @@ class PairAmoeba : public Pair {
   void fphi_to_cphi(double **, double **);
   void frac_to_cart();
 
-  void grid_mpole(double **, double ***);
-  void fphi_mpole(double ***, double **);
-  void grid_uind(double **, double **, double ****);
-  void fphi_uind(double ****, double **, double **, double **);
-  void grid_disp(double ***);
+  void grid_mpole(double **, FFT_SCALAR ***);
+  void fphi_mpole(FFT_SCALAR ***, double **);
+  void grid_uind(double **, double **, FFT_SCALAR ****);
+  virtual void fphi_uind(FFT_SCALAR ****, double **, double **, double **);
+  void grid_disp(FFT_SCALAR ***);
 
   void kewald();
   void kewald_parallel(int, int, int, int, int &, int &, int &, int &, int &, int &, int &, int &,
diff --git a/src/Depend.sh b/src/Depend.sh
index 10d612f490..470a0a2a2b 100755
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -45,6 +45,10 @@ depend () {
 # add one if statement per parent package
 # add one depend() call per child package that depends on that parent
 
+if (test $1 = "AMOEBA") then
+  depend GPU
+fi
+
 if (test $1 = "ASPHERE") then
   depend GPU
   depend OPENMP
diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index d28e6260f8..19e89498fc 100755
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -28,6 +28,8 @@ action () {
 
 # list of files with optional dependcies
 
+action amoeba_convolution_gpu.cpp amoeba_convolution.cpp
+action amoeba_convolution_gpu.h amoeba_convolution.cpp
 action fix_gpu.cpp
 action fix_gpu.h
 action fix_nve_gpu.h
@@ -41,6 +43,8 @@ action fix_npt_gpu.cpp
 action fix_nve_asphere_gpu.h fix_nve_asphere.h
 action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp
 action gpu_extra.h
+action pair_amoeba_gpu.cpp pair_amoeba.cpp
+action pair_amoeba_gpu.h pair_amoeba.h
 action pair_beck_gpu.cpp pair_beck.cpp
 action pair_beck_gpu.h pair_beck.h
 action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp
@@ -89,6 +93,8 @@ action pair_gauss_gpu.cpp pair_gauss.cpp
 action pair_gauss_gpu.h pair_gauss.h
 action pair_gayberne_gpu.cpp pair_gayberne.cpp
 action pair_gayberne_gpu.h pair_gayberne.cpp
+action pair_hippo_gpu.cpp pair_hippo.cpp
+action pair_hippo_gpu.h pair_hippo.cpp
 action pair_lj96_cut_gpu.cpp pair_lj96_cut.cpp
 action pair_lj96_cut_gpu.h pair_lj96_cut.h
 action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp
@@ -113,6 +119,10 @@ action pair_lj_cut_coul_msm_gpu.cpp pair_lj_cut_coul_msm.cpp
 action pair_lj_cut_coul_msm_gpu.h pair_lj_cut_coul_msm.h
 action pair_lj_cut_gpu.cpp
 action pair_lj_cut_gpu.h
+action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
+action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
+action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
+action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp
 action pair_lj_smooth_gpu.cpp pair_lj_smooth.cpp
 action pair_lj_smooth_gpu.h pair_lj_smooth.cpp
 action pair_lj_expand_gpu.cpp
@@ -155,10 +165,6 @@ action pppm_gpu.cpp pppm.cpp
 action pppm_gpu.h pppm.cpp
 action pair_ufm_gpu.cpp pair_ufm.cpp
 action pair_ufm_gpu.h pair_ufm.h
-action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
-action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
-action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
-action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp
 
 # edit 2 Makefile.package files to include/exclude package info
 
diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp
new file mode 100644
index 0000000000..908c9e409c
--- /dev/null
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "amoeba_convolution_gpu.h"
+#include "comm.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "grid3d.h"
+
+using namespace LAMMPS_NS;
+
+// DEBUG
+
+#define DEBUG_AMOEBA 0
+#if DEBUG_AMOEBA
+char *labels[7] =
+  {(char *) "MPOLE_GRID", (char *) "POLAR_GRID",
+   (char *) "POLAR_GRIDC", (char *) "DISP_GRID",
+   (char *) "INDUCE_GRID", (char *) "INDUCE_GRIDC"};
+
+enum{GRIDBRICK_OUT,GRIDBRICK_IN,FFT,CFFT1,CFFT2};
+#endif
+// END DEBUG
+
+#define SCALE 0
+
+//#define USE_AMOEBA_FFT
+#ifdef USE_AMOEBA_FFT
+// External functions from GPU library
+int amoeba_setup_fft(const int size, const int numel, const int element_type);
+int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode);
+#endif
+
+/* ----------------------------------------------------------------------
+   partition an FFT grid across processors
+   both for a brick and FFT x pencil decomposition
+   nx,nz,nz = global FFT grid size
+   order = size of stencil in each dimension that maps atoms to grid
+   adapted from PPPM::set_grid_local()
+------------------------------------------------------------------------- */
+
+AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
+                                     int nx_caller, int ny_caller, int nz_caller,
+                                     int order_caller, int which_caller) :
+  AmoebaConvolution(lmp, pair, nx_caller, ny_caller,  nz_caller, order_caller,
+                    which_caller)
+{
+
+}
+
+/* ----------------------------------------------------------------------
+   perform pre-convolution grid operations for 4d cgrid_brick array
+------------------------------------------------------------------------- */
+
+FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
+{
+  int ix,iy,iz,n;
+
+  // reverse comm for 4d brick grid + ghosts
+
+#if DEBUG_AMOEBA
+  debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE Grid3d");
+#endif
+
+  gc->reverse_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
+                   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+
+#if DEBUG_AMOEBA
+  debug_scalar(GRIDBRICK_IN,"PRE Convo / POST Grid3d");
+  debug_file(GRIDBRICK_IN,"pre.convo.post.grid3d");
+#endif
+  // copy owned 4d brick grid values to FFT grid
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+        cfft[n++] = cgrid_brick[iz][iy][ix][0];
+        cfft[n++] = cgrid_brick[iz][iy][ix][1];
+      }
+
+  // remap FFT grid from brick to x pencil partitioning
+  // NOTE: could just setup FFT to start from brick decomp and skip remap
+
+  remap->perform(cfft,cfft,remap_buf);
+
+#if DEBUG_AMOEBA
+  debug_scalar(FFT,"PRE Convo / POST Remap");
+  debug_file(FFT,"pre.convo.post.remap");
+#endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = platform::walltime();
+
+  // perform forward FFT
+
+  #ifdef USE_AMOEBA_FFT
+  amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD);
+  #else
+  fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  #endif
+
+  time1 = platform::walltime();
+
+  time_fft += time1 - time0;
+
+  if (SCALE) {
+    double scale = 1.0/nfft_global;
+    for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
+  }
+
+#if DEBUG_AMOEBA
+  debug_scalar(CFFT1,"PRE Convo / POST FFT");
+  debug_file(CFFT1,"pre.convo.post.fft");
+#endif
+  return cfft;
+}
+
+/* ----------------------------------------------------------------------
+   perform post-convolution grid operations for 4d cgrid_brick array
+------------------------------------------------------------------------- */
+
+void *AmoebaConvolutionGPU::post_convolution_4d()
+{
+  int ix,iy,iz,n;
+
+  // perform backward FFT
+
+#if DEBUG_AMOEBA
+  debug_scalar(CFFT1,"POST Convo / PRE FFT");
+  debug_file(CFFT1,"post.convo.pre.fft");
+#endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = platform::walltime();
+
+  fft2->compute(cfft,cfft,FFT3d::BACKWARD);
+
+  time1 = platform::walltime();
+
+  time_fft += time1 - time0;
+
+#if DEBUG_AMOEBA
+  debug_scalar(CFFT2,"POST Convo / POST FFT");
+  debug_file(CFFT2,"post.convo.post.fft");
+#endif
+  // copy 1d complex values into 4d complex grid
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+        cgrid_brick[iz][iy][ix][0] = cfft[n++];
+        cgrid_brick[iz][iy][ix][1] = cfft[n++];
+      }
+
+  // forward comm to populate ghost grid values
+
+#if DEBUG_AMOEBA
+  debug_scalar(GRIDBRICK_IN,"POST Convo / PRE grid3d");
+  debug_file(GRIDBRICK_IN,"post.convo.pre.grid3d");
+#endif
+  gc->forward_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
+                   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+
+  return (void *) cgrid_brick;
+}
diff --git a/src/GPU/amoeba_convolution_gpu.h b/src/GPU/amoeba_convolution_gpu.h
new file mode 100644
index 0000000000..4286f2155f
--- /dev/null
+++ b/src/GPU/amoeba_convolution_gpu.h
@@ -0,0 +1,32 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_AMOEBA_CONVOLUTION_GPU_H
+#define LMP_AMOEBA_CONVOLUTION_GPU_H
+
+#include "amoeba_convolution.h"
+
+
+namespace LAMMPS_NS {
+
+class AmoebaConvolutionGPU : public AmoebaConvolution {
+ public:
+  AmoebaConvolutionGPU(class LAMMPS *, class Pair *, int, int, int, int, int);
+
+  FFT_SCALAR *pre_convolution_4d() override;
+  void *post_convolution_4d() override;
+
+};
+
+} // namespace LAMMPS_NS
+#endif
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 97f22da0a7..23191c12c8 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -131,7 +131,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   _gpu_mode = GPU_NEIGH;
   _particle_split = 1.0;
   int nthreads = 0;
-  int newtonflag = 0;
+  int newtonflag = force->newton_pair;
   int threads_per_atom = -1;
   double binsize = 0.0;
   char *opencl_args = nullptr;
@@ -360,6 +360,8 @@ double FixGPU::memory_usage()
   return bytes;
 }
 
+/* ---------------------------------------------------------------------- */
+
 double FixGPU::binsize(const double subx, const double suby,
                        const double subz, const int nlocal,
                        const double cut) {
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
new file mode 100644
index 0000000000..fd423486fd
--- /dev/null
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -0,0 +1,2067 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Trung Nguyen (Northwestern/UChicago)
+------------------------------------------------------------------------- */
+
+#include "pair_amoeba_gpu.h"
+
+#include "amoeba_convolution_gpu.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "fix_store_peratom.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "info.h"
+#include "math_const.h"
+#include "memory.h"
+#include "my_page.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+#include "suffix.h"
+#include <cmath>
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+// same as in amoeba_induce.cpp
+enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP};   // forward comm
+enum{FIELD,ZRSD,TORQUE,UFLD};                               // reverse comm
+enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
+enum{MUTUAL,OPT,TCG,DIRECT};
+enum{GEAR,ASPC,LSQR};
+enum{BUILD,APPLY};
+enum{GORDON1,GORDON2};
+
+// same as in pair_amoeba.cpp
+enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};
+
+#define DEBYE 4.80321    // conversion factor from q-Angs (real units) to Debye
+
+// External functions from cuda library for atom decomposition
+
+int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp, const int* host_amtype2class,
+                    const double *host_special_hal, const double *host_special_repel,
+                    const double *host_special_disp, const double *host_special_mpole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const double *host_csix, const double *host_adisp,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double polar_dscale, const double polar_uscale);
+void amoeba_gpu_clear();
+
+int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
+                            double **host_x, int *host_type, int *host_amtype,
+                            int *host_amgroup, double **host_rpole,
+                            double **host_uind, double **host_uinp, double *host_pval,
+                            double *sublo, double *subhi, tagint *tag,
+                            int **nspecial, tagint **special,
+                            int *nspecial15, tagint **special15,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom, int &host_start,
+                            int **ilist, int **jnum, const double cpu_time,
+                            bool &success, double *host_q, double *boxlo, double *prd);
+
+void amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double *sublo, double *subhi, tagint *tag,
+              int **nspecial, tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double aewald, const double felec, const double off2,
+              double *host_q, double *boxlo, double *prd, void **tq_ptr);
+
+void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp,
+              const double aewald, const double off2, void **fieldp_ptr);
+
+void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp,
+              const double aewald, const double off2, void **fieldp_ptr);
+
+void amoeba_gpu_update_fieldp(void **fieldp_ptr);
+
+void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
+              double ***host_thetai1, double ***host_thetai2,
+              double ***host_thetai3, int** igrid,
+              const int nzlo_out, const int nzhi_out,
+              const int nylo_out, const int nyhi_out,
+              const int nxlo_out, const int nxhi_out);
+
+void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi);
+
+void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi,
+                           const double felec);
+
+void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              const double aewald, const double felec, const double off2,
+              void **tq_ptr);
+
+double amoeba_gpu_bytes();
+
+/* ---------------------------------------------------------------------- */
+
+PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
+{
+  respa_enable = 0;
+  reinitflag = 0;
+  cpu_time = 0.0;
+  suffix_flag |= Suffix::GPU;
+  fieldp_pinned = nullptr;
+  tq_pinned = nullptr;
+
+  gpu_hal_ready = false;               // true for AMOEBA when ready
+  gpu_repulsion_ready = false;         // always false for AMOEBA
+  gpu_dispersion_real_ready = false;   // always false for AMOEBA
+  gpu_multipole_real_ready = true;     // need to be true for precompute()
+  gpu_udirect2b_ready = true;
+  gpu_umutual1_ready = true;
+  gpu_fphi_uind_ready = true;
+  gpu_umutual2b_ready = true;
+  gpu_polar_real_ready = true;         // need to be true for copying data from device back to host
+
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairAmoebaGPU::~PairAmoebaGPU()
+{
+  amoeba_gpu_clear();
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::init_style()
+{
+  PairAmoeba::init_style();
+
+  // Repeat cutsq calculation because done after call to init_style
+
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  int maxspecial15=0;
+  if (atom->molecular != Atom::ATOMIC) {
+    maxspecial=atom->maxspecial;
+    maxspecial15=atom->maxspecial15;
+  }
+
+  int mnf = 5e-2 * neighbor->oneatom;
+  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
+                                pdamp, thole, dirdamp, amtype2class, special_hal,
+                                special_repel, special_disp, special_mpole,
+                                special_polar_wscale, special_polar_piscale,
+                                special_polar_pscale, csix, adisp, atom->nlocal,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
+                                maxspecial15, cell_size, gpu_mode, screen,
+                                polar_dscale, polar_uscale);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (gpu_mode == GPU_FORCE)
+    error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now");
+
+  acc_float = Info::has_accelerator_feature("GPU", "precision", "single");
+
+  // replace with the gpu counterpart
+
+  if (gpu_umutual1_ready) {
+    if (use_ewald && ic_kspace) {
+      delete ic_kspace;
+      ic_kspace =
+        new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   multipole_real = real-space portion of mulipole interactions
+   adapted from Tinker emreal1d() routine
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::multipole_real()
+{
+  if (!gpu_multipole_real_ready) {
+    PairAmoeba::multipole_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  double **f = atom->f;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh;
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  amoeba_gpu_precompute(neighbor->ago, inum, nall, atom->x,
+                        atom->type, amtype, amgroup, rpole,
+                        nullptr, nullptr, nullptr,
+                        sublo, subhi, atom->tag,
+                        atom->nspecial, atom->special,
+                        atom->nspecial15, atom->special15,
+                        eflag, vflag, eflag_atom, vflag_atom,
+                        host_start, &ilist, &numneigh, cpu_time,
+                        success, atom->q, domain->boxlo, domain->prd);
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // select the correct cutoff for the term
+
+  if (use_ewald) choose(MPOLE_LONG);
+  else choose(MPOLE);
+
+  // set the energy unit conversion factor for multipolar real-space calculation
+
+  double felec = electric / am_dielectric;
+
+  amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
+                                    atom->type, amtype, amgroup, rpole,
+                                    sublo, subhi, atom->tag,
+                                    atom->nspecial, atom->special,
+                                    atom->nspecial15, atom->special15,
+                                    eflag, vflag, eflag_atom, vflag_atom,
+                                    host_start, &ilist, &numneigh, cpu_time,
+                                    success, aewald, felec, off2, atom->q,
+                                    domain->boxlo, domain->prd, &tq_pinned);
+
+
+
+  // reference to the tep array from GPU lib
+
+  if (acc_float) {
+    auto *tq_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tq_ptr, f, virmpole); // fmpole
+  } else {
+    auto *tq_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tq_ptr, f, virmpole); // fmpole
+  }
+}
+
+/* ----------------------------------------------------------------------
+   induce = induced dipole moments via pre-conditioned CG solver
+   adapted from Tinker induce0a() routine
+   NOTE: Almost the same in the CPU version, except that there is no need
+      to call reverse_comm() for crstyle = FIELD;
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::induce()
+{
+  bool done;
+  int i,j,m,itype;
+  int iter,maxiter;
+  double polmin;
+  double eps,epsold;
+  double epsd,epsp;
+  double udsum,upsum;
+  double a,ap,b,bp;
+  double sum,sump,term;
+  double reduce[4],allreduce[4];
+
+  // set cutoffs, taper coeffs, and PME params
+  // create qfac here, free at end of polar()
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  // owned atoms
+
+  int nlocal = atom->nlocal;
+
+  // zero out the induced dipoles at each site
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      uind[i][j] = 0.0;
+      uinp[i][j] = 0.0;
+    }
+  }
+
+  // get the electrostatic field due to permanent multipoles
+
+  dfield0c(field,fieldp);
+
+  // need reverse_comm if dfield0c (i.e. udirect2b) is CPU-only
+
+  if (!gpu_udirect2b_ready) {
+    crstyle = FIELD;
+    comm->reverse_comm(this);
+  }
+
+  // set induced dipoles to polarizability times direct field
+
+  for (i = 0; i < nlocal; i++) {
+    itype = amtype[i];
+    for (j = 0; j < 3; j++) {
+      udir[i][j] = polarity[itype] * field[i][j];
+      udirp[i][j] = polarity[itype] * fieldp[i][j];
+      if (pcgguess) {
+        uind[i][j] = udir[i][j];
+        uinp[i][j] = udirp[i][j];
+      }
+    }
+  }
+
+  // allocate memory and make early host-device transfers
+  // must be done before the first ufield0c
+  // NOTE: this is for ic_kspace, and thetai[1-3]
+
+  if (ic_kspace)
+    amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
+                                 thetai3, igrid,
+                                 ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                                 ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                                 ic_kspace->nxlo_out, ic_kspace->nxhi_out);
+
+  // get induced dipoles via the OPT extrapolation method
+  // NOTE: any way to rewrite these loops to avoid allocating
+  //       uopt,uoptp with a optorder+1 dimension, just optorder ??
+  //       since no need to store optorder+1 values after these loops
+
+  if (poltyp == OPT) {
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        uopt[i][0][j] = udir[i][j];
+        uoptp[i][0][j] = udirp[i][j];
+      }
+    }
+
+    for (m = 1; m <= optorder; m++) {
+      optlevel = m - 1;     // used in umutual1() for fopt,foptp
+
+      cfstyle = INDUCE;
+      comm->forward_comm(this);
+
+      ufield0c(field,fieldp);
+
+      if (!gpu_umutual2b_ready) {
+        crstyle = FIELD;
+        comm->reverse_comm(this);
+      }
+
+      for (i = 0; i < nlocal; i++) {
+              itype = amtype[i];
+        for (j = 0; j < 3; j++) {
+          uopt[i][m][j] = polarity[itype] * field[i][j];
+          uoptp[i][m][j] = polarity[itype] * fieldp[i][j];
+          uind[i][j] = uopt[i][m][j];
+          uinp[i][j] = uoptp[i][m][j];
+        }
+      }
+    }
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        uind[i][j] = 0.0;
+        uinp[i][j] = 0.0;
+        usum[i][j] = 0.0;
+        usump[i][j] = 0.0;
+        for (m = 0; m <= optorder; m++) {
+          usum[i][j] += uopt[i][m][j];
+          usump[i][j] += uoptp[i][m][j];
+          uind[i][j] += copt[m]*usum[i][j];
+          uinp[i][j] += copt[m]*usump[i][j];
+        }
+      }
+    }
+  }
+
+  // set tolerances for computation of mutual induced dipoles
+
+  if (poltyp == MUTUAL) {
+    done = false;
+    maxiter = 100;
+    iter = 0;
+    polmin = 0.00000001;
+    eps = 100.0;
+
+    // estimate induced dipoles using a polynomial predictor
+
+    if (use_pred && nualt == maxualt) {
+      ulspred();
+
+      double ***udalt = fixudalt->tstore;
+      double ***upalt = fixupalt->tstore;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          udsum = 0.0;
+          upsum = 0.0;
+          for (m = 0; m < nualt; m++) {
+            udsum += bpred[m]*udalt[i][m][j];
+            upsum += bpredp[m]*upalt[i][m][j];
+          }
+          uind[i][j] = udsum;
+          uinp[i][j] = upsum;
+        }
+      }
+    }
+
+    // estimate induced dipoles via inertial extended Lagrangian
+    // not supported for now
+    // requires uaux,upaux to persist with each atom
+    // also requires a velocity vector(s) to persist
+    // also requires updating uaux,upaux in the Verlet integration
+
+    /*
+    if (use_ielscf) {
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = uaux[i][j];
+          uinp[i][j] = upaux[i][j];
+        }
+      }
+    }
+    */
+
+    // get the electrostatic field due to induced dipoles
+
+    cfstyle = INDUCE;
+    comm->forward_comm(this);
+
+    ufield0c(field,fieldp);
+
+    if (!gpu_umutual2b_ready) {
+      crstyle = FIELD;
+      comm->reverse_comm(this);
+    }
+
+    // set initial conjugate gradient residual and conjugate vector
+
+    for (i = 0; i < nlocal; i++) {
+      itype = amtype[i];
+
+      poli[i] = MAX(polmin,polarity[itype]);
+      for (j = 0; j < 3; j++) {
+        if (pcgguess) {
+          rsd[i][j] = (udir[i][j]-uind[i][j])/poli[i] + field[i][j];
+          rsdp[i][j] = (udirp[i][j]-uinp[i][j])/poli[i] + fieldp[i][j];
+        } else {
+          rsd[i][j] = udir[i][j] / poli[i];
+          rsdp[i][j] = udirp[i][j] / poli[i];
+        }
+        zrsd[i][j] = rsd[i][j];
+        zrsdp[i][j] = rsdp[i][j];
+      }
+    }
+
+    if (pcgprec) {
+      cfstyle = RSD;
+      comm->forward_comm(this);
+      uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp);
+      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp);
+      crstyle = ZRSD;
+      comm->reverse_comm(this);
+   }
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        conj[i][j] = zrsd[i][j];
+        conjp[i][j] = zrsdp[i][j];
+      }
+    }
+
+    // conjugate gradient iteration of the mutual induced dipoles
+
+    while (!done) {
+      iter++;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          vec[i][j] = uind[i][j];
+          vecp[i][j] = uinp[i][j];
+          uind[i][j] = conj[i][j];
+          uinp[i][j] = conjp[i][j];
+        }
+      }
+
+      cfstyle = INDUCE;
+      comm->forward_comm(this);
+
+      ufield0c(field,fieldp);
+
+      if (!gpu_umutual2b_ready) {
+        crstyle = FIELD;
+        comm->reverse_comm(this);
+      }
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = vec[i][j];
+          uinp[i][j] = vecp[i][j];
+          vec[i][j] = conj[i][j]/poli[i] - field[i][j];
+          vecp[i][j] = conjp[i][j]/poli[i] - fieldp[i][j];
+        }
+      }
+
+      a = 0.0;
+      ap = 0.0;
+      sum = 0.0;
+      sump = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          a += conj[i][j]*vec[i][j];
+          ap += conjp[i][j]*vecp[i][j];
+          sum += rsd[i][j]*zrsd[i][j];
+          sump += rsdp[i][j]*zrsdp[i][j];
+        }
+      }
+
+      reduce[0] = a;
+      reduce[1] = ap;
+      reduce[2] = sum;
+      reduce[3] = sump;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      a = allreduce[0];
+      ap = allreduce[1];
+      sum = allreduce[2];
+      sump = allreduce[3];
+
+      if (a != 0.0) a = sum / a;
+      if (ap != 0.0) ap = sump / ap;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = uind[i][j] + a*conj[i][j];
+          uinp[i][j] = uinp[i][j] + ap*conjp[i][j];
+          rsd[i][j] = rsd[i][j] - a*vec[i][j];
+          rsdp[i][j] = rsdp[i][j] - ap*vecp[i][j];
+          zrsd[i][j] = rsd[i][j];
+          zrsdp[i][j] = rsdp[i][j];
+        }
+      }
+
+      if (pcgprec) {
+        cfstyle = RSD;
+        comm->forward_comm(this);
+        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp);
+        crstyle = ZRSD;
+        comm->reverse_comm(this);
+      }
+
+      b = 0.0;
+      bp = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          b += rsd[i][j]*zrsd[i][j];
+          bp += rsdp[i][j]*zrsdp[i][j];
+        }
+      }
+
+      reduce[0] = b;
+      reduce[1] = bp;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      b = allreduce[0];
+      bp = allreduce[1];
+
+      if (sum != 0.0) b /= sum;
+      if (sump != 0.0) bp /= sump;
+
+      epsd = 0.0;
+      epsp = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          conj[i][j] = zrsd[i][j] + b*conj[i][j];
+          conjp[i][j] = zrsdp[i][j] + bp*conjp[i][j];
+          epsd += rsd[i][j]*rsd[i][j];
+          epsp += rsdp[i][j]*rsdp[i][j];
+        }
+      }
+
+      reduce[0] = epsd;
+      reduce[1] = epsp;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      epsd = allreduce[0];
+      epsp = allreduce[1];
+
+      // check the convergence of the mutual induced dipoles
+
+      epsold = eps;
+      eps = MAX(epsd,epsp);
+      eps = DEBYE * sqrt(eps/atom->natoms);
+
+      if (eps < poleps) done = true;
+      if (eps > epsold) done = true;
+      if (iter >= politer) done = true;
+
+      //  apply a "peek" iteration to the mutual induced dipoles
+
+      if (done) {
+        for (i = 0; i < nlocal; i++) {
+          term = pcgpeek * poli[i];
+          for (j = 0; j < 3; j++) {
+            uind[i][j] += term*rsd[i][j];
+            uinp[i][j] += term*rsdp[i][j];
+          }
+        }
+      }
+
+    }
+
+    // terminate the calculation if dipoles failed to converge
+    // NOTE: could make this an error
+
+    if (iter >= maxiter || eps > epsold)
+      if (comm->me == 0)
+              error->warning(FLERR,"AMOEBA induced dipoles did not converge");
+  }
+
+  // update the lists of previous induced dipole values
+  // shift previous m values up to m+1, add new values at m = 0
+  // only when preconditioner is used
+
+  if (use_pred) {
+    double ***udalt = fixudalt->tstore;
+    double ***upalt = fixupalt->tstore;
+
+    nualt = MIN(nualt+1,maxualt);
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        for (m = nualt-1; m > 0; m--) {
+          udalt[i][m][j] = udalt[i][m-1][j];
+          upalt[i][m][j] = upalt[i][m-1][j];
+        }
+        udalt[i][0][j] = uind[i][j];
+        upalt[i][0][j] = uinp[i][j];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   udirect2b = Ewald real direct field via list
+   udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
+{
+  if (!gpu_udirect2b_ready) {
+    PairAmoeba::udirect2b(field, fieldp);
+    return;
+  }
+
+  int inum;
+  double sublo[3],subhi[3];
+
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff (off2) for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  amoeba_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp,
+                               aewald, off2, &fieldp_pinned);
+
+  // rebuild dipole-dipole pair list and store pairwise dipole matrices
+  // done one atom at a time in real-space double loop over atoms & neighs
+  // NOTE: for the moment the tdipdip values are computed just in time in umutual2b()
+  //   so no need to call ubdirect2b_cpu().
+  // udirect2b_cpu();
+
+  // accumulate the field and fieldp values from the GPU lib
+  //   field and fieldp may already have some nonzero values from kspace (udirect1)
+
+  int nlocal = atom->nlocal;
+  if (acc_float) {
+    auto field_ptr = (float *)fieldp_pinned;
+
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    field_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
+    }
+  } else {
+    auto field_ptr = (double *)fieldp_pinned;
+
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    field_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
+    }
+  }
+
+
+}
+
+/* ----------------------------------------------------------------------
+   udirect2b = Ewald real direct field via list
+   udirect2b computes the real space contribution of the permanent
+     atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::udirect2b_cpu()
+{
+  int i,j,m,n,ii,jj,jextra,ndip,itype,jtype,igroup,jgroup;
+  double xr,yr,zr,r,r2;
+  double rr1,rr2,rr3,rr5;
+  double bfac,exp2a;
+  double ralpha,aefac;
+  double aesq2,aesq2n;
+  double pdi,pti;
+  double pgamma;
+  double damp,expdamp;
+  double scale3,scale5;
+  double scalek;
+  double bn[4],bcn[3];
+  double factor_uscale;
+
+  int inum,jnum;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+
+  // neigh list
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // NOTE: doesn't this have a problem if aewald is tiny ??
+
+  aesq2 = 2.0 * aewald * aewald;
+  aesq2n = 0.0;
+  if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald);
+
+  // rebuild dipole-dipole pair list and store pairwise dipole matrices
+  // done one atom at a time in real-space double loop over atoms & neighs
+
+  int *neighptr;
+  double *tdipdip;
+
+  // compute the real space portion of the Ewald summation
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    itype = amtype[i];
+    igroup = amgroup[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    n = ndip = 0;
+    neighptr = ipage_dipole->vget();
+    tdipdip = dpage_dipdip->vget();
+
+    pdi = pdamp[itype];
+    pti = thole[itype];
+
+    // evaluate all sites within the cutoff distance
+
+    for (jj = 0; jj < jnum; jj++) {
+      jextra = jlist[jj];
+      j = jextra & NEIGHMASK15;
+
+      xr = x[j][0] - x[i][0];
+      yr = x[j][1] - x[i][1];
+      zr = x[j][2] - x[i][2];
+      r2 = xr*xr + yr* yr + zr*zr;
+      if (r2 > off2) continue;
+
+      jtype = amtype[j];
+      jgroup = amgroup[j];
+
+      if (igroup == jgroup) factor_uscale = polar_uscale;
+      else factor_uscale = 1.0;
+
+      r = sqrt(r2);
+      rr1 = 1.0 / r;
+      rr2 = rr1 * rr1;
+      rr3 = rr2 * rr1;
+      rr5 = 3.0 * rr2 * rr3;
+
+      // calculate the real space Ewald error function terms
+
+      ralpha = aewald * r;
+      bn[0] = erfc(ralpha) * rr1;
+      exp2a = exp(-ralpha*ralpha);
+      aefac = aesq2n;
+      for (m = 1; m <= 3; m++) {
+        bfac = m+m-1;
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2;
+      }
+
+      // find terms needed later to compute mutual polarization
+
+      if (poltyp != DIRECT) {
+        scale3 = 1.0;
+        scale5 = 1.0;
+        damp = pdi * pdamp[jtype];
+        if (damp != 0.0) {
+          pgamma = MIN(pti,thole[jtype]);
+          damp = pgamma * pow(r/damp,3.0);
+          if (damp < 50.0) {
+            expdamp = exp(-damp);
+            scale3 = 1.0 - expdamp;
+            scale5 = 1.0 - expdamp*(1.0+damp);
+          }
+        }
+        scalek = factor_uscale;
+        bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3;
+        bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5;
+
+        neighptr[n++] = j;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr;
+        tdipdip[ndip++] = bcn[1]*xr*yr;
+        tdipdip[ndip++] = bcn[1]*xr*zr;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr;
+        tdipdip[ndip++] = bcn[1]*yr*zr;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr;
+      } else {
+        if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j);
+      }
+
+    } // jj
+
+    firstneigh_dipole[i] = neighptr;
+    firstneigh_dipdip[i] = tdipdip;
+    numneigh_dipole[i] = n;
+    ipage_dipole->vgot(n);
+    dpage_dipdip->vgot(ndip);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   ufield0c = mutual induction via Ewald sum
+   ufield0c computes the mutual electrostatic field due to
+   induced dipole moments via Ewald summation
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
+{
+  double term;
+
+  // zero field,fieldp for owned and ghost atoms
+
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+
+  memset(&field[0][0], 0, 3*nall *sizeof(double));
+  memset(&fieldp[0][0], 0, 3*nall *sizeof(double));
+
+  // get the real space portion of the mutual field first
+
+  double time0, time1, time2;
+
+  MPI_Barrier(world);
+  time0 = platform::walltime();
+
+  if (polar_rspace_flag) umutual2b(field,fieldp);
+  time1 = platform::walltime();
+
+  // get the reciprocal space part of the mutual field
+
+  if (polar_kspace_flag) umutual1(field,fieldp);
+  time2 = platform::walltime();
+
+  // add the self-energy portion of the mutual field
+
+  term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
+  for (int i = 0; i < nlocal; i++) {
+    field[i][0] += term*uind[i][0];
+    field[i][1] += term*uind[i][1];
+    field[i][2] += term*uind[i][2];
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    fieldp[i][0] += term*uinp[i][0];
+    fieldp[i][1] += term*uinp[i][1];
+    fieldp[i][2] += term*uinp[i][2];
+  }
+
+  // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
+  //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
+
+  amoeba_gpu_update_fieldp(&fieldp_pinned);
+
+  int inum = atom->nlocal;
+  if (acc_float) {
+    auto field_ptr = (float *)fieldp_pinned;
+
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    field_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
+    }
+  } else {
+    auto field_ptr = (double *)fieldp_pinned;
+
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    field_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
+    }
+  }
+
+
+  // accumulate timing information
+
+  time_mutual_rspace += time1 - time0;
+  time_mutual_kspace += time2 - time1;
+}
+
+/* ----------------------------------------------------------------------
+   umutual1 = Ewald recip mutual induced field
+   umutual1 computes the reciprocal space contribution of the
+   induced atomic dipole moments to the field
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::umutual1(double **field, double **fieldp)
+{
+  int m,n;
+  int nxlo,nxhi,nylo,nyhi,nzlo,nzhi;
+  double term;
+  double a[3][3];  // indices not flipped vs Fortran
+
+  // return if the Ewald coefficient is zero
+
+  if (aewald < 1.0e-6) return;
+
+  // convert Cartesian dipoles to fractional coordinates
+
+  for (int j = 0; j < 3; j++) {
+    a[0][j] = nfft1 * recip[0][j];
+    a[1][j] = nfft2 * recip[1][j];
+    a[2][j] = nfft3 * recip[2][j];
+  }
+
+  int nlocal = atom->nlocal;
+  for (int i = 0; i < nlocal; i++) {
+    fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2];
+    fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2];
+    fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2];
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2];
+    fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
+    fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
+  }
+
+  // gridpre = my portion of 4d grid in brick decomp w/ ghost values
+
+  FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero();
+
+  // map 2 values to grid
+
+  double time0, time1;
+  MPI_Barrier(world);
+  time0 = platform::walltime();
+
+  grid_uind(fuind,fuinp,gridpre);
+
+  time1 = platform::walltime();
+  time_grid_uind += (time1 - time0);
+
+  // pre-convolution operations including forward FFT
+  // gridfft = my portion of complex 3d grid in FFT decomposition
+
+  FFT_SCALAR *gridfft = ic_kspace->pre_convolution();
+
+  // ---------------------
+  // convolution operation
+  // ---------------------
+
+  nxlo = ic_kspace->nxlo_fft;
+  nxhi = ic_kspace->nxhi_fft;
+  nylo = ic_kspace->nylo_fft;
+  nyhi = ic_kspace->nyhi_fft;
+  nzlo = ic_kspace->nzlo_fft;
+  nzhi = ic_kspace->nzhi_fft;
+
+  // use qfac values stored in udirect1()
+
+  m = n = 0;
+  for (int k = nzlo; k <= nzhi; k++) {
+    for (int j = nylo; j <= nyhi; j++) {
+      for (int i = nxlo; i <= nxhi; i++) {
+        term = qfac[m++];
+        gridfft[n] *= term;
+        gridfft[n+1] *= term;
+        n += 2;
+      }
+    }
+  }
+
+  // post-convolution operations including backward FFT
+  // gridppost = my portion of 4d grid in brick decomp w/ ghost values
+
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution();
+
+  // get potential
+
+  MPI_Barrier(world);
+  time0 = platform::walltime();
+
+  fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
+
+  time1 = platform::walltime();
+  time_fphi_uind += (time1 - time0);
+
+  // store fractional reciprocal potentials for OPT method
+
+  if (poltyp == OPT) {
+    for (int i = 0; i < nlocal; i++) {
+      for (int j = 0; j < 10; j++) {
+        fopt[i][optlevel][j] = fdip_phi1[i][j];
+        foptp[i][optlevel][j] = fdip_phi2[i][j];
+      }
+    }
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    double dfx = a[0][0]*fdip_phi1[i][1] +
+      a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3];
+    double dfy = a[1][0]*fdip_phi1[i][1] +
+      a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3];
+    double dfz = a[2][0]*fdip_phi1[i][1] +
+      a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3];
+    field[i][0] -= dfx;
+    field[i][1] -= dfy;
+    field[i][2] -= dfz;
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    double dfx = a[0][0]*fdip_phi2[i][1] +
+      a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3];
+    double dfy = a[1][0]*fdip_phi2[i][1] +
+      a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3];
+    double dfz = a[2][0]*fdip_phi2[i][1] +
+      a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3];
+    fieldp[i][0] -= dfx;
+    fieldp[i][1] -= dfy;
+    fieldp[i][2] -= dfz;
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   fphi_uind = induced potential from grid
+   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
+                              double **fdip_phi2, double **fdip_sum_phi)
+{
+  if (!gpu_fphi_uind_ready) {
+    PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi);
+    return;
+  }
+
+  void* fdip_phi1_pinned = nullptr;
+  void* fdip_phi2_pinned = nullptr;
+  void* fdip_sum_phi_pinned = nullptr;
+  amoeba_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned,
+                       &fdip_sum_phi_pinned);
+
+  int nlocal = atom->nlocal;
+  if (acc_float) {
+    auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi1[i][m] = _fdip_phi1_ptr[n];
+        n += nlocal;
+      }
+    }
+
+    auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi2[i][m] = _fdip_phi2_ptr[n];
+        n += nlocal;
+      }
+    }
+
+    auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 20; m++) {
+        fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
+        n += nlocal;
+      }
+    }
+
+  } else {
+    auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi1[i][m] = _fdip_phi1_ptr[n];
+        n += nlocal;
+      }
+    }
+
+    auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi2[i][m] = _fdip_phi2_ptr[n];
+        n += nlocal;
+      }
+    }
+
+    auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 20; m++) {
+        fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
+        n += nlocal;
+      }
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
+{
+  if (!gpu_umutual2b_ready) {
+    PairAmoeba::umutual2b(field, fieldp);
+    return;
+  }
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+
+  // select the correct cutoff (off2) for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp,
+                               aewald, off2, &fieldp_pinned);
+}
+
+/* ----------------------------------------------------------------------
+   polar_real = real-space portion of induced dipole polarization
+   adapted from Tinker epreal1d() routine
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::polar_real()
+{
+  if (!gpu_polar_real_ready) {
+    PairAmoeba::polar_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  double **f = atom->f;
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+
+  // select the correct cutoff and aewald for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  // set the energy unit conversion factor for polar real-space calculation
+
+  double felec = 0.5 * electric / am_dielectric;
+
+  amoeba_gpu_compute_polar_real(amtype, amgroup, rpole, uind, uinp,
+                                eflag, vflag, eflag_atom, vflag_atom,
+                                aewald, felec, off2, &tq_pinned);
+
+  // reference to the tep array from GPU lib
+
+  if (acc_float) {
+    auto *tep_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tep_ptr, f, virpolar); // fpolar
+  } else {
+    auto *tep_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tep_ptr, f, virpolar); // fpolar
+  }
+}
+
+/* ----------------------------------------------------------------------
+   polar_kspace = KSpace portion of induced dipole polarization
+   adapted from Tinker eprecip1() routine
+   same as PairAmoeba, except that fphi_uind() is reimplemented here
+ ------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::polar_kspace()
+{
+  int i,j,k,m,n;
+  int nhalf1,nhalf2,nhalf3;
+  int nxlo,nxhi,nylo,nyhi,nzlo,nzhi;
+  int j1,j2,j3;
+  int ix,iy,iz;
+  double eterm,felec;
+  double r1,r2,r3;
+  double h1,h2,h3;
+  double f1,f2,f3;
+  double xix,yix,zix;
+  double xiy,yiy,ziy;
+  double xiz,yiz,ziz;
+  double vxx,vyy,vzz;
+  double vxy,vxz,vyz;
+  double volterm,denom;
+  double hsq,expterm;
+  double term,pterm;
+  double vterm,struc2;
+  double tep[3];
+  double fix[3],fiy[3],fiz[3];
+  double cphid[4],cphip[4];
+  double a[3][3];    // indices not flipped vs Fortran
+
+  bool gpu_fphi_mpole_ready = true;
+
+  // indices into the electrostatic field array
+  // decremented by 1 versus Fortran
+
+  int deriv1[10] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
+  int deriv2[10] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
+  int deriv3[10] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
+
+  // return if the Ewald coefficient is zero
+
+  if (aewald < 1.0e-6) return;
+
+  // owned atoms
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int nlocal = atom->nlocal;
+
+  double volbox = domain->prd[0] * domain->prd[1] * domain->prd[2];
+  pterm = pow((MY_PI/aewald),2.0);
+  volterm = MY_PI * volbox;
+
+  // initialize variables required for the scalar summation
+
+  felec = electric / am_dielectric;
+
+  // remove scalar sum virial from prior multipole FFT
+  // can only do this if multipoles were computed with same aeewald = apewald
+  // else need to re-compute it via new long-range solve
+
+  nfft1 = p_kspace->nx;
+  nfft2 = p_kspace->ny;
+  nfft3 = p_kspace->nz;
+  bsorder = p_kspace->order;
+
+  nhalf1 = (nfft1+1) / 2;
+  nhalf2 = (nfft2+1) / 2;
+  nhalf3 = (nfft3+1) / 2;
+
+  nxlo = p_kspace->nxlo_fft;
+  nxhi = p_kspace->nxhi_fft;
+  nylo = p_kspace->nylo_fft;
+  nyhi = p_kspace->nyhi_fft;
+  nzlo = p_kspace->nzlo_fft;
+  nzhi = p_kspace->nzhi_fft;
+
+  // use previous results or compute new qfac and convolution
+
+  if (aewald == aeewald) {
+    vxx = -vmsave[0];
+    vyy = -vmsave[1];
+    vzz = -vmsave[2];
+    vxy = -vmsave[3];
+    vxz = -vmsave[4];
+    vyz = -vmsave[5];
+
+  } else {
+
+    // setup stencil size and B-spline coefficients
+
+    moduli();
+    bspline_fill();
+
+    // allocate memory and make early host-device transfers
+
+    // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill
+    if (gpu_fphi_mpole_ready) {
+       amoeba_gpu_precompute_kspace(atom->nlocal, bsorder,
+                                    thetai1, thetai2, thetai3, igrid,
+                                    p_kspace->nzlo_out, p_kspace->nzhi_out,
+                                    p_kspace->nylo_out, p_kspace->nyhi_out,
+                                    p_kspace->nxlo_out, p_kspace->nxhi_out);
+    }
+
+
+    // convert Cartesian multipoles to fractional coordinates
+
+    cmp_to_fmp(cmp,fmp);
+
+    // gridpre = my portion of 3d grid in brick decomp w/ ghost values
+
+    FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
+
+    // map atoms to grid
+
+    grid_mpole(fmp,gridpre);
+
+    // pre-convolution operations including forward FFT
+    // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
+
+    FFT_SCALAR *gridfft = p_kspace->pre_convolution();
+
+    // ---------------------
+    // convolution operation
+    // ---------------------
+
+    // zero virial accumulation variables
+
+    vxx = vyy = vzz = vxy = vxz = vyz = 0.0;
+
+    // perform convolution on K-space points I own
+
+    m = n = 0;
+    for (k = nzlo; k <= nzhi; k++) {
+      for (j = nylo; j <= nyhi; j++) {
+        for (i = nxlo; i <= nxhi; i++) {
+          r1 = (i >= nhalf1) ? i-nfft1 : i;
+          r2 = (j >= nhalf2) ? j-nfft2 : j;
+          r3 = (k >= nhalf3) ? k-nfft3 : k;
+          h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3;  // matvec
+          h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3;
+          h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3;
+          hsq = h1*h1 + h2*h2 + h3*h3;
+          term = -pterm * hsq;
+          expterm = 0.0;
+          if (term > -50.0 && hsq != 0.0) {
+            denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k];
+            if (hsq) expterm = exp(term) / denom;
+            struc2 = gridfft[n]*gridfft[n] + gridfft[n+1]*gridfft[n+1];
+            eterm = 0.5 * felec * expterm * struc2;
+            vterm = (2.0/hsq) * (1.0-term) * eterm;
+            vxx -= h1*h1*vterm - eterm;
+            vyy -= h2*h2*vterm - eterm;
+            vzz -= h3*h3*vterm - eterm;
+            vxy -= h1*h2*vterm;
+            vxz -= h1*h3*vterm;
+            vyz -= h2*h3*vterm;
+          }
+
+          expterm = qfac[m++];
+          gridfft[n] *= expterm;
+          gridfft[n+1] *= expterm;
+          n += 2;
+        }
+      }
+    }
+
+    // post-convolution operations including backward FFT
+    // gridppost = my portion of 3d grid in brick decomp w/ ghost values
+
+    FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution();
+
+    // get potential
+
+    if (!gpu_fphi_mpole_ready) {
+      fphi_mpole(gridpost,fphi);
+
+      for (i = 0; i < nlocal; i++) {
+        for (k = 0; k < 20; k++)
+          fphi[i][k] *= felec;
+      }
+
+    } else {
+      void* fphi_pinned = nullptr;
+      amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec);
+      if (acc_float) {
+        auto _fphi_ptr = (float *)fphi_pinned;
+        for (int i = 0; i < nlocal; i++) {
+          int idx = i;
+          for (int m = 0; m < 20; m++) {
+            fphi[i][m] = _fphi_ptr[idx];
+            idx += nlocal;
+          }
+        }
+      } else {
+        auto _fphi_ptr = (double *)fphi_pinned;
+        for (int i = 0; i < nlocal; i++) {
+          int idx = i;
+          for (int m = 0; m < 20; m++) {
+            fphi[i][m] = _fphi_ptr[idx];
+            idx += nlocal;
+          }
+        }
+      }
+    }
+
+    // convert field from fractional to Cartesian
+
+    fphi_to_cphi(fphi,cphi);
+  }
+
+  // convert Cartesian induced dipoles to fractional coordinates
+
+  for (i = 0; i < 3; i++) {
+    a[0][i] = nfft1 * recip[0][i];
+    a[1][i] = nfft2 * recip[1][i];
+    a[2][i] = nfft3 * recip[2][i];
+  }
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2];
+      fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2];
+    }
+  }
+
+  // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values
+
+  FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero();
+
+  // map 2 values to grid
+
+  grid_uind(fuind,fuinp,gridpre2);
+
+  // pre-convolution operations including forward FFT
+  // gridfft = my portion of complex 3d grid in FFT decomposition
+
+  FFT_SCALAR *gridfft = pc_kspace->pre_convolution();
+
+  // ---------------------
+  // convolution operation
+  // ---------------------
+
+  // use qfac values from above or from induce()
+
+  m = n = 0;
+  for (k = nzlo; k <= nzhi; k++) {
+    for (j = nylo; j <= nyhi; j++) {
+      for (i = nxlo; i <= nxhi; i++) {
+        term = qfac[m++];
+        gridfft[n] *= term;
+        gridfft[n+1] *= term;
+        n += 2;
+      }
+    }
+  }
+
+  // post-convolution operations including backward FFT
+  // gridppost = my portion of 4d grid in brick decomp w/ ghost values
+
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution();
+
+  // get potential
+
+  fphi_uind(gridpost,fphid,fphip,fphidp);
+
+  // TODO: port the remaining loops to the GPU
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 1; j < 10; j++) {
+      fphid[i][j] = felec * fphid[i][j];
+      fphip[i][j] = felec * fphip[i][j];
+    }
+    for (j = 0; j < 20; j++)
+      fphidp[i][j] = felec * fphidp[i][j];
+  }
+
+  // increment the dipole polarization gradient contributions
+
+  for (i = 0; i < nlocal; i++) {
+    f1 = 0.0;
+    f2 = 0.0;
+    f3 = 0.0;
+    for (k = 0; k < 3; k++) {
+      j1 = deriv1[k+1];
+      j2 = deriv2[k+1];
+      j3 = deriv3[k+1];
+      f1 += (fuind[i][k]+fuinp[i][k])*fphi[i][j1];
+      f2 += (fuind[i][k]+fuinp[i][k])*fphi[i][j2];
+      f3 += (fuind[i][k]+fuinp[i][k])*fphi[i][j3];
+      if (poltyp == MUTUAL) {
+        f1 += fuind[i][k]*fphip[i][j1] + fuinp[i][k]*fphid[i][j1];
+        f2 += fuind[i][k]*fphip[i][j2] + fuinp[i][k]*fphid[i][j2];
+        f3 += fuind[i][k]*fphip[i][j3] + fuinp[i][k]*fphid[i][j3];
+      }
+    }
+    for (k = 0; k < 10; k++) {
+      f1 += fmp[i][k]*fphidp[i][deriv1[k]];
+      f2 += fmp[i][k]*fphidp[i][deriv2[k]];
+      f3 += fmp[i][k]*fphidp[i][deriv3[k]];
+    }
+    f1 *= 0.5 * nfft1;
+    f2 *= 0.5 * nfft2;
+    f3 *= 0.5 * nfft3;
+    h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3;
+    h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3;
+    h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3;
+    f[i][0] -= h1;
+    f[i][1] -= h2;
+    f[i][2] -= h3;
+  }
+
+  // set the potential to be the induced dipole average
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 10; j++)
+      fphidp[i][j] *= 0.5;
+  }
+
+  fphi_to_cphi(fphidp,cphidp);
+
+  // get the fractional to Cartesian transformation matrix
+
+  //frac_to_cart();
+
+  // increment the dipole polarization virial contributions
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 1; j < 4; j++) {
+      cphid[j] = 0.0;
+      cphip[j] = 0.0;
+      for (k = 1; k < 4; k++) {
+        cphid[j] += ftc[j][k]*fphid[i][k];
+        cphip[j] += ftc[j][k]*fphip[i][k];
+      }
+    }
+
+    vxx -= cmp[i][1]*cphidp[i][1] +
+      0.5*((uind[i][0]+uinp[i][0])*cphi[i][1]);
+    vyy -= cmp[i][2]*cphidp[i][2] +
+      0.5*((uind[i][1]+uinp[i][1])*cphi[i][2]);
+    vzz -= cmp[i][3]*cphidp[i][3] +
+      0.5*((uind[i][2]+uinp[i][2])*cphi[i][3]);
+    vxy -= 0.5*(cphidp[i][1]*cmp[i][2]+cphidp[i][2]*cmp[i][1]) +
+      0.25*((uind[i][1]+uinp[i][1])*cphi[i][1] +
+            (uind[i][0]+uinp[i][0])*cphi[i][2]);
+    vyz -= 0.5*(cphidp[i][2]*cmp[i][3]+cphidp[i][3]*cmp[i][2]) +
+      0.25*((uind[i][2]+uinp[i][2])*cphi[i][2] +
+            (uind[i][1]+uinp[i][1])*cphi[i][3]);
+    vxz -= 0.5*(cphidp[i][1]*cmp[i][3]+cphidp[i][3]*cmp[i][1]) +
+      0.25*((uind[i][2]+uinp[i][2])*cphi[i][1] +
+            (uind[i][0]+uinp[i][0])*cphi[i][3]);
+
+    vxx -= 2.0*cmp[i][4]*cphidp[i][4] + cmp[i][7]*cphidp[i][7] +
+      cmp[i][8]*cphidp[i][8];
+    vyy -= 2.0*cmp[i][5]*cphidp[i][5] + cmp[i][7]*cphidp[i][7] +
+      cmp[i][9]*cphidp[i][9];
+    vzz -= 2.0*cmp[i][6]*cphidp[i][6] + cmp[i][8]*cphidp[i][8] +
+      cmp[i][9]*cphidp[i][9];
+    vxy -= (cmp[i][4]+cmp[i][5])*cphidp[i][7] +
+      0.5*(cmp[i][7]*(cphidp[i][5]+cphidp[i][4]) +
+           cmp[i][8]*cphidp[i][9]+cmp[i][9]*cphidp[i][8]);
+    vyz -= (cmp[i][5]+cmp[i][6])*cphidp[i][9] +
+      0.5*(cmp[i][9]*(cphidp[i][5]+cphidp[i][6]) +
+           cmp[i][7]*cphidp[i][8]+cmp[i][8]*cphidp[i][7]);
+    vxz -= (cmp[i][4]+cmp[i][6])*cphidp[i][8] +
+      0.5*(cmp[i][8]*(cphidp[i][4]+cphidp[i][6]) +
+           cmp[i][7]*cphidp[i][9]+cmp[i][9]*cphidp[i][7]);
+
+    if (poltyp == MUTUAL) {
+      vxx -= 0.5 * (cphid[1]*uinp[i][0]+cphip[1]*uind[i][0]);
+      vyy -= 0.5 * (cphid[2]*uinp[i][1]+cphip[2]*uind[i][1]);
+      vzz -= 0.5 * (cphid[3]*uinp[i][2]+cphip[3]*uind[i][2]);
+      vxy -= 0.25 * (cphid[1]*uinp[i][1]+cphip[1]*uind[i][1] +
+                     cphid[2]*uinp[i][0]+cphip[2]*uind[i][0]);
+      vyz -= 0.25 * (cphid[2]*uinp[i][2]+cphip[2]*uind[i][2] +
+                     cphid[3]*uinp[i][1]+cphip[3]*uind[i][1]);
+      vxz -= 0.25 * (cphid[1]*uinp[i][2]+cphip[1]*uind[i][2] +
+                     cphid[3]*uinp[i][0]+cphip[3]*uind[i][0]);
+    }
+  }
+
+
+  // resolve site torques then increment forces and virial
+
+  for (i = 0; i < nlocal; i++) {
+    tep[0] = cmp[i][3]*cphidp[i][2] - cmp[i][2]*cphidp[i][3] +
+      2.0*(cmp[i][6]-cmp[i][5])*cphidp[i][9] + cmp[i][8]*cphidp[i][7] +
+      cmp[i][9]*cphidp[i][5]- cmp[i][7]*cphidp[i][8] - cmp[i][9]*cphidp[i][6];
+    tep[1] = cmp[i][1]*cphidp[i][3] - cmp[i][3]*cphidp[i][1] +
+      2.0*(cmp[i][4]-cmp[i][6])*cphidp[i][8] + cmp[i][7]*cphidp[i][9] +
+      cmp[i][8]*cphidp[i][6] - cmp[i][8]*cphidp[i][4] - cmp[i][9]*cphidp[i][7];
+    tep[2] = cmp[i][2]*cphidp[i][1] - cmp[i][1]*cphidp[i][2] +
+      2.0*(cmp[i][5]-cmp[i][4])*cphidp[i][7] + cmp[i][7]*cphidp[i][4] +
+      cmp[i][9]*cphidp[i][8] - cmp[i][7]*cphidp[i][5] - cmp[i][8]*cphidp[i][9];
+
+    torque2force(i,tep,fix,fiy,fiz,f);
+
+    iz = zaxis2local[i];
+    ix = xaxis2local[i];
+    iy = yaxis2local[i];
+
+    xiz = x[iz][0] - x[i][0];
+    yiz = x[iz][1] - x[i][1];
+    ziz = x[iz][2] - x[i][2];
+    xix = x[ix][0] - x[i][0];
+    yix = x[ix][1] - x[i][1];
+    zix = x[ix][2] - x[i][2];
+    xiy = x[iy][0] - x[i][0];
+    yiy = x[iy][1] - x[i][1];
+    ziy = x[iy][2] - x[i][2];
+
+    vxx += xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
+    vyy += yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
+    vzz += zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
+    vxy += 0.5*(yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
+                xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
+    vyz += 0.5*(zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
+                yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
+    vxz += 0.5*(zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
+                xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
+  }
+
+  // account for dipole response terms in the OPT method
+
+  if (poltyp == OPT) {
+    for (i = 0; i < nlocal; i++) {
+      for (k = 0; k < optorder; k++) {
+        for (j = 1; j < 10; j++) {
+          fphid[i][j] = felec * fopt[i][k][j];
+          fphip[i][j] = felec * foptp[i][k][j];
+        }
+
+        for (m = 0; m < optorder-k; m++) {
+          for (j = 0; j < 3; j++) {
+            fuind[i][j] = a[0][j]*uopt[i][m][0] + a[1][j]*uopt[i][m][1] +
+              a[2][j]*uopt[i][m][2];
+            fuinp[i][j] = a[0][j]*uoptp[i][m][0] + a[1][j]*uoptp[i][m][1] +
+              a[2][j]*uoptp[i][m][2];
+          }
+
+          f1 = 0.0;
+          f2 = 0.0;
+          f3 = 0.0;
+
+          for (j = 0; j < 3; j++) {
+            j1 = deriv1[j+1];
+            j2 = deriv2[j+1];
+            j3 = deriv3[j+1];
+            f1 += fuind[i][j]*fphip[i][j1] + fuinp[i][j]*fphid[i][j1];
+            f2 += fuind[i][j]*fphip[i][j2] + fuinp[i][j]*fphid[i][j2];
+            f3 += fuind[i][j]*fphip[i][j3] + fuinp[i][j]*fphid[i][j3];
+          }
+
+          f1 *= 0.5 * nfft1;
+          f2 *= 0.5 * nfft2;
+          f3 *= 0.5 * nfft3;
+          h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3;
+          h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3;
+          h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3;
+
+          f[i][0] -= copm[k+m+1]*h1;
+          f[i][1] -= copm[k+m+1]*h2;
+          f[i][2] -= copm[k+m+1]*h3;
+
+          for (j = 1; j < 4; j++) {
+            cphid[j] = 0.0;
+            cphip[j] = 0.0;
+            for (j1 = 1; j1 < 4; j1++) {
+              cphid[j] += ftc[j][j1]*fphid[i][j1];
+              cphip[j] += ftc[j][j1]*fphip[i][j1];
+            }
+          }
+
+          vxx -= 0.5*copm[k+m+1] *
+            (cphid[1]*uoptp[i][m][0] + cphip[1]*uopt[i][m][0]);
+          vyy -= 0.5*copm[k+m+1] *
+            (cphid[2]*uoptp[i][m][1]+ cphip[2]*uopt[i][m][1]);
+          vzz -= 0.5*copm[k+m+1] *
+            (cphid[3]*uoptp[i][m][2]+ cphip[3]*uopt[i][m][2]);
+          vxy -= 0.25*copm[k+m+1] *
+            (cphid[1]*uoptp[i][m][1]+ cphip[1]*uopt[i][m][1]+
+             cphid[2]*uoptp[i][m][0]+ cphip[2]*uopt[i][m][0]);
+          vyz -= 0.25*copm[k+m+1] *
+            (cphid[1]*uoptp[i][m][2]+ cphip[1]*uopt[i][m][2]+
+             cphid[3]*uoptp[i][m][0]+ cphip[3]*uopt[i][m][0]);
+          vxz -= 0.25*copm[k+m+1] *
+            (cphid[2]*uoptp[i][m][2]+ cphip[2]*uopt[i][m][2]+
+             cphid[3]*uoptp[i][m][1]+ cphip[3]*uopt[i][m][1]);
+        }
+      }
+    }
+  }
+
+  // assign permanent and induced multipoles to the PME grid
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 1; j < 4; j++)
+      cmp[i][j] += uinp[i][j-1];
+  }
+
+  // convert Cartesian multipoles to fractional multipoles
+
+  cmp_to_fmp(cmp,fmp);
+
+  // gridpre = my portion of 3d grid in brick decomp w/ ghost values
+  // zeroed by zero()
+
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
+
+  // map atoms to grid
+
+  grid_mpole(fmp,gridpre);
+
+  // pre-convolution operations including forward FFT
+  // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
+
+  gridfft = p_kspace->pre_convolution();
+
+  // gridfft1 = copy of first FFT
+
+  int nfft_owned = p_kspace->nfft_owned;
+  memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR));
+
+  // assign induced dipoles to the PME grid
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 1; j < 4; j++)
+      cmp[i][j] += uind[i][j-1] - uinp[i][j-1];
+  }
+
+  // convert Cartesian multipoles to fractional multipoles
+
+  cmp_to_fmp(cmp,fmp);
+
+  // gridpre = my portion of 3d grid in brick decomp w/ ghost values
+  // zeroed by zero()
+
+  gridpre = (FFT_SCALAR ***) p_kspace->zero();
+
+  // map atoms to grid
+
+  grid_mpole(fmp,gridpre);
+
+  // pre-convolution operations including forward FFT
+  // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors
+
+  FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
+
+  // ---------------------
+  // convolution operation
+  // ---------------------
+
+  m = n = 0;
+  for (k = nzlo; k <= nzhi; k++) {
+    for (j = nylo; j <= nyhi; j++) {
+      for (i = nxlo; i <= nxhi; i++) {
+        r1 = (i >= nhalf1) ? i-nfft1 : i;
+        r2 = (j >= nhalf2) ? j-nfft2 : j;
+        r3 = (k >= nhalf3) ? k-nfft3 : k;
+        h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3;  // matvec
+        h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3;
+        h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3;
+        hsq = h1*h1 + h2*h2 + h3*h3;
+        term = -pterm * hsq;
+        expterm = 0.0;
+        if (term > -50.0 && hsq != 0.0) {
+          denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k];
+          expterm = exp(term) / denom;
+          struc2 = gridfft1[n]*gridfft2[n] + gridfft1[n+1]*gridfft2[n+1];
+          eterm = 0.5 * felec * expterm * struc2;
+          vterm = (2.0/hsq) * (1.0-term) * eterm;
+          vxx += h1*h1*vterm - eterm;
+          vyy += h2*h2*vterm - eterm;
+          vzz += h3*h3*vterm - eterm;
+          vxy += h1*h2*vterm;
+          vyz += h2*h3*vterm;
+          vxz += h1*h3*vterm;
+        }
+        n += 2;
+      }
+    }
+  }
+
+  // assign only the induced dipoles to the PME grid
+  // and perform the 3-D FFT forward transformation
+  // NOTE: why is there no inverse FFT in this section?
+
+  if (poltyp == DIRECT || poltyp == TCG) {
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 10; j++)
+        cmp[i][j] = 0.0;
+      for (j = 1; j < 4; j++)
+        cmp[i][j] = uinp[i][j-1];
+    }
+
+    // convert Cartesian multipoles to fractional multipoles
+
+    cmp_to_fmp(cmp,fmp);
+
+    // gridpre = my portion of 3d grid in brick decomp w/ ghost values
+    // zeroed by zero()
+
+    FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
+
+    // map atoms to grid
+
+    grid_mpole(fmp,gridpre);
+
+    // pre-convolution operations including forward FFT
+    // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
+
+    FFT_SCALAR *gridfft = p_kspace->pre_convolution();
+
+    // gridfft1 = copy of first FFT
+
+    int nfft_owned = p_kspace->nfft_owned;
+    memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR));
+
+    // assign ??? to the PME grid
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 1; j < 4; j++)
+        cmp[i][j] = uind[i][j-1];
+    }
+
+    // convert Cartesian multipoles to fractional multipoles
+
+    cmp_to_fmp(cmp,fmp);
+
+    // gridpre = my portion of 3d grid in brick decomp w/ ghost values
+
+    gridpre = (FFT_SCALAR ***) p_kspace->zero();
+
+    // map atoms to grid
+
+    grid_mpole(fmp,gridpre);
+
+    // pre-convolution operations including forward FFT
+    // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
+
+    FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
+
+    // ---------------------
+    // convolution operation
+    // ---------------------
+
+    m = n = 0;
+    for (k = nzlo; k <= nzhi; k++) {
+      for (j = nylo; j <= nyhi; j++) {
+        for (i = nxlo; i <= nxhi; i++) {
+          r1 = (i >= nhalf1) ? i-nfft1 : i;
+          r2 = (j >= nhalf2) ? j-nfft2 : j;
+          r3 = (k >= nhalf3) ? k-nfft3 : k;
+          h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3;  // matvec
+          h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3;
+          h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3;
+          hsq = h1*h1 + h2*h2 + h3*h3;
+          term = -pterm * hsq;
+          expterm = 0.0;
+          if (term > -50.0 && hsq != 0.0) {
+            denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k];
+            expterm = exp(term) / denom;
+            struc2 = gridfft1[n]*gridfft2[n] + gridfft1[n+1]*gridfft2[n+1];
+            eterm = 0.5 * felec * expterm * struc2;
+            vterm = (2.0/hsq) * (1.0-term) * eterm;
+            vxx += h1*h1*vterm - eterm;
+            vyy += h2*h2*vterm - eterm;
+            vzz += h3*h3*vterm - eterm;
+            vxy += h1*h2*vterm;
+            vyz += h2*h3*vterm;
+            vxz += h1*h3*vterm;
+          }
+          n += 2;
+        }
+      }
+    }
+  }
+
+  // increment the total internal virial tensor components
+
+  if (vflag_global) {
+    virpolar[0] -= vxx;
+    virpolar[1] -= vyy;
+    virpolar[2] -= vzz;
+    virpolar[3] -= vxy;
+    virpolar[4] -= vxz;
+    virpolar[5] -= vyz;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute atom forces from torques
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr,
+                                              double** force_comp,
+                                              double* virial_comp)
+{
+  int i,ix,iy,iz;
+  double xix,yix,zix;
+  double xiy,yiy,ziy;
+  double xiz,yiz,ziz;
+  double vxx,vyy,vzz;
+  double vxy,vxz,vyz;
+  double fix[3],fiy[3],fiz[3],_tq[4];
+
+  double** x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    _tq[0] = tq_ptr[4*i];
+    _tq[1] = tq_ptr[4*i+1];
+    _tq[2] = tq_ptr[4*i+2];
+    torque2force(i,_tq,fix,fiy,fiz,force_comp);
+
+    iz = zaxis2local[i];
+    ix = xaxis2local[i];
+    iy = yaxis2local[i];
+
+    xiz = x[iz][0] - x[i][0];
+    yiz = x[iz][1] - x[i][1];
+    ziz = x[iz][2] - x[i][2];
+    xix = x[ix][0] - x[i][0];
+    yix = x[ix][1] - x[i][1];
+    zix = x[ix][2] - x[i][2];
+    xiy = x[iy][0] - x[i][0];
+    yiy = x[iy][1] - x[i][1];
+    ziy = x[iy][2] - x[i][2];
+
+    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
+    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
+    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
+    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
+                 xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
+    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
+                 xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
+    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
+                 yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
+
+    virial_comp[0] -= vxx;
+    virial_comp[1] -= vyy;
+    virial_comp[2] -= vzz;
+    virial_comp[3] -= vxy;
+    virial_comp[4] -= vxz;
+    virial_comp[5] -= vyz;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairAmoebaGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + amoeba_gpu_bytes();
+}
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
new file mode 100644
index 0000000000..be53f7ef50
--- /dev/null
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -0,0 +1,72 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(amoeba/gpu,PairAmoebaGPU);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_AMOEBA_GPU_H
+#define LMP_PAIR_AMOEBA_GPU_H
+
+#include "pair_amoeba.h"
+
+namespace LAMMPS_NS {
+
+class PairAmoebaGPU : public PairAmoeba {
+ public:
+  PairAmoebaGPU(LAMMPS *lmp);
+  ~PairAmoebaGPU() override;
+  void init_style() override;
+  double memory_usage() override;
+
+  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+  void induce() override;
+
+  void multipole_real() override;
+  void udirect2b(double **, double **) override;
+  void umutual1(double **, double **) override;
+  void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
+  void umutual2b(double **, double **) override;
+  void ufield0c(double **, double **) override;
+  void polar_real() override;
+  void polar_kspace() override;
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  void *tq_pinned;
+  void *fieldp_pinned;
+  bool acc_float;
+
+  bool gpu_hal_ready;
+  bool gpu_repulsion_ready;
+  bool gpu_dispersion_real_ready;
+  bool gpu_multipole_real_ready;
+  bool gpu_udirect2b_ready;
+  bool gpu_umutual1_ready;
+  bool gpu_fphi_uind_ready;
+  bool gpu_umutual2b_ready;
+  bool gpu_polar_real_ready;
+
+  void udirect2b_cpu();
+
+  template<class numtyp>
+  void compute_force_from_torque(const numtyp*, double**, double*);
+};
+
+}    // namespace LAMMPS_NS
+#endif
+#endif
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
new file mode 100644
index 0000000000..9d286d5db7
--- /dev/null
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -0,0 +1,1494 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Trung Nguyen (Northwestern/UChicago)
+------------------------------------------------------------------------- */
+
+#include "pair_hippo_gpu.h"
+
+#include "amoeba_convolution_gpu.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "fix_store_peratom.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "info.h"
+#include "math_const.h"
+#include "memory.h"
+#include "my_page.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+#include "suffix.h"
+#include <cmath>
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+enum{INDUCE,RSD,SETUP_hippo,SETUP_HIPPO,KMPOLE,AMGROUP};   // forward comm
+enum{FIELD,ZRSD,TORQUE,UFLD};                               // reverse comm
+enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
+enum{MUTUAL,OPT,TCG,DIRECT};
+enum{GEAR,ASPC,LSQR};
+enum{BUILD,APPLY};
+enum{GORDON1,GORDON2};
+
+// same as in pair_amoeba.cpp
+enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};
+
+#define DEBYE 4.80321    // conversion factor from q-Angs (real units) to Debye
+
+// External functions from cuda library for atom decomposition
+
+int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp, const int* host_amtype2class,
+                    const double *host_special_repel, const double *host_special_disp,
+                    const double *host_special_mpole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
+                    const double *host_csix, const double *host_adisp,
+                    const double *host_pcore, const double *host_palpha,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double polar_dscale, const double polar_uscale);
+void hippo_gpu_clear();
+
+int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
+                            double **host_x, int *host_type, int *host_amtype,
+                            int *host_amgroup, double **host_rpole,
+                            double **host_uind, double **host_uinp, double *host_pval,
+                            double *sublo, double *subhi, tagint *tag,
+                            int **nspecial, tagint **special,
+                            int *nspecial15, tagint **special15,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom, int &host_start,
+                            int **ilist, int **jnum, const double cpu_time,
+                            bool &success, double *host_q, double *boxlo, double *prd);
+
+void hippo_gpu_compute_repulsion(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double off2,
+                           double *host_q, double *boxlo, double *prd,
+                           double cut2, double c0, double c1, double c2,
+                           double c3, double c4, double c5, void **tep_ptr);
+
+void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                        const double aewald, const double off2);
+
+void hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag,
+              int **nspecial, tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double aewald, const double felec, const double off2,
+              double *host_q, double *boxlo, double *prd, void **tq_ptr);
+
+void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp,
+              double *host_pval, const double aewald, const double off2, void **fieldp_ptr);
+
+void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
+              const double aewald, const double off2, void **fieldp_ptr);
+
+void hippo_gpu_update_fieldp(void **fieldp_ptr);
+
+void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
+              double ***host_thetai1, double ***host_thetai2,
+              double ***host_thetai3, int** igrid,
+              const int nzlo_out, const int nzhi_out,
+              const int nylo_out, const int nyhi_out,
+              const int nxlo_out, const int nxhi_out);
+
+void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi);
+
+void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              const double aewald, const double felec, const double off2,
+              void **tq_ptr);
+
+double hippo_gpu_bytes();
+
+/* ---------------------------------------------------------------------- */
+
+PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
+{
+  amoeba = false;
+  mystyle = "hippo";
+
+  respa_enable = 0;
+  reinitflag = 0;
+  cpu_time = 0.0;
+  suffix_flag |= Suffix::GPU;
+  fieldp_pinned = nullptr;
+  tq_pinned = nullptr;
+
+  gpu_hal_ready = false;              // always false for HIPPO
+  gpu_repulsion_ready = true;
+  gpu_dispersion_real_ready = true;
+  gpu_multipole_real_ready = true;
+  gpu_udirect2b_ready = true;
+  gpu_umutual1_ready = true;
+  gpu_fphi_uind_ready = true;
+  gpu_umutual2b_ready = true;
+  gpu_polar_real_ready = true;         // need to be true for copying data from device back to host
+
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairHippoGPU::~PairHippoGPU()
+{
+  hippo_gpu_clear();
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::init_style()
+{
+  PairAmoeba::init_style();
+
+  // Repeat cutsq calculation because done after call to init_style
+
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  int maxspecial15=0;
+  if (atom->molecular != Atom::ATOMIC) {
+    maxspecial=atom->maxspecial;
+    maxspecial15=atom->maxspecial15;
+  }
+
+  int mnf = 5e-2 * neighbor->oneatom;
+  int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
+                               pdamp, thole, dirdamp, amtype2class,
+                               special_repel, special_disp, special_mpole,
+                               special_polar_wscale, special_polar_piscale,
+                               special_polar_pscale, sizpr, dmppr, elepr,
+                               csix, adisp, pcore, palpha,
+                               atom->nlocal, atom->nlocal+atom->nghost, mnf,
+                               maxspecial, maxspecial15, cell_size, gpu_mode,
+                               screen, polar_dscale, polar_uscale);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (gpu_mode == GPU_FORCE)
+    error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now");
+
+  acc_float = Info::has_accelerator_feature("GPU", "precision", "single");
+
+  // replace with the gpu counterpart
+
+  if (gpu_umutual1_ready) {
+    if (use_ewald && ic_kspace) {
+      delete ic_kspace;
+      ic_kspace =
+        new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   repulsion = Pauli repulsion interactions
+   adapted from Tinker erepel1b() routine
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::repulsion()
+{
+  if (!gpu_repulsion_ready) {
+    PairAmoeba::repulsion();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  double **f = atom->f;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh;
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  hippo_gpu_precompute(neighbor->ago, inum, nall, atom->x,
+                       atom->type, amtype, amgroup, rpole,
+                       nullptr, nullptr, nullptr,
+                       sublo, subhi, atom->tag,
+                       atom->nspecial, atom->special,
+                       atom->nspecial15, atom->special15,
+                       eflag, vflag, eflag_atom, vflag_atom,
+                       host_start, &ilist, &numneigh, cpu_time,
+                       success, atom->q, domain->boxlo, domain->prd);
+
+  // select the correct cutoff for the term
+
+  choose(REPULSE);
+
+  hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x,
+                              atom->type, amtype, amgroup, rpole,
+                              sublo, subhi, atom->tag,
+                              atom->nspecial, atom->special,
+                              atom->nspecial15, atom->special15,
+                              eflag, vflag, eflag_atom, vflag_atom,
+                              host_start, &ilist, &numneigh, cpu_time,
+                              success, aewald, off2, atom->q,
+                              domain->boxlo, domain->prd, cut2,
+                              c0, c1, c2, c3, c4, c5, &tq_pinned);
+
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // reference to the tep array from GPU lib
+
+  if (acc_float) {
+    auto *tq_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tq_ptr, f, virrepulse); // frepulse
+  } else {
+    auto *tq_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tq_ptr, f, virrepulse); // frepulse
+  }
+}
+
+/* ----------------------------------------------------------------------
+   dispersion_real = real-space portion of Ewald dispersion
+   adapted from Tinker edreal1d() routine
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::dispersion_real()
+{
+  if (!gpu_dispersion_real_ready) {
+    PairAmoeba::dispersion_real();
+    return;
+  }
+
+  double sublo[3],subhi[3];
+
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+
+  // select the correct cutoff for the term
+
+  if (use_dewald) choose(DISP_LONG);
+  else choose(DISP);
+
+  hippo_gpu_compute_dispersion_real(amtype, amgroup, rpole, aewald, off2);
+}
+
+/* ----------------------------------------------------------------------
+   multipole_real = real-space portion of mulipole interactions
+   adapted from Tinker emreal1d() routine
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::multipole_real()
+{
+  if (!gpu_multipole_real_ready) {
+    PairAmoeba::multipole_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  double **f = atom->f;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh;
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff for the term
+
+  if (use_ewald) choose(MPOLE_LONG);
+  else choose(MPOLE);
+
+  // set the energy unit conversion factor for multipolar real-space calculation
+
+  double felec = electric / am_dielectric;
+  double *pval = atom->dvector[index_pval];
+
+  hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
+                                   atom->type, amtype, amgroup, rpole, pval,
+                                   sublo, subhi, atom->tag,
+                                   atom->nspecial, atom->special,
+                                   atom->nspecial15, atom->special15,
+                                   eflag, vflag, eflag_atom, vflag_atom,
+                                   host_start, &ilist, &numneigh, cpu_time,
+                                   success, aewald, felec, off2, atom->q,
+                                   domain->boxlo, domain->prd, &tq_pinned);
+
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // reference to the tep array from GPU lib
+
+  if (acc_float) {
+    auto *tq_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tq_ptr, f, virmpole); // fmpole
+  } else {
+    auto *tq_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tq_ptr, f, virmpole); // fmpole
+  }
+}
+
+/* ----------------------------------------------------------------------
+   induce = induced dipole moments via pre-conditioned CG solver
+   adapted from Tinker induce0a() routine
+   NOTE: Almost the same in the CPU version, except that there is no need
+      to call reverse_comm() for crstyle = FIELD;
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::induce()
+{
+  bool done;
+  int i,j,m,itype;
+  int iter,maxiter;
+  double polmin;
+  double eps,epsold;
+  double epsd,epsp;
+  double udsum,upsum;
+  double a,ap,b,bp;
+  double sum,sump,term;
+  double reduce[4],allreduce[4];
+
+  // set cutoffs, taper coeffs, and PME params
+  // create qfac here, free at end of polar()
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  // owned atoms
+
+  int nlocal = atom->nlocal;
+
+  // zero out the induced dipoles at each site
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      uind[i][j] = 0.0;
+      uinp[i][j] = 0.0;
+    }
+  }
+
+  // get the electrostatic field due to permanent multipoles
+
+  dfield0c(field,fieldp);
+
+  // need reverse_comm if dfield0c (i.e. udirect2b) is CPU-only
+
+  if (!gpu_udirect2b_ready) {
+    crstyle = FIELD;
+    comm->reverse_comm(this);
+  }
+
+  // set induced dipoles to polarizability times direct field
+
+  for (i = 0; i < nlocal; i++) {
+    itype = amtype[i];
+    for (j = 0; j < 3; j++) {
+      udir[i][j] = polarity[itype] * field[i][j];
+      udirp[i][j] = polarity[itype] * fieldp[i][j];
+      if (pcgguess) {
+        uind[i][j] = udir[i][j];
+        uinp[i][j] = udirp[i][j];
+      }
+    }
+  }
+
+  // allocate memory and make early host-device transfers
+  // must be done before the first ufield0c
+  if (ic_kspace)
+    hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
+                                thetai3, igrid,
+                                ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                                ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                                ic_kspace->nxlo_out, ic_kspace->nxhi_out);
+
+  // get induced dipoles via the OPT extrapolation method
+  // NOTE: any way to rewrite these loops to avoid allocating
+  //       uopt,uoptp with a optorder+1 dimension, just optorder ??
+  //       since no need to store optorder+1 values after these loops
+
+  if (poltyp == OPT) {
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        uopt[i][0][j] = udir[i][j];
+        uoptp[i][0][j] = udirp[i][j];
+      }
+    }
+
+    for (m = 1; m <= optorder; m++) {
+      optlevel = m - 1;     // used in umutual1() for fopt,foptp
+
+      cfstyle = INDUCE;
+      comm->forward_comm(this);
+
+      ufield0c(field,fieldp);
+
+      if (!gpu_umutual2b_ready) {
+        crstyle = FIELD;
+        comm->reverse_comm(this);
+      }
+
+      for (i = 0; i < nlocal; i++) {
+              itype = amtype[i];
+        for (j = 0; j < 3; j++) {
+          uopt[i][m][j] = polarity[itype] * field[i][j];
+          uoptp[i][m][j] = polarity[itype] * fieldp[i][j];
+          uind[i][j] = uopt[i][m][j];
+          uinp[i][j] = uoptp[i][m][j];
+        }
+      }
+    }
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        uind[i][j] = 0.0;
+        uinp[i][j] = 0.0;
+        usum[i][j] = 0.0;
+        usump[i][j] = 0.0;
+        for (m = 0; m <= optorder; m++) {
+          usum[i][j] += uopt[i][m][j];
+          usump[i][j] += uoptp[i][m][j];
+          uind[i][j] += copt[m]*usum[i][j];
+          uinp[i][j] += copt[m]*usump[i][j];
+        }
+      }
+    }
+  }
+
+  // set tolerances for computation of mutual induced dipoles
+
+  if (poltyp == MUTUAL) {
+    done = false;
+    maxiter = 100;
+    iter = 0;
+    polmin = 0.00000001;
+    eps = 100.0;
+
+    // estimate induced dipoles using a polynomial predictor
+
+    if (use_pred && nualt == maxualt) {
+      ulspred();
+
+      double ***udalt = fixudalt->tstore;
+      double ***upalt = fixupalt->tstore;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          udsum = 0.0;
+          upsum = 0.0;
+          for (m = 0; m < nualt; m++) {
+            udsum += bpred[m]*udalt[i][m][j];
+            upsum += bpredp[m]*upalt[i][m][j];
+          }
+          uind[i][j] = udsum;
+          uinp[i][j] = upsum;
+        }
+      }
+    }
+
+    // estimate induced dipoles via inertial extended Lagrangian
+    // not supported for now
+    // requires uaux,upaux to persist with each atom
+    // also requires a velocity vector(s) to persist
+    // also requires updating uaux,upaux in the Verlet integration
+
+    /*
+    if (use_ielscf) {
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = uaux[i][j];
+          uinp[i][j] = upaux[i][j];
+        }
+      }
+    }
+    */
+
+    // get the electrostatic field due to induced dipoles
+
+    cfstyle = INDUCE;
+    comm->forward_comm(this);
+
+    ufield0c(field,fieldp);
+
+    if (!gpu_umutual2b_ready) {
+      crstyle = FIELD;
+      comm->reverse_comm(this);
+    }
+
+    // set initial conjugate gradient residual and conjugate vector
+
+    for (i = 0; i < nlocal; i++) {
+      itype = amtype[i];
+
+      poli[i] = MAX(polmin,polarity[itype]);
+      for (j = 0; j < 3; j++) {
+        if (pcgguess) {
+          rsd[i][j] = (udir[i][j]-uind[i][j])/poli[i] + field[i][j];
+          rsdp[i][j] = (udirp[i][j]-uinp[i][j])/poli[i] + fieldp[i][j];
+        } else {
+          rsd[i][j] = udir[i][j] / poli[i];
+          rsdp[i][j] = udirp[i][j] / poli[i];
+        }
+        zrsd[i][j] = rsd[i][j];
+        zrsdp[i][j] = rsdp[i][j];
+      }
+    }
+
+    if (pcgprec) {
+      cfstyle = RSD;
+      comm->forward_comm(this);
+      uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp);
+      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp);
+      crstyle = ZRSD;
+      comm->reverse_comm(this);
+   }
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        conj[i][j] = zrsd[i][j];
+        conjp[i][j] = zrsdp[i][j];
+      }
+    }
+
+    // conjugate gradient iteration of the mutual induced dipoles
+
+    while (!done) {
+      iter++;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          vec[i][j] = uind[i][j];
+          vecp[i][j] = uinp[i][j];
+          uind[i][j] = conj[i][j];
+          uinp[i][j] = conjp[i][j];
+        }
+      }
+
+      cfstyle = INDUCE;
+      comm->forward_comm(this);
+
+      ufield0c(field,fieldp);
+
+      if (!gpu_umutual2b_ready) {
+        crstyle = FIELD;
+        comm->reverse_comm(this);
+      }
+
+      //error->all(FLERR,"STOP");
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = vec[i][j];
+          uinp[i][j] = vecp[i][j];
+          vec[i][j] = conj[i][j]/poli[i] - field[i][j];
+          vecp[i][j] = conjp[i][j]/poli[i] - fieldp[i][j];
+        }
+      }
+
+      a = 0.0;
+      ap = 0.0;
+      sum = 0.0;
+      sump = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          a += conj[i][j]*vec[i][j];
+          ap += conjp[i][j]*vecp[i][j];
+          sum += rsd[i][j]*zrsd[i][j];
+          sump += rsdp[i][j]*zrsdp[i][j];
+        }
+      }
+
+      reduce[0] = a;
+      reduce[1] = ap;
+      reduce[2] = sum;
+      reduce[3] = sump;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      a = allreduce[0];
+      ap = allreduce[1];
+      sum = allreduce[2];
+      sump = allreduce[3];
+
+      if (a != 0.0) a = sum / a;
+      if (ap != 0.0) ap = sump / ap;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = uind[i][j] + a*conj[i][j];
+          uinp[i][j] = uinp[i][j] + ap*conjp[i][j];
+          rsd[i][j] = rsd[i][j] - a*vec[i][j];
+          rsdp[i][j] = rsdp[i][j] - ap*vecp[i][j];
+          zrsd[i][j] = rsd[i][j];
+          zrsdp[i][j] = rsdp[i][j];
+        }
+      }
+
+      if (pcgprec) {
+        cfstyle = RSD;
+        comm->forward_comm(this);
+        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp);
+        crstyle = ZRSD;
+        comm->reverse_comm(this);
+      }
+
+      b = 0.0;
+      bp = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          b += rsd[i][j]*zrsd[i][j];
+          bp += rsdp[i][j]*zrsdp[i][j];
+        }
+      }
+
+      reduce[0] = b;
+      reduce[1] = bp;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      b = allreduce[0];
+      bp = allreduce[1];
+
+      if (sum != 0.0) b /= sum;
+      if (sump != 0.0) bp /= sump;
+
+      epsd = 0.0;
+      epsp = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          conj[i][j] = zrsd[i][j] + b*conj[i][j];
+          conjp[i][j] = zrsdp[i][j] + bp*conjp[i][j];
+          epsd += rsd[i][j]*rsd[i][j];
+          epsp += rsdp[i][j]*rsdp[i][j];
+        }
+      }
+
+      reduce[0] = epsd;
+      reduce[1] = epsp;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      epsd = allreduce[0];
+      epsp = allreduce[1];
+
+      // check the convergence of the mutual induced dipoles
+
+      epsold = eps;
+      eps = MAX(epsd,epsp);
+      eps = DEBYE * sqrt(eps/atom->natoms);
+
+      if (eps < poleps) done = true;
+      if (eps > epsold) done = true;
+      if (iter >= politer) done = true;
+
+      //  apply a "peek" iteration to the mutual induced dipoles
+
+      if (done) {
+        for (i = 0; i < nlocal; i++) {
+          term = pcgpeek * poli[i];
+          for (j = 0; j < 3; j++) {
+            uind[i][j] += term*rsd[i][j];
+            uinp[i][j] += term*rsdp[i][j];
+          }
+        }
+      }
+
+    }
+
+    // terminate the calculation if dipoles failed to converge
+    // NOTE: could make this an error
+
+    if (iter >= maxiter || eps > epsold)
+      if (comm->me == 0)
+              error->warning(FLERR,"HIPPO induced dipoles did not converge");
+  }
+
+  // update the lists of previous induced dipole values
+  // shift previous m values up to m+1, add new values at m = 0
+  // only when preconditioner is used
+
+  if (use_pred) {
+    double ***udalt = fixudalt->tstore;
+    double ***upalt = fixupalt->tstore;
+
+    nualt = MIN(nualt+1,maxualt);
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        for (m = nualt-1; m > 0; m--) {
+          udalt[i][m][j] = udalt[i][m-1][j];
+          upalt[i][m][j] = upalt[i][m-1][j];
+        }
+        udalt[i][0][j] = uind[i][j];
+        upalt[i][0][j] = uinp[i][j];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   udirect2b = Ewald real direct field via list
+   udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::udirect2b(double **field, double **fieldp)
+{
+  if (!gpu_udirect2b_ready) {
+    PairAmoeba::udirect2b(field, fieldp);
+    return;
+  }
+
+  int inum;
+  double sublo[3],subhi[3];
+
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff (off2) for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  double *pval = atom->dvector[index_pval];
+  hippo_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp, pval,
+                              aewald, off2, &fieldp_pinned);
+
+  // rebuild dipole-dipole pair list and store pairwise dipole matrices
+  // done one atom at a time in real-space double loop over atoms & neighs
+  // NOTE: for the moment the tdipdip values are computed just in time in umutual2b()
+  //   so no need to call ubdirect2b_cpu().
+  // udirect2b_cpu();
+
+  // accumulate the field and fieldp values from the GPU lib
+  //   field and fieldp may already have some nonzero values from kspace (udirect1)
+
+  int nlocal = atom->nlocal;
+  if (acc_float) {
+    auto field_ptr = (float *)fieldp_pinned;
+
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    field_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
+    }
+
+  } else {
+
+    auto field_ptr = (double *)fieldp_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    field_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   udirect2b = Ewald real direct field via list
+   udirect2b computes the real space contribution of the permanent
+     atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::udirect2b_cpu()
+{
+  int i,j,m,n,ii,jj,jextra,ndip,itype,jtype,igroup,jgroup;
+  double xr,yr,zr,r,r2;
+  double rr1,rr2,rr3,rr5;
+  double bfac,exp2a;
+  double ralpha,aefac;
+  double aesq2,aesq2n;
+  double pdi,pti;
+  double pgamma;
+  double damp,expdamp;
+  double scale3,scale5,scalek;
+  double bn[4],bcn[3];
+  double factor_uscale;
+
+  int inum,jnum;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+
+  // neigh list
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // NOTE: doesn't this have a problem if aewald is tiny ??
+
+  aesq2 = 2.0 * aewald * aewald;
+  aesq2n = 0.0;
+  if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald);
+
+  // rebuild dipole-dipole pair list and store pairwise dipole matrices
+  // done one atom at a time in real-space double loop over atoms & neighs
+
+  int *neighptr;
+  double *tdipdip;
+
+  // compute the real space portion of the Ewald summation
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    itype = amtype[i];
+    igroup = amgroup[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    n = ndip = 0;
+    neighptr = ipage_dipole->vget();
+    tdipdip = dpage_dipdip->vget();
+
+    pdi = pdamp[itype];
+    pti = thole[itype];
+
+    // evaluate all sites within the cutoff distance
+
+    for (jj = 0; jj < jnum; jj++) {
+      jextra = jlist[jj];
+      j = jextra & NEIGHMASK15;
+
+      xr = x[j][0] - x[i][0];
+      yr = x[j][1] - x[i][1];
+      zr = x[j][2] - x[i][2];
+      r2 = xr*xr + yr* yr + zr*zr;
+      if (r2 > off2) continue;
+
+      jtype = amtype[j];
+      jgroup = amgroup[j];
+
+      if (igroup == jgroup) factor_uscale = polar_uscale;
+      else factor_uscale = 1.0;
+
+      r = sqrt(r2);
+      rr1 = 1.0 / r;
+      rr2 = rr1 * rr1;
+      rr3 = rr2 * rr1;
+      rr5 = 3.0 * rr2 * rr3;
+
+      // calculate the real space Ewald error function terms
+
+      ralpha = aewald * r;
+      bn[0] = erfc(ralpha) * rr1;
+      exp2a = exp(-ralpha*ralpha);
+      aefac = aesq2n;
+      for (m = 1; m <= 3; m++) {
+        bfac = m+m-1;
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2;
+      }
+
+      // find terms needed later to compute mutual polarization
+
+      if (poltyp != DIRECT) {
+        scale3 = 1.0;
+        scale5 = 1.0;
+        damp = pdi * pdamp[jtype];
+        if (damp != 0.0) {
+          pgamma = MIN(pti,thole[jtype]);
+          damp = pgamma * pow(r/damp,3.0);
+          if (damp < 50.0) {
+            expdamp = exp(-damp);
+            scale3 = 1.0 - expdamp;
+            scale5 = 1.0 - expdamp*(1.0+damp);
+          }
+        }
+        scalek = factor_uscale;
+        bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3;
+        bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5;
+
+        neighptr[n++] = j;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr;
+        tdipdip[ndip++] = bcn[1]*xr*yr;
+        tdipdip[ndip++] = bcn[1]*xr*zr;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr;
+        tdipdip[ndip++] = bcn[1]*yr*zr;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr;
+      } else {
+
+      }
+
+    } // jj
+
+    firstneigh_dipole[i] = neighptr;
+    firstneigh_dipdip[i] = tdipdip;
+    numneigh_dipole[i] = n;
+    ipage_dipole->vgot(n);
+    dpage_dipdip->vgot(ndip);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   ufield0c = mutual induction via Ewald sum
+   ufield0c computes the mutual electrostatic field due to
+   induced dipole moments via Ewald summation
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::ufield0c(double **field, double **fieldp)
+{
+  double term;
+
+  // zero field,fieldp for owned and ghost atoms
+
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+
+  memset(&field[0][0], 0, 3*nall *sizeof(double));
+  memset(&fieldp[0][0], 0, 3*nall *sizeof(double));
+
+  // get the real space portion of the mutual field first
+
+  double time0, time1, time2;
+  MPI_Barrier(world);
+  time0 = platform::walltime();
+
+  if (polar_rspace_flag) umutual2b(field,fieldp);
+  time1 = platform::walltime();
+
+  // get the reciprocal space part of the mutual field
+
+  if (polar_kspace_flag) umutual1(field,fieldp);
+  time2 = platform::walltime();
+
+  // add the self-energy portion of the mutual field
+
+  term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
+  for (int i = 0; i < nlocal; i++) {
+    field[i][0] += term*uind[i][0];
+    field[i][1] += term*uind[i][1];
+    field[i][2] += term*uind[i][2];
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    fieldp[i][0] += term*uinp[i][0];
+    fieldp[i][1] += term*uinp[i][1];
+    fieldp[i][2] += term*uinp[i][2];
+  }
+
+  // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
+  //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
+
+  hippo_gpu_update_fieldp(&fieldp_pinned);
+  int inum = atom->nlocal;
+
+  if (acc_float) {
+    auto *field_ptr = (float *)fieldp_pinned;
+
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    field_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
+    }
+
+  } else {
+    auto *field_ptr = (double *)fieldp_pinned;
+
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    field_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
+    }
+  }
+
+  // accumulate timing information
+
+  time_mutual_rspace += time1 - time0;
+  time_mutual_kspace += time2 - time1;
+}
+
+/* ----------------------------------------------------------------------
+   umutual1 = Ewald recip mutual induced field
+   umutual1 computes the reciprocal space contribution of the
+   induced atomic dipole moments to the field
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::umutual1(double **field, double **fieldp)
+{
+  int m,n;
+  int nxlo,nxhi,nylo,nyhi,nzlo,nzhi;
+  double term;
+  double a[3][3];  // indices not flipped vs Fortran
+
+  // return if the Ewald coefficient is zero
+
+  if (aewald < 1.0e-6) return;
+
+  // convert Cartesian dipoles to fractional coordinates
+
+  for (int j = 0; j < 3; j++) {
+    a[0][j] = nfft1 * recip[0][j];
+    a[1][j] = nfft2 * recip[1][j];
+    a[2][j] = nfft3 * recip[2][j];
+  }
+
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+    fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2];
+    fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2];
+    fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2];
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2];
+    fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
+    fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
+  }
+
+  // gridpre = my portion of 4d grid in brick decomp w/ ghost values
+
+  FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero();
+
+  // map 2 values to grid
+
+  double time0, time1;
+  MPI_Barrier(world);
+  time0 = platform::walltime();
+
+  grid_uind(fuind,fuinp,gridpre);
+
+  time1 = platform::walltime();
+  time_grid_uind += (time1 - time0);
+
+  // pre-convolution operations including forward FFT
+  // gridfft = my portion of complex 3d grid in FFT decomposition
+
+  FFT_SCALAR *gridfft = ic_kspace->pre_convolution();
+
+  // ---------------------
+  // convolution operation
+  // ---------------------
+
+  nxlo = ic_kspace->nxlo_fft;
+  nxhi = ic_kspace->nxhi_fft;
+  nylo = ic_kspace->nylo_fft;
+  nyhi = ic_kspace->nyhi_fft;
+  nzlo = ic_kspace->nzlo_fft;
+  nzhi = ic_kspace->nzhi_fft;
+
+  // use qfac values stored in udirect1()
+
+  m = n = 0;
+  for (int k = nzlo; k <= nzhi; k++) {
+    for (int j = nylo; j <= nyhi; j++) {
+      for (int i = nxlo; i <= nxhi; i++) {
+        term = qfac[m++];
+        gridfft[n] *= term;
+        gridfft[n+1] *= term;
+        n += 2;
+      }
+    }
+  }
+
+  // post-convolution operations including backward FFT
+  // gridppost = my portion of 4d grid in brick decomp w/ ghost values
+
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution();
+
+  // get potential
+
+  MPI_Barrier(world);
+  time0 = platform::walltime();
+
+  fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
+
+  time1 = platform::walltime();
+  time_fphi_uind += (time1 - time0);
+
+  // store fractional reciprocal potentials for OPT method
+
+  if (poltyp == OPT) {
+    for (int i = 0; i < nlocal; i++) {
+      for (int j = 0; j < 10; j++) {
+        fopt[i][optlevel][j] = fdip_phi1[i][j];
+        foptp[i][optlevel][j] = fdip_phi2[i][j];
+      }
+    }
+  }
+
+  // convert the dipole fields from fractional to Cartesian
+
+  for (int i = 0; i < 3; i++) {
+    a[0][i] = nfft1 * recip[0][i];
+    a[1][i] = nfft2 * recip[1][i];
+    a[2][i] = nfft3 * recip[2][i];
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    double dfx = a[0][0]*fdip_phi1[i][1] +
+      a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3];
+    double dfy = a[1][0]*fdip_phi1[i][1] +
+      a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3];
+    double dfz = a[2][0]*fdip_phi1[i][1] +
+      a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3];
+    field[i][0] -= dfx;
+    field[i][1] -= dfy;
+    field[i][2] -= dfz;
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    double dfx = a[0][0]*fdip_phi2[i][1] +
+      a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3];
+    double dfy = a[1][0]*fdip_phi2[i][1] +
+      a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3];
+    double dfz = a[2][0]*fdip_phi2[i][1] +
+      a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3];
+    fieldp[i][0] -= dfx;
+    fieldp[i][1] -= dfy;
+    fieldp[i][2] -= dfz;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   fphi_uind = induced potential from grid
+   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
+                             double **fdip_phi2, double **fdip_sum_phi)
+{
+  if (!gpu_fphi_uind_ready) {
+    PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi);
+    return;
+  }
+
+  void* fdip_phi1_pinned = nullptr;
+  void* fdip_phi2_pinned = nullptr;
+  void* fdip_sum_phi_pinned = nullptr;
+  hippo_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned,
+                      &fdip_sum_phi_pinned);
+
+  int nlocal = atom->nlocal;
+  if (acc_float) {
+    auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi1[i][m] = _fdip_phi1_ptr[n];
+        n += nlocal;
+      }
+    }
+
+    auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi2[i][m] = _fdip_phi2_ptr[n];
+        n += nlocal;
+      }
+    }
+
+    auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 20; m++) {
+        fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
+        n += nlocal;
+      }
+    }
+
+  } else {
+
+    auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi1[i][m] = _fdip_phi1_ptr[n];
+        n += nlocal;
+      }
+    }
+
+    auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi2[i][m] = _fdip_phi2_ptr[n];
+        n += nlocal;
+      }
+    }
+
+    auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 20; m++) {
+        fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
+        n += nlocal;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::umutual2b(double **field, double **fieldp)
+{
+  if (!gpu_umutual2b_ready) {
+    PairAmoeba::umutual2b(field, fieldp);
+    return;
+  }
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+
+  // select the correct cutoff (off2) for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  double *pval = atom->dvector[index_pval];
+  hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval,
+                              aewald, off2, &fieldp_pinned);
+}
+
+/* ----------------------------------------------------------------------
+   polar_real = real-space portion of induced dipole polarization
+   adapted from Tinker epreal1d() routine
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::polar_real()
+{
+  if (!gpu_polar_real_ready) {
+    PairAmoeba::polar_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  double **f = atom->f;
+  double sublo[3],subhi[3];
+
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+
+  // select the correct cutoff and aewald for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  // set the energy unit conversion factor for polar real-space calculation
+
+  double felec = 0.5 * electric / am_dielectric;
+  double *pval = atom->dvector[index_pval];
+
+  hippo_gpu_compute_polar_real(amtype, amgroup, rpole, uind, uinp, pval,
+                               eflag, vflag, eflag_atom, vflag_atom,
+                               aewald, felec, off2, &tq_pinned);
+
+  // reference to the tep array from GPU lib
+
+  if (acc_float) {
+    auto *tep_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tep_ptr, f, virpolar); // fpolar
+  } else {
+    auto *tep_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tep_ptr, f, virpolar); // fpolar
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute atom forces from torques used by various terms
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr,
+                                              double** force_comp,
+                                              double* virial_comp)
+{
+  int i,ix,iy,iz;
+  double xix,yix,zix;
+  double xiy,yiy,ziy;
+  double xiz,yiz,ziz;
+  double vxx,vyy,vzz;
+  double vxy,vxz,vyz;
+  double fix[3],fiy[3],fiz[3],_tq[4];
+
+  double** x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    _tq[0] = tq_ptr[4*i];
+    _tq[1] = tq_ptr[4*i+1];
+    _tq[2] = tq_ptr[4*i+2];
+    torque2force(i,_tq,fix,fiy,fiz,force_comp);
+
+    iz = zaxis2local[i];
+    ix = xaxis2local[i];
+    iy = yaxis2local[i];
+
+    xiz = x[iz][0] - x[i][0];
+    yiz = x[iz][1] - x[i][1];
+    ziz = x[iz][2] - x[i][2];
+    xix = x[ix][0] - x[i][0];
+    yix = x[ix][1] - x[i][1];
+    zix = x[ix][2] - x[i][2];
+    xiy = x[iy][0] - x[i][0];
+    yiy = x[iy][1] - x[i][1];
+    ziy = x[iy][2] - x[i][2];
+
+    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
+    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
+    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
+    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
+                 xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
+    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
+                 xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
+    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
+                 yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
+
+    virial_comp[0] -= vxx;
+    virial_comp[1] -= vyy;
+    virial_comp[2] -= vzz;
+    virial_comp[3] -= vxy;
+    virial_comp[4] -= vxz;
+    virial_comp[5] -= vyz;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairHippoGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + hippo_gpu_bytes();
+}
diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h
new file mode 100644
index 0000000000..d160446d77
--- /dev/null
+++ b/src/GPU/pair_hippo_gpu.h
@@ -0,0 +1,73 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS Development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(hippo/gpu,PairHippoGPU);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_HIPPO_GPU_H
+#define LMP_PAIR_HIPPO_GPU_H
+
+#include "pair_amoeba.h"
+
+namespace LAMMPS_NS {
+
+class PairHippoGPU : public PairAmoeba {
+ public:
+  PairHippoGPU(LAMMPS *lmp);
+  ~PairHippoGPU() override;
+  void init_style() override;
+  double memory_usage() override;
+
+  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+  void induce() override;
+
+  void repulsion() override;
+  void dispersion_real() override;
+  void multipole_real() override;
+  void udirect2b(double **, double **) override;
+  void umutual1(double **, double **) override;
+  void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
+  void umutual2b(double **, double **) override;
+  void ufield0c(double **, double **) override;
+  void polar_real() override;
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  void *tq_pinned;
+  void *fieldp_pinned;
+  bool acc_float;
+
+  bool gpu_hal_ready;
+  bool gpu_repulsion_ready;
+  bool gpu_dispersion_real_ready;
+  bool gpu_multipole_real_ready;
+  bool gpu_udirect2b_ready;
+  bool gpu_umutual1_ready;
+  bool gpu_fphi_uind_ready;
+  bool gpu_umutual2b_ready;
+  bool gpu_polar_real_ready;
+
+  void udirect2b_cpu();
+
+  template<class numtyp>
+  void compute_force_from_torque(const numtyp*, double**, double*);
+};
+
+}    // namespace LAMMPS_NS
+#endif
+#endif
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index e7472f4e88..f493b5438a 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -204,6 +204,8 @@ action mliap_model_linear_kokkos.h mliap_model_linear.h
 action mliap_model_python_kokkos.cpp mliap_model_linear.cpp
 action mliap_model_python_kokkos.h mliap_model_linear.h
 action mliap_model_kokkos.h mliap_model.h
+action mliap_unified_kokkos.cpp mliap_unified.cpp
+action mliap_unified_kokkos.h mliap_unified.h
 action mliap_so3_kokkos.cpp mliap_so3.cpp
 action mliap_so3_kokkos.h mliap_so3.h
 action modify_kokkos.cpp
@@ -314,6 +316,8 @@ action pair_lj_spica_kokkos.cpp pair_lj_spica.cpp
 action pair_lj_spica_kokkos.h pair_lj_spica.h
 action pair_meam_kokkos.cpp pair_meam.cpp
 action pair_meam_kokkos.h pair_meam.h
+action pair_meam_ms_kokkos.cpp pair_meam_ms.cpp
+action pair_meam_ms_kokkos.h pair_meam_ms.h
 action pair_mliap_kokkos.cpp pair_mliap.cpp
 action pair_mliap_kokkos.h pair_mliap.h
 action pair_morse_kokkos.cpp
@@ -365,6 +369,7 @@ action verlet_kokkos.h
 
 # Install cython pyx file only if non-KOKKOS version is present
 action mliap_model_python_couple_kokkos.pyx mliap_model_python_couple.pyx
+action mliap_unified_couple_kokkos.pyx mliap_unified_couple.pyx
 
 # edit 2 Makefile.package files to include/exclude package info
 
@@ -423,15 +428,19 @@ fi
 if (test $1 = 1) then
   if (type cythonize > /dev/null 2>&1 && test -e ../python_impl.cpp) then
     cythonize -3 ../mliap_model_python_couple_kokkos.pyx
+    cythonize -3 ../mliap_unified_couple_kokkos.pyx
   fi
 
 elif (test $1 = 0) then
   rm -f ../mliap_model_python_couple_kokkos.cpp ../mliap_model_python_couple_kokkos.h
+  rm -f ../mliap_unified_couple_kokkos.cpp ../mliap_unified_couple_kokkos.h
 
 elif (test $1 = 2) then
   if (type cythonize > /dev/null 2>&1 && test -e ../python_impl.cpp) then
     cythonize -3 ../mliap_model_python_couple_kokkos.pyx
+    cythonize -3 ../mliap_unified_couple_kokkos.pyx
   else
     rm -f ../mliap_model_python_couple_kokkos.cpp ../mliap_model_python_couple_kokkos.h
+    rm -f ../mliap_unified_couple_kokkos.cpp ../mliap_unified_couple_kokkos.h
   fi
 fi
diff --git a/src/KOKKOS/fix_nvt_kokkos.cpp b/src/KOKKOS/fix_nvt_kokkos.cpp
index d98ba5c163..16328c5e3a 100644
--- a/src/KOKKOS/fix_nvt_kokkos.cpp
+++ b/src/KOKKOS/fix_nvt_kokkos.cpp
@@ -39,7 +39,7 @@ FixNVTKokkos<DeviceType>::FixNVTKokkos(LAMMPS *lmp, int narg, char **arg) :
   // id = fix-ID + temp
 
   this->id_temp = utils::strdup(std::string(this->id)+"_temp");
-  this->modify->add_compute(fmt::format("{} all temp/kk",this->id_temp));
+  this->modify->add_compute(fmt::format("{} {} temp/kk",this->id_temp,this->group->names[this->igroup]));
   this->tcomputeflag = 1;
 }
 
diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
index 12b1e8f322..69ffdcd684 100644
--- a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
@@ -67,7 +67,7 @@ FixNVTSllodKokkos<DeviceType>::FixNVTSllodKokkos(LAMMPS *lmp, int narg, char **a
   }
 
   this->id_temp = utils::strdup(std::string(this->id)+"_temp");
-  this->modify->add_compute(fmt::format("{} all temp/deform/kk",this->id_temp));
+  this->modify->add_compute(fmt::format("{} {} temp/deform/kk",this->id_temp,this->group->names[this->igroup]));
   this->tcomputeflag = 1;
   this->nondeformbias = 0;
 }
diff --git a/src/KOKKOS/fix_setforce_kokkos.cpp b/src/KOKKOS/fix_setforce_kokkos.cpp
index 4b1c31bec0..9f193bc6e4 100644
--- a/src/KOKKOS/fix_setforce_kokkos.cpp
+++ b/src/KOKKOS/fix_setforce_kokkos.cpp
@@ -77,9 +77,8 @@ void FixSetForceKokkos<DeviceType>::init()
 template<class DeviceType>
 void FixSetForceKokkos<DeviceType>::post_force(int /*vflag*/)
 {
-  atomKK->sync(execution_space, X_MASK | F_MASK | MASK_MASK);
+  atomKK->sync(execution_space, F_MASK | MASK_MASK);
 
-  x = atomKK->k_x.view<DeviceType>();
   f = atomKK->k_f.view<DeviceType>();
   mask = atomKK->k_mask.view<DeviceType>();
 
@@ -88,6 +87,8 @@ void FixSetForceKokkos<DeviceType>::post_force(int /*vflag*/)
   // update region if necessary
 
   if (region) {
+    if (!utils::strmatch(region->style, "^block"))
+      error->all(FLERR,"Cannot (yet) use {}-style region with fix setforce/kk",region->style);
     region->prematch();
     DAT::tdual_int_1d k_match = DAT::tdual_int_1d("setforce:k_match",nlocal);
     KokkosBase* regionKKBase = dynamic_cast<KokkosBase*>(region);
diff --git a/src/KOKKOS/meam_dens_final_kokkos.h b/src/KOKKOS/meam_dens_final_kokkos.h
index bcc7b558dc..5e7ffdec20 100644
--- a/src/KOKKOS/meam_dens_final_kokkos.h
+++ b/src/KOKKOS/meam_dens_final_kokkos.h
@@ -61,34 +61,61 @@ void MEAMKokkos<DeviceType>::operator()(TagMEAMDensFinal, const int &i, EV_FLOAT
   if (elti >= 0) {
     scaleii = d_scale(type[i],type[i]);
     d_rho1[i] = 0.0;
-    d_rho2[i] = -1.0 / 3.0 * d_arho2b[i] * d_arho2b[i];
+    if (msmeamflag) {
+      d_rho2[i] = -1.0 / 3.0 * (d_arho2b[i] * d_arho2b[i]
+                              - d_arho2mb[i] * d_arho2mb[i]);
+    } else{
+      d_rho2[i] = -1.0 / 3.0 * d_arho2b[i] * d_arho2b[i];
+    }
     d_rho3[i] = 0.0;
     for (int m = 0; m < 3; m++) {
-      d_rho1[i] += d_arho1(i,m) * d_arho1(i,m);
-      d_rho3[i] -= 3.0 / 5.0 * d_arho3b(i,m) * d_arho3b(i,m);
-    }
-    for (int m = 0; m < 6; m++)
-      d_rho2[i] += v2D[m] * d_arho2(i,m) * d_arho2(i,m);
-    for (int m = 0; m < 10; m++)
-      d_rho3[i] += v3D[m] * d_arho3(i,m) * d_arho3(i,m);
-
-    if (d_rho0[i] > 0.0) {
-      if (ialloy == 1) {
-        d_t_ave(i,0) = fdiv_zero_kk(d_t_ave(i,0), d_tsq_ave(i,0));
-        d_t_ave(i,1) = fdiv_zero_kk(d_t_ave(i,1), d_tsq_ave(i,1));
-        d_t_ave(i,2) = fdiv_zero_kk(d_t_ave(i,2), d_tsq_ave(i,2));
-      } else if (ialloy == 2) {
-        d_t_ave(i,0) = t1_meam[elti];
-        d_t_ave(i,1) = t2_meam[elti];
-        d_t_ave(i,2) = t3_meam[elti];
-      } else {
-        d_t_ave(i,0) /= d_rho0[i];
-        d_t_ave(i,1) /= d_rho0[i];
-        d_t_ave(i,2) /= d_rho0[i];
+      if (msmeamflag) {
+        d_rho1[i] = d_rho1[i] + d_arho1(i, m) * d_arho1(i, m)
+                             - d_arho1m(i, m) * d_arho1m(i, m);
+        d_rho3[i] = d_rho3[i] - 3.0 / 5.0 * (d_arho3b(i, m) * d_arho3b(i, m)
+                                          - d_arho3mb(i, m) * d_arho3mb(i, m));
+      } else{
+        d_rho1[i] += d_arho1(i,m) * d_arho1(i,m);
+        d_rho3[i] -= 3.0 / 5.0 * d_arho3b(i,m) * d_arho3b(i,m);
       }
     }
+    for (int m = 0; m < 6; m++){
+      if (msmeamflag) {
+        d_rho2[i] = d_rho2[i] + v2D[m] * (d_arho2(i, m) * d_arho2(i, m)
+                                         - d_arho2m(i, m) * d_arho2m(i, m));
+      } else{
+        d_rho2[i] += v2D[m] * d_arho2(i,m) * d_arho2(i,m);
+      }
+    }
+    for (int m = 0; m < 10; m++)
+      if (msmeamflag) {
+        d_rho3[i] = d_rho3[i] + v3D[m] * (d_arho3(i, m) * d_arho3(i, m)
+                                        - d_arho3m(i, m) * d_arho3m(i, m));
+      } else{
+        d_rho3[i] += v3D[m] * d_arho3(i,m) * d_arho3(i,m);
+      }
 
-    d_gamma[i] = d_t_ave(i,0) * d_rho1[i] + d_t_ave(i,1) * d_rho2[i] + d_t_ave(i,2) * d_rho3[i];
+    if (msmeamflag) {
+      // with msmeam all t weights are already accounted for in rho
+      d_gamma[i] = d_rho1[i] + d_rho2[i] + d_rho3[i];
+    } else{
+      if (d_rho0[i] > 0.0) {
+        if (ialloy == 1) {
+          d_t_ave(i,0) = fdiv_zero_kk(d_t_ave(i,0), d_tsq_ave(i,0));
+          d_t_ave(i,1) = fdiv_zero_kk(d_t_ave(i,1), d_tsq_ave(i,1));
+          d_t_ave(i,2) = fdiv_zero_kk(d_t_ave(i,2), d_tsq_ave(i,2));
+        } else if (ialloy == 2) {
+          d_t_ave(i,0) = t1_meam[elti];
+          d_t_ave(i,1) = t2_meam[elti];
+          d_t_ave(i,2) = t3_meam[elti];
+        } else {
+          d_t_ave(i,0) /= d_rho0[i];
+          d_t_ave(i,1) /= d_rho0[i];
+          d_t_ave(i,2) /= d_rho0[i];
+        }
+      }
+      d_gamma[i] = d_t_ave(i,0) * d_rho1[i] + d_t_ave(i,1) * d_rho2[i] + d_t_ave(i,2) * d_rho3[i];
+    }
 
     if (d_rho0[i] > 0.0)
       d_gamma[i] /= (d_rho0[i] * d_rho0[i]);
diff --git a/src/KOKKOS/meam_dens_init_kokkos.h b/src/KOKKOS/meam_dens_init_kokkos.h
index 31ac046dcf..60bb6553d8 100644
--- a/src/KOKKOS/meam_dens_init_kokkos.h
+++ b/src/KOKKOS/meam_dens_init_kokkos.h
@@ -43,11 +43,23 @@ void MEAMKokkos<DeviceType>::operator()(TagMEAMZero, const int &i) const {
   d_rho0[i] = 0.0;
   d_arho2b[i] = 0.0;
   d_arho1(i,0) = d_arho1(i,1) = d_arho1(i,2) = 0.0;
-  for (int j = 0; j < 6; j++)
+  if (msmeamflag) {
+    d_arho2mb[i] = 0.0;
+    d_arho1m(i,0) = d_arho1m(i,1) = d_arho1m(i,2) = 0.0;
+  }
+  for (int j = 0; j < 6; j++) {
     d_arho2(i,j) = 0.0;
-  for (int j = 0; j < 10; j++)
+    if (msmeamflag)
+      d_arho2m(i,j) = 0.0;
+  }
+  for (int j = 0; j < 10; j++) {
     d_arho3(i,j) = 0.0;
+    if (msmeamflag)
+      d_arho3m(i,j) = 0.0;
+  }
   d_arho3b(i,0) = d_arho3b(i,1) = d_arho3b(i,2) = 0.0;
+  if (msmeamflag)
+    d_arho3mb(i,0) = d_arho3mb(i,1) = d_arho3mb(i,2) = 0.0;
   d_t_ave(i,0) = d_t_ave(i,1) = d_t_ave(i,2) = 0.0;
   d_tsq_ave(i,0) = d_tsq_ave(i,1) = d_tsq_ave(i,2) = 0.0;
 }
@@ -80,13 +92,20 @@ MEAMKokkos<DeviceType>::meam_dens_setup(int atom_nmax, int nall, int n_neigh)
     memoryKK->destroy_kokkos(k_arho3b,arho3b);
     memoryKK->destroy_kokkos(k_t_ave,t_ave);
     memoryKK->destroy_kokkos(k_tsq_ave,tsq_ave);
+    // msmeam
+    memoryKK->destroy_kokkos(k_arho2mb, arho2mb);
+    memoryKK->destroy_kokkos(k_arho1m, arho1m);
+    memoryKK->destroy_kokkos(k_arho2m, arho2m);
+    memoryKK->destroy_kokkos(k_arho3m, arho3m);
+    memoryKK->destroy_kokkos(k_arho3mb, arho3mb);
 
     nmax = atom_nmax;
-//    memory->create(rho, nmax, "pair:rho");
+
+    //memory->create(rho, nmax, "pair:rho");
     k_rho = DAT::tdual_ffloat_1d("pair:rho",nmax);
     d_rho = k_rho.template view<DeviceType>();
     h_rho = k_rho.h_view;
- //   memory->create(rho0, nmax, "pair:rho0");
+    //memory->create(rho0, nmax, "pair:rho0");
     k_rho0 = DAT::tdual_ffloat_1d("pair:rho0",nmax);
     d_rho0 = k_rho0.template view<DeviceType>();
     h_rho0 = k_rho0.h_view;
@@ -150,6 +169,28 @@ MEAMKokkos<DeviceType>::meam_dens_setup(int atom_nmax, int nall, int n_neigh)
     k_tsq_ave = DAT::tdual_ffloat_2d("pair:tsq_ave",nmax, 3);
     d_tsq_ave = k_tsq_ave.template view<DeviceType>();
     h_tsq_ave = k_tsq_ave.h_view;
+
+    // msmeam
+    //memory->create(arho2mb, nmax, "pair:arho2mb");
+    k_arho2mb = DAT::tdual_ffloat_1d("pair:arho2mb",nmax);
+    d_arho2mb = k_arho2mb.template view<DeviceType>();
+    h_arho2mb = k_arho2mb.h_view;
+    //memory->create(arho1m, nmax, 3, "pair:arho1m");
+    k_arho1m = DAT::tdual_ffloat_2d("pair:arho1m", nmax, 3);
+    d_arho1m = k_arho1m.template view<DeviceType>();
+    h_arho1m = k_arho1m.h_view;
+    //memory->create(arho2m, nmax, 6, "pair:arho2m");
+    k_arho2m = DAT::tdual_ffloat_2d("pair:arho2m", nmax, 6);
+    d_arho2m = k_arho2m.template view<DeviceType>();
+    h_arho2m = k_arho2m.h_view;
+    //memory->create(arho3m, nmax, 10, "pair:arho3m");
+    k_arho3m = DAT::tdual_ffloat_2d("pair:arho3m", nmax, 10);
+    d_arho3m = k_arho3m.template view<DeviceType>();
+    h_arho3m = k_arho3m.h_view;
+    //memory->create(arho3mb, nmax, 3, "pair:arho3mb");
+    k_arho3mb = DAT::tdual_ffloat_2d("pair:arho3mb", nmax, 3);
+    d_arho3mb = k_arho3mb.template view<DeviceType>();
+    h_arho3mb = k_arho3mb.h_view;
   }
 
   if (n_neigh > maxneigh) {
@@ -206,6 +247,12 @@ MEAMKokkos<DeviceType>::meam_dens_init(int inum_half, int ntype, typename AT::t_
     dup_arho3b = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho3b);
     dup_t_ave = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_t_ave);
     dup_tsq_ave = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_tsq_ave);
+    // msmeam
+    dup_arho2mb = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho2mb);
+    dup_arho1m = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho1m);
+    dup_arho2m = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho2m);
+    dup_arho3m = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho3m);
+    dup_arho3mb = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho3mb);
   } else {
     ndup_rho0 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_rho0);
     ndup_arho2b = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho2b);
@@ -215,6 +262,12 @@ MEAMKokkos<DeviceType>::meam_dens_init(int inum_half, int ntype, typename AT::t_
     ndup_arho3b = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho3b);
     ndup_t_ave = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_t_ave);
     ndup_tsq_ave = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_tsq_ave);
+    // msmeam
+    ndup_arho2mb = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho2mb);
+    ndup_arho1m = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho1m);
+    ndup_arho2m = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho2m);
+    ndup_arho3m = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho3m);
+    ndup_arho3mb = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho3mb);
   }
 
   copymode = 1;
@@ -233,6 +286,12 @@ MEAMKokkos<DeviceType>::meam_dens_init(int inum_half, int ntype, typename AT::t_
     Kokkos::Experimental::contribute(d_arho3b, dup_arho3b);
     Kokkos::Experimental::contribute(d_t_ave, dup_t_ave);
     Kokkos::Experimental::contribute(d_tsq_ave, dup_tsq_ave);
+    // msmeam
+    Kokkos::Experimental::contribute(d_arho2mb, dup_arho2mb);
+    Kokkos::Experimental::contribute(d_arho1m, dup_arho1m);
+    Kokkos::Experimental::contribute(d_arho2m, dup_arho2m);
+    Kokkos::Experimental::contribute(d_arho3m, dup_arho3m);
+    Kokkos::Experimental::contribute(d_arho3mb, dup_arho3mb);
 
     // free duplicated memory
     dup_rho0 = decltype(dup_rho0)();
@@ -243,6 +302,12 @@ MEAMKokkos<DeviceType>::meam_dens_init(int inum_half, int ntype, typename AT::t_
     dup_arho3b = decltype(dup_arho3b)();
     dup_t_ave = decltype(dup_t_ave)();
     dup_tsq_ave = decltype(dup_tsq_ave)();
+    // msmeam
+    dup_arho2mb = decltype(dup_arho2mb)();
+    dup_arho1m = decltype(dup_arho1m)();
+    dup_arho2m = decltype(dup_arho2m)();
+    dup_arho3m = decltype(dup_arho3m)();
+    dup_arho3mb = decltype(dup_arho3mb)();
   }
 }
 
@@ -417,7 +482,6 @@ MEAMKokkos<DeviceType>::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d ty
                 int offset) const
 {
   // The rho0, etc. arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial
-
   auto v_rho0 = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_rho0),decltype(ndup_rho0)>::get(dup_rho0,ndup_rho0);
   auto a_rho0 = v_rho0.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
   auto v_arho2b = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho2b),decltype(ndup_arho2b)>::get(dup_arho2b,ndup_arho2b);
@@ -434,6 +498,17 @@ MEAMKokkos<DeviceType>::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d ty
   auto a_t_ave = v_t_ave.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
   auto v_tsq_ave = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_tsq_ave),decltype(ndup_tsq_ave)>::get(dup_tsq_ave,ndup_tsq_ave);
   auto a_tsq_ave = v_tsq_ave.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  // msmeam
+  auto v_arho2mb = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho2mb),decltype(ndup_arho2mb)>::get(dup_arho2mb,ndup_arho2mb);
+  auto a_arho2mb = v_arho2mb.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho1m = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho1m),decltype(ndup_arho1m)>::get(dup_arho1m,ndup_arho1m);
+  auto a_arho1m = v_arho1m.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho2m = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho2m),decltype(ndup_arho2m)>::get(dup_arho2m,ndup_arho2m);
+  auto a_arho2m = v_arho2m.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho3m = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho3m),decltype(ndup_arho3m)>::get(dup_arho3m,ndup_arho3m);
+  auto a_arho3m = v_arho3m.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho3mb = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho3mb),decltype(ndup_arho3mb)>::get(dup_arho3mb,ndup_arho3mb);
+  auto a_arho3mb = v_arho3mb.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
 
   const int elti = d_map[type[i]];
   const double xtmp = x(i,0);
@@ -463,6 +538,16 @@ MEAMKokkos<DeviceType>::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d ty
         double rhoa1i = ro0i * MathSpecialKokkos::fm_exp(-beta1_meam[elti] * ai) * sij;
         double rhoa2i = ro0i * MathSpecialKokkos::fm_exp(-beta2_meam[elti] * ai) * sij;
         double rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-beta3_meam[elti] * ai) * sij;
+        // msmeam
+        double rhoa1mj, rhoa2mj, rhoa3mj, rhoa1mi, rhoa2mi, rhoa3mi;
+        if (msmeamflag) {
+          rhoa1mj = ro0j * t1m_meam[eltj] * MathSpecialKokkos::fm_exp(-beta1m_meam[eltj] * aj) * sij;
+          rhoa2mj = ro0j * t2m_meam[eltj] * MathSpecialKokkos::fm_exp(-beta2m_meam[eltj] * aj) * sij;
+          rhoa3mj = ro0j * t3m_meam[eltj] * MathSpecialKokkos::fm_exp(-beta3m_meam[eltj] * aj) * sij;
+          rhoa1mi = ro0i * t1m_meam[elti] * MathSpecialKokkos::fm_exp(-beta1m_meam[elti] * ai) * sij;
+          rhoa2mi = ro0i * t2m_meam[elti] * MathSpecialKokkos::fm_exp(-beta2m_meam[elti] * ai) * sij;
+          rhoa3mi = ro0i * t3m_meam[elti] * MathSpecialKokkos::fm_exp(-beta3m_meam[elti] * ai) * sij;
+        }
         if (ialloy == 1) {
           rhoa1j *= t1_meam[eltj];
           rhoa2j *= t2_meam[eltj];
@@ -499,20 +584,45 @@ MEAMKokkos<DeviceType>::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d ty
         const double A1i = rhoa1i / rij;
         const double A2i = rhoa2i / rij2;
         const double A3i = rhoa3i / (rij2 * rij);
+        double A1mj, A2mj, A3mj, A1mi, A2mi, A3mi;
+        if (msmeamflag) {
+          a_arho2mb[i] += rhoa2mj;
+          a_arho2mb[j] += rhoa2mi;
+          A1mj = rhoa1mj / rij;
+          A2mj = rhoa2mj / rij2;
+          A3mj = rhoa3mj / (rij2 * rij);
+          A1mi = rhoa1mi / rij;
+          A2mi = rhoa2mi / rij2;
+          A3mi = rhoa3mi / (rij2 * rij);
+        }
         int nv2 = 0;
         int nv3 = 0;
         for (int m = 0; m < 3; m++) {
-          a_arho1(i,m) += A1j * delij[m];
+          a_arho1(i,m) +=  A1j * delij[m];
           a_arho1(j,m) += -A1i * delij[m];
-          a_arho3b(i,m) += rhoa3j * delij[m] / rij;
+          a_arho3b(i,m) +=  rhoa3j * delij[m] / rij;
           a_arho3b(j,m) += -rhoa3i * delij[m] / rij;
+          if (msmeamflag) {
+            a_arho1m(i,m) +=  A1mj * delij[m];
+            a_arho1m(j,m) += -A1mi * delij[m];
+            a_arho3mb(i,m) +=  rhoa3mj * delij[m] / rij;
+            a_arho3mb(j,m) += -rhoa3mi * delij[m] / rij;
+          }
           for (int n = m; n < 3; n++) {
             a_arho2(i,nv2) += A2j * delij[m] * delij[n];
             a_arho2(j,nv2) += A2i * delij[m] * delij[n];
+            if (msmeamflag) {
+              a_arho2m(i,nv2) += A2mj * delij[m] * delij[n];
+              a_arho2m(j,nv2) += A2mi * delij[m] * delij[n];
+            }
             nv2++;
             for (int p = n; p < 3; p++) {
-              a_arho3(i,nv3) += A3j * delij[m] * delij[n] * delij[p];
+              a_arho3(i,nv3) +=  A3j * delij[m] * delij[n] * delij[p];
               a_arho3(j,nv3) += -A3i * delij[m] * delij[n] * delij[p];
+              if (msmeamflag) {
+                a_arho3m(i,nv3) +=  A3mj * delij[m] * delij[n] * delij[p];
+                a_arho3m(j,nv3) += -A3mi * delij[m] * delij[n] * delij[p];
+              }
               nv3++;
             }
           }
diff --git a/src/KOKKOS/meam_force_kokkos.h b/src/KOKKOS/meam_force_kokkos.h
index e7e6c64231..5c4244e99b 100644
--- a/src/KOKKOS/meam_force_kokkos.h
+++ b/src/KOKKOS/meam_force_kokkos.h
@@ -119,6 +119,17 @@ KOKKOS_INLINE_FUNCTION void MEAMKokkos<DeviceType>::operator()(TagMEAMForce<NEIG
   double dsij1, dsij2, force1, force2;
   double t1i, t2i, t3i, t1j, t2j, t3j;
   int fnoffset;
+  // msmeam
+  double rhoa1mj,drhoa1mj,rhoa1mi,drhoa1mi;
+  double rhoa2mj,drhoa2mj,rhoa2mi,drhoa2mi;
+  double rhoa3mj, drhoa3mj, rhoa3mi, drhoa3mi;
+  double arg1i1m, arg1j1m, arg1i2m, arg1j2m, arg1i3m, arg1j3m, arg3i3m, arg3j3m;
+  double drho1mdr1, drho1mdr2, drho1mds1, drho1mds2;
+  double drho1mdrm1[3], drho1mdrm2[3];
+  double drho2mdr1, drho2mdr2, drho2mds1, drho2mds2;
+  double drho2mdrm1[3], drho2mdrm2[3];
+  double drho3mdr1, drho3mdr2, drho3mds1, drho3mds2;
+  double drho3mdrm1[3], drho3mdrm2[3];
 
   // The f, etc. arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial
 
@@ -197,6 +208,14 @@ KOKKOS_INLINE_FUNCTION void MEAMKokkos<DeviceType>::operator()(TagMEAMForce<NEIG
         drhoa2i = -beta2_meam[elti] * invrei * rhoa2i;
         rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-beta3_meam[elti] * ai);
         drhoa3i = -beta3_meam[elti] * invrei * rhoa3i;
+        if (msmeamflag) {
+          rhoa1mi = ro0i * MathSpecialKokkos::fm_exp(-beta1m_meam[elti] * ai) * t1m_meam[elti];
+          drhoa1mi = -beta1m_meam[elti] * invrei * rhoa1mi;
+          rhoa2mi = ro0i * MathSpecialKokkos::fm_exp(-beta2m_meam[elti] * ai) * t2m_meam[elti];
+          drhoa2mi = -beta2m_meam[elti] * invrei * rhoa2mi;
+          rhoa3mi = ro0i * MathSpecialKokkos::fm_exp(-beta3m_meam[elti] * ai) * t3m_meam[elti];
+          drhoa3mi = -beta3m_meam[elti] * invrei * rhoa3mi;
+        }
 
         if (elti != eltj) {
           invrej = 1.0 / re_meam[eltj][eltj];
@@ -210,6 +229,14 @@ KOKKOS_INLINE_FUNCTION void MEAMKokkos<DeviceType>::operator()(TagMEAMForce<NEIG
           drhoa2j = -beta2_meam[eltj] * invrej * rhoa2j;
           rhoa3j = ro0j * MathSpecialKokkos::fm_exp(-beta3_meam[eltj] * aj);
           drhoa3j = -beta3_meam[eltj] * invrej * rhoa3j;
+          if (msmeamflag) {
+            rhoa1mj = ro0j * MathSpecialKokkos::fm_exp(-beta1m_meam[eltj] * aj) * t1m_meam[eltj];
+            drhoa1mj = -beta1m_meam[eltj] * invrej * rhoa1mj;
+            rhoa2mj = ro0j * MathSpecialKokkos::fm_exp(-beta2m_meam[eltj] * aj) * t2m_meam[eltj];
+            drhoa2mj = -beta2m_meam[eltj] * invrej * rhoa2mj;
+            rhoa3mj = ro0j * MathSpecialKokkos::fm_exp(-beta3m_meam[eltj] * aj) * t3m_meam[eltj];
+            drhoa3mj = -beta3m_meam[eltj] * invrej * rhoa3mj;
+          }
         } else {
           rhoa0j = rhoa0i;
           drhoa0j = drhoa0i;
@@ -219,6 +246,14 @@ KOKKOS_INLINE_FUNCTION void MEAMKokkos<DeviceType>::operator()(TagMEAMForce<NEIG
           drhoa2j = drhoa2i;
           rhoa3j = rhoa3i;
           drhoa3j = drhoa3i;
+          if (msmeamflag) {
+            rhoa1mj = rhoa1mi;
+            drhoa1mj = drhoa1mi;
+            rhoa2mj = rhoa2mi;
+            drhoa2mj = drhoa2mi;
+            rhoa3mj = rhoa3mi;
+            drhoa3mj = drhoa3mi;
+          }
         }
 
         const double t1mi = t1_meam[elti];
@@ -228,7 +263,10 @@ KOKKOS_INLINE_FUNCTION void MEAMKokkos<DeviceType>::operator()(TagMEAMForce<NEIG
         const double t2mj = t2_meam[eltj];
         const double t3mj = t3_meam[eltj];
 
-        if (ialloy == 1) {
+        // ialloy mod not needed in MS-MEAM, but similarity here is that we multply rhos by t.
+        // We did this above with rhoa1mj, rhoa2mj, etc.
+
+        if (ialloy == 1 || msmeamflag) {
           rhoa1j *= t1mj;
           rhoa2j *= t2mj;
           rhoa3j *= t3mj;
@@ -272,6 +310,38 @@ KOKKOS_INLINE_FUNCTION void MEAMKokkos<DeviceType>::operator()(TagMEAMForce<NEIG
           arg3j3 = arg3j3 - d_arho3b(j, n) * delij[n];
         }
 
+        // msmeam arhom args
+        nv2 = 0.0;
+        nv3 = 0.0;
+        arg1i1m = 0.0;
+        arg1j1m = 0.0;
+        arg1i2m = 0.0;
+        arg1j2m = 0.0;
+        arg1i3m = 0.0;
+        arg1j3m = 0.0;
+        arg3i3m = 0.0;
+        arg3j3m = 0.0;
+        if (msmeamflag) {
+          for (n = 0; n < 3; n++) {
+            for (p = n; p < 3; p++) {
+              for (q = p; q < 3; q++) {
+                arg = delij[n] * delij[p] * delij[q] * v3D[nv3];
+                arg1i3m = arg1i3m - d_arho3m(i, nv3) * arg;
+                arg1j3m = arg1j3m + d_arho3m(j, nv3) * arg;
+                nv3 = nv3 + 1;
+              }
+              arg = delij[n] * delij[p] * v2D[nv2];
+              arg1i2m = arg1i2m + d_arho2m(i, nv2) * arg;
+              arg1j2m = arg1j2m + d_arho2m(j, nv2) * arg;
+              nv2 = nv2 + 1;
+            }
+            arg1i1m = arg1i1m - d_arho1m(i, n) * delij[n];
+            arg1j1m = arg1j1m + d_arho1m(j, n) * delij[n];
+            arg3i3m = arg3i3m - d_arho3mb(i, n) * delij[n];
+            arg3j3m = arg3j3m + d_arho3mb(j, n) * delij[n];
+          }
+        }
+
         // rho0 terms
         drho0dr1 = drhoa0j * sij;
         drho0dr2 = drhoa0i * sij;
@@ -330,75 +400,183 @@ KOKKOS_INLINE_FUNCTION void MEAMKokkos<DeviceType>::operator()(TagMEAMForce<NEIG
           drho3drm2[m] = (-a3 * drho3drm2[m] + a3a * d_arho3b(j, m)) * rhoa3i;
         }
 
+        if (msmeamflag) {
+          //     rho1m terms
+          a1 = 2 * sij / rij;
+          drho1mdr1 = a1 * (drhoa1mj - rhoa1mj / rij) * arg1i1m;
+          drho1mdr2 = a1 * (drhoa1mi - rhoa1mi / rij) * arg1j1m;
+          drho1mdr1 *= -1.0;
+          drho1mdr2 *= -1.0;
+          a1 = 2.0 * sij / rij;
+          for (m = 0; m < 3; m++) {
+            drho1mdrm1[m] = a1 * rhoa1mj * d_arho1m(i, m);
+            drho1mdrm2[m] = -a1 * rhoa1mi * d_arho1m(j, m);
+          }
+
+          //     rho2m terms
+          a2 = 2 * sij / rij2;
+          drho2mdr1 = a2 * (drhoa2mj - 2 * rhoa2mj / rij) * arg1i2m - 2.0 / 3.0 * d_arho2mb[i] * drhoa2mj * sij;
+          drho2mdr2 = a2 * (drhoa2mi - 2 * rhoa2mi / rij) * arg1j2m - 2.0 / 3.0 * d_arho2mb[j] * drhoa2mi * sij;
+          a2 = 4 * sij / rij2;
+          for (m = 0; m < 3; m++) {
+            drho2mdrm1[m] = 0.0;
+            drho2mdrm2[m] = 0.0;
+            for (n = 0; n < 3; n++) {
+              drho2mdrm1[m] = drho2mdrm1[m] + d_arho2m(i, vind2D[m][n]) * delij[n];
+              drho2mdrm2[m] = drho2mdrm2[m] - d_arho2m(j, vind2D[m][n]) * delij[n];
+            }
+            drho2mdrm1[m] = a2 * rhoa2mj * drho2mdrm1[m];
+            drho2mdrm2[m] = -a2 * rhoa2mi * drho2mdrm2[m];
+          }
+
+          //     rho3m terms
+          rij3 = rij * rij2;
+          a3 = 2 * sij / rij3;
+          a3a = 6.0 / 5.0 * sij / rij;
+          drho3mdr1 = a3 * (drhoa3mj - 3 * rhoa3mj / rij) * arg1i3m - a3a * (drhoa3mj - rhoa3mj / rij) * arg3i3m;
+          drho3mdr2 = a3 * (drhoa3mi - 3 * rhoa3mi / rij) * arg1j3m - a3a * (drhoa3mi - rhoa3mi / rij) * arg3j3m;
+          drho3mdr1 *= -1.0;
+          drho3mdr2 *= -1.0;
+
+          a3 = 6 * sij / rij3;
+          a3a = 6 * sij / (5 * rij);
+          for (m = 0; m < 3; m++) {
+            drho3mdrm1[m] = 0.0;
+            drho3mdrm2[m] = 0.0;
+            nv2 = 0;
+            for (n = 0; n < 3; n++) {
+              for (p = n; p < 3; p++) {
+                arg = delij[n] * delij[p] * this->v2D[nv2];
+                drho3mdrm1[m] = drho3mdrm1[m] + d_arho3m(i, vind3D[m][n][p]) * arg;
+                drho3mdrm2[m] = drho3mdrm2[m] + d_arho3m(j, vind3D[m][n][p]) * arg;
+                nv2 = nv2 + 1;
+              }
+            }
+            drho3mdrm1[m] = (a3 * drho3mdrm1[m] - a3a * d_arho3mb(i, m)) * rhoa3mj;
+            drho3mdrm2[m] = (-a3 * drho3mdrm2[m] + a3a * d_arho3mb(j, m)) * rhoa3mi;
+          }
+        } else {
+          for (m = 0; m < 3; m++) {
+            drho1mdrm1[m] = 0.0;
+            drho1mdrm2[m] = 0.0;
+            drho2mdrm1[m] = 0.0;
+            drho2mdrm2[m] = 0.0;
+            drho3mdrm1[m] = 0.0;
+            drho3mdrm2[m] = 0.0;
+          }
+        }
+
         // Compute derivatives of weighting functions t wrt rij
-        t1i = d_t_ave(i, 0);
-        t2i = d_t_ave(i, 1);
-        t3i = d_t_ave(i, 2);
-        t1j = d_t_ave(j, 0);
-        t2j = d_t_ave(j, 1);
-        t3j = d_t_ave(j, 2);
-
-        if (ialloy == 1) {
-
-          a1i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 0));
-          a1j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 0));
-          a2i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 1));
-          a2j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 1));
-          a3i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 2));
-          a3j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 2));
-
-          dt1dr1 = a1i * (t1mj - t1i * MathSpecialKokkos::square(t1mj));
-          dt1dr2 = a1j * (t1mi - t1j * MathSpecialKokkos::square(t1mi));
-          dt2dr1 = a2i * (t2mj - t2i * MathSpecialKokkos::square(t2mj));
-          dt2dr2 = a2j * (t2mi - t2j * MathSpecialKokkos::square(t2mi));
-          dt3dr1 = a3i * (t3mj - t3i * MathSpecialKokkos::square(t3mj));
-          dt3dr2 = a3j * (t3mi - t3j * MathSpecialKokkos::square(t3mi));
-
-        } else if (ialloy == 2) {
+        // Weighting functions t set to unity for msmeam
 
+        if (msmeamflag) {
+          t1i = 1.0;
+          t2i = 1.0;
+          t3i = 1.0;
+          t1j = 1.0;
+          t2j = 1.0;
+          t3j = 1.0;
           dt1dr1 = 0.0;
           dt1dr2 = 0.0;
           dt2dr1 = 0.0;
           dt2dr2 = 0.0;
           dt3dr1 = 0.0;
           dt3dr2 = 0.0;
-
         } else {
 
-          ai = 0.0;
-          if (!iszero_kk(d_rho0[i])) ai = drhoa0j * sij / d_rho0[i];
-          aj = 0.0;
-          if (!iszero_kk(d_rho0[j])) aj = drhoa0i * sij / d_rho0[j];
+          t1i = d_t_ave(i, 0);
+          t2i = d_t_ave(i, 1);
+          t3i = d_t_ave(i, 2);
+          t1j = d_t_ave(j, 0);
+          t2j = d_t_ave(j, 1);
+          t3j = d_t_ave(j, 2);
 
-          dt1dr1 = ai * (t1mj - t1i);
-          dt1dr2 = aj * (t1mi - t1j);
-          dt2dr1 = ai * (t2mj - t2i);
-          dt2dr2 = aj * (t2mi - t2j);
-          dt3dr1 = ai * (t3mj - t3i);
-          dt3dr2 = aj * (t3mi - t3j);
+          if (ialloy == 1) {
+
+            a1i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 0));
+            a1j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 0));
+            a2i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 1));
+            a2j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 1));
+            a3i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 2));
+            a3j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 2));
+
+            dt1dr1 = a1i * (t1mj - t1i * MathSpecialKokkos::square(t1mj));
+            dt1dr2 = a1j * (t1mi - t1j * MathSpecialKokkos::square(t1mi));
+            dt2dr1 = a2i * (t2mj - t2i * MathSpecialKokkos::square(t2mj));
+            dt2dr2 = a2j * (t2mi - t2j * MathSpecialKokkos::square(t2mi));
+            dt3dr1 = a3i * (t3mj - t3i * MathSpecialKokkos::square(t3mj));
+            dt3dr2 = a3j * (t3mi - t3j * MathSpecialKokkos::square(t3mi));
+
+          } else if (ialloy == 2) {
+
+            dt1dr1 = 0.0;
+            dt1dr2 = 0.0;
+            dt2dr1 = 0.0;
+            dt2dr2 = 0.0;
+            dt3dr1 = 0.0;
+            dt3dr2 = 0.0;
+
+          } else {
+
+            ai = 0.0;
+            if (!iszero_kk(d_rho0[i])) ai = drhoa0j * sij / d_rho0[i];
+            aj = 0.0;
+            if (!iszero_kk(d_rho0[j])) aj = drhoa0i * sij / d_rho0[j];
+
+            dt1dr1 = ai * (t1mj - t1i);
+            dt1dr2 = aj * (t1mi - t1j);
+            dt2dr1 = ai * (t2mj - t2i);
+            dt2dr2 = aj * (t2mi - t2j);
+            dt3dr1 = ai * (t3mj - t3i);
+            dt3dr2 = aj * (t3mi - t3j);
+          }
         }
 
         // Compute derivatives of total density wrt rij, sij and rij(3)
         get_shpfcn(lattce_meam[elti][elti], stheta_meam[elti][elti], ctheta_meam[elti][elti], shpi);
         get_shpfcn(lattce_meam[eltj][eltj], stheta_meam[elti][elti], ctheta_meam[elti][elti], shpj);
 
-        drhodr1 = d_dgamma1[i] * drho0dr1 +
-            d_dgamma2[i] *
-                (dt1dr1 * d_rho1[i] + t1i * drho1dr1 + dt2dr1 * d_rho2[i] + t2i * drho2dr1 +
-                 dt3dr1 * d_rho3[i] + t3i * drho3dr1) -
-            d_dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1);
-        drhodr2 = d_dgamma1[j] * drho0dr2 +
-            d_dgamma2[j] *
-                (dt1dr2 * d_rho1[j] + t1j * drho1dr2 + dt2dr2 * d_rho2[j] + t2j * drho2dr2 +
-                 dt3dr2 * d_rho3[j] + t3j * drho3dr2) -
-            d_dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2);
-        for (m = 0; m < 3; m++) {
-          drhodrm1[m] = 0.0;
-          drhodrm2[m] = 0.0;
-          drhodrm1[m] =
-              d_dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]);
-          drhodrm2[m] =
-              d_dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]);
+
+        if (msmeamflag) {
+          drhodr1 = d_dgamma1[i] * drho0dr1 +
+                    d_dgamma2[i] * (dt1dr1 * d_rho1[i] + t1i * (drho1dr1 - drho1mdr1) +
+                                    dt2dr1 * d_rho2[i] + t2i * (drho2dr1 - drho2mdr1) +
+                                    dt3dr1 * d_rho3[i] + t3i * (drho3dr1 - drho3mdr1)) -
+                    d_dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1);
+          drhodr2 = d_dgamma1[j] * drho0dr2 +
+                    d_dgamma2[j] * (dt1dr2 * d_rho1[j] + t1j * (drho1dr2 - drho1mdr2) +
+                                    dt2dr2 * d_rho2[j] + t2j * (drho2dr2 - drho2mdr2) +
+                                    dt3dr2 * d_rho3[j] + t3j * (drho3dr2 - drho3mdr2)) -
+                    d_dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2);
+          for (m = 0; m < 3; m++) {
+            drhodrm1[m] = 0.0;
+            drhodrm2[m] = 0.0;
+            drhodrm1[m] = d_dgamma2[i] * (t1i * (drho1drm1[m] - drho1mdrm1[m]) +
+                                          t2i * (drho2drm1[m] - drho2mdrm1[m]) +
+                                          t3i * (drho3drm1[m] - drho3mdrm1[m]) );
+            drhodrm2[m] = d_dgamma2[j] * (t1j * (drho1drm2[m] - drho1mdrm2[m]) +
+                                          t2j * (drho2drm2[m] - drho2mdrm2[m]) +
+                                          t3j * (drho3drm2[m] - drho3mdrm2[m]) );
+          }
+        } else {
+          drhodr1 = d_dgamma1[i] * drho0dr1 +
+              d_dgamma2[i] *
+                  (dt1dr1 * d_rho1[i] + t1i * drho1dr1 + dt2dr1 * d_rho2[i] + t2i * drho2dr1 +
+                   dt3dr1 * d_rho3[i] + t3i * drho3dr1) -
+              d_dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1);
+          drhodr2 = d_dgamma1[j] * drho0dr2 +
+              d_dgamma2[j] *
+                  (dt1dr2 * d_rho1[j] + t1j * drho1dr2 + dt2dr2 * d_rho2[j] + t2j * drho2dr2 +
+                   dt3dr2 * d_rho3[j] + t3j * drho3dr2) -
+              d_dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2);
+          for (m = 0; m < 3; m++) {
+            drhodrm1[m] = 0.0;
+            drhodrm2[m] = 0.0;
+            drhodrm1[m] =
+                d_dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]);
+            drhodrm2[m] =
+                d_dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]);
+          }
         }
 
         // Compute derivatives wrt sij, but only if necessary
@@ -416,6 +594,24 @@ KOKKOS_INLINE_FUNCTION void MEAMKokkos<DeviceType>::operator()(TagMEAMForce<NEIG
           drho3ds1 = a3 * rhoa3j * arg1i3 - a3a * rhoa3j * arg3i3;
           drho3ds2 = a3 * rhoa3i * arg1j3 - a3a * rhoa3i * arg3j3;
 
+          if (msmeamflag) {
+            drho1mds1 = a1 * rhoa1mj * arg1i1m;
+            drho1mds2 = a1 * rhoa1mi * arg1j1m;
+            drho2mds1 = a2 * rhoa2mj * arg1i2m - 2.0 / 3.0 * d_arho2mb[i] * rhoa2mj;
+            drho2mds2 = a2 * rhoa2mi * arg1j2m - 2.0 / 3.0 * d_arho2mb[j] * rhoa2mi;
+            drho3mds1 = a3 * rhoa3mj * arg1i3m - a3a * rhoa3mj * arg3i3m;
+            drho3mds2 = a3 * rhoa3mi * arg1j3m - a3a * rhoa3mi * arg3j3m;
+            drho3mds1 *= -1;
+            drho3mds2 *= -1;
+          } else {
+            drho1mds1 = 0.0;
+            drho1mds2 = 0.0;
+            drho2mds1 = 0.0;
+            drho2mds2 = 0.0;
+            drho3mds1 = 0.0;
+            drho3mds2 = 0.0;
+          }
+
           if (ialloy == 1) {
             a1i = fdiv_zero_kk(rhoa0j, d_tsq_ave(i, 0));
             a1j = fdiv_zero_kk(rhoa0i, d_tsq_ave(j, 0));
@@ -455,19 +651,33 @@ KOKKOS_INLINE_FUNCTION void MEAMKokkos<DeviceType>::operator()(TagMEAMForce<NEIG
             dt3ds2 = aj * (t3mi - t3j);
           }
 
-          drhods1 = d_dgamma1[i] * drho0ds1 +
-              d_dgamma2[i] *
-                  (dt1ds1 * d_rho1[i] + t1i * drho1ds1 + dt2ds1 * d_rho2[i] + t2i * drho2ds1 +
-                   dt3ds1 * d_rho3[i] + t3i * drho3ds1) -
+          if (msmeamflag) {
+            drhods1 = d_dgamma1[i] * drho0ds1 +
+              d_dgamma2[i] * (dt1ds1 * d_rho1[i] + t1i * (drho1ds1 - drho1mds1) +
+                              dt2ds1 * d_rho2[i] + t2i * (drho2ds1 - drho2mds1) +
+                              dt3ds1 * d_rho3[i] + t3i * (drho3ds1 - drho3mds1)) -
               d_dgamma3[i] * (shpi[0] * dt1ds1 + shpi[1] * dt2ds1 + shpi[2] * dt3ds1);
-          drhods2 = d_dgamma1[j] * drho0ds2 +
-              d_dgamma2[j] *
-                  (dt1ds2 * d_rho1[j] + t1j * drho1ds2 + dt2ds2 * d_rho2[j] + t2j * drho2ds2 +
-                   dt3ds2 * d_rho3[j] + t3j * drho3ds2) -
+            drhods2 = d_dgamma1[j] * drho0ds2 +
+              d_dgamma2[j] * (dt1ds2 * d_rho1[j] + t1j * (drho1ds2 - drho1mds2) +
+                              dt2ds2 * d_rho2[j] + t2j * (drho2ds2 - drho2mds2) +
+                              dt3ds2 * d_rho3[j] + t3j * (drho3ds2 - drho3mds2)) -
               d_dgamma3[j] * (shpj[0] * dt1ds2 + shpj[1] * dt2ds2 + shpj[2] * dt3ds2);
+          } else {
+            drhods1 = d_dgamma1[i] * drho0ds1 +
+                d_dgamma2[i] *
+                    (dt1ds1 * d_rho1[i] + t1i * drho1ds1 + dt2ds1 * d_rho2[i] + t2i * drho2ds1 +
+                     dt3ds1 * d_rho3[i] + t3i * drho3ds1) -
+                d_dgamma3[i] * (shpi[0] * dt1ds1 + shpi[1] * dt2ds1 + shpi[2] * dt3ds1);
+            drhods2 = d_dgamma1[j] * drho0ds2 +
+                d_dgamma2[j] *
+                    (dt1ds2 * d_rho1[j] + t1j * drho1ds2 + dt2ds2 * d_rho2[j] + t2j * drho2ds2 +
+                     dt3ds2 * d_rho3[j] + t3j * drho3ds2) -
+                d_dgamma3[j] * (shpj[0] * dt1ds2 + shpj[1] * dt2ds2 + shpj[2] * dt3ds2);
+          }
         }
 
         // Compute derivatives of energy wrt rij, sij and rij[3]
+
         dUdrij = phip * sij + d_frhop[i] * drhodr1 + d_frhop[j] * drhodr2;
         dUdsij = 0.0;
         if (!iszero_kk(d_dscrfcn[fnoffset + jn])) {
diff --git a/src/KOKKOS/meam_impl_kokkos.h b/src/KOKKOS/meam_impl_kokkos.h
index bdd8728356..9d630d724e 100644
--- a/src/KOKKOS/meam_impl_kokkos.h
+++ b/src/KOKKOS/meam_impl_kokkos.h
@@ -58,6 +58,14 @@ MEAMKokkos<DeviceType>::~MEAMKokkos()
   memoryKK->destroy_kokkos(k_scrfcn,scrfcn);
   memoryKK->destroy_kokkos(k_dscrfcn,dscrfcn);
   memoryKK->destroy_kokkos(k_fcpair,fcpair);
+
+  // msmeam
+
+  memoryKK->destroy_kokkos(k_arho2mb, arho2mb);
+  memoryKK->destroy_kokkos(k_arho1m, arho1m);
+  memoryKK->destroy_kokkos(k_arho2m, arho2m);
+  memoryKK->destroy_kokkos(k_arho3m, arho3m);
+  memoryKK->destroy_kokkos(k_arho3mb, arho3mb);
 }
 
 #include "meam_setup_done_kokkos.h"
diff --git a/src/KOKKOS/meam_kokkos.h b/src/KOKKOS/meam_kokkos.h
index cc75023810..2203355641 100644
--- a/src/KOKKOS/meam_kokkos.h
+++ b/src/KOKKOS/meam_kokkos.h
@@ -136,6 +136,13 @@ template <class DeviceType> class MEAMKokkos : public MEAM {
   DAT::tdual_ffloat_1d k_scrfcn, k_dscrfcn, k_fcpair;
   typename ArrayTypes<DeviceType>::t_ffloat_1d d_scrfcn, d_dscrfcn, d_fcpair;
   HAT::t_ffloat_1d h_scrfcn, h_dscrfcn, h_fcpair;
+  // msmeam
+  DAT::tdual_ffloat_2d k_arho1m, k_arho2m, k_arho3m, k_arho3mb;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_arho1m, d_arho2m, d_arho3m, d_arho3mb;
+  HAT::t_ffloat_2d h_arho1m, h_arho2m, h_arho3m, h_arho3mb;
+  DAT::tdual_ffloat_1d k_arho2mb;
+  typename ArrayTypes<DeviceType>::t_ffloat_1d d_arho2mb;
+  HAT::t_ffloat_1d h_arho2mb;
 
  protected:
   int need_dup;
@@ -195,6 +202,31 @@ template <class DeviceType> class MEAMKokkos : public MEAM {
       dup_vatom;
   NonDupScatterView<typename decltype(d_vatom)::data_type, typename decltype(d_vatom)::array_layout>
       ndup_vatom;
+
+  // msmeam
+
+  DupScatterView<typename decltype(d_arho1m)::data_type, typename decltype(d_arho1m)::array_layout>
+      dup_arho1m;
+  NonDupScatterView<typename decltype(d_arho1m)::data_type, typename decltype(d_arho1m)::array_layout>
+      ndup_arho1m;
+  DupScatterView<typename decltype(d_arho2m)::data_type, typename decltype(d_arho2m)::array_layout>
+      dup_arho2m;
+  NonDupScatterView<typename decltype(d_arho2m)::data_type, typename decltype(d_arho2m)::array_layout>
+      ndup_arho2m;
+  DupScatterView<typename decltype(d_arho3m)::data_type, typename decltype(d_arho3m)::array_layout>
+      dup_arho3m;
+  NonDupScatterView<typename decltype(d_arho3m)::data_type, typename decltype(d_arho3m)::array_layout>
+      ndup_arho3m;
+  DupScatterView<typename decltype(d_arho2mb)::data_type, typename decltype(d_arho2mb)::array_layout>
+      dup_arho2mb;
+  NonDupScatterView<typename decltype(d_arho2mb)::data_type,
+                    typename decltype(d_arho2mb)::array_layout>
+      ndup_arho2mb;
+  DupScatterView<typename decltype(d_arho3mb)::data_type, typename decltype(d_arho3mb)::array_layout>
+      dup_arho3mb;
+  NonDupScatterView<typename decltype(d_arho3mb)::data_type,
+                    typename decltype(d_arho3mb)::array_layout>
+      ndup_arho3mb;
 };
 
 KOKKOS_INLINE_FUNCTION
diff --git a/src/KOKKOS/min_kokkos.cpp b/src/KOKKOS/min_kokkos.cpp
index 4e9c9db4e2..4e1c3967ff 100644
--- a/src/KOKKOS/min_kokkos.cpp
+++ b/src/KOKKOS/min_kokkos.cpp
@@ -513,6 +513,7 @@ double MinKokkos::energy_force(int resetflag)
   if (modify->n_min_post_force) {
      timer->stamp();
      modify->min_post_force(vflag);
+     atomKK->sync(Device,F_MASK);
      timer->stamp(Timer::MODIFY);
   }
 
diff --git a/src/KOKKOS/mliap_data_kokkos.cpp b/src/KOKKOS/mliap_data_kokkos.cpp
index f9453301c7..993272771d 100644
--- a/src/KOKKOS/mliap_data_kokkos.cpp
+++ b/src/KOKKOS/mliap_data_kokkos.cpp
@@ -56,7 +56,9 @@ MLIAPDataKokkos<DeviceType>::~MLIAPDataKokkos() {
   memoryKK->destroy_kokkos(k_ielems,ielems);
   memoryKK->destroy_kokkos(k_numneighs,numneighs);
   memoryKK->destroy_kokkos(k_jatoms,jatoms);
+  memoryKK->destroy_kokkos(k_pair_i,pair_i);
   memoryKK->destroy_kokkos(k_jelems,jelems);
+  memoryKK->destroy_kokkos(k_elems,elems);
   memoryKK->destroy_kokkos(k_ij);
   memoryKK->destroy_kokkos(k_rij,rij);
   memoryKK->destroy_kokkos(k_graddesc,graddesc);
@@ -75,13 +77,17 @@ void MLIAPDataKokkos<DeviceType>::generate_neighdata(class NeighList *list_in, i
     nmax = atom->nmax;
     memoryKK->destroy_kokkos(k_gradforce,gradforce);
     memoryKK->create_kokkos(k_gradforce, gradforce, nmax, size_gradforce, "mliap_data:gradforce");
-  }
+    memoryKK->destroy_kokkos(k_elems,elems);
+    memoryKK->create_kokkos(k_elems, elems, nmax, "mliap_data:elems");  }
 
   // clear gradforce array
 
+  int nall = atom->nlocal + atom->nghost;
+  ntotal = nall;
   auto d_gradforce = k_gradforce.template view<DeviceType>();
   Kokkos::deep_copy(d_gradforce, 0.);
-
+  auto d_elems = k_elems.template view<DeviceType>();
+  Kokkos::deep_copy(d_elems, 0.);
   // grow arrays if necessary
 
   nlistatoms = list->inum;
@@ -122,6 +128,7 @@ void MLIAPDataKokkos<DeviceType>::generate_neighdata(class NeighList *list_in, i
   auto d_ij = k_ij.template view<DeviceType>();
   auto d_numneighs = k_numneighs.template view<DeviceType>();
   auto d_jatoms = k_jatoms.template view<DeviceType>();
+  auto d_pair_i= k_pair_i.template view<DeviceType>();
   auto d_jelems= k_jelems.template view<DeviceType>();
   auto d_rij= k_rij.template view<DeviceType>();
 
@@ -162,6 +169,7 @@ void MLIAPDataKokkos<DeviceType>::generate_neighdata(class NeighList *list_in, i
       const int jelem = map(jtype);
       if (rsq < d_cutsq(itype,jtype)) {
         d_jatoms(ij) = j;
+        d_pair_i(ij) = i;
         d_jelems(ij) = jelem;
         d_rij(ij, 0) = delx;
         d_rij(ij, 1) = dely;
@@ -172,8 +180,11 @@ void MLIAPDataKokkos<DeviceType>::generate_neighdata(class NeighList *list_in, i
     d_iatoms[ii] = i;
     d_ielems[ii] = ielem;
   });
-
-  modified(execution_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | JATOMS_MASK | JELEMS_MASK | RIJ_MASK | IJ_MASK );
+  Kokkos::parallel_for(nmax, KOKKOS_LAMBDA (int i)  {
+    const int itype = type(i);
+    d_elems(i) = map(itype);
+  });
+  modified(execution_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | ELEMS_MASK | JATOMS_MASK | PAIR_I_MASK | JELEMS_MASK | RIJ_MASK | IJ_MASK );
   eflag = eflag_in;
   vflag = vflag_in;
 }
@@ -183,7 +194,8 @@ void MLIAPDataKokkos<DeviceType>::generate_neighdata(class NeighList *list_in, i
 template<class DeviceType>
 void MLIAPDataKokkos<DeviceType>::grow_neigharrays() {
   AtomKokkos *atomKK = (AtomKokkos *) atom;
-
+  f = atom->f;
+  f_device = atomKK->k_f.view<DeviceType>().data();
   // grow neighbor arrays if necessary
 
   if (natomneigh_max < nlistatoms) {
@@ -207,6 +219,7 @@ void MLIAPDataKokkos<DeviceType>::grow_neigharrays() {
   auto x = atomKK->k_x.view<DeviceType>();
   auto type = atomKK->k_type.view<DeviceType>();
   auto d_cutsq=k_pairmliap->k_cutsq.template view<DeviceType>();
+  auto h_cutsq=k_pairmliap->k_cutsq.template view<LMPHostType>();
   auto d_numneighs = k_numneighs.template view<DeviceType>();
   Kokkos::parallel_reduce(nlistatoms, KOKKOS_LAMBDA (int ii, int &contrib) {
     const int i = d_ilist[ii];
@@ -229,22 +242,24 @@ void MLIAPDataKokkos<DeviceType>::grow_neigharrays() {
     }
     d_numneighs(ii) = count;
     contrib += count;
-  }, nij_total);
+  }, npairs);
   modified(execution_space, NUMNEIGHS_MASK);
 
-  if (nneigh_max < nij_total) {
+  if (nneigh_max < npairs) {
     memoryKK->destroy_kokkos(k_jatoms,jatoms);
-    memoryKK->create_kokkos(k_jatoms, jatoms, nij_total, "mliap_data:jatoms");
+    memoryKK->create_kokkos(k_jatoms, jatoms, npairs, "mliap_data:jatoms");
+    memoryKK->destroy_kokkos(k_pair_i,pair_i);
+    memoryKK->create_kokkos(k_pair_i, pair_i, npairs, "mliap_data:pair_i");
     memoryKK->destroy_kokkos(k_jelems,jelems);
-    memoryKK->create_kokkos(k_jelems, jelems, nij_total, "mliap_data:jelems");
+    memoryKK->create_kokkos(k_jelems, jelems, npairs, "mliap_data:jelems");
     memoryKK->destroy_kokkos(k_rij,rij);
-    memoryKK->create_kokkos(k_rij, rij, nij_total, 3, "mliap_data:rij");
+    memoryKK->create_kokkos(k_rij, rij, npairs, 3, "mliap_data:rij");
 
     if (gradgradflag == 0){
       memoryKK->destroy_kokkos(k_graddesc,graddesc);
-      memoryKK->create_kokkos(k_graddesc, graddesc, nij_total, ndescriptors,3, "mliap_data:graddesc");
+      memoryKK->create_kokkos(k_graddesc, graddesc, npairs, ndescriptors,3, "mliap_data:graddesc");
     }
-    nneigh_max = nij_total;
+    nneigh_max = npairs;
    }
 }
 
@@ -256,7 +271,9 @@ void MLIAPDataKokkos<DeviceType>::modified(ExecutionSpace space, unsigned int ma
     if (mask & IATOMS_MASK      ) k_iatoms         .modify<LMPDeviceType>();
     if (mask & IELEMS_MASK      ) k_ielems         .modify<LMPDeviceType>();
     if (mask & JATOMS_MASK      ) k_jatoms         .modify<LMPDeviceType>();
+    if (mask & PAIR_I_MASK      ) k_pair_i         .modify<LMPDeviceType>();
     if (mask & JELEMS_MASK      ) k_jelems         .modify<LMPDeviceType>();
+    if (mask & ELEMS_MASK       ) k_elems          .modify<LMPDeviceType>();
     if (mask & IJ_MASK          ) k_ij             .modify<LMPDeviceType>();
     if (mask & BETAS_MASK       ) k_betas          .modify<LMPDeviceType>();
     if (mask & DESCRIPTORS_MASK ) k_descriptors    .modify<LMPDeviceType>();
@@ -274,7 +291,9 @@ void MLIAPDataKokkos<DeviceType>::modified(ExecutionSpace space, unsigned int ma
     if (mask & IATOMS_MASK      ) k_iatoms         .modify<LMPHostType>();
     if (mask & IELEMS_MASK      ) k_ielems         .modify<LMPHostType>();
     if (mask & JATOMS_MASK      ) k_jatoms         .modify<LMPHostType>();
+    if (mask & PAIR_I_MASK      ) k_pair_i         .modify<LMPHostType>();
     if (mask & JELEMS_MASK      ) k_jelems         .modify<LMPHostType>();
+    if (mask & ELEMS_MASK       ) k_elems          .modify<LMPHostType>();
     if (mask & IJ_MASK          ) k_ij             .modify<LMPHostType>();
     if (mask & BETAS_MASK       ) k_betas          .modify<LMPHostType>();
     if (mask & DESCRIPTORS_MASK ) k_descriptors    .modify<LMPHostType>();
@@ -300,7 +319,9 @@ void MLIAPDataKokkos<DeviceType>::sync(ExecutionSpace space, unsigned int mask,
     if (mask & IATOMS_MASK      ) k_iatoms         .sync<LMPDeviceType>();
     if (mask & IELEMS_MASK      ) k_ielems         .sync<LMPDeviceType>();
     if (mask & JATOMS_MASK      ) k_jatoms         .sync<LMPDeviceType>();
+    if (mask & PAIR_I_MASK      ) k_pair_i         .sync<LMPDeviceType>();
     if (mask & JELEMS_MASK      ) k_jelems         .sync<LMPDeviceType>();
+    if (mask & ELEMS_MASK       ) k_elems          .sync<LMPDeviceType>();
     if (mask & IJ_MASK          ) k_ij             .sync<LMPDeviceType>();
     if (mask & BETAS_MASK       ) k_betas          .sync<LMPDeviceType>();
     if (mask & DESCRIPTORS_MASK ) k_descriptors    .sync<LMPDeviceType>();
@@ -317,7 +338,9 @@ void MLIAPDataKokkos<DeviceType>::sync(ExecutionSpace space, unsigned int mask,
     if (mask & IATOMS_MASK      ) k_iatoms         .sync<LMPHostType>();
     if (mask & IELEMS_MASK      ) k_ielems         .sync<LMPHostType>();
     if (mask & JATOMS_MASK      ) k_jatoms         .sync<LMPHostType>();
+    if (mask & PAIR_I_MASK      ) k_pair_i         .sync<LMPHostType>();
     if (mask & JELEMS_MASK      ) k_jelems         .sync<LMPHostType>();
+    if (mask & ELEMS_MASK       ) k_elems          .sync<LMPHostType>();
     if (mask & IJ_MASK          ) k_ij             .sync<LMPHostType>();
     if (mask & BETAS_MASK       ) k_betas          .sync<LMPHostType>();
     if (mask & DESCRIPTORS_MASK ) k_descriptors    .sync<LMPHostType>();
diff --git a/src/KOKKOS/mliap_data_kokkos.h b/src/KOKKOS/mliap_data_kokkos.h
index ba81e2a226..f641085c6a 100644
--- a/src/KOKKOS/mliap_data_kokkos.h
+++ b/src/KOKKOS/mliap_data_kokkos.h
@@ -43,6 +43,8 @@ enum {
   GAMMA_MASK_MASK  = 0x00001000,
   GAMMA_ROW_MASK   = 0x00002000,
   GAMMA_COL_MASK   = 0x00004000,
+  PAIR_I_MASK      = 0x00008000,
+  ELEMS_MASK       = 0x00010000,
 };
 // clang-format on
 
@@ -65,6 +67,8 @@ template <class DeviceType> class MLIAPDataKokkos : public MLIAPData {
   DAT::tdual_int_1d k_iatoms;           // index of each atom
   DAT::tdual_int_1d k_ielems;           // element of each atom
   DAT::tdual_int_1d k_jatoms;           // index of each neighbor
+  DAT::tdual_int_1d k_elems;            // element of each atom in or not in the neighborlist
+  DAT::tdual_int_1d k_pair_i;           // index of each i atom for each ij pair
   DAT::tdual_int_1d k_jelems;           // element of each neighbor
   DAT::tdual_int_1d k_ij;               // Start location for each particle
   DAT::tdual_float_2d k_betas;          // betas for all atoms in list
@@ -78,10 +82,123 @@ template <class DeviceType> class MLIAPDataKokkos : public MLIAPData {
   DAT::tdual_int_2d k_gamma_row_index;    // row (parameter) index
   DAT::tdual_int_2d k_gamma_col_index;    // column (descriptor) index
 
-  int nij_total;
+  // Just cached for python interface
+  double *f_device;
 
  protected:
   class LAMMPS *lmp;
 };
+
+// Now we need a specific device version for communication with python
+class MLIAPDataKokkosDevice {
+public:
+
+  MLIAPDataKokkosDevice(MLIAPDataKokkos<LMPDeviceType> &base) :
+    size_array_rows(base.size_array_rows),
+    size_array_cols(base.size_array_cols),
+    natoms(base.natoms),
+    yoffset(base.yoffset),
+    zoffset(base.zoffset),
+    ndims_force(base.ndims_force),
+    ndims_virial(base.ndims_virial),
+    size_gradforce(base.size_gradforce),
+    f(base.f_device),
+    gradforce(base.k_gradforce.d_view.data()),
+    betas(base.k_betas.d_view.data()),
+    descriptors(base.k_descriptors.d_view.data()),
+    eatoms(base.k_eatoms.d_view.data()),
+    energy(&base.energy),
+    ndescriptors(base.ndescriptors),
+    nparams(base.nparams),
+    nelements(base.nelements),
+    gamma_nnz(base.gamma_nnz),
+    gamma(base.k_gamma.d_view.data()),
+    gamma_row_index(base.k_gamma_row_index.d_view.data()),
+    gamma_col_index(base.k_gamma_col_index.d_view.data()),
+    egradient(nullptr),
+    ntotal(base.ntotal),
+    nlistatoms(base.nlistatoms),
+    natomneigh(base.natomneigh),
+    numneighs(base.numneighs),
+    iatoms(base.k_iatoms.d_view.data()),
+    pair_i(base.k_pair_i.d_view.data()),
+    ielems(base.k_ielems.d_view.data()),
+    nneigh_max(base.nneigh_max),
+    npairs(base.npairs),
+    jatoms(base.k_jatoms.d_view.data()),
+    jelems(base.k_jelems.d_view.data()),
+    elems(base.k_elems.d_view.data()),
+    rij(base.k_rij.d_view.data()),
+    graddesc(base.k_graddesc.d_view.data()),
+    eflag(base.eflag),
+    vflag(base.vflag),
+    pairmliap(dynamic_cast<PairMLIAPKokkos<LMPDeviceType> *>(base.pairmliap)),
+#if defined(KOKKOS_ENABLE_CUDA)
+    dev(1)
+#else
+    dev(0)
+#endif
+    {  }
+  int size_array_rows;
+  int size_array_cols;
+  int natoms;
+  int yoffset;
+  int zoffset;
+  int ndims_force;
+  int ndims_virial;
+  int size_gradforce;
+
+  //Write only
+  double *f;
+  double *gradforce;
+  double *betas;
+  double *descriptors;
+  double *eatoms;
+  double *energy;
+
+  // sizing
+  const int ndescriptors;
+  const int nparams;
+  const int nelements;
+
+  //Ignored for now
+  int gamma_nnz;
+  double *gamma;
+  int *gamma_row_index;
+  int *gamma_col_index;
+  double *egradient;
+
+  // Neighborlist stuff
+  const int ntotal;
+  const int nlistatoms;
+  const int natomneigh;
+  int *numneighs;
+  int *iatoms;
+  int *pair_i;
+  int *ielems;
+  const int nneigh_max;
+  const int npairs;
+  int *jatoms;
+  int *jelems;
+  int *elems;
+  double *rij;
+  double *graddesc;
+  int eflag;
+  int vflag;
+
+  class PairMLIAPKokkos<LMPDeviceType> *pairmliap;    // access to pair tally functions
+
+  int dev;
+
+#ifdef LMP_KOKKOS_GPU
+  MLIAPDataKokkosDevice(MLIAPDataKokkos<LMPHostType> &base) : ndescriptors(-1),nparams(-1),nelements(-1),ntotal(-1),nlistatoms(-1),natomneigh(-1),
+      nneigh_max(-1),npairs(-1)
+  {
+    // It cannot get here, but needed for compilation
+  }
+#endif
+};
+
+
 }    // namespace LAMMPS_NS
 #endif
diff --git a/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp b/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp
index f0122bca11..6518cccaa8 100644
--- a/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp
+++ b/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp
@@ -58,7 +58,7 @@ void MLIAPDescriptorSO3Kokkos<DeviceType>::compute_descriptors(class MLIAPData *
 {
   auto data = static_cast<MLIAPDataKokkos<DeviceType>*>(data_);
   so3ptr_kokkos->spectrum(data->nlistatoms, data->k_numneighs, data->k_jelems, this->k_wjelem, data->k_rij, data->k_ij,
-                          nmax, lmax, rcutfac, alpha, data->nij_total, data->ndescriptors);
+                          nmax, lmax, rcutfac, alpha, data->npairs, data->ndescriptors);
 
   Kokkos::deep_copy(data->k_descriptors.template view<DeviceType>(), so3ptr_kokkos->m_plist_r);
   Kokkos::deep_copy(data->k_descriptors.h_view, so3ptr_kokkos->m_plist_r);
@@ -70,7 +70,7 @@ template <class DeviceType>
 void MLIAPDescriptorSO3Kokkos<DeviceType>::compute_forces(class MLIAPData *data_)
 {
   auto data = static_cast<MLIAPDataKokkos<DeviceType>*>(data_);
-  int npairs = data->nij_total;
+  int npairs = data->npairs;
   auto d_numneighs = data->k_numneighs.template view<DeviceType>();
   so3ptr_kokkos->spectrum_dxdr(data->nlistatoms, data->k_numneighs, data->k_jelems, this->k_wjelem, data->k_rij, data->k_ij,
                                nmax, lmax, rcutfac, alpha, npairs, data->ndescriptors);
@@ -186,7 +186,7 @@ void MLIAPDescriptorSO3Kokkos<DeviceType>::compute_force_gradients(class MLIAPDa
   error->all(FLERR,"This has not been tested in cuda/kokkos");
 
   auto data = static_cast<MLIAPDataKokkos<DeviceType>*>(data_);
-  int npairs = data->nij_total;
+  int npairs = data->npairs;
   so3ptr_kokkos->spectrum_dxdr(data->nlistatoms, data->k_numneighs, data->k_jelems, this->k_wjelem, data->k_rij, data->k_ij,
                                nmax, lmax, rcutfac, alpha, npairs, data->ndescriptors);
   auto d_dplist_r = so3ptr_kokkos->k_dplist_r;
@@ -239,7 +239,7 @@ template <class DeviceType>
 void MLIAPDescriptorSO3Kokkos<DeviceType>::compute_descriptor_gradients(class MLIAPData *data_)
 {
   auto data = static_cast<MLIAPDataKokkos<DeviceType>*>(data_);
-  bigint npairs = data->nij_total;
+  bigint npairs = data->npairs;
   so3ptr_kokkos->spectrum_dxdr(data->nlistatoms, data->k_numneighs, data->k_jelems, this->k_wjelem, data->k_rij, data->k_ij,
                                nmax, lmax, rcutfac, alpha, npairs, data->ndescriptors);
   auto graddesc = data->k_graddesc.template view<DeviceType>();
diff --git a/src/KOKKOS/mliap_model_python_kokkos.h b/src/KOKKOS/mliap_model_python_kokkos.h
index e8c9909b88..a223cafd9d 100644
--- a/src/KOKKOS/mliap_model_python_kokkos.h
+++ b/src/KOKKOS/mliap_model_python_kokkos.h
@@ -36,51 +36,11 @@ class MLIAPModelPythonKokkos : public MLIAPModelPython, public MLIAPModelKokkos<
   void compute_force_gradients(class MLIAPData *) override;
   void connect_param_counts();
 };
-}    // namespace LAMMPS_NS
 
-
-
-
-#include "mliap_data_kokkos.h"
-
-namespace LAMMPS_NS {
 class  MLIAPModelPythonKokkosDevice: public MLIAPModelPythonKokkos<LMPDeviceType> {
 };
 
-class MLIAPDataKokkosDevice {
-public:
+}    // namespace LAMMPS_NS
 
-  MLIAPDataKokkosDevice(MLIAPDataKokkos<LMPDeviceType> &base) :
-    ndescriptors(base.ndescriptors),
-    nlistatoms(base.nlistatoms),
-    ielems(base.k_ielems.d_view.data()),
-    descriptors(base.k_descriptors.d_view.data()),
-    betas(base.k_betas.d_view.data()),
-    eatoms(base.k_eatoms.d_view.data()),
-    energy(&base.energy),
-#if defined(KOKKOS_ENABLE_CUDA)
-    dev(1)
-#else
-    dev(0)
-#endif
-    {  }
-
-  const int ndescriptors;
-  const int nlistatoms;
-  int *ielems;
-  double *descriptors;
-  double *betas;
-  double *eatoms;
-  double *energy;
-  int dev;
-
-#ifdef LMP_KOKKOS_GPU
-  MLIAPDataKokkosDevice(MLIAPDataKokkos<LMPHostType> &base) : ndescriptors(-1),nlistatoms(-1)
-  {
-    // It cannot get here, but needed for compilation
-  }
-#endif
-};
-}
 
 #endif
diff --git a/src/KOKKOS/mliap_unified_couple_kokkos.pyx b/src/KOKKOS/mliap_unified_couple_kokkos.pyx
new file mode 100644
index 0000000000..37326263d3
--- /dev/null
+++ b/src/KOKKOS/mliap_unified_couple_kokkos.pyx
@@ -0,0 +1,445 @@
+# cython: language_level=3
+# distutils: language = c++
+
+import pickle
+import numpy as np
+import lammps.mliap
+try:
+    import cupy
+except ImportError:
+    pass
+from libc.stdint cimport uintptr_t
+
+cimport cython
+from cpython.ref cimport PyObject
+from libc.stdlib cimport malloc, free
+
+
+cdef extern from "lammps.h" namespace "LAMMPS_NS":
+    cdef cppclass LAMMPS:
+        pass
+
+
+cdef extern from "mliap_data_kokkos.h" namespace "LAMMPS_NS":
+    cdef cppclass MLIAPDataKokkosDevice:
+        # ----- may not need -----
+        int size_array_rows
+        int size_array_cols
+        int natoms
+        int yoffset
+        int zoffset
+        int ndims_force
+        int ndims_virial
+        # -END- may not need -END-
+        int size_gradforce
+        # ----- write only -----
+        double * f
+        double * gradforce
+        double * betas         # betas for all atoms in list
+        double * descriptors   # descriptors for all atoms in list
+        double * eatoms         # energies for all atoms in list
+        double * energy
+        # -END- write only -END-
+        int ndescriptors        # number of descriptors
+        int nparams             # number of model parameters per element
+        int nelements           # number of elements
+
+        # data structures for grad-grad list (gamma)
+
+        # ----- ignore for now -----
+        int gamma_nnz           # number of non-zero entries in gamma
+        double * gamma         # gamma element
+        int * gamma_row_index  # row (parameter) index
+        int * gamma_col_index  # column (descriptor) index
+        double * egradient      # energy gradient w.r.t. parameters
+        # -END- ignore for now -END-
+
+        # data structures for mliap neighbor list
+        # only neighbors strictly inside descriptor cutoff
+
+        int ntotal              # total number of owned and ghost atoms on this proc
+        int nlistatoms          # current number of atoms in local atom lists
+        int natomneigh          # current number of atoms and ghosts in atom neighbor arrays
+        int * numneighs         # neighbors count for each atom
+        int * iatoms            # index of each atom
+        int * pair_i            # index of each i atom for each ij pair
+        int * ielems            # element of each atom
+        int nneigh_max          # number of ij neighbors allocated
+        int npairs              # number of ij neighbor pairs
+        int * jatoms            # index of each neighbor
+        int * jelems            # element of each neighbor
+        int * elems             # element of each atom in or not in the neighborlist
+        double * rij           # distance vector of each neighbor
+        # ----- write only -----
+        double * graddesc     # descriptor gradient w.r.t. each neighbor
+        # -END- write only -END-
+        int eflag               # indicates if energy is needed
+        int vflag               # indicates if virial is needed
+        void * pairmliap        # pointer to base class
+        int dev
+
+cdef extern from "mliap_unified_kokkos.h" namespace "LAMMPS_NS":
+    cdef cppclass MLIAPDummyDescriptor:
+        MLIAPDummyDescriptor(PyObject *, LAMMPS *) except +
+        int ndescriptors    # number of descriptors
+        int nelements       # # of unique elements
+        char *elements     # names of unique elements
+        double cutmax       # maximum cutoff needed
+        double rcutfac
+        double *radelem     # element radii
+
+        void compute_descriptors(MLIAPDataKokkosDevice *)
+        void compute_forces(MLIAPDataKokkosDevice *)
+        void set_elements(char **, int)
+
+    cdef cppclass MLIAPDummyModel:
+        MLIAPDummyModel(PyObject *, LAMMPS *, char * = NULL) except +
+        int ndescriptors    # number of descriptors
+        int nparams         # number of parameters per element
+        int nelements;      # # of unique elements
+
+        void compute_gradients(MLIAPDataKokkosDevice *)
+
+    cdef void update_pair_energy(MLIAPDataKokkosDevice *, double *) except +
+    cdef void update_pair_forces(MLIAPDataKokkosDevice *, double *) except +
+
+
+LOADED_MODEL = None
+
+
+# @property sans getter
+def write_only_property(fset):
+    return property(fget=None, fset=fset)
+
+cdef create_array(device, void *pointer, shape,is_int):
+    size=1
+    for i in shape:
+        size = size*i
+
+    if ( device == 1):
+        mem = cupy.cuda.UnownedMemory(ptr=int( <uintptr_t> pointer), owner=None, size=size)
+        memptr = cupy.cuda.MemoryPointer(mem, 0)
+        type=cupy.double
+        if (is_int):
+            type=cupy.int32
+        return cupy.ndarray(shape, type, memptr=memptr)
+    else:
+        if (len(shape) == 1 ):
+            if (is_int):
+                return np.asarray(<int[:shape[0]]>pointer)
+            else:
+                return np.asarray(<double[:shape[0]]>pointer)
+        else:
+            if (is_int):
+                return np.asarray(<int[:shape[0],:shape[1]]>pointer)
+            else:
+                return np.asarray(<double[:shape[0],:shape[1]]>pointer)
+            
+
+
+# Cython implementation of MLIAPData
+# Automatically converts between C arrays and numpy when needed
+cdef class MLIAPDataPy:
+    cdef MLIAPDataKokkosDevice * data
+    
+    def __cinit__(self):
+        self.data = NULL
+
+    def update_pair_energy_cpu(self, eij):
+        cdef double[:] eij_arr = eij
+        update_pair_energy(self.data, &eij_arr[0])
+    def update_pair_energy_gpu(self, eij):
+        cdef uintptr_t ptr = eij.data.ptr
+        update_pair_energy(self.data, <double*>ptr)  
+    def update_pair_energy(self, eij):
+        if self.data.dev==0:
+            self.update_pair_energy_cpu(eij)
+        else:
+            self.update_pair_energy_gpu(eij)
+
+    def update_pair_forces_cpu(self, fij):
+        cdef double[:, ::1] fij_arr = fij
+        update_pair_forces(self.data, &fij_arr[0][0])
+    def update_pair_forces_gpu(self, fij):
+        cdef uintptr_t ptr = fij.data.ptr
+        update_pair_forces(self.data, <double*>ptr)  
+    def update_pair_forces(self, fij):
+        if self.data.dev==0:
+            self.update_pair_forces_cpu(fij)
+        else:
+            self.update_pair_forces_gpu(fij)
+    @property
+    def f(self):
+        if self.data.f is NULL:
+            return None
+        return cupy.asarray(<double[:self.ntotal, :3]> self.data.f)
+    
+    @property
+    def size_gradforce(self):
+        return self.data.size_gradforce
+ 
+    @write_only_property
+    def gradforce(self, value):
+        if self.data.gradforce is NULL:
+            raise ValueError("attempt to set NULL gradforce")
+        cdef double[:, :] gradforce_view = <double[:self.ntotal, :self.size_gradforce]> &self.data.gradforce[0]
+        cdef double[:, :] value_view = value
+        gradforce_view[:] = value_view
+        print("This code has not been tested or optimized for the GPU, if you are getting this warning optimize gradforce")
+ 
+    @write_only_property
+    def betas(self, value):
+        if self.data.betas is NULL:
+            raise ValueError("attempt to set NULL betas")
+        cdef double[:, :] betas_view = <double[:self.nlistatoms, :self.ndescriptors]> &self.data.betas[0]
+        cdef double[:, :] value_view = value
+        betas_view[:] = value_view
+        print("This code has not been tested or optimized for the GPU, if you are getting this warning optimize ")
+
+    @write_only_property
+    def descriptors(self, value):
+        if self.data.descriptors is NULL:
+            raise ValueError("attempt to set NULL descriptors")
+        cdef double[:, :] descriptors_view = <double[:self.nlistatoms, :self.ndescriptors]> &self.data.descriptors[0]
+        cdef double[:, :] value_view = value
+        descriptors_view[:] = value_view
+        print("This code has not been tested or optimized for the GPU, if you are getting this warning optimize descriptors")
+
+    @write_only_property
+    def eatoms(self, value):
+        if self.data.eatoms is NULL:
+            raise ValueError("attempt to set NULL eatoms")
+        cdef double[:] eatoms_view = <double[:self.nlistatoms]> &self.data.eatoms[0]
+        cdef double[:] value_view = value
+        eatoms_view[:] = value_view
+        print("This code has not been tested or optimized for the GPU, if you are getting this warning optimize eatoms")
+
+
+    @write_only_property
+    def energy(self, value):
+        self.data.energy[0] = <double>value
+
+    @property
+    def ndescriptors(self):
+        return self.data.ndescriptors
+
+    @property
+    def nparams(self):
+        return self.data.nparams
+
+    @property
+    def nelements(self):
+        return self.data.nelements
+
+    # data structures for grad-grad list (gamma)
+
+    @property
+    def gamma_nnz(self):
+        return self.data.gamma_nnz
+
+    @property
+    def gamma(self):
+        if self.data.gamma is NULL:
+            return None
+        return create_array(self.data.dev, self.data.gamma, [self.nlistatoms, self.gama_nnz],False)
+
+    @property
+    def gamma_row_index(self):
+        if self.data.gamma_row_index is NULL:
+            return None
+        return create_array(self.data.dev, self.data.gamma_row_index, [self.nlistatoms, self.gamma_nnz],True)
+
+    @property
+    def gamma_col_index(self):
+        if self.data.gamma_col_index is NULL:
+            return None
+        return create_array(self.data.dev, self.data.gamma_col_index, [self.nlistatoms, self.gamma_nnz],True)
+
+    @property
+    def egradient(self):
+        if self.data.egradient is NULL:
+            return None
+        return create_array(self.data.dev, self.data.egradient, [self.nelements*self.nparams],False)
+
+    # data structures for mliap neighbor list
+    # only neighbors strictly inside descriptor cutoff
+
+    @property
+    def ntotal(self):
+        return self.data.ntotal
+    
+    @property
+    def elems(self):
+        if self.data.elems is NULL:
+            return None
+        return create_array(self.data.dev, self.data.elems, [self.ntotal],True)
+
+    @property
+    def nlistatoms(self):
+        return self.data.nlistatoms
+    
+    @property
+    def natomneigh(self):
+        return self.data.natomneigh
+
+    @property
+    def numneighs(self):
+        if self.data.numneighs is NULL:
+            return None
+        return create_array(self.data.dev, self.data.numneighs, [self.natomneigh],False)
+
+    @property
+    def iatoms(self):
+        if self.data.iatoms is NULL:
+            return None
+        return create_array(self.data.dev, self.data.iatoms, [self.natomneigh],True)
+    
+    @property
+    def ielems(self):
+        if self.data.ielems is NULL:
+            return None
+        return create_array(self.data.dev, self.data.ielems, [self.natomneigh],True)
+
+    @property
+    def npairs(self):
+        return self.data.npairs
+
+    @property
+    def pair_i(self):
+        if self.data.pair_i is NULL:
+            return None
+        return create_array(self.data.dev, self.data.pair_i, [self.npairs],True)
+    
+    @property
+    def pair_j(self):
+        return self.jatoms
+
+    @property
+    def jatoms(self):
+        if self.data.jatoms is NULL:
+            return None
+        return create_array(self.data.dev, self.data.jatoms, [self.npairs],True)
+    
+    @property
+    def jelems(self):
+        if self.data.jelems is NULL:
+            return None
+        return create_array(self.data.dev, self.data.jelems, [self.npairs],True)
+
+
+    @property
+    def rij(self):
+        if self.data.rij is NULL:
+            return None
+        return create_array(self.data.dev, self.data.rij, [self.npairs,3],False)
+
+    @write_only_property
+    def graddesc(self, value):
+        if self.data.graddesc is NULL:
+            raise ValueError("attempt to set NULL graddesc")
+        cdef double[:, :, :] graddesc_view = <double[:self.npairs, :self.ndescriptors, :3]> &self.data.graddesc[0]
+        cdef double[:, :, :] value_view = value
+        graddesc_view[:] = value_view
+
+    @property
+    def eflag(self):
+        return self.data.eflag
+
+    @property
+    def vflag(self):
+        return self.data.vflag
+
+
+# Interface between C and Python compute functions
+cdef class MLIAPUnifiedInterface:
+    cdef MLIAPDummyModel * model
+    cdef MLIAPDummyDescriptor * descriptor
+    cdef unified_impl
+
+    def __init__(self, unified_impl):
+        self.model = NULL
+        self.descriptor = NULL
+        self.unified_impl = unified_impl
+    
+    def compute_gradients(self, data):
+        self.unified_impl.compute_gradients(data)
+    
+    def compute_descriptors(self, data):
+        self.unified_impl.compute_descriptors(data)
+    
+    def compute_forces(self, data):
+        self.unified_impl.compute_forces(data)
+
+
+cdef public void compute_gradients_python_kokkos(unified_int, MLIAPDataKokkosDevice *data) except * with gil:
+    pydata = MLIAPDataPy()
+    pydata.data = data
+    unified_int.compute_gradients(pydata)
+
+
+cdef public void compute_descriptors_python_kokkos(unified_int, MLIAPDataKokkosDevice *data) except * with gil:
+    pydata = MLIAPDataPy()
+    pydata.data = data
+    unified_int.compute_descriptors(pydata)
+
+
+cdef public void compute_forces_python_kokkos(unified_int, MLIAPDataKokkosDevice *data) except * with gil:
+    pydata = MLIAPDataPy()
+    pydata.data = data
+    unified_int.compute_forces(pydata)
+
+
+# Create a MLIAPUnifiedInterface and connect it to the dummy model, descriptor
+cdef public object mliap_unified_connect_kokkos(char *fname, MLIAPDummyModel * model,
+                                                MLIAPDummyDescriptor * descriptor) with gil:
+    str_fname = fname.decode('utf-8')
+    if str_fname == 'EXISTS':
+        if LOADED_MODEL is None:
+            raise ValueError("No unified model loaded")
+        unified = LOADED_MODEL
+    elif str_fname.endswith(".pt") or str_fname.endswith('.pth'):
+        import torch
+        unified = torch.load(str_fname)
+    else:
+        with open(str_fname, 'rb') as pfile:
+            unified = pickle.load(pfile)
+
+    unified_int = MLIAPUnifiedInterface(unified)
+    unified_int.model = model
+    unified_int.descriptor = descriptor
+
+    unified.interface = unified_int
+
+    if unified.ndescriptors is None:
+        raise ValueError("no descriptors set")
+
+    unified_int.descriptor.ndescriptors = <int>unified.ndescriptors
+    unified_int.descriptor.rcutfac = <double>unified.rcutfac
+    unified_int.model.ndescriptors = <int>unified.ndescriptors
+    unified_int.model.nparams = <int>unified.nparams
+
+    if unified.element_types is None:
+        raise ValueError("no element type set")
+    
+    cdef int nelements = <int>len(unified.element_types)
+    cdef char **elements = <char**>malloc(nelements * sizeof(char*))
+
+    if not elements:
+        raise MemoryError("failed to allocate memory for element names")
+
+    cdef char *elem_name
+    for i, elem in enumerate(unified.element_types):
+        elem_name_bytes = elem.encode('UTF-8')
+        elem_name = elem_name_bytes
+        elements[i] = &elem_name[0]
+    unified_int.descriptor.set_elements(elements, nelements)
+    unified_int.model.nelements = nelements
+
+    free(elements)
+    return unified_int
+
+
+# For pre-loading a Python model
+def load_from_python(unified):
+    global LOADED_MODEL
+    LOADED_MODEL = unified
diff --git a/src/KOKKOS/mliap_unified_kokkos.cpp b/src/KOKKOS/mliap_unified_kokkos.cpp
new file mode 100644
index 0000000000..bfb9193df6
--- /dev/null
+++ b/src/KOKKOS/mliap_unified_kokkos.cpp
@@ -0,0 +1,388 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Matt Bettencourt (NVIDIA)
+------------------------------------------------------------------------- */
+
+#ifdef MLIAP_PYTHON
+
+#include "mliap_unified_kokkos.h"
+#include <Python.h>
+
+#include "error.h"
+#include "lmppython.h"
+#include "memory.h"
+#include "mliap_data.h"
+#include "mliap_unified_couple_kokkos.h"
+#include "pair_mliap.h"
+#include "python_compat.h"
+#include "utils.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+MLIAPDummyDescriptorKokkos<DeviceType>::MLIAPDummyDescriptorKokkos(LAMMPS *_lmp) :
+  Pointers(_lmp), MLIAPDummyDescriptor(_lmp), MLIAPDescriptorKokkos<DeviceType>(lmp, this) {}
+
+template <class DeviceType>
+MLIAPDummyDescriptorKokkos<DeviceType>::~MLIAPDummyDescriptorKokkos()
+{
+  // done in base class
+  // Py_DECREF(unified_interface);
+}
+
+/* ----------------------------------------------------------------------
+   invoke compute_descriptors from Cython interface
+   ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+void MLIAPDummyDescriptorKokkos<DeviceType>::compute_descriptors(class MLIAPData *data)
+{
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  auto *kokkos_data = dynamic_cast<MLIAPDataKokkos<DeviceType>*>(data);
+  MLIAPDataKokkosDevice raw_data(*kokkos_data);
+  compute_descriptors_python_kokkos(unified_interface, &raw_data);
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+    PyErr_Clear();
+    PyGILState_Release(gstate);
+    lmp->error->all(FLERR, "Running mliappy unified compute_descriptors failure.");
+  }
+  PyGILState_Release(gstate);
+}
+
+/* ----------------------------------------------------------------------
+   invoke compute_forces from Cython interface
+   ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+void MLIAPDummyDescriptorKokkos<DeviceType>::compute_forces(class MLIAPData *data)
+{
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  auto *kokkos_data = dynamic_cast<MLIAPDataKokkos<DeviceType>*>(data);
+  MLIAPDataKokkosDevice raw_data(*kokkos_data);
+  compute_forces_python_kokkos(unified_interface, &raw_data);
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+    PyErr_Clear();
+    PyGILState_Release(gstate);
+    lmp->error->all(FLERR, "Running mliappy unified compute_forces failure.");
+  }
+  PyGILState_Release(gstate);
+}
+
+// not implemented
+template <class DeviceType>
+void MLIAPDummyDescriptorKokkos<DeviceType>::compute_force_gradients(class MLIAPData *)
+{
+  error->all(FLERR, "compute_force_gradients not implemented");
+}
+
+// not implemented
+template <class DeviceType>
+void MLIAPDummyDescriptorKokkos<DeviceType>::compute_descriptor_gradients(class MLIAPData *)
+{
+  error->all(FLERR, "compute_descriptor_gradients not implemented");
+}
+
+template <class DeviceType>
+void MLIAPDummyDescriptorKokkos<DeviceType>::init()
+{
+  memory->create(radelem, nelements, "mliap_dummy_descriptor:radelem");
+  for (int ielem = 0; ielem < nelements; ielem++) { radelem[ielem] = 1; }
+
+  double cut;
+  cutmax = 0.0;
+  memory->create(cutsq, nelements, nelements, "mliap/descriptor/dummy:cutsq");
+  memory->create(cutghost, nelements, nelements, "mliap/descriptor/dummy:cutghost");
+  for (int ielem = 0; ielem < nelements; ielem++) {
+    // rcutfac set from python, is global cutoff for all elements
+    cut = 2.0 * radelem[ielem] * rcutfac;
+    if (cut > cutmax) cutmax = cut;
+    cutsq[ielem][ielem] = cut * cut;
+    cutghost[ielem][ielem] = cut * cut;
+    for (int jelem = ielem + 1; jelem < nelements; jelem++) {
+      cut = (radelem[ielem] + radelem[jelem]) * rcutfac;
+      cutsq[ielem][jelem] = cutsq[jelem][ielem] = cut * cut;
+      cutghost[ielem][jelem] = cutghost[jelem][ielem] = cut * cut;
+    }
+  }
+}
+
+template <class DeviceType>
+void MLIAPDummyDescriptorKokkos<DeviceType>::set_elements(char **elems, int nelems)
+{
+  nelements = nelems;
+  elements = new char *[nelems];
+  for (int i = 0; i < nelems; i++) { elements[i] = utils::strdup(elems[i]); }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+MLIAPDummyModelKokkos<DeviceType>::MLIAPDummyModelKokkos(LAMMPS *lmp, char *coefffilename) :
+MLIAPDummyModel(lmp,coefffilename),
+MLIAPModelKokkos<DeviceType>(lmp, this)
+{
+  nonlinearflag = 1;
+}
+
+template <class DeviceType>
+MLIAPDummyModelKokkos<DeviceType>::~MLIAPDummyModelKokkos()
+{
+  // manually decrement borrowed reference from Python
+  Py_DECREF(unified_interface);
+}
+
+template <class DeviceType>
+int MLIAPDummyModelKokkos<DeviceType>::get_nparams()
+{
+  return nparams;
+}
+
+template <class DeviceType>
+int MLIAPDummyModelKokkos<DeviceType>::get_gamma_nnz(class MLIAPData *)
+{
+  // TODO: get_gamma_nnz
+  return 0;
+}
+
+/* ----------------------------------------------------------------------
+   invoke compute_gradients from Cython interface
+   ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+void MLIAPDummyModelKokkos<DeviceType>::compute_gradients(class MLIAPData *data)
+{
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  auto *kokkos_data = dynamic_cast<MLIAPDataKokkos<DeviceType>*>(data);
+  MLIAPDataKokkosDevice raw_data(*kokkos_data);
+  compute_gradients_python_kokkos(unified_interface, &raw_data);
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+    PyErr_Clear();
+    PyGILState_Release(gstate);
+    MLIAPModelKokkos<DeviceType>::error->all(FLERR, "Running mliappy unified compute_gradients failure.");
+  }
+  PyGILState_Release(gstate);
+}
+
+// not implemented
+template <class DeviceType>
+void MLIAPDummyModelKokkos<DeviceType>::compute_gradgrads(class MLIAPData *)
+{
+  MLIAPModelKokkos<DeviceType>::error->all(FLERR, "compute_gradgrads not implemented");
+}
+
+// not implemented
+template <class DeviceType>
+void MLIAPDummyModelKokkos<DeviceType>::compute_force_gradients(class MLIAPData *)
+{
+  MLIAPModelKokkos<DeviceType>::error->all(FLERR, "compute_force_gradients not implemented");
+}
+
+/* ----------------------------------------------------------------------
+   memory usage unclear due to Cython/Python implementation
+   ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+double MLIAPDummyModelKokkos<DeviceType>::memory_usage()
+{
+  // TODO: implement memory usage in Cython(?)
+  return 0;
+}
+
+// not implemented
+template <class DeviceType>
+void MLIAPDummyModelKokkos<DeviceType>::read_coeffs(char *)
+{
+  MLIAPModelKokkos<DeviceType>::error->all(FLERR, "read_coeffs not implemented");
+}
+
+/* ----------------------------------------------------------------------
+   build the unified interface object, connect to dummy model and descriptor
+   ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+MLIAPBuildUnifiedKokkos_t<DeviceType> LAMMPS_NS::build_unified(char *unified_fname, MLIAPDataKokkos<DeviceType> *data, LAMMPS *lmp,
+                                             char *coefffilename)
+{
+  lmp->python->init();
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject *pyMain = PyImport_AddModule("__main__");
+
+  if (!pyMain) {
+    PyGILState_Release(gstate);
+    lmp->error->all(FLERR, "Could not initialize embedded Python");
+  }
+
+  PyImport_ImportModule("mliap_unified_couple_kokkos");
+
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+    PyErr_Clear();
+    PyGILState_Release(gstate);
+    lmp->error->all(FLERR, "Loading mliappy unified module failure.");
+  }
+
+  // Connect dummy model, dummy descriptor, data to Python unified
+  MLIAPDummyModelKokkos<DeviceType> *model = new MLIAPDummyModelKokkos<DeviceType>(lmp, coefffilename);
+  MLIAPDummyDescriptorKokkos<DeviceType> *descriptor = new MLIAPDummyDescriptorKokkos<DeviceType>(lmp);
+
+  PyObject *unified_interface = mliap_unified_connect_kokkos(unified_fname, model, descriptor);
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+    PyErr_Clear();
+    PyGILState_Release(gstate);
+    lmp->error->all(FLERR, "Running mliappy unified module failure.");
+  }
+
+  // Borrowed references must be manually incremented
+  model->unified_interface = unified_interface;
+  Py_INCREF(unified_interface);
+  descriptor->unified_interface = unified_interface;
+  Py_INCREF(unified_interface);
+
+  PyGILState_Release(gstate);
+
+  MLIAPBuildUnifiedKokkos_t<DeviceType> build = {data, descriptor, model};
+  return build;
+}
+
+/* ----------------------------------------------------------------------
+   set energy for ij atom pairs
+   ---------------------------------------------------------------------- */
+
+void LAMMPS_NS::update_pair_energy(MLIAPDataKokkosDevice *data, double *eij)
+{
+  double e_total = 0.0;
+  auto d_eatoms = data->eatoms;
+  auto d_pair_i= data->pair_i;
+  const auto nlistatoms = data->nlistatoms;
+  Kokkos::parallel_for(nlistatoms, KOKKOS_LAMBDA(int ii){
+    d_eatoms[ii] = 0;
+  });
+
+  Kokkos::parallel_reduce(data->npairs, KOKKOS_LAMBDA(int ii, double &local_sum){
+    int i = d_pair_i[ii];
+    double e = 0.5 * eij[ii];
+
+    // must not count any contribution where i is not a local atom
+    if (i < nlistatoms) {
+      Kokkos::atomic_add(&d_eatoms[i], e);
+      local_sum += e;
+    }
+  },*data->energy);
+}
+
+/* ----------------------------------------------------------------------
+   set forces for ij atom pairs
+   ---------------------------------------------------------------------- */
+
+void LAMMPS_NS::update_pair_forces(MLIAPDataKokkosDevice *data, double *fij)
+{
+  const auto nlistatoms = data->nlistatoms;
+  auto *f = data->f;
+  auto pair_i = data->pair_i;
+  auto j_atoms = data->jatoms;
+  auto vflag = data->vflag;
+  auto rij = data->rij;
+  int vflag_either=data->pairmliap->vflag_either, vflag_global=data->pairmliap->vflag_global, vflag_atom=data->pairmliap->vflag_atom;
+  auto d_vatom = data->pairmliap->k_vatom.template view<LMPDeviceType>();
+  Kokkos::View<double[6], LMPDeviceType> virial("virial");
+
+  Kokkos::parallel_for(data->npairs,KOKKOS_LAMBDA (int ii) {
+
+    int ii3 = ii * 3;
+    int i = pair_i[ii];
+    int j = j_atoms[ii];
+
+    // must not count any contribution where i is not a local atom
+    if (i < nlistatoms) {
+      Kokkos::atomic_add(&f[i*3+0], fij[ii3+0]);
+      Kokkos::atomic_add(&f[i*3+1], fij[ii3+1]);
+      Kokkos::atomic_add(&f[i*3+2], fij[ii3+2]);
+      Kokkos::atomic_add(&f[j*3+0],-fij[ii3+0]);
+      Kokkos::atomic_add(&f[j*3+1],-fij[ii3+1]);
+      Kokkos::atomic_add(&f[j*3+2],-fij[ii3+2]);
+      if (vflag) {
+        double v[6];
+        v[0] = -rij[ii3+0]*fij[ii3+0];
+        v[1] = -rij[ii3+1]*fij[ii3+1];
+        v[2] = -rij[ii3+2]*fij[ii3+2];
+        v[3] = -rij[ii3+0]*fij[ii3+1];
+        v[4] = -rij[ii3+0]*fij[ii3+2];
+        v[5] = -rij[ii3+1]*fij[ii3+2];
+        if (vflag_global) {
+          Kokkos::atomic_add(&virial[0], v[0]);
+          Kokkos::atomic_add(&virial[1], v[1]);
+          Kokkos::atomic_add(&virial[2], v[2]);
+          Kokkos::atomic_add(&virial[3], v[3]);
+          Kokkos::atomic_add(&virial[4], v[4]);
+          Kokkos::atomic_add(&virial[5], v[5]);
+        }
+        if (vflag_atom) {
+          Kokkos::atomic_add(&d_vatom(i,0), 0.5*v[0]);
+          Kokkos::atomic_add(&d_vatom(i,1), 0.5*v[1]);
+          Kokkos::atomic_add(&d_vatom(i,2), 0.5*v[2]);
+          Kokkos::atomic_add(&d_vatom(i,3), 0.5*v[3]);
+          Kokkos::atomic_add(&d_vatom(i,4), 0.5*v[4]);
+          Kokkos::atomic_add(&d_vatom(i,5), 0.5*v[5]);
+
+          Kokkos::atomic_add(&d_vatom(j,0), 0.5*v[0]);
+          Kokkos::atomic_add(&d_vatom(j,1), 0.5*v[1]);
+          Kokkos::atomic_add(&d_vatom(j,2), 0.5*v[2]);
+          Kokkos::atomic_add(&d_vatom(j,3), 0.5*v[3]);
+          Kokkos::atomic_add(&d_vatom(j,4), 0.5*v[4]);
+          Kokkos::atomic_add(&d_vatom(j,5), 0.5*v[5]);
+        }
+      }
+    }
+  });
+
+  if (vflag) {
+    if (vflag_global) {
+      Kokkos::View<double[6], LMPHostType> h_virial("h_virial");
+      Kokkos::deep_copy(h_virial,virial);
+      for (int i=0;i<6;++i)
+        data->pairmliap->virial[i]+=h_virial[i];
+    }
+    if (vflag_atom) {
+      data->pairmliap->k_vatom.template modify<LMPDeviceType>();
+      data->pairmliap->k_vatom.template sync<LMPHostType>();
+    }
+  }
+}
+
+namespace LAMMPS_NS {
+template class MLIAPDummyModelKokkos<LMPDeviceType>;
+template class MLIAPDummyDescriptorKokkos<LMPDeviceType>;
+template MLIAPBuildUnifiedKokkos_t<LMPDeviceType> LAMMPS_NS::build_unified(char *unified_fname, MLIAPDataKokkos<LMPDeviceType> *data, LAMMPS *lmp,
+                                             char *coefffilename);
+//template void LAMMPS_NS::update_pair_energy(MLIAPDataKokkos<LMPDeviceType> *data, double *eij);
+//template void LAMMPS_NS::update_pair_forces(MLIAPDataKokkos<LMPDeviceType> *data, double *fij);
+#ifdef LMP_KOKKOS_GPU
+template class MLIAPDummyModelKokkos<LMPHostType>;
+template class MLIAPDummyDescriptorKokkos<LMPHostType>;
+template MLIAPBuildUnifiedKokkos_t<LMPHostType> LAMMPS_NS::build_unified(char *unified_fname, MLIAPDataKokkos<LMPHostType> *data, LAMMPS *lmp,
+                                             char *coefffilename);
+//template void LAMMPS_NS::update_pair_energy(MLIAPDataKokkos<LMPHostType> *data, double *eij);
+//template void LAMMPS_NS::update_pair_forces(MLIAPDataKokkos<LMPHostType> *data, double *fij);
+#endif
+}
+#endif
+
diff --git a/src/KOKKOS/mliap_unified_kokkos.h b/src/KOKKOS/mliap_unified_kokkos.h
new file mode 100644
index 0000000000..aad25891b0
--- /dev/null
+++ b/src/KOKKOS/mliap_unified_kokkos.h
@@ -0,0 +1,66 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_MLIAP_UNIFIED_KOKKOS_H
+#define LMP_MLIAP_UNIFIED_KOKKOS_H
+
+#include "mliap_unified.h"
+#include "mliap_descriptor_kokkos.h"
+#include "mliap_model_kokkos.h"
+#include "mliap_data_kokkos.h"
+
+#include <Python.h>
+
+namespace LAMMPS_NS {
+template <class DeviceType>
+class MLIAPDummyDescriptorKokkos : public MLIAPDummyDescriptor, public MLIAPDescriptorKokkos<DeviceType>{
+ public:
+  MLIAPDummyDescriptorKokkos(LAMMPS *);
+  ~MLIAPDummyDescriptorKokkos() override;
+  void compute_descriptors(class MLIAPData *) override;
+  void compute_forces(class MLIAPData *) override;
+  void compute_force_gradients(class MLIAPData *) override;
+  void compute_descriptor_gradients(class MLIAPData *) override;
+  void init() override;
+  void set_elements(char **, int);
+};
+template <class DeviceType>
+class MLIAPDummyModelKokkos : public MLIAPDummyModel, public MLIAPModelKokkos<DeviceType> {
+ public:
+  MLIAPDummyModelKokkos(LAMMPS *, char * = nullptr);
+  ~MLIAPDummyModelKokkos() override;
+  int get_nparams() override;
+  int get_gamma_nnz(class MLIAPData *) override;
+  void compute_gradients(class MLIAPData *) override;
+  void compute_gradgrads(class MLIAPData *) override;
+  void compute_force_gradients(class MLIAPData *) override;
+  double memory_usage() override;
+
+ protected:
+  void read_coeffs(char *) override;
+};
+
+template <class DeviceType>
+struct MLIAPBuildUnifiedKokkos_t {
+  MLIAPDataKokkos<DeviceType> *data;
+  MLIAPDummyDescriptorKokkos<DeviceType> *descriptor;
+  MLIAPDummyModelKokkos<DeviceType> *model;
+};
+template <class DeviceType>
+MLIAPBuildUnifiedKokkos_t<DeviceType> build_unified(char *, MLIAPDataKokkos<DeviceType> *, LAMMPS *, char * = NULL);
+void update_pair_energy(MLIAPDataKokkosDevice *, double *);
+void update_pair_forces(MLIAPDataKokkosDevice *, double *);
+
+}    // namespace LAMMPS_NS
+
+#endif
diff --git a/src/KOKKOS/pair_meam_kokkos.cpp b/src/KOKKOS/pair_meam_kokkos.cpp
index 90e714cefe..c2b03c2054 100644
--- a/src/KOKKOS/pair_meam_kokkos.cpp
+++ b/src/KOKKOS/pair_meam_kokkos.cpp
@@ -51,6 +51,7 @@ PairMEAMKokkos<DeviceType>::PairMEAMKokkos(LAMMPS *lmp) : PairMEAM(lmp)
   delete meam_inst;
   meam_inst_kk = new MEAMKokkos<DeviceType>(memory);
   meam_inst = meam_inst_kk;
+  myname = "meam/kk";
 }
 
 /* ---------------------------------------------------------------------- */
@@ -156,7 +157,8 @@ void PairMEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   int need_dup = lmp->kokkos->need_dup<DeviceType>();
 
-  meam_inst_kk->meam_dens_init(inum_half,ntype,type,d_map,x,d_numneigh_half,d_numneigh_full,d_ilist_half,d_neighbors_half, d_neighbors_full, d_offset, neighflag, need_dup);
+  meam_inst_kk->meam_dens_init(inum_half,ntype,type,d_map,x,d_numneigh_half,d_numneigh_full,
+    d_ilist_half,d_neighbors_half, d_neighbors_full, d_offset, neighflag, need_dup);
 
   meam_inst_kk->k_rho0.template modify<DeviceType>();
   meam_inst_kk->k_arho2b.template modify<DeviceType>();
@@ -166,6 +168,13 @@ void PairMEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   meam_inst_kk->k_arho3b.template modify<DeviceType>();
   meam_inst_kk->k_t_ave.template modify<DeviceType>();
   meam_inst_kk->k_tsq_ave.template modify<DeviceType>();
+  if (msmeamflag) {
+    meam_inst_kk->k_arho2mb.template modify<DeviceType>();
+    meam_inst_kk->k_arho1m.template modify<DeviceType>();
+    meam_inst_kk->k_arho2m.template modify<DeviceType>();
+    meam_inst_kk->k_arho3m.template modify<DeviceType>();
+    meam_inst_kk->k_arho3mb.template modify<DeviceType>();
+  }
 
   comm->reverse_comm(this);
 
@@ -177,6 +186,13 @@ void PairMEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   meam_inst_kk->k_arho3b.template sync<DeviceType>();
   meam_inst_kk->k_t_ave.template sync<DeviceType>();
   meam_inst_kk->k_tsq_ave.template sync<DeviceType>();
+  if (msmeamflag) {
+    meam_inst_kk->k_arho2mb.template sync<DeviceType>();
+    meam_inst_kk->k_arho1m.template sync<DeviceType>();
+    meam_inst_kk->k_arho2m.template sync<DeviceType>();
+    meam_inst_kk->k_arho3m.template sync<DeviceType>();
+    meam_inst_kk->k_arho3mb.template sync<DeviceType>();
+  }
 
   meam_inst_kk->meam_dens_final(nlocal,eflag_either,eflag_global,eflag_atom,
                    d_eatom,ntype,type,d_map,d_scale,errorflag,ev);
@@ -200,6 +216,13 @@ void PairMEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   meam_inst_kk->k_arho3b.template modify<DeviceType>();
   meam_inst_kk->k_t_ave.template modify<DeviceType>();
   meam_inst_kk->k_tsq_ave.template modify<DeviceType>();
+  if (msmeamflag) {
+    meam_inst_kk->k_arho2mb.template modify<DeviceType>();
+    meam_inst_kk->k_arho1m.template modify<DeviceType>();
+    meam_inst_kk->k_arho2m.template modify<DeviceType>();
+    meam_inst_kk->k_arho3m.template modify<DeviceType>();
+    meam_inst_kk->k_arho3mb.template modify<DeviceType>();
+  }
 
   comm->forward_comm(this);
 
@@ -219,6 +242,13 @@ void PairMEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   meam_inst_kk->k_arho3b.template sync<DeviceType>();
   meam_inst_kk->k_t_ave.template sync<DeviceType>();
   meam_inst_kk->k_tsq_ave.template sync<DeviceType>();
+  if (msmeamflag) {
+    meam_inst_kk->k_arho2mb.template sync<DeviceType>();
+    meam_inst_kk->k_arho1m.template sync<DeviceType>();
+    meam_inst_kk->k_arho2m.template sync<DeviceType>();
+    meam_inst_kk->k_arho3m.template sync<DeviceType>();
+    meam_inst_kk->k_arho3mb.template sync<DeviceType>();
+  }
 
   meam_inst_kk->meam_force(inum_half,eflag_global,eflag_atom,vflag_global,
                            vflag_atom,d_eatom,ntype,type,d_map,x,
@@ -315,7 +345,7 @@ int PairMEAMKokkos<DeviceType>::pack_forward_comm_kokkos(int n, DAT::tdual_int_2
   iswap = iswap_in;
   v_buf = buf.view<DeviceType>();
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMPackForwardComm>(0,n),*this);
-  return n*38;
+  return n*comm_forward;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -324,7 +354,7 @@ template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMPackForwardComm, const int &i) const {
   int j = d_sendlist(iswap, i);
-  int m = i*38;
+  int m = i*comm_forward;
   v_buf[m++] = d_rho0[j];
   v_buf[m++] = d_rho1[j];
   v_buf[m++] = d_rho2[j];
@@ -354,6 +384,22 @@ void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMPackForwardComm, const in
   v_buf[m++] = d_tsq_ave(j,0);
   v_buf[m++] = d_tsq_ave(j,1);
   v_buf[m++] = d_tsq_ave(j,2);
+  if (msmeamflag) {
+    v_buf[m++] = d_arho2mb[j];
+    v_buf[m++] = d_arho1m(j,0);
+    v_buf[m++] = d_arho1m(j,1);
+    v_buf[m++] = d_arho1m(j,2);
+    v_buf[m++] = d_arho2m(j,0);
+    v_buf[m++] = d_arho2m(j,1);
+    v_buf[m++] = d_arho2m(j,2);
+    v_buf[m++] = d_arho2m(j,3);
+    v_buf[m++] = d_arho2m(j,4);
+    v_buf[m++] = d_arho2m(j,5);
+    for (int k = 0; k < 10; k++) v_buf[m++] = d_arho3m(j,k);
+    v_buf[m++] = d_arho3mb(j,0);
+    v_buf[m++] = d_arho3mb(j,1);
+    v_buf[m++] = d_arho3mb(j,2);
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -371,7 +417,8 @@ void PairMEAMKokkos<DeviceType>::unpack_forward_comm_kokkos(int n, int first_in,
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMUnpackForwardComm, const int &i) const{
-   int m = i*38;
+  //int m = i*38;
+  int m = i*comm_forward;
 
     d_rho0[i+first] = v_buf[m++];
     d_rho1[i+first] = v_buf[m++];
@@ -402,6 +449,22 @@ void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMUnpackForwardComm, const
     d_tsq_ave(i+first,0) = v_buf[m++];
     d_tsq_ave(i+first,1) = v_buf[m++];
     d_tsq_ave(i+first,2) = v_buf[m++];
+    if (msmeamflag) {
+      d_arho2mb[i+first] = v_buf[m++];
+      d_arho1m(i+first,0) = v_buf[m++];
+      d_arho1m(i+first,1) = v_buf[m++];
+      d_arho1m(i+first,2) = v_buf[m++];
+      d_arho2m(i+first,0) = v_buf[m++];
+      d_arho2m(i+first,1) = v_buf[m++];
+      d_arho2m(i+first,2) = v_buf[m++];
+      d_arho2m(i+first,3) = v_buf[m++];
+      d_arho2m(i+first,4) = v_buf[m++];
+      d_arho2m(i+first,5) = v_buf[m++];
+      for (int k = 0; k < 10; k++) d_arho3m(i+first,k) = v_buf[m++];
+      d_arho3mb(i+first,0) = v_buf[m++];
+      d_arho3mb(i+first,1) = v_buf[m++];
+      d_arho3mb(i+first,2) = v_buf[m++];
+    }
  }
 
 /* ---------------------------------------------------------------------- */
@@ -426,6 +489,13 @@ int PairMEAMKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf,
   meam_inst_kk->k_arho3b.sync_host();
   meam_inst_kk->k_t_ave.sync_host();
   meam_inst_kk->k_tsq_ave.sync_host();
+  if (msmeamflag) {
+    meam_inst_kk->k_arho2mb.sync_host();
+    meam_inst_kk->k_arho1m.sync_host();
+    meam_inst_kk->k_arho2m.sync_host();
+    meam_inst_kk->k_arho3m.sync_host();
+    meam_inst_kk->k_arho3mb.sync_host();
+  }
 
   int m = 0;
   for (int i = 0; i < n; i++) {
@@ -459,6 +529,22 @@ int PairMEAMKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf,
     buf[m++] = meam_inst_kk->h_tsq_ave(j,0);
     buf[m++] = meam_inst_kk->h_tsq_ave(j,1);
     buf[m++] = meam_inst_kk->h_tsq_ave(j,2);
+    if (msmeamflag) {
+      buf[m++] = meam_inst_kk->h_arho2mb[j];
+      buf[m++] = meam_inst_kk->h_arho1m(j,0);
+      buf[m++] = meam_inst_kk->h_arho1m(j,1);
+      buf[m++] = meam_inst_kk->h_arho1m(j,2);
+      buf[m++] = meam_inst_kk->h_arho2m(j,0);
+      buf[m++] = meam_inst_kk->h_arho2m(j,1);
+      buf[m++] = meam_inst_kk->h_arho2m(j,2);
+      buf[m++] = meam_inst_kk->h_arho2m(j,3);
+      buf[m++] = meam_inst_kk->h_arho2m(j,4);
+      buf[m++] = meam_inst_kk->h_arho2m(j,5);
+      for (int k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3m(j,k);
+      buf[m++] = meam_inst_kk->h_arho3mb(j,0);
+      buf[m++] = meam_inst_kk->h_arho3mb(j,1);
+      buf[m++] = meam_inst_kk->h_arho3mb(j,2);
+    }
   }
 
   return m;
@@ -485,6 +571,13 @@ void PairMEAMKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *b
   meam_inst_kk->k_arho3b.sync_host();
   meam_inst_kk->k_t_ave.sync_host();
   meam_inst_kk->k_tsq_ave.sync_host();
+  if (msmeamflag) {
+    meam_inst_kk->k_arho2mb.sync_host();
+    meam_inst_kk->k_arho1m.sync_host();
+    meam_inst_kk->k_arho2m.sync_host();
+    meam_inst_kk->k_arho3m.sync_host();
+    meam_inst_kk->k_arho3mb.sync_host();
+  }
 
   int m = 0;
   const int last = first + n;
@@ -518,6 +611,22 @@ void PairMEAMKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *b
     meam_inst_kk->h_tsq_ave(i,0) = buf[m++];
     meam_inst_kk->h_tsq_ave(i,1) = buf[m++];
     meam_inst_kk->h_tsq_ave(i,2) = buf[m++];
+    if (msmeamflag) {
+      meam_inst_kk->h_arho2mb[i] = buf[m++];
+      meam_inst_kk->h_arho1m(i,0) = buf[m++];
+      meam_inst_kk->h_arho1m(i,1) = buf[m++];
+      meam_inst_kk->h_arho1m(i,2) = buf[m++];
+      meam_inst_kk->h_arho2m(i,0) = buf[m++];
+      meam_inst_kk->h_arho2m(i,1) = buf[m++];
+      meam_inst_kk->h_arho2m(i,2) = buf[m++];
+      meam_inst_kk->h_arho2m(i,3) = buf[m++];
+      meam_inst_kk->h_arho2m(i,4) = buf[m++];
+      meam_inst_kk->h_arho2m(i,5) = buf[m++];
+      for (int k = 0; k < 10; k++) meam_inst_kk->h_arho3m(i,k) = buf[m++];
+      meam_inst_kk->h_arho3mb(i,0) = buf[m++];
+      meam_inst_kk->h_arho3mb(i,1) = buf[m++];
+      meam_inst_kk->h_arho3mb(i,2) = buf[m++];
+    }
   }
 
   meam_inst_kk->k_rho0.modify_host();
@@ -536,6 +645,13 @@ void PairMEAMKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *b
   meam_inst_kk->k_arho3b.modify_host();
   meam_inst_kk->k_t_ave.modify_host();
   meam_inst_kk->k_tsq_ave.modify_host();
+  if (msmeamflag) {
+    meam_inst_kk->k_arho2mb.modify_host();
+    meam_inst_kk->k_arho1m.modify_host();
+    meam_inst_kk->k_arho2m.modify_host();
+    meam_inst_kk->k_arho3m.modify_host();
+    meam_inst_kk->k_arho3mb.modify_host();
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -546,7 +662,8 @@ int PairMEAMKokkos<DeviceType>::pack_reverse_comm_kokkos(int n, int first_in, DA
   first = first_in;
   v_buf = buf.view<DeviceType>();
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMPackReverseComm>(0,n),*this);
-  return n*30;
+  //return n*30;
+  return n*comm_reverse;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -554,7 +671,8 @@ int PairMEAMKokkos<DeviceType>::pack_reverse_comm_kokkos(int n, int first_in, DA
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMPackReverseComm, const int &i) const {
-  int m = i*30;
+  //int m = i*30;
+  int m = i*comm_reverse;
 
   v_buf[m++] = d_rho0[i+first];
   v_buf[m++] = d_arho2b[i+first];
@@ -577,6 +695,22 @@ void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMPackReverseComm, const in
   v_buf[m++] = d_tsq_ave(i+first,0);
   v_buf[m++] = d_tsq_ave(i+first,1);
   v_buf[m++] = d_tsq_ave(i+first,2);
+  if (msmeamflag) {
+    v_buf[m++] = d_arho2mb[i+first];
+    v_buf[m++] = d_arho1m(i+first,0);
+    v_buf[m++] = d_arho1m(i+first,1);
+    v_buf[m++] = d_arho1m(i+first,2);
+    v_buf[m++] = d_arho2m(i+first,0);
+    v_buf[m++] = d_arho2m(i+first,1);
+    v_buf[m++] = d_arho2m(i+first,2);
+    v_buf[m++] = d_arho2m(i+first,3);
+    v_buf[m++] = d_arho2m(i+first,4);
+    v_buf[m++] = d_arho2m(i+first,5);
+    for (int k = 0; k < 10; k++) v_buf[m++] = d_arho3m(i+first,k);
+    v_buf[m++] = d_arho3mb(i+first,0);
+    v_buf[m++] = d_arho3mb(i+first,1);
+    v_buf[m++] = d_arho3mb(i+first,2);
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -592,6 +726,13 @@ int PairMEAMKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
   meam_inst_kk->k_arho3b.sync_host();
   meam_inst_kk->k_t_ave.sync_host();
   meam_inst_kk->k_tsq_ave.sync_host();
+  if (msmeamflag) {
+    meam_inst_kk->k_arho2mb.sync_host();
+    meam_inst_kk->k_arho1m.sync_host();
+    meam_inst_kk->k_arho2m.sync_host();
+    meam_inst_kk->k_arho3m.sync_host();
+    meam_inst_kk->k_arho3mb.sync_host();
+  }
 
   int m = 0;
   const int last = first + n;
@@ -617,6 +758,22 @@ int PairMEAMKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
     buf[m++] = meam_inst_kk->h_tsq_ave(i,0);
     buf[m++] = meam_inst_kk->h_tsq_ave(i,1);
     buf[m++] = meam_inst_kk->h_tsq_ave(i,2);
+    if (msmeamflag) {
+      buf[m++] = meam_inst_kk->h_arho2mb[i];
+      buf[m++] = meam_inst_kk->h_arho1m(i,0);
+      buf[m++] = meam_inst_kk->h_arho1m(i,1);
+      buf[m++] = meam_inst_kk->h_arho1m(i,2);
+      buf[m++] = meam_inst_kk->h_arho2m(i,0);
+      buf[m++] = meam_inst_kk->h_arho2m(i,1);
+      buf[m++] = meam_inst_kk->h_arho2m(i,2);
+      buf[m++] = meam_inst_kk->h_arho2m(i,3);
+      buf[m++] = meam_inst_kk->h_arho2m(i,4);
+      buf[m++] = meam_inst_kk->h_arho2m(i,5);
+      for (int k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3m(i,k);
+      buf[m++] = meam_inst_kk->h_arho3mb(i,0);
+      buf[m++] = meam_inst_kk->h_arho3mb(i,1);
+      buf[m++] = meam_inst_kk->h_arho3mb(i,2);
+    }
   }
 
   return m;
@@ -639,7 +796,8 @@ template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMUnpackReverseComm, const int &i) const {
   int j = d_sendlist(iswap, i);
-  int m = i*30;
+  //int m = i*30;
+  int m = i*comm_reverse;
 
   d_rho0[j] += v_buf[m++];
   d_arho2b[j] += v_buf[m++];
@@ -662,6 +820,22 @@ void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMUnpackReverseComm, const
   d_tsq_ave(j,0) += v_buf[m++];
   d_tsq_ave(j,1) += v_buf[m++];
   d_tsq_ave(j,2) += v_buf[m++];
+  if (msmeamflag) {
+    d_arho2mb[j] += v_buf[m++];
+    d_arho1m(j,0) += v_buf[m++];
+    d_arho1m(j,1) += v_buf[m++];
+    d_arho1m(j,2) += v_buf[m++];
+    d_arho2m(j,0) += v_buf[m++];
+    d_arho2m(j,1) += v_buf[m++];
+    d_arho2m(j,2) += v_buf[m++];
+    d_arho2m(j,3) += v_buf[m++];
+    d_arho2m(j,4) += v_buf[m++];
+    d_arho2m(j,5) += v_buf[m++];
+    for (int k = 0; k < 10; k++) d_arho3m(j,k) += v_buf[m++];
+    d_arho3mb(j,0) += v_buf[m++];
+    d_arho3mb(j,1) += v_buf[m++];
+    d_arho3mb(j,2) += v_buf[m++];
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -677,6 +851,13 @@ void PairMEAMKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *b
   meam_inst_kk->k_arho3b.sync_host();
   meam_inst_kk->k_t_ave.sync_host();
   meam_inst_kk->k_tsq_ave.sync_host();
+  if (msmeamflag) {
+    meam_inst_kk->k_arho2mb.sync_host();
+    meam_inst_kk->k_arho1m.sync_host();
+    meam_inst_kk->k_arho2m.sync_host();
+    meam_inst_kk->k_arho3m.sync_host();
+    meam_inst_kk->k_arho3mb.sync_host();
+  }
 
   int m = 0;
   for (int i = 0; i < n; i++) {
@@ -702,6 +883,22 @@ void PairMEAMKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *b
     meam_inst_kk->h_tsq_ave(j,0) += buf[m++];
     meam_inst_kk->h_tsq_ave(j,1) += buf[m++];
     meam_inst_kk->h_tsq_ave(j,2) += buf[m++];
+    if (msmeamflag) {
+      meam_inst_kk->h_arho2mb[j] += buf[m++];
+      meam_inst_kk->h_arho1m(j,0) += buf[m++];
+      meam_inst_kk->h_arho1m(j,1) += buf[m++];
+      meam_inst_kk->h_arho1m(j,2) += buf[m++];
+      meam_inst_kk->h_arho2m(j,0) += buf[m++];
+      meam_inst_kk->h_arho2m(j,1) += buf[m++];
+      meam_inst_kk->h_arho2m(j,2) += buf[m++];
+      meam_inst_kk->h_arho2m(j,3) += buf[m++];
+      meam_inst_kk->h_arho2m(j,4) += buf[m++];
+      meam_inst_kk->h_arho2m(j,5) += buf[m++];
+      for (int k = 0; k < 10; k++) meam_inst_kk->h_arho3m(j,k) += buf[m++];
+      meam_inst_kk->h_arho3mb(j,0) += buf[m++];
+      meam_inst_kk->h_arho3mb(j,1) += buf[m++];
+      meam_inst_kk->h_arho3mb(j,2) += buf[m++];
+    }
   }
 
   meam_inst_kk->k_rho0.modify_host();
@@ -712,6 +909,13 @@ void PairMEAMKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *b
   meam_inst_kk->k_arho3b.modify_host();
   meam_inst_kk->k_t_ave.modify_host();
   meam_inst_kk->k_tsq_ave.modify_host();
+  if (msmeamflag) {
+    meam_inst_kk->k_arho2mb.modify_host();
+    meam_inst_kk->k_arho1m.modify_host();
+    meam_inst_kk->k_arho2m.modify_host();
+    meam_inst_kk->k_arho3m.modify_host();
+    meam_inst_kk->k_arho3mb.modify_host();
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -764,6 +968,12 @@ void PairMEAMKokkos<DeviceType>::update_meam_views()
   d_arho3b = meam_inst_kk->d_arho3b;
   d_t_ave = meam_inst_kk->d_t_ave;
   d_tsq_ave = meam_inst_kk->d_tsq_ave;
+  // msmeam
+  d_arho1m = meam_inst_kk->d_arho1m;
+  d_arho2m = meam_inst_kk->d_arho2m;
+  d_arho3m = meam_inst_kk->d_arho3m;
+  d_arho2mb = meam_inst_kk->d_arho2mb;
+  d_arho3mb = meam_inst_kk->d_arho3mb;
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/KOKKOS/pair_meam_kokkos.h b/src/KOKKOS/pair_meam_kokkos.h
index c5fe82fa79..0d0d7667f3 100644
--- a/src/KOKKOS/pair_meam_kokkos.h
+++ b/src/KOKKOS/pair_meam_kokkos.h
@@ -13,12 +13,12 @@
 
 #ifdef PAIR_CLASS
 // clang-format off
-PairStyle(meam/c/kk,PairMEAMKokkos<LMPDeviceType>)
-PairStyle(meam/c/kk/device,PairMEAMKokkos<LMPDeviceType>)
-PairStyle(meam/c/kk/host,PairMEAMKokkos<LMPHostType>)
-PairStyle(meam/kk,PairMEAMKokkos<LMPDeviceType>)
-PairStyle(meam/kk/device,PairMEAMKokkos<LMPDeviceType>)
-PairStyle(meam/kk/host,PairMEAMKokkos<LMPHostType>)
+PairStyle(meam/c/kk,PairMEAMKokkos<LMPDeviceType>);
+PairStyle(meam/c/kk/device,PairMEAMKokkos<LMPDeviceType>);
+PairStyle(meam/c/kk/host,PairMEAMKokkos<LMPHostType>);
+PairStyle(meam/kk,PairMEAMKokkos<LMPDeviceType>);
+PairStyle(meam/kk/device,PairMEAMKokkos<LMPDeviceType>);
+PairStyle(meam/kk/host,PairMEAMKokkos<LMPHostType>);
 // clang-format on
 #else
 
@@ -117,6 +117,9 @@ class PairMEAMKokkos : public PairMEAM, public KokkosBase {
   typename ArrayTypes<DeviceType>::t_ffloat_1d d_rho, d_rho0, d_rho1, d_rho2, d_rho3, d_frhop;
   typename ArrayTypes<DeviceType>::t_ffloat_1d d_gamma, d_dgamma1, d_dgamma2, d_dgamma3, d_arho2b;
   typename ArrayTypes<DeviceType>::t_ffloat_2d d_arho1, d_arho2, d_arho3, d_arho3b, d_t_ave, d_tsq_ave;
+  // msmeam params
+  typename ArrayTypes<DeviceType>::t_ffloat_1d d_arho2mb;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_arho1m, d_arho2m, d_arho3m, d_arho3mb;
 
   void update_meam_views();
 
diff --git a/src/KOKKOS/pair_meam_ms_kokkos.cpp b/src/KOKKOS/pair_meam_ms_kokkos.cpp
new file mode 100644
index 0000000000..491fc0273c
--- /dev/null
+++ b/src/KOKKOS/pair_meam_ms_kokkos.cpp
@@ -0,0 +1,32 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "pair_meam_ms_kokkos.h"
+#include "meam.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+template<class DeviceType>
+PairMEAMMSKokkos<DeviceType>::PairMEAMMSKokkos(LAMMPS *lmp) : PairMEAMKokkos<DeviceType>(lmp)
+{
+  this->meam_inst->msmeamflag = this->msmeamflag = 1;
+  this->myname = "meam/ms/kk";
+}
+
+namespace LAMMPS_NS {
+template class PairMEAMMSKokkos<LMPDeviceType>;
+#ifdef KOKKOS_ENABLE_CUDA
+template class PairMEAMMSKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/pair_meam_ms_kokkos.h b/src/KOKKOS/pair_meam_ms_kokkos.h
new file mode 100644
index 0000000000..a2cefc2c16
--- /dev/null
+++ b/src/KOKKOS/pair_meam_ms_kokkos.h
@@ -0,0 +1,36 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(meam/ms/kk,PairMEAMMSKokkos<LMPDeviceType>);
+PairStyle(meam/ms/kk/device,PairMEAMMSKokkos<LMPDeviceType>);
+PairStyle(meam/ms/kk/host,PairMEAMMSKokkos<LMPHostType>);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_MEAM_MS_KOKKOS_H
+#define LMP_PAIR_MEAM_MS_KOKKOS_H
+
+#include "pair_meam_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template <class DeviceType>
+class PairMEAMMSKokkos : public PairMEAMKokkos<DeviceType> {
+ public:
+  PairMEAMMSKokkos(class LAMMPS *);
+};
+}    // namespace LAMMPS_NS
+#endif
+#endif
diff --git a/src/KOKKOS/pair_mliap_kokkos.cpp b/src/KOKKOS/pair_mliap_kokkos.cpp
index d26b6367f8..71e45085ea 100644
--- a/src/KOKKOS/pair_mliap_kokkos.cpp
+++ b/src/KOKKOS/pair_mliap_kokkos.cpp
@@ -24,6 +24,7 @@
 #include "mliap_model_linear_kokkos.h"
 #ifdef MLIAP_PYTHON
 #include "mliap_model_python_kokkos.h"
+#include "mliap_unified_kokkos.h"
 #endif
 #include "error.h"
 #include "neigh_request.h"
@@ -66,7 +67,6 @@ PairMLIAPKokkos<DeviceType>::~PairMLIAPKokkos()
 template<class DeviceType>
 void PairMLIAPKokkos<DeviceType>::compute(int eflag, int vflag)
 {
-  atomKK->sync(Host,F_MASK | ENERGY_MASK | VIRIAL_MASK);
   atomKK->sync(execution_space,X_MASK | TYPE_MASK );
   MLIAPDataKokkos<DeviceType> *k_data = (MLIAPDataKokkos<DeviceType>*)(data);
 
@@ -97,7 +97,7 @@ void PairMLIAPKokkos<DeviceType>::compute(int eflag, int vflag)
 
   // compute descriptors, if needed
   if (model->nonlinearflag || eflag)  {
-      k_data->sync(descriptor_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | JATOMS_MASK | JELEMS_MASK | RIJ_MASK );
+    k_data->sync(descriptor_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | ELEMS_MASK | JATOMS_MASK | PAIR_I_MASK | JELEMS_MASK | RIJ_MASK );
     descriptor->compute_descriptors(data);
     if (!is_kokkos_descriptor)
       k_data->modified(descriptor_space, DESCRIPTORS_MASK);
@@ -109,12 +109,13 @@ void PairMLIAPKokkos<DeviceType>::compute(int eflag, int vflag)
   k_data->modified(model_space, BETAS_MASK);
   if (eflag_atom)
     k_data->modified(model_space, EATOMS_MASK);
-  e_tally(data);
 
   // calculate force contributions beta_i*dB_i/dR_j
-  k_data->sync(descriptor_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | BETAS_MASK | JATOMS_MASK | JELEMS_MASK | RIJ_MASK );
+  k_data->sync(descriptor_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | ELEMS_MASK | BETAS_MASK | JATOMS_MASK | PAIR_I_MASK | JELEMS_MASK | RIJ_MASK );
   descriptor->compute_forces(data);
 
+  e_tally(data);
+
   if (evflag) {
     atomKK->modified(descriptor_space,F_MASK | ENERGY_MASK | VIRIAL_MASK);
     atomKK->sync(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK);
@@ -181,6 +182,25 @@ void PairMLIAPKokkos<DeviceType>::settings(int narg, char ** arg)
         iarg += 3;
       } else
         new_args.push_back(arg[iarg++]);
+    } else if (strcmp(arg[iarg], "unified") == 0) {
+#ifdef MLIAP_PYTHON
+      printf("IN SETUP UNIFIED\n");
+      if (model != nullptr) error->all(FLERR,"Illegal multiple pair_style mliap model definitions");
+      if (descriptor != nullptr) error->all(FLERR,"Illegal multiple pair_style mliap descriptor definitions");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "pair_style mliap unified", error);
+      MLIAPBuildUnifiedKokkos_t<DeviceType> build = build_unified(arg[iarg+1], dynamic_cast<MLIAPDataKokkos<DeviceType>*>(data), lmp);
+      if (iarg+3 > narg) {
+        ghostneigh = 0;
+      } else {
+        ghostneigh = utils::logical(FLERR, arg[iarg+2], false, lmp);
+      }
+
+      iarg += 3;
+      model = build.model;
+      descriptor = build.descriptor;
+#else
+      error->all(FLERR,"Using pair_style mliap unified requires ML-IAP with python support");
+#endif
     } else
       new_args.push_back(arg[iarg++]);
   }
@@ -226,13 +246,6 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
   k_map.modify<LMPHostType>();
   k_map.sync<LMPDeviceType>();
 
-  auto h_cutsq=k_cutsq.template view<LMPHostType>();
-  for (int itype=1; itype <= atom->ntypes; ++itype)
-    for (int jtype=1; jtype <= atom->ntypes; ++jtype)
-      h_cutsq(itype,jtype) = descriptor->cutsq[map[itype]][map[jtype]];
-  k_cutsq.modify<LMPHostType>();
-  k_cutsq.sync<DeviceType>();
-
   // clear setflag since coeff() called once with I,J = * *
 
   int n = atom->ntypes;
@@ -257,6 +270,13 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
   // set up model, descriptor, and mliap data structures
   model->init();
   descriptor->init();
+
+  auto h_cutsq=k_cutsq.template view<LMPHostType>();
+  for (int itype=1; itype <= atom->ntypes; ++itype)
+    for (int jtype=1; jtype <= atom->ntypes; ++jtype)
+      h_cutsq(itype,jtype) = descriptor->cutsq[map[itype]][map[jtype]];
+  k_cutsq.modify<LMPHostType>();
+  k_cutsq.sync<DeviceType>();
   int gradgradflag = -1;
   delete data;
   data = new MLIAPDataKokkos<DeviceType>(lmp, gradgradflag, map, model, descriptor, this);
diff --git a/src/KOKKOS/region_block_kokkos.cpp b/src/KOKKOS/region_block_kokkos.cpp
index cfee46e916..c53fae7b03 100644
--- a/src/KOKKOS/region_block_kokkos.cpp
+++ b/src/KOKKOS/region_block_kokkos.cpp
@@ -48,7 +48,8 @@ void RegBlockKokkos<DeviceType>::match_all_kokkos(int groupbit_in, DAT::tdual_in
   groupbit = groupbit_in;
   d_match = k_match_in.template view<DeviceType>();
 
-  atomKK->sync(Device, X_MASK | MASK_MASK);
+  auto execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  atomKK->sync(execution_space, X_MASK | MASK_MASK);
 
   x = atomKK->k_x.view<DeviceType>();
   mask = atomKK->k_mask.view<DeviceType>();
diff --git a/src/MEAM/meam.h b/src/MEAM/meam.h
index 9ec7de3426..5a131bdc34 100644
--- a/src/MEAM/meam.h
+++ b/src/MEAM/meam.h
@@ -17,7 +17,7 @@
 #include <cmath>
 #include <string>
 
-#define maxelt 5
+constexpr int maxelt = 5;
 
 namespace LAMMPS_NS {
 class Memory;
@@ -30,6 +30,7 @@ class MEAM {
   virtual ~MEAM();
 
   int copymode;
+  int msmeamflag;
 
  protected:
   Memory *memory;
@@ -74,6 +75,12 @@ class MEAM {
   // vind[23]D = Voight notation index maps for 2 and 3D
   // v2D,v3D = array of factors to apply for Voight notation
 
+  // MS-MEAM parameters
+
+  // msmeamflag = flag to activate MS-MEAM
+  // betam[1-3]_meam = MS-MEAM electron density constants
+  // tm[1-3]_meam = MS-MEAM coefficients on densities in Gamma computation
+
   // nr,dr = pair function discretization parameters
   // nrar,rdrar = spline coeff array parameters
 
@@ -115,12 +122,22 @@ class MEAM {
   int nr, nrar;
   double dr, rdrar;
 
+  // MS-MEAM parameters
+
+  double t1m_meam[maxelt], t2m_meam[maxelt], t3m_meam[maxelt];
+  double beta1m_meam[maxelt], beta2m_meam[maxelt], beta3m_meam[maxelt];
+  //int msmeamflag; // made public for pair style settings
+
  public:
   int nmax;
   double *rho, *rho0, *rho1, *rho2, *rho3, *frhop;
   double *gamma, *dgamma1, *dgamma2, *dgamma3, *arho2b;
   double **arho1, **arho2, **arho3, **arho3b, **t_ave, **tsq_ave;
 
+  // MS-MEAM arrays
+
+  double **arho1m, **arho2m, *arho2mb, **arho3m, **arho3mb;
+
   int maxneigh;
   double *scrfcn, *dscrfcn, *fcpair;
 
@@ -242,7 +259,7 @@ class MEAM {
                   double, double, double, double, double, int, int, lattice_t);
   void get_sijk(double, int, int, int, double *);
   void get_densref(double, int, int, double *, double *, double *, double *, double *, double *,
-                   double *, double *);
+                   double *, double *, double *, double *, double *, double *, double *, double *); // last 6 args for msmeam
   void interpolate_meam(int);
 
  public:
@@ -282,10 +299,12 @@ class MEAM {
   }
   // clang-format on
   static int get_Zij(const lattice_t latt);
+  // last 6 args are optional msmeam parameters
   void meam_setup_global(int nelt, lattice_t *lat, int *ielement, double *atwt, double *alpha,
                          double *b0, double *b1, double *b2, double *b3, double *alat, double *esub,
                          double *asub, double *t0, double *t1, double *t2, double *t3,
-                         double *rozero, int *ibar);
+                         double *rozero, int *ibar, double *b1m, double *b2m, double *b3m,
+                         double *t1m, double *t2m, double *t3m);
   void meam_setup_param(int which, double value, int nindex, int *index /*index(3)*/,
                         int *errorflag);
   virtual void meam_setup_done(double *cutmax);
diff --git a/src/MEAM/meam_dens_final.cpp b/src/MEAM/meam_dens_final.cpp
index cf964a4724..ab0ac8c53f 100644
--- a/src/MEAM/meam_dens_final.cpp
+++ b/src/MEAM/meam_dens_final.cpp
@@ -27,115 +27,222 @@ MEAM::meam_dens_final(int nlocal, int eflag_either, int eflag_global, int eflag_
 
   //     Complete the calculation of density
 
-  for (i = 0; i < nlocal; i++) {
-    elti = fmap[type[i]];
-    if (elti >= 0) {
-      scaleii = scale[type[i]][type[i]];
-      rho1[i] = 0.0;
-      rho2[i] = -1.0 / 3.0 * arho2b[i] * arho2b[i];
-      rho3[i] = 0.0;
-      for (m = 0; m < 3; m++) {
-        rho1[i] = rho1[i] + arho1[i][m] * arho1[i][m];
-        rho3[i] = rho3[i] - 3.0 / 5.0 * arho3b[i][m] * arho3b[i][m];
-      }
-      for (m = 0; m < 6; m++) {
-        rho2[i] = rho2[i] + this->v2D[m] * arho2[i][m] * arho2[i][m];
-      }
-      for (m = 0; m < 10; m++) {
-        rho3[i] = rho3[i] + this->v3D[m] * arho3[i][m] * arho3[i][m];
-      }
-
-      if (rho0[i] > 0.0) {
-        if (this->ialloy == 1) {
-          t_ave[i][0] = fdiv_zero(t_ave[i][0], tsq_ave[i][0]);
-          t_ave[i][1] = fdiv_zero(t_ave[i][1], tsq_ave[i][1]);
-          t_ave[i][2] = fdiv_zero(t_ave[i][2], tsq_ave[i][2]);
-        } else if (this->ialloy == 2) {
-          t_ave[i][0] = this->t1_meam[elti];
-          t_ave[i][1] = this->t2_meam[elti];
-          t_ave[i][2] = this->t3_meam[elti];
-        } else {
-          t_ave[i][0] = t_ave[i][0] / rho0[i];
-          t_ave[i][1] = t_ave[i][1] / rho0[i];
-          t_ave[i][2] = t_ave[i][2] / rho0[i];
+  if (this->msmeamflag) {
+    for (i = 0; i < nlocal; i++) {
+      elti = fmap[type[i]];
+      if (elti >= 0) {
+        scaleii = scale[type[i]][type[i]];
+        rho1[i] = 0.0;
+        rho2[i] = -1.0 / 3.0 * (arho2b[i] * arho2b[i]
+                              - arho2mb[i] * arho2mb[i]);
+        rho3[i] = 0.0;
+        for (m = 0; m < 3; m++) {
+          rho1[i] = rho1[i] + arho1[i][m] * arho1[i][m]
+                            - arho1m[i][m] * arho1m[i][m];
+          rho3[i] = rho3[i] - 3.0 / 5.0 * (arho3b[i][m] * arho3b[i][m]
+                                         - arho3mb[i][m] * arho3mb[i][m]);
         }
-      }
-
-      gamma[i] = t_ave[i][0] * rho1[i] + t_ave[i][1] * rho2[i] + t_ave[i][2] * rho3[i];
-
-      if (rho0[i] > 0.0) {
-        gamma[i] = gamma[i] / (rho0[i] * rho0[i]);
-      }
-
-      Z = get_Zij(this->lattce_meam[elti][elti]);
-
-      G = G_gam(gamma[i], this->ibar_meam[elti], errorflag);
-      if (errorflag != 0)
-        return;
-
-      get_shpfcn(this->lattce_meam[elti][elti], this->stheta_meam[elti][elti], this->ctheta_meam[elti][elti], shp);
-
-      if (this->ibar_meam[elti] <= 0) {
-        Gbar = 1.0;
-        dGbar = 0.0;
-      } else {
-        if (this->mix_ref_t == 1) {
-          gam = (t_ave[i][0] * shp[0] + t_ave[i][1] * shp[1] + t_ave[i][2] * shp[2]) / (Z * Z);
-        } else {
-          gam = (this->t1_meam[elti] * shp[0] + this->t2_meam[elti] * shp[1] + this->t3_meam[elti] * shp[2]) /
-                (Z * Z);
+        for (m = 0; m < 6; m++) {
+          rho2[i] = rho2[i] + this->v2D[m] * (arho2[i][m] * arho2[i][m]
+                                            - arho2m[i][m] * arho2m[i][m]);
         }
-        Gbar = G_gam(gam, this->ibar_meam[elti], errorflag);
-      }
-      rho[i] = rho0[i] * G;
 
-      if (this->mix_ref_t == 1) {
+        for (m = 0; m < 10; m++) {
+          rho3[i] = rho3[i] + this->v3D[m] * (arho3[i][m] * arho3[i][m]
+                                            - arho3m[i][m] * arho3m[i][m]);
+        }
+
+        // all the t weights are already accounted for with msmeam
+        gamma[i] = rho1[i] + rho2[i] + rho3[i];
+
+        if (rho0[i] > 0.0) {
+          gamma[i] = gamma[i] / (rho0[i] * rho0[i]);
+        }
+
+        Z = get_Zij(this->lattce_meam[elti][elti]);
+
+        G = G_gam(gamma[i], this->ibar_meam[elti], errorflag);
+        if (errorflag != 0)
+          return;
+
+        get_shpfcn(this->lattce_meam[elti][elti], this->stheta_meam[elti][elti], this->ctheta_meam[elti][elti], shp);
+
         if (this->ibar_meam[elti] <= 0) {
           Gbar = 1.0;
           dGbar = 0.0;
         } else {
-          gam = (t_ave[i][0] * shp[0] + t_ave[i][1] * shp[1] + t_ave[i][2] * shp[2]) / (Z * Z);
-          Gbar = dG_gam(gam, this->ibar_meam[elti], dGbar);
+          if (this->mix_ref_t == 1) {
+            gam = (t_ave[i][0] * shp[0] + t_ave[i][1] * shp[1] + t_ave[i][2] * shp[2]) / (Z * Z);
+          } else {
+            gam = (this->t1_meam[elti] * shp[0] + this->t2_meam[elti] * shp[1] + this->t3_meam[elti] * shp[2]) /
+                  (Z * Z);
+          }
+          Gbar = G_gam(gam, this->ibar_meam[elti], errorflag);
         }
-        rho_bkgd = this->rho0_meam[elti] * Z * Gbar;
-      } else {
-        if (this->bkgd_dyn == 1) {
-          rho_bkgd = this->rho0_meam[elti] * Z;
+        rho[i] = rho0[i] * G;
+
+        if (this->mix_ref_t == 1) {
+          if (this->ibar_meam[elti] <= 0) {
+            Gbar = 1.0;
+            dGbar = 0.0;
+          } else {
+            gam = (t_ave[i][0] * shp[0] + t_ave[i][1] * shp[1] + t_ave[i][2] * shp[2]) / (Z * Z);
+            Gbar = dG_gam(gam, this->ibar_meam[elti], dGbar);
+          }
+          rho_bkgd = this->rho0_meam[elti] * Z * Gbar;
         } else {
-          rho_bkgd = this->rho_ref_meam[elti];
+          if (this->bkgd_dyn == 1) {
+            rho_bkgd = this->rho0_meam[elti] * Z;
+          } else {
+            rho_bkgd = this->rho_ref_meam[elti];
+          }
+        }
+        rhob = rho[i] / rho_bkgd;
+        denom = 1.0 / rho_bkgd;
+
+        G = dG_gam(gamma[i], this->ibar_meam[elti], dG);
+
+        dgamma1[i] = (G - 2 * dG * gamma[i]) * denom;
+
+        if (!iszero(rho0[i])) {
+          dgamma2[i] = (dG / rho0[i]) * denom;
+        } else {
+          dgamma2[i] = 0.0;
+        }
+
+        //     dgamma3 is nonzero only if we are using the "mixed" rule for
+        //     computing t in the reference system (which is not correct, but
+        //     included for backward compatibility
+        if (this->mix_ref_t == 1) {
+          dgamma3[i] = rho0[i] * G * dGbar / (Gbar * Z * Z) * denom;
+        } else {
+          dgamma3[i] = 0.0;
+        }
+
+        Fl = embedding(this->A_meam[elti], this->Ec_meam[elti][elti], rhob, frhop[i]);
+        if (eflag_either != 0) {
+          Fl *= scaleii;
+          if (eflag_global != 0) {
+            *eng_vdwl = *eng_vdwl + Fl;
+          }
+          if (eflag_atom != 0) {
+            eatom[i] = eatom[i] + Fl;
+          }
         }
       }
-      rhob = rho[i] / rho_bkgd;
-      denom = 1.0 / rho_bkgd;
-
-      G = dG_gam(gamma[i], this->ibar_meam[elti], dG);
-
-      dgamma1[i] = (G - 2 * dG * gamma[i]) * denom;
-
-      if (!iszero(rho0[i])) {
-        dgamma2[i] = (dG / rho0[i]) * denom;
-      } else {
-        dgamma2[i] = 0.0;
-      }
-
-      //     dgamma3 is nonzero only if we are using the "mixed" rule for
-      //     computing t in the reference system (which is not correct, but
-      //     included for backward compatibility
-      if (this->mix_ref_t == 1) {
-        dgamma3[i] = rho0[i] * G * dGbar / (Gbar * Z * Z) * denom;
-      } else {
-        dgamma3[i] = 0.0;
-      }
-
-      Fl = embedding(this->A_meam[elti], this->Ec_meam[elti][elti], rhob, frhop[i]);
-
-      if (eflag_either != 0) {
-        Fl *= scaleii;
-        if (eflag_global != 0) {
-          *eng_vdwl = *eng_vdwl + Fl;
+    }
+  } else {
+    for (i = 0; i < nlocal; i++) {
+      elti = fmap[type[i]];
+      if (elti >= 0) {
+        scaleii = scale[type[i]][type[i]];
+        rho1[i] = 0.0;
+        rho2[i] = -1.0 / 3.0 * arho2b[i] * arho2b[i];
+        rho3[i] = 0.0;
+        for (m = 0; m < 3; m++) {
+          rho1[i] = rho1[i] + arho1[i][m] * arho1[i][m];
+          rho3[i] = rho3[i] - 3.0 / 5.0 * arho3b[i][m] * arho3b[i][m];
         }
-        if (eflag_atom != 0) {
-          eatom[i] = eatom[i] + Fl;
+        for (m = 0; m < 6; m++) {
+          rho2[i] = rho2[i] + this->v2D[m] * arho2[i][m] * arho2[i][m];
+        }
+        for (m = 0; m < 10; m++) {
+          rho3[i] = rho3[i] + this->v3D[m] * arho3[i][m] * arho3[i][m];
+        }
+
+        if (rho0[i] > 0.0) {
+          if (this->ialloy == 1) {
+            t_ave[i][0] = fdiv_zero(t_ave[i][0], tsq_ave[i][0]);
+            t_ave[i][1] = fdiv_zero(t_ave[i][1], tsq_ave[i][1]);
+            t_ave[i][2] = fdiv_zero(t_ave[i][2], tsq_ave[i][2]);
+          } else if (this->ialloy == 2) {
+            t_ave[i][0] = this->t1_meam[elti];
+            t_ave[i][1] = this->t2_meam[elti];
+            t_ave[i][2] = this->t3_meam[elti];
+          } else {
+            t_ave[i][0] = t_ave[i][0] / rho0[i];
+            t_ave[i][1] = t_ave[i][1] / rho0[i];
+            t_ave[i][2] = t_ave[i][2] / rho0[i];
+          }
+        }
+
+        gamma[i] = t_ave[i][0] * rho1[i] + t_ave[i][1] * rho2[i] + t_ave[i][2] * rho3[i];
+
+        if (rho0[i] > 0.0) {
+          gamma[i] = gamma[i] / (rho0[i] * rho0[i]);
+        }
+
+        Z = get_Zij(this->lattce_meam[elti][elti]);
+
+        G = G_gam(gamma[i], this->ibar_meam[elti], errorflag);
+        if (errorflag != 0)
+          return;
+
+        get_shpfcn(this->lattce_meam[elti][elti], this->stheta_meam[elti][elti], this->ctheta_meam[elti][elti], shp);
+
+        if (this->ibar_meam[elti] <= 0) {
+          Gbar = 1.0;
+          dGbar = 0.0;
+        } else {
+          if (this->mix_ref_t == 1) {
+            gam = (t_ave[i][0] * shp[0] + t_ave[i][1] * shp[1] + t_ave[i][2] * shp[2]) / (Z * Z);
+          } else {
+            gam = (this->t1_meam[elti] * shp[0] + this->t2_meam[elti] * shp[1] + this->t3_meam[elti] * shp[2]) /
+                  (Z * Z);
+          }
+          Gbar = G_gam(gam, this->ibar_meam[elti], errorflag);
+        }
+        rho[i] = rho0[i] * G;
+
+        if (this->mix_ref_t == 1) {
+          if (this->ibar_meam[elti] <= 0) {
+            Gbar = 1.0;
+            dGbar = 0.0;
+          } else {
+            gam = (t_ave[i][0] * shp[0] + t_ave[i][1] * shp[1] + t_ave[i][2] * shp[2]) / (Z * Z);
+            Gbar = dG_gam(gam, this->ibar_meam[elti], dGbar);
+          }
+          rho_bkgd = this->rho0_meam[elti] * Z * Gbar;
+        } else {
+          if (this->bkgd_dyn == 1) {
+            rho_bkgd = this->rho0_meam[elti] * Z;
+          } else {
+            rho_bkgd = this->rho_ref_meam[elti];
+          }
+        }
+        rhob = rho[i] / rho_bkgd;
+        denom = 1.0 / rho_bkgd;
+
+        G = dG_gam(gamma[i], this->ibar_meam[elti], dG);
+
+        dgamma1[i] = (G - 2 * dG * gamma[i]) * denom;
+
+        if (!iszero(rho0[i])) {
+          dgamma2[i] = (dG / rho0[i]) * denom;
+        } else {
+          dgamma2[i] = 0.0;
+        }
+
+        //     dgamma3 is nonzero only if we are using the "mixed" rule for
+        //     computing t in the reference system (which is not correct, but
+        //     included for backward compatibility
+        if (this->mix_ref_t == 1) {
+          dgamma3[i] = rho0[i] * G * dGbar / (Gbar * Z * Z) * denom;
+        } else {
+          dgamma3[i] = 0.0;
+        }
+
+        Fl = embedding(this->A_meam[elti], this->Ec_meam[elti][elti], rhob, frhop[i]);
+
+        if (eflag_either != 0) {
+          Fl *= scaleii;
+          if (eflag_global != 0) {
+            *eng_vdwl = *eng_vdwl + Fl;
+          }
+          if (eflag_atom != 0) {
+            eatom[i] = eatom[i] + Fl;
+
+          }
         }
       }
     }
diff --git a/src/MEAM/meam_dens_init.cpp b/src/MEAM/meam_dens_init.cpp
index b60e1a7a17..00ad276ad7 100644
--- a/src/MEAM/meam_dens_init.cpp
+++ b/src/MEAM/meam_dens_init.cpp
@@ -45,6 +45,14 @@ MEAM::meam_dens_setup(int atom_nmax, int nall, int n_neigh)
     memory->destroy(arho3b);
     memory->destroy(t_ave);
     memory->destroy(tsq_ave);
+    // msmeam params
+    if (this->msmeamflag) {
+      memory->destroy(arho1m);
+      memory->destroy(arho2m);
+      memory->destroy(arho3m);
+      memory->destroy(arho2mb);
+      memory->destroy(arho3mb);
+    }
 
     nmax = atom_nmax;
 
@@ -65,6 +73,14 @@ MEAM::meam_dens_setup(int atom_nmax, int nall, int n_neigh)
     memory->create(arho3b, nmax, 3, "pair:arho3b");
     memory->create(t_ave, nmax, 3, "pair:t_ave");
     memory->create(tsq_ave, nmax, 3, "pair:tsq_ave");
+    // msmeam params
+    if (this->msmeamflag) {
+      memory->create(arho1m, nmax, 3, "pair:arho1m");
+      memory->create(arho2m, nmax, 6, "pair:arho2m");
+      memory->create(arho3m, nmax, 10, "pair:arho3m");
+      memory->create(arho2mb, nmax, "pair:arho2mb");
+      memory->create(arho3mb, nmax, 3, "pair:arho3mb");
+    }
   }
 
   if (n_neigh > maxneigh) {
@@ -83,14 +99,30 @@ MEAM::meam_dens_setup(int atom_nmax, int nall, int n_neigh)
     rho0[i] = 0.0;
     arho2b[i] = 0.0;
     arho1[i][0] = arho1[i][1] = arho1[i][2] = 0.0;
-    for (j = 0; j < 6; j++)
+    if (this->msmeamflag) {
+      arho2mb[i] = 0.0;
+      arho1m[i][0] = arho1m[i][1] = arho1m[i][2] = 0.0;
+    }
+    for (j = 0; j < 6; j++) {
       arho2[i][j] = 0.0;
-    for (j = 0; j < 10; j++)
+      if (this->msmeamflag) {
+        arho2m[i][j] = 0.0;
+      }
+    }
+    for (j = 0; j < 10; j++) {
       arho3[i][j] = 0.0;
+      if (this->msmeamflag) {
+        arho3m[i][j] = 0.0;
+      }
+    }
     arho3b[i][0] = arho3b[i][1] = arho3b[i][2] = 0.0;
+    if (this->msmeamflag) {
+      arho3mb[i][0] = arho3mb[i][1] = arho3mb[i][2] = 0.0;
+    }
     t_ave[i][0] = t_ave[i][1] = t_ave[i][2] = 0.0;
     tsq_ave[i][0] = tsq_ave[i][1] = tsq_ave[i][2] = 0.0;
   }
+
 }
 
 void
@@ -282,6 +314,9 @@ MEAM::calc_rho1(int i, int /*ntype*/, int* type, int* fmap, double** x, int numn
   // double G,Gbar,gam,shp[3+1];
   double ro0i, ro0j;
   double rhoa0i, rhoa1i, rhoa2i, rhoa3i, A1i, A2i, A3i;
+  // msmeam params
+  double rhoa1mj, rhoa2mj, rhoa3mj, A1mj, A2mj, A3mj;
+  double rhoa1mi, rhoa2mi, rhoa3mi, A1mi, A2mi, A3mi;
 
   elti = fmap[type[i]];
   xtmp = x[i][0];
@@ -306,10 +341,20 @@ MEAM::calc_rho1(int i, int /*ntype*/, int* type, int* fmap, double** x, int numn
         rhoa1j = ro0j * MathSpecial::fm_exp(-this->beta1_meam[eltj] * aj) * sij;
         rhoa2j = ro0j * MathSpecial::fm_exp(-this->beta2_meam[eltj] * aj) * sij;
         rhoa3j = ro0j * MathSpecial::fm_exp(-this->beta3_meam[eltj] * aj) * sij;
+        if (this->msmeamflag){
+          rhoa1mj = ro0j * this->t1m_meam[eltj] * MathSpecial::fm_exp(-this->beta1m_meam[eltj] * aj) * sij;
+          rhoa2mj = ro0j * this->t2m_meam[eltj] * MathSpecial::fm_exp(-this->beta2m_meam[eltj] * aj) * sij;
+          rhoa3mj = ro0j * this->t3m_meam[eltj] * MathSpecial::fm_exp(-this->beta3m_meam[eltj] * aj) * sij;
+        }
         rhoa0i = ro0i * MathSpecial::fm_exp(-this->beta0_meam[elti] * ai) * sij;
         rhoa1i = ro0i * MathSpecial::fm_exp(-this->beta1_meam[elti] * ai) * sij;
         rhoa2i = ro0i * MathSpecial::fm_exp(-this->beta2_meam[elti] * ai) * sij;
         rhoa3i = ro0i * MathSpecial::fm_exp(-this->beta3_meam[elti] * ai) * sij;
+        if (this->msmeamflag){
+          rhoa1mi = ro0i * this->t1m_meam[elti] * MathSpecial::fm_exp(-this->beta1m_meam[elti] * ai) * sij;
+          rhoa2mi = ro0i * this->t2m_meam[elti] * MathSpecial::fm_exp(-this->beta2m_meam[elti] * ai) * sij;
+          rhoa3mi = ro0i * this->t3m_meam[elti] * MathSpecial::fm_exp(-this->beta3m_meam[elti] * ai) * sij;
+        }
         if (this->ialloy == 1) {
           rhoa1j = rhoa1j * this->t1_meam[eltj];
           rhoa2j = rhoa2j * this->t2_meam[eltj];
@@ -321,6 +366,7 @@ MEAM::calc_rho1(int i, int /*ntype*/, int* type, int* fmap, double** x, int numn
         rho0[i] = rho0[i] + rhoa0j;
         rho0[j] = rho0[j] + rhoa0i;
         // For ialloy = 2, use single-element value (not average)
+        // For ialloy = 2, use single-element value (not average)
         if (this->ialloy != 2) {
           t_ave[i][0] = t_ave[i][0] + this->t1_meam[eltj] * rhoa0j;
           t_ave[i][1] = t_ave[i][1] + this->t2_meam[eltj] * rhoa0j;
@@ -348,18 +394,42 @@ MEAM::calc_rho1(int i, int /*ntype*/, int* type, int* fmap, double** x, int numn
         A3i = rhoa3i / (rij2 * rij);
         nv2 = 0;
         nv3 = 0;
+        if (this->msmeamflag) {
+          arho2mb[i] = arho2mb[i] + rhoa2mj;
+          arho2mb[j] = arho2mb[j] + rhoa2mi;
+          A1mj = rhoa1mj/rij;
+          A2mj = rhoa2mj/rij2;
+          A3mj = rhoa3mj/(rij2*rij);
+          A1mi = rhoa1mi/rij;
+          A2mi = rhoa2mi/rij2;
+          A3mi = rhoa3mi/(rij2*rij);
+        }
         for (m = 0; m < 3; m++) {
           arho1[i][m] = arho1[i][m] + A1j * delij[m];
           arho1[j][m] = arho1[j][m] - A1i * delij[m];
           arho3b[i][m] = arho3b[i][m] + rhoa3j * delij[m] / rij;
           arho3b[j][m] = arho3b[j][m] - rhoa3i * delij[m] / rij;
+          if (this->msmeamflag) {
+            arho1m[i][m] = arho1m[i][m] + A1mj*delij[m];
+            arho1m[j][m] = arho1m[j][m] - A1mi*delij[m];
+            arho3mb[i][m] = arho3mb[i][m] + rhoa3mj*delij[m] / rij;
+            arho3mb[j][m] = arho3mb[j][m] - rhoa3mi*delij[m] / rij;
+          }
           for (n = m; n < 3; n++) {
             arho2[i][nv2] = arho2[i][nv2] + A2j * delij[m] * delij[n];
             arho2[j][nv2] = arho2[j][nv2] + A2i * delij[m] * delij[n];
+            if (this->msmeamflag) {
+              arho2m[i][nv2] = arho2m[i][nv2] + A2mj*delij[m] * delij[n];
+              arho2m[j][nv2] = arho2m[j][nv2] + A2mi*delij[m] * delij[n];
+            }
             nv2 = nv2 + 1;
             for (p = n; p < 3; p++) {
               arho3[i][nv3] = arho3[i][nv3] + A3j * delij[m] * delij[n] * delij[p];
               arho3[j][nv3] = arho3[j][nv3] - A3i * delij[m] * delij[n] * delij[p];
+              if (this->msmeamflag) {
+                arho3m[i][nv3] = arho3m[i][nv3] + A3mj*delij[m]*delij[n]*delij[p];
+                arho3m[j][nv3] = arho3m[j][nv3] - A3mi*delij[m]*delij[n]*delij[p];
+              }
               nv3 = nv3 + 1;
             }
           }
diff --git a/src/MEAM/meam_force.cpp b/src/MEAM/meam_force.cpp
index acc3d5672a..4bc7380898 100644
--- a/src/MEAM/meam_force.cpp
+++ b/src/MEAM/meam_force.cpp
@@ -61,6 +61,17 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
   double t1i, t2i, t3i, t1j, t2j, t3j;
   double scaleij;
 
+  double rhoa1mj,drhoa1mj,rhoa1mi,drhoa1mi;
+  double rhoa2mj,drhoa2mj,rhoa2mi,drhoa2mi;
+  double rhoa3mj, drhoa3mj, rhoa3mi, drhoa3mi;
+  double arg1i1m, arg1j1m, arg1i2m, arg1j2m, arg1i3m, arg1j3m, arg3i3m, arg3j3m;
+  double drho1mdr1, drho1mdr2, drho1mds1, drho1mds2;
+  double drho1mdrm1[3], drho1mdrm2[3];
+  double drho2mdr1, drho2mdr2, drho2mds1, drho2mds2;
+  double drho2mdrm1[3], drho2mdrm2[3];
+  double drho3mdr1, drho3mdr2, drho3mds1, drho3mds2;
+  double drho3mdrm1[3], drho3mdrm2[3];
+
   third = 1.0 / 3.0;
   sixth = 1.0 / 6.0;
 
@@ -74,6 +85,7 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
   zitmp = x[i][2];
 
   //     Treat each pair
+
   for (jn = 0; jn < numneigh; jn++) {
     j = firstneigh[jn];
     eltj = fmap[type[j]];
@@ -89,7 +101,6 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
       if (rij2 < this->cutforcesq) {
         rij = sqrt(rij2);
         recip = 1.0 / rij;
-
         //     Compute phi and phip
         ind = this->eltind[elti][eltj];
         pp = rij * this->rdrar;
@@ -114,6 +125,7 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
         //     write(1,*) "force_meamf: phip: ",phip
 
         //     Compute pair densities and derivatives
+
         invrei = 1.0 / this->re_meam[elti][elti];
         ai = rij * invrei - 1.0;
         ro0i = this->rho0_meam[elti];
@@ -126,6 +138,15 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
         rhoa3i = ro0i * MathSpecial::fm_exp(-this->beta3_meam[elti] * ai);
         drhoa3i = -this->beta3_meam[elti] * invrei * rhoa3i;
 
+        if (this->msmeamflag) {
+          rhoa1mi = ro0i * MathSpecial::fm_exp(-this->beta1m_meam[elti] * ai) * t1m_meam[elti];
+          drhoa1mi = -this->beta1m_meam[elti] * invrei * rhoa1mi;
+          rhoa2mi = ro0i * MathSpecial::fm_exp(-this->beta2m_meam[elti] * ai) * t2m_meam[elti];
+          drhoa2mi = -this->beta2m_meam[elti] * invrei * rhoa2mi;
+          rhoa3mi = ro0i * MathSpecial::fm_exp(-this->beta3m_meam[elti] * ai) * t3m_meam[elti];
+          drhoa3mi = -this->beta3m_meam[elti] * invrei * rhoa3mi;
+        }
+
         if (elti != eltj) {
           invrej = 1.0 / this->re_meam[eltj][eltj];
           aj = rij * invrej - 1.0;
@@ -138,6 +159,16 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
           drhoa2j = -this->beta2_meam[eltj] * invrej * rhoa2j;
           rhoa3j = ro0j * MathSpecial::fm_exp(-this->beta3_meam[eltj] * aj);
           drhoa3j = -this->beta3_meam[eltj] * invrej * rhoa3j;
+
+          if (this->msmeamflag) {
+            rhoa1mj = ro0j * t1m_meam[eltj] * MathSpecial::fm_exp(-this->beta1m_meam[eltj] * aj);
+            drhoa1mj = -this->beta1m_meam[eltj] * invrej * rhoa1mj;
+            rhoa2mj = ro0j * t2m_meam[eltj] * MathSpecial::fm_exp(-this->beta2m_meam[eltj] * aj);
+            drhoa2mj = -this->beta2m_meam[eltj] * invrej * rhoa2mj;
+            rhoa3mj = ro0j * t3m_meam[eltj] * MathSpecial::fm_exp(-this->beta3m_meam[eltj] * aj);
+            drhoa3mj = -this->beta3m_meam[eltj] * invrej * rhoa3mj;
+          }
+
         } else {
           rhoa0j = rhoa0i;
           drhoa0j = drhoa0i;
@@ -147,6 +178,15 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
           drhoa2j = drhoa2i;
           rhoa3j = rhoa3i;
           drhoa3j = drhoa3i;
+
+          if (this->msmeamflag) {
+            rhoa1mj = rhoa1mi;
+            drhoa1mj = drhoa1mi;
+            rhoa2mj = rhoa2mi;
+            drhoa2mj = drhoa2mi;
+            rhoa3mj = rhoa3mi;
+            drhoa3mj = drhoa3mi;
+          }
         }
 
         const double t1mi = this->t1_meam[elti];
@@ -156,7 +196,10 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
         const double t2mj = this->t2_meam[eltj];
         const double t3mj = this->t3_meam[eltj];
 
-        if (this->ialloy == 1) {
+        // ialloy mod not needed in MS-MEAM, but similarity here is that we multply rhos by t.
+        // We did this above with rhoa1mj, rhoa2mj, etc.
+
+        if (this->ialloy == 1 || this->msmeamflag) {
           rhoa1j  *= t1mj;
           rhoa2j  *= t2mj;
           rhoa3j  *= t3mj;
@@ -200,6 +243,39 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
           arg3j3 = arg3j3 - arho3b[j][n] * delij[n];
         }
 
+        // msmeam arhom args
+
+        nv2 = 0;
+        nv3 = 0;
+        arg1i1m = 0.0;
+        arg1j1m = 0.0;
+        arg1i2m = 0.0;
+        arg1j2m = 0.0;
+        arg1i3m = 0.0;
+        arg1j3m = 0.0;
+        arg3i3m = 0.0;
+        arg3j3m = 0.0;
+        if (this->msmeamflag) {
+          for (n = 0; n < 3; n++) {
+            for (p = n; p < 3; p++) {
+              for (q = p; q < 3; q++) {
+                arg = delij[n] * delij[p] * delij[q] * this->v3D[nv3];
+                arg1i3m = arg1i3m - arho3m[i][nv3] * arg;
+                arg1j3m = arg1j3m + arho3m[j][nv3] * arg;
+                nv3 = nv3 + 1;
+              }
+              arg = delij[n] * delij[p] * this->v2D[nv2];
+              arg1i2m = arg1i2m + arho2m[i][nv2] * arg;
+              arg1j2m = arg1j2m + arho2m[j][nv2] * arg;
+              nv2 = nv2 + 1;
+            }
+            arg1i1m = arg1i1m - arho1m[i][n] * delij[n];
+            arg1j1m = arg1j1m + arho1m[j][n] * delij[n];
+            arg3i3m = arg3i3m - arho3mb[i][n] * delij[n];
+            arg3j3m = arg3j3m + arho3mb[j][n] * delij[n];
+          }
+        }
+
         //     rho0 terms
         drho0dr1 = drhoa0j * sij;
         drho0dr2 = drhoa0i * sij;
@@ -254,32 +330,83 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
           drho3drm2[m] = (-a3 * drho3drm2[m] + a3a * arho3b[j][m]) * rhoa3i;
         }
 
-        //     Compute derivatives of weighting functions t wrt rij
-        t1i = t_ave[i][0];
-        t2i = t_ave[i][1];
-        t3i = t_ave[i][2];
-        t1j = t_ave[j][0];
-        t2j = t_ave[j][1];
-        t3j = t_ave[j][2];
+        if (this->msmeamflag) {
+          //     rho1m terms
+          a1 = 2 * sij / rij;
+          drho1mdr1 = a1 * (drhoa1mj - rhoa1mj / rij) * arg1i1m;
+          drho1mdr2 = a1 * (drhoa1mi - rhoa1mi / rij) * arg1j1m;
+          drho1mdr1 *= -1.0;
+          drho1mdr2 *= -1.0;
+          a1 = 2.0 * sij / rij;
+          for (m = 0; m < 3; m++) {
+            drho1mdrm1[m] = a1 * rhoa1mj * arho1m[i][m];
+            drho1mdrm2[m] = -a1 * rhoa1mi * arho1m[j][m];
+          }
 
-        if (this->ialloy == 1) {
+          //     rho2m terms
+          a2 = 2 * sij / rij2;
+          drho2mdr1 = a2 * (drhoa2mj - 2 * rhoa2mj / rij) * arg1i2m - 2.0 / 3.0 * arho2mb[i] * drhoa2mj * sij;
+          drho2mdr2 = a2 * (drhoa2mi - 2 * rhoa2mi / rij) * arg1j2m - 2.0 / 3.0 * arho2mb[j] * drhoa2mi * sij;
+          a2 = 4 * sij / rij2;
+          for (m = 0; m < 3; m++) {
+            drho2mdrm1[m] = 0.0;
+            drho2mdrm2[m] = 0.0;
+            for (n = 0; n < 3; n++) {
+              drho2mdrm1[m] += arho2m[i][this->vind2D[m][n]] * delij[n];
+              drho2mdrm2[m] -= arho2m[j][this->vind2D[m][n]] * delij[n];
+            }
+            drho2mdrm1[m] = a2 * rhoa2mj * drho2mdrm1[m];
+            drho2mdrm2[m] = -a2 * rhoa2mi * drho2mdrm2[m];
+          }
 
-          a1i = fdiv_zero(drhoa0j * sij, tsq_ave[i][0]);
-          a1j = fdiv_zero(drhoa0i * sij, tsq_ave[j][0]);
-          a2i = fdiv_zero(drhoa0j * sij, tsq_ave[i][1]);
-          a2j = fdiv_zero(drhoa0i * sij, tsq_ave[j][1]);
-          a3i = fdiv_zero(drhoa0j * sij, tsq_ave[i][2]);
-          a3j = fdiv_zero(drhoa0i * sij, tsq_ave[j][2]);
+          //     rho3m terms
+          rij3 = rij * rij2;
+          a3 = 2 * sij / rij3;
+          a3a = 6.0 / 5.0 * sij / rij;
+          drho3mdr1 = a3 * (drhoa3mj - 3 * rhoa3mj / rij) * arg1i3m - a3a * (drhoa3mj - rhoa3mj / rij) * arg3i3m;
+          drho3mdr2 = a3 * (drhoa3mi - 3 * rhoa3mi / rij) * arg1j3m - a3a * (drhoa3mi - rhoa3mi / rij) * arg3j3m;
+          drho3mdr1 *= -1.0;
+          drho3mdr2 *= -1.0;
 
-          dt1dr1 = a1i * (t1mj - t1i * MathSpecial::square(t1mj));
-          dt1dr2 = a1j * (t1mi - t1j * MathSpecial::square(t1mi));
-          dt2dr1 = a2i * (t2mj - t2i * MathSpecial::square(t2mj));
-          dt2dr2 = a2j * (t2mi - t2j * MathSpecial::square(t2mi));
-          dt3dr1 = a3i * (t3mj - t3i * MathSpecial::square(t3mj));
-          dt3dr2 = a3j * (t3mi - t3j * MathSpecial::square(t3mi));
+          a3 = 6 * sij / rij3;
+          a3a = 6 * sij / (5 * rij);
+          for (m = 0; m < 3; m++) {
+            drho3mdrm1[m] = 0.0;
+            drho3mdrm2[m] = 0.0;
+            nv2 = 0;
+            for (n = 0; n < 3; n++) {
+              for (p = n; p < 3; p++) {
+                arg = delij[n] * delij[p] * this->v2D[nv2];
+                drho3mdrm1[m] += arho3m[i][this->vind3D[m][n][p]] * arg;
+                drho3mdrm2[m] += arho3m[j][this->vind3D[m][n][p]] * arg;
+                nv2 = nv2 + 1;
+              }
+            }
+            drho3mdrm1[m] = (a3 * drho3mdrm1[m] - a3a * arho3mb[i][m]) * rhoa3mj;
+            drho3mdrm2[m] = (-a3 * drho3mdrm2[m] + a3a * arho3mb[j][m]) * rhoa3mi;
+          }
+        } else {
+          for (m = 0; m < 3; m++) {
+            drho1mdrm1[m] = 0.0;
+            drho1mdrm2[m] = 0.0;
+            drho2mdrm1[m] = 0.0;
+            drho2mdrm2[m] = 0.0;
+            drho3mdrm1[m] = 0.0;
+            drho3mdrm2[m] = 0.0;
+          }
+        }
 
-        } else if (this->ialloy == 2) {
+        // compute derivatives of weighting functions t wrt rij
+        // weighting functions t set to unity for MS-MEAM
 
+        if (this->msmeamflag) {
+
+          t1i = 1.0;
+          t2i = 1.0;
+          t3i = 1.0;
+          t1j = 1.0;
+          t2j = 1.0;
+          t3j = 1.0;
           dt1dr1 = 0.0;
           dt1dr2 = 0.0;
           dt2dr1 = 0.0;
@@ -289,38 +416,98 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
 
         } else {
 
-          ai = 0.0;
-          if (!iszero(rho0[i]))
-            ai = drhoa0j * sij / rho0[i];
-          aj = 0.0;
-          if (!iszero(rho0[j]))
-            aj = drhoa0i * sij / rho0[j];
+          t1i = t_ave[i][0];
+          t2i = t_ave[i][1];
+          t3i = t_ave[i][2];
+          t1j = t_ave[j][0];
+          t2j = t_ave[j][1];
+          t3j = t_ave[j][2];
+
+          if (this->ialloy == 1) {
+
+            a1i = fdiv_zero(drhoa0j * sij, tsq_ave[i][0]);
+            a1j = fdiv_zero(drhoa0i * sij, tsq_ave[j][0]);
+            a2i = fdiv_zero(drhoa0j * sij, tsq_ave[i][1]);
+            a2j = fdiv_zero(drhoa0i * sij, tsq_ave[j][1]);
+            a3i = fdiv_zero(drhoa0j * sij, tsq_ave[i][2]);
+            a3j = fdiv_zero(drhoa0i * sij, tsq_ave[j][2]);
+
+            dt1dr1 = a1i * (t1mj - t1i * MathSpecial::square(t1mj));
+            dt1dr2 = a1j * (t1mi - t1j * MathSpecial::square(t1mi));
+            dt2dr1 = a2i * (t2mj - t2i * MathSpecial::square(t2mj));
+            dt2dr2 = a2j * (t2mi - t2j * MathSpecial::square(t2mi));
+            dt3dr1 = a3i * (t3mj - t3i * MathSpecial::square(t3mj));
+            dt3dr2 = a3j * (t3mi - t3j * MathSpecial::square(t3mi));
+
+          } else if (this->ialloy == 2) {
+
+            dt1dr1 = 0.0;
+            dt1dr2 = 0.0;
+            dt2dr1 = 0.0;
+            dt2dr2 = 0.0;
+            dt3dr1 = 0.0;
+            dt3dr2 = 0.0;
+
+          } else {
+
+            ai = 0.0;
+            if (!iszero(rho0[i]))
+              ai = drhoa0j * sij / rho0[i];
+            aj = 0.0;
+            if (!iszero(rho0[j]))
+              aj = drhoa0i * sij / rho0[j];
+
+            dt1dr1 = ai * (t1mj - t1i);
+            dt1dr2 = aj * (t1mi - t1j);
+            dt2dr1 = ai * (t2mj - t2i);
+            dt2dr2 = aj * (t2mi - t2j);
+            dt3dr1 = ai * (t3mj - t3i);
+            dt3dr2 = aj * (t3mi - t3j);
+          }
 
-          dt1dr1 = ai * (t1mj - t1i);
-          dt1dr2 = aj * (t1mi - t1j);
-          dt2dr1 = ai * (t2mj - t2i);
-          dt2dr2 = aj * (t2mi - t2j);
-          dt3dr1 = ai * (t3mj - t3i);
-          dt3dr2 = aj * (t3mi - t3j);
         }
 
         //     Compute derivatives of total density wrt rij, sij and rij(3)
         get_shpfcn(this->lattce_meam[elti][elti], this->stheta_meam[elti][elti], this->ctheta_meam[elti][elti], shpi);
         get_shpfcn(this->lattce_meam[eltj][eltj], this->stheta_meam[elti][elti], this->ctheta_meam[elti][elti], shpj);
 
-        drhodr1 = dgamma1[i] * drho0dr1 +
-          dgamma2[i] * (dt1dr1 * rho1[i] + t1i * drho1dr1 + dt2dr1 * rho2[i] + t2i * drho2dr1 +
-                        dt3dr1 * rho3[i] + t3i * drho3dr1) -
-          dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1);
-        drhodr2 = dgamma1[j] * drho0dr2 +
-          dgamma2[j] * (dt1dr2 * rho1[j] + t1j * drho1dr2 + dt2dr2 * rho2[j] + t2j * drho2dr2 +
-                        dt3dr2 * rho3[j] + t3j * drho3dr2) -
-          dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2);
-        for (m = 0; m < 3; m++) {
-          drhodrm1[m] = 0.0;
-          drhodrm2[m] = 0.0;
-          drhodrm1[m] = dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]);
-          drhodrm2[m] = dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]);
+        if (this->msmeamflag) {
+          drhodr1 = dgamma1[i] * drho0dr1 +
+            dgamma2[i] * (dt1dr1 * rho1[i] + t1i * (drho1dr1 - drho1mdr1) +
+                          dt2dr1 * rho2[i] + t2i * (drho2dr1 - drho2mdr1) +
+                          dt3dr1 * rho3[i] + t3i * (drho3dr1 - drho3mdr1)) -
+            dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1);
+          drhodr2 = dgamma1[j] * drho0dr2 +
+            dgamma2[j] * (dt1dr2 * rho1[j] + t1j * (drho1dr2 - drho1mdr2) +
+                          dt2dr2 * rho2[j] + t2j * (drho2dr2 - drho2mdr2) +
+                          dt3dr2 * rho3[j] + t3j * (drho3dr2 - drho3mdr2)) -
+            dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2);
+          for (m = 0; m < 3; m++) {
+            drhodrm1[m] = 0.0;
+            drhodrm2[m] = 0.0;
+            drhodrm1[m] = dgamma2[i] * (t1i * (drho1drm1[m] - drho1mdrm1[m]) +
+                                        t2i * (drho2drm1[m] - drho2mdrm1[m]) +
+                                        t3i * (drho3drm1[m] - drho3mdrm1[m]) );
+            drhodrm2[m] = dgamma2[j] * (t1j * (drho1drm2[m] - drho1mdrm2[m]) +
+                                        t2j * (drho2drm2[m] - drho2mdrm2[m]) +
+                                        t3j * (drho3drm2[m] - drho3mdrm2[m]) );
+          }
+        } else {
+
+          drhodr1 = dgamma1[i] * drho0dr1 +
+            dgamma2[i] * (dt1dr1 * rho1[i] + t1i * drho1dr1 + dt2dr1 * rho2[i] + t2i * drho2dr1 +
+                          dt3dr1 * rho3[i] + t3i * drho3dr1) -
+            dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1);
+          drhodr2 = dgamma1[j] * drho0dr2 +
+            dgamma2[j] * (dt1dr2 * rho1[j] + t1j * drho1dr2 + dt2dr2 * rho2[j] + t2j * drho2dr2 +
+                          dt3dr2 * rho3[j] + t3j * drho3dr2) -
+            dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2);
+          for (m = 0; m < 3; m++) {
+            drhodrm1[m] = 0.0;
+            drhodrm2[m] = 0.0;
+            drhodrm1[m] = dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]);
+            drhodrm2[m] = dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]);
+          }
         }
 
         //     Compute derivatives wrt sij, but only if necessary
@@ -328,17 +515,37 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
           drho0ds1 = rhoa0j;
           drho0ds2 = rhoa0i;
           a1 = 2.0 / rij;
-          drho1ds1 = a1 * rhoa1j * arg1i1;
-          drho1ds2 = a1 * rhoa1i * arg1j1;
           a2 = 2.0 / rij2;
-          drho2ds1 = a2 * rhoa2j * arg1i2 - 2.0 / 3.0 * arho2b[i] * rhoa2j;
-          drho2ds2 = a2 * rhoa2i * arg1j2 - 2.0 / 3.0 * arho2b[j] * rhoa2i;
           a3 = 2.0 / rij3;
           a3a = 6.0 / (5.0 * rij);
+
+          drho1ds1 = a1 * rhoa1j * arg1i1;
+          drho1ds2 = a1 * rhoa1i * arg1j1;
+          drho2ds1 = a2 * rhoa2j * arg1i2 - 2.0 / 3.0 * arho2b[i] * rhoa2j;
+          drho2ds2 = a2 * rhoa2i * arg1j2 - 2.0 / 3.0 * arho2b[j] * rhoa2i;
           drho3ds1 = a3 * rhoa3j * arg1i3 - a3a * rhoa3j * arg3i3;
           drho3ds2 = a3 * rhoa3i * arg1j3 - a3a * rhoa3i * arg3j3;
 
+          if (this->msmeamflag) {
+            drho1mds1 = a1 * rhoa1mj * arg1i1m;
+            drho1mds2 = a1 * rhoa1mi * arg1j1m;
+            drho2mds1 = a2 * rhoa2mj * arg1i2m - 2.0 / 3.0 * arho2mb[i] * rhoa2mj;
+            drho2mds2 = a2 * rhoa2mi * arg1j2m - 2.0 / 3.0 * arho2mb[j] * rhoa2mi;
+            drho3mds1 = a3 * rhoa3mj * arg1i3m - a3a * rhoa3mj * arg3i3m;
+            drho3mds2 = a3 * rhoa3mi * arg1j3m - a3a * rhoa3mi * arg3j3m;
+            drho3mds1 *= -1;
+            drho3mds2 *= -1;
+          } else {
+            drho1mds1 = 0.0;
+            drho1mds2 = 0.0;
+            drho2mds1 = 0.0;
+            drho2mds2 = 0.0;
+            drho3mds1 = 0.0;
+            drho3mds2 = 0.0;
+          }
+
           if (this->ialloy == 1) {
+
             a1i = fdiv_zero(rhoa0j, tsq_ave[i][0]);
             a1j = fdiv_zero(rhoa0i, tsq_ave[j][0]);
             a2i = fdiv_zero(rhoa0j, tsq_ave[i][1]);
@@ -379,19 +586,36 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int
             dt3ds2 = aj * (t3mi - t3j);
           }
 
-          drhods1 = dgamma1[i] * drho0ds1 +
-            dgamma2[i] * (dt1ds1 * rho1[i] + t1i * drho1ds1 + dt2ds1 * rho2[i] + t2i * drho2ds1 +
-                          dt3ds1 * rho3[i] + t3i * drho3ds1) -
-            dgamma3[i] * (shpi[0] * dt1ds1 + shpi[1] * dt2ds1 + shpi[2] * dt3ds1);
-          drhods2 = dgamma1[j] * drho0ds2 +
-            dgamma2[j] * (dt1ds2 * rho1[j] + t1j * drho1ds2 + dt2ds2 * rho2[j] + t2j * drho2ds2 +
-                          dt3ds2 * rho3[j] + t3j * drho3ds2) -
-            dgamma3[j] * (shpj[0] * dt1ds2 + shpj[1] * dt2ds2 + shpj[2] * dt3ds2);
+          if (this->msmeamflag) {
+            drhods1 = dgamma1[i] * drho0ds1 +
+              dgamma2[i] * (dt1ds1 * rho1[i] + t1i * (drho1ds1 - drho1mds1) +
+                            dt2ds1 * rho2[i] + t2i * (drho2ds1 - drho2mds1) +
+                            dt3ds1 * rho3[i] + t3i * (drho3ds1 - drho3mds1)) -
+              dgamma3[i] * (shpi[0] * dt1ds1 + shpi[1] * dt2ds1 + shpi[2] * dt3ds1);
+            drhods2 = dgamma1[j] * drho0ds2 +
+              dgamma2[j] * (dt1ds2 * rho1[j] + t1j * (drho1ds2 - drho1mds2) +
+                            dt2ds2 * rho2[j] + t2j * (drho2ds2 - drho2mds2) +
+                            dt3ds2 * rho3[j] + t3j * (drho3ds2 - drho3mds2)) -
+              dgamma3[j] * (shpj[0] * dt1ds2 + shpj[1] * dt2ds2 + shpj[2] * dt3ds2);
+          }
+          else {
+            drhods1 = dgamma1[i] * drho0ds1 +
+              dgamma2[i] * (dt1ds1 * rho1[i] + t1i * drho1ds1 + dt2ds1 * rho2[i] + t2i * drho2ds1 +
+                            dt3ds1 * rho3[i] + t3i * drho3ds1) -
+              dgamma3[i] * (shpi[0] * dt1ds1 + shpi[1] * dt2ds1 + shpi[2] * dt3ds1);
+            drhods2 = dgamma1[j] * drho0ds2 +
+              dgamma2[j] * (dt1ds2 * rho1[j] + t1j * drho1ds2 + dt2ds2 * rho2[j] + t2j * drho2ds2 +
+                            dt3ds2 * rho3[j] + t3j * drho3ds2) -
+              dgamma3[j] * (shpj[0] * dt1ds2 + shpj[1] * dt2ds2 + shpj[2] * dt3ds2);
+          }
         }
 
-        //     Compute derivatives of energy wrt rij, sij and rij[3]
+        // Compute derivatives of energy wrt rij, sij and rij[3]
+        // MS-MEAM affects phip
+
         dUdrij = phip * sij + frhop[i] * drhodr1 + frhop[j] * drhodr2;
         dUdsij = 0.0;
+
         if (!iszero(dscrfcn[fnoffset + jn])) {
           dUdsij = phi + frhop[i] * drhods1 + frhop[j] * drhods2;
         }
diff --git a/src/MEAM/meam_impl.cpp b/src/MEAM/meam_impl.cpp
index bbfb83e94a..5290647b18 100644
--- a/src/MEAM/meam_impl.cpp
+++ b/src/MEAM/meam_impl.cpp
@@ -34,6 +34,11 @@ MEAM::MEAM(Memory* mem)
   gamma = dgamma1 = dgamma2 = dgamma3 = arho2b = nullptr;
   arho1 = arho2 = arho3 = arho3b = t_ave = tsq_ave = nullptr;
 
+  // msmeam arrays
+  msmeamflag = 0;
+  arho2mb = nullptr;
+  arho1m = arho2m = arho3m = arho3mb = nullptr;
+
   maxneigh = 0;
   scrfcn = dscrfcn = fcpair = nullptr;
   copymode = 0;
@@ -43,7 +48,9 @@ MEAM::MEAM(Memory* mem)
     A_meam[i] = rho0_meam[i] = beta0_meam[i] =
       beta1_meam[i]= beta2_meam[i] = beta3_meam[i] =
       t0_meam[i] = t1_meam[i] = t2_meam[i] = t3_meam[i] =
-      rho_ref_meam[i] = ibar_meam[i] = ielt_meam[i] = 0.0;
+      rho_ref_meam[i] = ibar_meam[i] = ielt_meam[i] =
+      t1m_meam[i] = t2m_meam[i] = t3m_meam[i] =
+      beta1m_meam[i] = beta2m_meam[i] = beta3m_meam[i] = 0.0;
     for (int j = 0; j < maxelt; j++) {
       lattce_meam[i][j] = FCC;
       Ec_meam[i][j] = re_meam[i][j] = alpha_meam[i][j] = delta_meam[i][j] = ebound_meam[i][j] = attrac_meam[i][j] = repuls_meam[i][j] = 0.0;
@@ -87,4 +94,13 @@ MEAM::~MEAM()
   memory->destroy(this->scrfcn);
   memory->destroy(this->dscrfcn);
   memory->destroy(this->fcpair);
+
+  // msmeam
+  if (this->msmeamflag){
+    memory->destroy(this->arho1m);
+    memory->destroy(this->arho2m);
+    memory->destroy(this->arho3m);
+    memory->destroy(this->arho2mb);
+    memory->destroy(this->arho3mb);
+  }
 }
diff --git a/src/MEAM/meam_setup_done.cpp b/src/MEAM/meam_setup_done.cpp
index 93f2552465..de1188349c 100644
--- a/src/MEAM/meam_setup_done.cpp
+++ b/src/MEAM/meam_setup_done.cpp
@@ -220,7 +220,6 @@ void MEAM::compute_pair_meam()
       // loop over r values and compute
       for (j = 0; j < this->nr; j++) {
         r = j * this->dr;
-
         this->phir[nv2][j] = phi_meam(r, a, b);
 
         // if using second-nearest neighbor, solve recursive problem
@@ -333,9 +332,12 @@ double MEAM::phi_meam(double r, int a, int b)
   lattice_t latta /*unused:,lattb*/;
   double rho_bkgd1, rho_bkgd2;
   double b11s, b22s;
+  // msmeam
+  double t1m1av, t2m1av, t3m1av, t1m2av, t2m2av, t3m2av;
+  double rho1m1, rho2m1, rho3m1;
+  double rho1m2, rho2m2, rho3m2;
 
   double phi_m = 0.0;
-
   // Equation numbers below refer to:
   //   I. Huang et.al., Modelling simul. Mater. Sci. Eng. 3:615
 
@@ -345,8 +347,16 @@ double MEAM::phi_meam(double r, int a, int b)
   Z2 = get_Zij(this->lattce_meam[b][b]);
   Z12 = get_Zij(this->lattce_meam[a][b]);
 
-  get_densref(r, a, b, &rho01, &rho11, &rho21, &rho31, &rho02, &rho12, &rho22, &rho32);
-
+  // this function has extra args for msmeam
+  if (this->msmeamflag) {
+    get_densref(r, a, b, &rho01, &rho11, &rho21, &rho31, &rho02, &rho12, &rho22, &rho32,
+                &rho1m1, &rho2m1, &rho3m1,
+                &rho1m2, &rho2m2, &rho3m2);
+  } else {
+    get_densref(r, a, b, &rho01, &rho11, &rho21, &rho31, &rho02, &rho12, &rho22, &rho32,
+                nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr);
+  }
   // if densities are too small, numerical problems may result; just return zero
   if (rho01 <= 1e-14 && rho02 <= 1e-14)
     return 0.0;
@@ -374,6 +384,12 @@ double MEAM::phi_meam(double r, int a, int b)
     get_tavref(&t11av, &t21av, &t31av, &t12av, &t22av, &t32av, this->t1_meam[a], this->t2_meam[a],
                this->t3_meam[a], this->t1_meam[b], this->t2_meam[b], this->t3_meam[b], r, a, b,
                this->lattce_meam[a][b]);
+    // with msmeam call twice with different sets of variables
+    if (this->msmeamflag) {
+      get_tavref(&t1m1av, &t2m1av, &t3m1av, &t1m2av, &t2m2av, &t3m2av, this->t1m_meam[a], this->t2m_meam[a],
+                this->t3m_meam[a], this->t1m_meam[b], this->t2m_meam[b], this->t3m_meam[b], r, a, b,
+                this->lattce_meam[a][b]);
+    }
   }
 
   // for c11b structure, calculate background electron densities
@@ -420,17 +436,33 @@ double MEAM::phi_meam(double r, int a, int b)
       rho0_1 = this->rho0_meam[a] * Z1 * G1;
       rho0_2 = this->rho0_meam[b] * Z2 * G2;
     }
-    Gam1 = (t11av * rho11 + t21av * rho21 + t31av * rho31);
-    if (rho01 < 1.0e-14)
-      Gam1 = 0.0;
-    else
-      Gam1 = Gam1 / (rho01 * rho01);
 
-    Gam2 = (t12av * rho12 + t22av * rho22 + t32av * rho32);
-    if (rho02 < 1.0e-14)
-      Gam2 = 0.0;
-    else
-      Gam2 = Gam2 / (rho02 * rho02);
+    if (this->msmeamflag) {
+      // no additional use of t's here; all included in definitions of rho's for msmeam
+      Gam1 = rho11 + rho21 + rho31 - (rho1m1 + rho2m1 + rho3m1);
+      if (rho01 < 1.0e-14)
+        Gam1 = 0.0;
+      else
+        Gam1 = Gam1 / (rho01 * rho01);
+      Gam2 = rho12 + rho22 + rho32 - (rho1m2 + rho2m2 + rho3m2);
+      if (rho02 < 1.0e-14)
+        Gam2 = 0.0;
+      else
+        Gam2 = Gam2 / (rho02 * rho02);
+
+    } else {
+      Gam1 = (t11av * rho11 + t21av * rho21 + t31av * rho31);
+      if (rho01 < 1.0e-14)
+        Gam1 = 0.0;
+      else
+        Gam1 = Gam1 / (rho01 * rho01);
+
+      Gam2 = (t12av * rho12 + t22av * rho22 + t32av * rho32);
+      if (rho02 < 1.0e-14)
+        Gam2 = 0.0;
+      else
+        Gam2 = Gam2 / (rho02 * rho02);
+    }
 
     G1 = G_gam(Gam1, this->ibar_meam[a], errorflag);
     G2 = G_gam(Gam2, this->ibar_meam[b], errorflag);
@@ -655,7 +687,9 @@ void MEAM::get_sijk(double C, int i, int j, int k, double* sijk)
 //------------------------------------------------------------------------------c
 // Calculate density functions, assuming reference configuration
 void MEAM::get_densref(double r, int a, int b, double* rho01, double* rho11, double* rho21, double* rho31,
-                       double* rho02, double* rho12, double* rho22, double* rho32)
+                       double* rho02, double* rho12, double* rho22, double* rho32,
+                       double* rho1m1, double* rho2m1, double* rho3m1,
+                       double* rho1m2, double* rho2m2, double* rho3m2)
 {
   double a1, a2;
   double s[3];
@@ -666,18 +700,39 @@ void MEAM::get_densref(double r, int a, int b, double* rho01, double* rho11, dou
   double rhoa02, rhoa12, rhoa22, rhoa32;
   double arat, scrn, denom;
   double C, s111, s112, s221, S11, S22;
+  // msmeam
+  double rhoa1m1, rhoa2m1, rhoa3m1, rhoa1m2, rhoa2m2, rhoa3m2;
 
   a1 = r / this->re_meam[a][a] - 1.0;
   a2 = r / this->re_meam[b][b] - 1.0;
 
   rhoa01 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta0_meam[a] * a1);
-  rhoa11 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta1_meam[a] * a1);
-  rhoa21 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta2_meam[a] * a1);
-  rhoa31 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta3_meam[a] * a1);
-  rhoa02 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta0_meam[b] * a2);
-  rhoa12 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta1_meam[b] * a2);
-  rhoa22 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta2_meam[b] * a2);
-  rhoa32 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta3_meam[b] * a2);
+
+  if (this->msmeamflag) {
+    // the rho variables are multiplied by t here since ialloy not needed in msmeam
+    rhoa11 = this->rho0_meam[a] * this->t1_meam[a] * MathSpecial::fm_exp(-this->beta1_meam[a] * a1);
+    rhoa21 = this->rho0_meam[a] * this->t2_meam[a] * MathSpecial::fm_exp(-this->beta2_meam[a] * a1);
+    rhoa31 = this->rho0_meam[a] * this->t3_meam[a] * MathSpecial::fm_exp(-this->beta3_meam[a] * a1);
+    rhoa02 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta0_meam[b] * a2);
+    rhoa12 = this->rho0_meam[b] * this->t1_meam[b] * MathSpecial::fm_exp(-this->beta1_meam[b] * a2);
+    rhoa22 = this->rho0_meam[b] * this->t2_meam[b] * MathSpecial::fm_exp(-this->beta2_meam[b] * a2);
+    rhoa32 = this->rho0_meam[b] * this->t3_meam[b] * MathSpecial::fm_exp(-this->beta3_meam[b] * a2);
+    // msmeam specific rho vars
+    rhoa1m1 = this->rho0_meam[a] * this->t1m_meam[a] * MathSpecial::fm_exp(-this->beta1m_meam[a] * a1);
+    rhoa2m1 = this->rho0_meam[a] * this->t2m_meam[a] * MathSpecial::fm_exp(-this->beta2m_meam[a] * a1);
+    rhoa3m1 = this->rho0_meam[a] * this->t3m_meam[a] * MathSpecial::fm_exp(-this->beta3m_meam[a] * a1);
+    rhoa1m2 = this->rho0_meam[b] * this->t1m_meam[b] * MathSpecial::fm_exp(-this->beta1m_meam[b] * a2);
+    rhoa2m2 = this->rho0_meam[b] * this->t2m_meam[b] * MathSpecial::fm_exp(-this->beta2m_meam[b] * a2);
+    rhoa3m2 = this->rho0_meam[b] * this->t3m_meam[b] * MathSpecial::fm_exp(-this->beta3m_meam[b] * a2);
+  } else {
+    rhoa11 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta1_meam[a] * a1);
+    rhoa21 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta2_meam[a] * a1);
+    rhoa31 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta3_meam[a] * a1);
+    rhoa02 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta0_meam[b] * a2);
+    rhoa12 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta1_meam[b] * a2);
+    rhoa22 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta2_meam[b] * a2);
+    rhoa32 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta3_meam[b] * a2);
+  }
 
   lat = this->lattce_meam[a][b];
 
@@ -689,7 +744,16 @@ void MEAM::get_densref(double r, int a, int b, double* rho01, double* rho11, dou
   *rho12 = 0.0;
   *rho22 = 0.0;
   *rho32 = 0.0;
+  if (this->msmeamflag) {
+    *rho1m1 = 0.0;
+    *rho2m1 = 0.0;
+    *rho3m1 = 0.0;
+    *rho1m2 = 0.0;
+    *rho2m2 = 0.0;
+    *rho3m2 = 0.0;
+  }
 
+  // keep track of density components separately; combine in the calling subroutine
   switch (lat) {
     case FCC:
       *rho01 = 12.0 * rhoa02;
@@ -710,12 +774,20 @@ void MEAM::get_densref(double r, int a, int b, double* rho01, double* rho11, dou
       *rho02 = 4.0 * rhoa01;
       *rho31 = 32.0 / 9.0 * rhoa32 * rhoa32;
       *rho32 = 32.0 / 9.0 * rhoa31 * rhoa31;
+      if (this->msmeamflag) {
+        *rho3m1 = 32.0 / 9.0 * rhoa3m2 * rhoa3m2;
+        *rho3m2 = 32.0 / 9.0 * rhoa3m1 * rhoa3m1;
+      }
       break;
     case HCP:
       *rho01 = 12 * rhoa02;
       *rho02 = 12 * rhoa01;
       *rho31 = 1.0 / 3.0 * rhoa32 * rhoa32;
       *rho32 = 1.0 / 3.0 * rhoa31 * rhoa31;
+      if (this->msmeamflag) {
+        *rho3m1 = 1.0 / 3.0 * rhoa3m2 * rhoa3m2;
+        *rho3m2 = 1.0 / 3.0 * rhoa3m1 * rhoa3m1;
+      }
       break;
     case DIM:
       get_shpfcn(DIM, 0, 0, s);
@@ -727,6 +799,14 @@ void MEAM::get_densref(double r, int a, int b, double* rho01, double* rho11, dou
       *rho22 = s[1] * rhoa21 * rhoa21;
       *rho31 = s[2] * rhoa32 * rhoa32;
       *rho32 = s[2] * rhoa31 * rhoa31;
+      if (this->msmeamflag) {
+        *rho1m1 = s[0] * rhoa1m2 * rhoa1m2;
+        *rho1m2 = s[0] * rhoa1m1 * rhoa1m1;
+        *rho2m1 = s[1] * rhoa2m2 * rhoa2m2;
+        *rho2m2 = s[1] * rhoa2m1 * rhoa2m1;
+        *rho3m1 = s[2] * rhoa3m2 * rhoa3m2;
+        *rho3m2 = s[2] * rhoa3m1 * rhoa3m1;
+      }
       break;
     case C11:
       *rho01 = rhoa01;
@@ -737,17 +817,28 @@ void MEAM::get_densref(double r, int a, int b, double* rho01, double* rho11, dou
       *rho22 = rhoa22;
       *rho31 = rhoa31;
       *rho32 = rhoa32;
+      if (this->msmeamflag) {
+        *rho1m1 = rhoa1m1;
+        *rho1m2 = rhoa1m2;
+        *rho2m1 = rhoa2m1;
+        *rho2m2 = rhoa2m2;
+        *rho3m1 = rhoa3m1;
+        *rho3m2 = rhoa3m2;
+      }
       break;
     case L12:
       *rho01 = 8 * rhoa01 + 4 * rhoa02;
       *rho02 = 12 * rhoa01;
-      if (this->ialloy == 1) {
+      if (this->ialloy ==1){
         *rho21 = 8. / 3. * MathSpecial::square(rhoa21 * this->t2_meam[a] - rhoa22 * this->t2_meam[b]);
         denom = 8 * rhoa01 * MathSpecial::square(this->t2_meam[a]) + 4 * rhoa02 * MathSpecial::square(this->t2_meam[b]);
         if (denom > 0.)
           *rho21 = *rho21 / denom * *rho01;
       } else
         *rho21 = 8. / 3. * (rhoa21 - rhoa22) * (rhoa21 - rhoa22);
+      if (this->msmeamflag) {
+        *rho2m1 = 8. / 3. * (rhoa2m1 - rhoa2m2) * (rhoa2m1 - rhoa2m2);
+      }
       break;
     case B2:
       *rho01 = 8.0 * rhoa02;
@@ -864,6 +955,7 @@ void MEAM::interpolate_meam(int ind)
   this->rdrar = 1.0 / drar;
 
   // phir interp
+
   for (j = 0; j < this->nrar; j++) {
     this->phirar[ind][j] = this->phir[ind][j];
   }
diff --git a/src/MEAM/meam_setup_global.cpp b/src/MEAM/meam_setup_global.cpp
index 545a2ad3f4..5d35242e7c 100644
--- a/src/MEAM/meam_setup_global.cpp
+++ b/src/MEAM/meam_setup_global.cpp
@@ -36,7 +36,8 @@ void
 MEAM::meam_setup_global(int nelt, lattice_t* lat, int* ielement, double* /*atwt*/, double* alpha,
                         double* b0, double* b1, double* b2, double* b3, double* alat, double* esub,
                         double* asub, double* t0, double* t1, double* t2, double* t3, double* rozero,
-                        int* ibar)
+                        int* ibar, double* b1m, double *b2m, double *b3m, double *t1m, double *t2m,
+                        double *t3m)
 {
 
   int i;
@@ -53,6 +54,11 @@ MEAM::meam_setup_global(int nelt, lattice_t* lat, int* ielement, double* /*atwt*
     this->beta1_meam[i] = b1[i];
     this->beta2_meam[i] = b2[i];
     this->beta3_meam[i] = b3[i];
+    if (this->msmeamflag){
+      this->beta1m_meam[i] = b1m[i];
+      this->beta2m_meam[i] = b2m[i];
+      this->beta3m_meam[i] = b3m[i];
+    }
     tmplat[i] = alat[i];
     this->Ec_meam[i][i] = esub[i];
     this->A_meam[i] = asub[i];
@@ -60,6 +66,11 @@ MEAM::meam_setup_global(int nelt, lattice_t* lat, int* ielement, double* /*atwt*
     this->t1_meam[i] = t1[i];
     this->t2_meam[i] = t2[i];
     this->t3_meam[i] = t3[i];
+    if (this->msmeamflag){
+      this->t1m_meam[i] = t1m[i];
+      this->t2m_meam[i] = t2m[i];
+      this->t3m_meam[i] = t3m[i];
+    }
     this->rho0_meam[i] = rozero[i];
     this->ibar_meam[i] = ibar[i];
 
diff --git a/src/MEAM/pair_meam.cpp b/src/MEAM/pair_meam.cpp
index bcfffbe52b..c4a4cfa1d7 100644
--- a/src/MEAM/pair_meam.cpp
+++ b/src/MEAM/pair_meam.cpp
@@ -58,13 +58,12 @@ PairMEAM::PairMEAM(LAMMPS *lmp) : Pair(lmp)
   allocated = 0;
 
   nlibelements = 0;
+
   meam_inst = new MEAM(memory);
+  meam_inst->msmeamflag = msmeamflag = 0;
+  myname = "meam";
+
   scale = nullptr;
-
-  // set comm size needed by this Pair
-
-  comm_forward = 38;
-  comm_reverse = 30;
 }
 
 /* ----------------------------------------------------------------------
@@ -93,7 +92,6 @@ void PairMEAM::compute(int eflag, int vflag)
   int i,ii,n,inum_half,errorflag;
   int *ilist_half,*numneigh_half,**firstneigh_half;
   int *numneigh_full,**firstneigh_full;
-
   ev_init(eflag,vflag);
 
   // neighbor list info
@@ -133,7 +131,6 @@ void PairMEAM::compute(int eflag, int vflag)
 
   int offset = 0;
   errorflag = 0;
-
   for (ii = 0; ii < inum_half; ii++) {
     i = ilist_half[ii];
     meam_inst->meam_dens_init(i,ntype,type,map,x,
@@ -142,9 +139,7 @@ void PairMEAM::compute(int eflag, int vflag)
                     offset);
     offset += numneigh_half[i];
   }
-
   comm->reverse_comm(this);
-
   meam_inst->meam_dens_final(nlocal,eflag_either,eflag_global,eflag_atom,
                    &eng_vdwl,eatom,ntype,type,map,scale,errorflag);
   if (errorflag)
@@ -159,7 +154,6 @@ void PairMEAM::compute(int eflag, int vflag)
 
   double **vptr = nullptr;
   if (vflag_atom) vptr = vatom;
-
   for (ii = 0; ii < inum_half; ii++) {
     i = ilist_half[ii];
     meam_inst->meam_force(i,eflag_global,eflag_atom,vflag_global,
@@ -169,7 +163,6 @@ void PairMEAM::compute(int eflag, int vflag)
                           offset,f,vptr,virial);
     offset += numneigh_half[i];
   }
-
   if (vflag_fdotr) virial_fdotr_compute();
 }
 
@@ -193,7 +186,17 @@ void PairMEAM::allocate()
 
 void PairMEAM::settings(int narg, char ** /*arg*/)
 {
-  if (narg != 0) error->all(FLERR,"Illegal pair_style command");
+  if (narg != 0) error->all(FLERR,"Illegal pair_style {} command", myname);
+
+  // set comm size needed by this Pair
+
+  if (msmeamflag) {
+    comm_forward = 38+23; // plus 23 for msmeam
+    comm_reverse = 30+23; // plus 23 for msmeam
+  } else {
+    comm_forward = 38;
+    comm_reverse = 30;
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -206,12 +209,7 @@ void PairMEAM::coeff(int narg, char **arg)
 
   if (!allocated) allocate();
 
-  if (narg < 6) error->all(FLERR,"Incorrect args for pair coefficients");
-
-  // ensure I,J args are * *
-
-  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
-    error->all(FLERR,"Incorrect args for pair coefficients");
+  if (narg < 6) error->all(FLERR,"Incorrect args for pair style {} coefficients", myname);
 
   // check for presence of first meam file
 
@@ -239,7 +237,7 @@ void PairMEAM::coeff(int narg, char **arg)
   }
   if (paridx < 0) error->all(FLERR,"No MEAM parameter file in pair coefficients");
   if ((narg - paridx - 1) != atom->ntypes)
-    error->all(FLERR,"Incorrect args for pair coefficients");
+    error->all(FLERR,"Incorrect args for pair style {} coefficients", myname);
 
   // MEAM element names between 2 filenames
   // nlibelements = # of MEAM elements
@@ -282,7 +280,7 @@ void PairMEAM::coeff(int narg, char **arg)
       if (libelements[j] == arg[i]) break;
     if (j < nlibelements) map[m] = j;
     else if (strcmp(arg[i],"NULL") == 0) map[m] = -1;
-    else error->all(FLERR,"Incorrect args for pair coefficients");
+    else error->all(FLERR,"Incorrect args for pair style {} coefficients", myname);
   }
 
   // clear setflag since coeff() called once with I,J = * *
@@ -307,7 +305,7 @@ void PairMEAM::coeff(int narg, char **arg)
     }
   }
 
-  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+  if (count == 0) error->all(FLERR,"Incorrect args for pair style {} coefficients", myname);
 }
 
 /* ----------------------------------------------------------------------
@@ -317,7 +315,7 @@ void PairMEAM::coeff(int narg, char **arg)
 void PairMEAM::init_style()
 {
   if (force->newton_pair == 0)
-    error->all(FLERR,"Pair style MEAM requires newton pair on");
+    error->all(FLERR,"Pair style {} requires newton pair on", myname);
 
   // need a full and a half neighbor list
 
@@ -360,7 +358,9 @@ void PairMEAM::read_files(const std::string &globalfile,
 
 void PairMEAM::read_global_meam_file(const std::string &globalfile)
 {
+
   // allocate parameter arrays
+
   std::vector<lattice_t> lat(nlibelements);
   std::vector<int> ielement(nlibelements);
   std::vector<int> ibar(nlibelements);
@@ -381,6 +381,15 @@ void PairMEAM::read_global_meam_file(const std::string &globalfile)
   std::vector<double> rozero(nlibelements);
   std::vector<bool> found(nlibelements, false);
 
+  // allocate 6 extra arrays for msmeam
+
+  std::vector<double> b1m(nlibelements);
+  std::vector<double> b2m(nlibelements);
+  std::vector<double> b3m(nlibelements);
+  std::vector<double> t1m(nlibelements);
+  std::vector<double> t2m(nlibelements);
+  std::vector<double> t3m(nlibelements);
+
   // open global meamf file on proc 0
 
   if (comm->me == 0) {
@@ -416,8 +425,7 @@ void PairMEAM::read_global_meam_file(const std::string &globalfile)
         std::string lattice_type = values.next_string();
 
         if (!MEAM::str_to_lat(lattice_type, true, lat[index]))
-          error->one(FLERR,"Unrecognized lattice type in MEAM "
-                                       "library file: {}", lattice_type);
+          error->one(FLERR,"Unrecognized lattice type in MEAM library file: {}", lattice_type);
 
         // store parameters
 
@@ -429,6 +437,11 @@ void PairMEAM::read_global_meam_file(const std::string &globalfile)
         b1[index] = values.next_double();
         b2[index] = values.next_double();
         b3[index] = values.next_double();
+        if (msmeamflag) {
+          b1m[index] = values.next_double();
+          b2m[index] = values.next_double();
+          b3m[index] = values.next_double();
+        }
         alat[index] = values.next_double();
         esub[index] = values.next_double();
         asub[index] = values.next_double();
@@ -436,15 +449,20 @@ void PairMEAM::read_global_meam_file(const std::string &globalfile)
         t1[index] = values.next_double();
         t2[index] = values.next_double();
         t3[index] = values.next_double();
+        if (msmeamflag) {
+          t1m[index] = values.next_double();
+          t2m[index] = values.next_double();
+          t3m[index] = values.next_double();
+        }
         rozero[index] = values.next_double();
         ibar[index] = values.next_int();
 
         if (!isone(t0[index]))
-          error->one(FLERR,"Unsupported parameter in MEAM library file: t0!=1");
+          error->one(FLERR,"Unsupported parameter in MEAM library file: t0 != 1");
 
         // z given is ignored: if this is mismatched, we definitely won't do what the user said -> fatal error
         if (z[index] != MEAM::get_Zij(lat[index]))
-          error->one(FLERR,"Mismatched parameter in MEAM library file: z!=lat");
+          error->one(FLERR,"Mismatched parameter in MEAM library file: z != lat");
 
         nset++;
       } catch (TokenizerException &e) {
@@ -484,13 +502,29 @@ void PairMEAM::read_global_meam_file(const std::string &globalfile)
   MPI_Bcast(t2.data(), nlibelements, MPI_DOUBLE, 0, world);
   MPI_Bcast(t3.data(), nlibelements, MPI_DOUBLE, 0, world);
   MPI_Bcast(rozero.data(), nlibelements, MPI_DOUBLE, 0, world);
+  // distribute msmeam parameter sets
+  MPI_Bcast(b1m.data(), nlibelements, MPI_DOUBLE, 0, world);
+  MPI_Bcast(b2m.data(), nlibelements, MPI_DOUBLE, 0, world);
+  MPI_Bcast(b3m.data(), nlibelements, MPI_DOUBLE, 0, world);
+  MPI_Bcast(t1m.data(), nlibelements, MPI_DOUBLE, 0, world);
+  MPI_Bcast(t2m.data(), nlibelements, MPI_DOUBLE, 0, world);
+  MPI_Bcast(t3m.data(), nlibelements, MPI_DOUBLE, 0, world);
 
   // pass element parameters to MEAM package
 
-  meam_inst->meam_setup_global(nlibelements, lat.data(), ielement.data(), atwt.data(),
-                               alpha.data(), b0.data(), b1.data(), b2.data(), b3.data(),
-                               alat.data(), esub.data(), asub.data(), t0.data(), t1.data(),
-                               t2.data(), t3.data(), rozero.data(), ibar.data());
+  if (msmeamflag) {
+    meam_inst->meam_setup_global(nlibelements, lat.data(), ielement.data(), atwt.data(),
+                                alpha.data(), b0.data(), b1.data(), b2.data(), b3.data(),
+                                alat.data(), esub.data(), asub.data(), t0.data(), t1.data(),
+                                t2.data(), t3.data(), rozero.data(), ibar.data(), b1m.data(),
+                                b2m.data(), b3m.data(), t1m.data(), t2m.data(), t3m.data());
+  } else {
+    meam_inst->meam_setup_global(nlibelements, lat.data(), ielement.data(), atwt.data(),
+                                alpha.data(), b0.data(), b1.data(), b2.data(), b3.data(),
+                                alat.data(), esub.data(), asub.data(), t0.data(), t1.data(),
+                                t2.data(), t3.data(), rozero.data(), ibar.data(), nullptr,
+                                nullptr, nullptr, nullptr, nullptr, nullptr);
+  }
 
   // set element masses
 
@@ -613,6 +647,23 @@ int PairMEAM::pack_forward_comm(int n, int *list, double *buf,
     buf[m++] = meam_inst->tsq_ave[j][0];
     buf[m++] = meam_inst->tsq_ave[j][1];
     buf[m++] = meam_inst->tsq_ave[j][2];
+    if (msmeamflag) {
+      buf[m++] = meam_inst->arho2mb[j];
+      buf[m++] = meam_inst->arho1m[j][0];
+      buf[m++] = meam_inst->arho1m[j][1];
+      buf[m++] = meam_inst->arho1m[j][2];
+      buf[m++] = meam_inst->arho2m[j][0];
+      buf[m++] = meam_inst->arho2m[j][1];
+      buf[m++] = meam_inst->arho2m[j][2];
+      buf[m++] = meam_inst->arho2m[j][3];
+      buf[m++] = meam_inst->arho2m[j][4];
+      buf[m++] = meam_inst->arho2m[j][5];
+      for (k = 0; k < 10; k++) buf[m++] = meam_inst->arho3m[j][k];
+      buf[m++] = meam_inst->arho3mb[j][0];
+      buf[m++] = meam_inst->arho3mb[j][1];
+      buf[m++] = meam_inst->arho3mb[j][2];
+    }
+
   }
 
   return m;
@@ -656,6 +707,22 @@ void PairMEAM::unpack_forward_comm(int n, int first, double *buf)
     meam_inst->tsq_ave[i][0] = buf[m++];
     meam_inst->tsq_ave[i][1] = buf[m++];
     meam_inst->tsq_ave[i][2] = buf[m++];
+    if (msmeamflag) {
+      meam_inst->arho2mb[i] = buf[m++];
+      meam_inst->arho1m[i][0] = buf[m++];
+      meam_inst->arho1m[i][1] = buf[m++];
+      meam_inst->arho1m[i][2] = buf[m++];
+      meam_inst->arho2m[i][0] = buf[m++];
+      meam_inst->arho2m[i][1] = buf[m++];
+      meam_inst->arho2m[i][2] = buf[m++];
+      meam_inst->arho2m[i][3] = buf[m++];
+      meam_inst->arho2m[i][4] = buf[m++];
+      meam_inst->arho2m[i][5] = buf[m++];
+      for (k = 0; k < 10; k++) meam_inst->arho3m[i][k] = buf[m++];
+      meam_inst->arho3mb[i][0] = buf[m++];
+      meam_inst->arho3mb[i][1] = buf[m++];
+      meam_inst->arho3mb[i][2] = buf[m++];
+    }
   }
 }
 
@@ -689,6 +756,22 @@ int PairMEAM::pack_reverse_comm(int n, int first, double *buf)
     buf[m++] = meam_inst->tsq_ave[i][0];
     buf[m++] = meam_inst->tsq_ave[i][1];
     buf[m++] = meam_inst->tsq_ave[i][2];
+    if (msmeamflag) {
+      buf[m++] = meam_inst->arho2mb[i];
+      buf[m++] = meam_inst->arho1m[i][0];
+      buf[m++] = meam_inst->arho1m[i][1];
+      buf[m++] = meam_inst->arho1m[i][2];
+      buf[m++] = meam_inst->arho2m[i][0];
+      buf[m++] = meam_inst->arho2m[i][1];
+      buf[m++] = meam_inst->arho2m[i][2];
+      buf[m++] = meam_inst->arho2m[i][3];
+      buf[m++] = meam_inst->arho2m[i][4];
+      buf[m++] = meam_inst->arho2m[i][5];
+      for (k = 0; k < 10; k++) buf[m++] = meam_inst->arho3m[i][k];
+      buf[m++] = meam_inst->arho3mb[i][0];
+      buf[m++] = meam_inst->arho3mb[i][1];
+      buf[m++] = meam_inst->arho3mb[i][2];
+    }
   }
 
   return m;
@@ -724,7 +807,25 @@ void PairMEAM::unpack_reverse_comm(int n, int *list, double *buf)
     meam_inst->tsq_ave[j][0] += buf[m++];
     meam_inst->tsq_ave[j][1] += buf[m++];
     meam_inst->tsq_ave[j][2] += buf[m++];
+    if (msmeamflag) {
+      meam_inst->arho2mb[j] += buf[m++];
+      meam_inst->arho1m[j][0] += buf[m++];
+      meam_inst->arho1m[j][1] += buf[m++];
+      meam_inst->arho1m[j][2] += buf[m++];
+      meam_inst->arho2m[j][0] += buf[m++];
+      meam_inst->arho2m[j][1] += buf[m++];
+      meam_inst->arho2m[j][2] += buf[m++];
+      meam_inst->arho2m[j][3] += buf[m++];
+      meam_inst->arho2m[j][4] += buf[m++];
+      meam_inst->arho2m[j][5] += buf[m++];
+      for (k = 0; k < 10; k++) meam_inst->arho3m[j][k] += buf[m++];
+      meam_inst->arho3mb[j][0] += buf[m++];
+      meam_inst->arho3mb[j][1] += buf[m++];
+      meam_inst->arho3mb[j][2] += buf[m++];
+    }
   }
+
+
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/MEAM/pair_meam.h b/src/MEAM/pair_meam.h
index 16ba38fcb2..a89714bfa9 100644
--- a/src/MEAM/pair_meam.h
+++ b/src/MEAM/pair_meam.h
@@ -47,6 +47,8 @@ class PairMEAM : public Pair {
   class MEAM *meam_inst;
   double cutmax;                           // max cutoff for all elements
   int nlibelements;                        // # of library elements
+  int msmeamflag;                          // 0 (default) for normal MEAM, 1 for MS-MEAM
+  std::string myname;                      // name of the pair style
   std::vector<std::string> libelements;    // names of library elements
   std::vector<double> mass;                // mass of library element
 
diff --git a/src/MEAM/pair_meam_ms.cpp b/src/MEAM/pair_meam_ms.cpp
new file mode 100644
index 0000000000..982a54f546
--- /dev/null
+++ b/src/MEAM/pair_meam_ms.cpp
@@ -0,0 +1,25 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "pair_meam_ms.h"
+#include "meam.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairMEAMMS::PairMEAMMS(LAMMPS *lmp) : PairMEAM(lmp)
+{
+  meam_inst->msmeamflag = msmeamflag = 1;
+  myname = "meam/ms";
+}
diff --git a/src/MEAM/pair_meam_ms.h b/src/MEAM/pair_meam_ms.h
new file mode 100644
index 0000000000..25878203ed
--- /dev/null
+++ b/src/MEAM/pair_meam_ms.h
@@ -0,0 +1,33 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(meam/ms,PairMEAMMS);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_MEAM_MS_H
+#define LMP_PAIR_MEAM_MS_H
+
+#include "pair_meam.h"
+
+namespace LAMMPS_NS {
+
+class PairMEAMMS : public PairMEAM {
+ public:
+  PairMEAMMS(class LAMMPS *);
+};
+}    // namespace LAMMPS_NS
+#endif
+#endif
diff --git a/src/ML-IAP/pair_mliap.cpp b/src/ML-IAP/pair_mliap.cpp
index 6b55fb3373..929a32020b 100644
--- a/src/ML-IAP/pair_mliap.cpp
+++ b/src/ML-IAP/pair_mliap.cpp
@@ -83,7 +83,6 @@ void PairMLIAP::compute(int eflag, int vflag)
 {
 
   // consistency checks
-
   if (data->ndescriptors != model->ndescriptors)
     error->all(FLERR, "Inconsistent model and descriptor descriptor count: {} vs {}",
                model->ndescriptors, data->ndescriptors);
@@ -134,10 +133,10 @@ void PairMLIAP::allocate()
 
 void PairMLIAP::settings(int narg, char ** arg)
 {
-  if (narg < 2) utils::missing_cmd_args(FLERR, "pair_style mliap", error);
 
   // This is needed because the unit test calls settings twice
   if (!is_child) {
+    if (narg < 2) utils::missing_cmd_args(FLERR, "pair_style mliap", error);
     delete model;
     model = nullptr;
     delete descriptor;
diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp
index 65a2e6d8ce..ce04be2cc8 100644
--- a/src/REAXFF/fix_reaxff_species.cpp
+++ b/src/REAXFF/fix_reaxff_species.cpp
@@ -21,6 +21,7 @@
 
 #include "atom.h"
 #include "atom_vec.h"
+#include "citeme.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
@@ -36,12 +37,25 @@
 #include "pair_reaxff.h"
 #include "reaxff_defs.h"
 
+#include <algorithm>
 #include <cstring>
 #include <exception>
+#include <random>
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
+static const char cite_reaxff_species_delete[] =
+  "fix reaxff/species, 'delete' keyword: https://doi.org/10.1016/j.carbon.2022.11.002\n\n"
+  "@Article{Gissinger23,\n"
+  " author = {J. R. Gissinger, S. R. Zavada, J. G. Smith, J. Kemppainen, I. Gallegos, G. M. Odegard, E. J. Siochi, K. E. Wise},\n"
+  " title = {Predicting char yield of high-temperature resins},\n"
+  " journal = {Carbon},\n"
+  " year =    2023,\n"
+  " volume =  202,\n"
+  " pages =   {336-347}\n"
+  "}\n\n";
+
 /* ---------------------------------------------------------------------- */
 
 FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
@@ -145,6 +159,7 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
   ele = filepos = filedel = nullptr;
   eleflag = posflag = padflag = 0;
   delflag = specieslistflag = masslimitflag = 0;
+  delete_Nlimit = delete_Nsteps = 0;
 
   singlepos_opened = multipos_opened = del_opened = 0;
   multipos = 0;
@@ -221,7 +236,12 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
 
       } else
         error->all(FLERR, "Unknown fix reaxff/species delete option: {}", arg[iarg]);
-
+      // rate limit when deleting molecules
+    } else if (strcmp(arg[iarg], "delete_rate_limit") == 0) {
+      if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species delete_rate_limit", error);
+      delete_Nlimit = utils::numeric(FLERR, arg[iarg+1], false, lmp);
+      delete_Nsteps = utils::numeric(FLERR, arg[iarg+2], false, lmp);
+      iarg += 3;
       // position of molecules
     } else if (strcmp(arg[iarg], "position") == 0) {
       if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species position", error);
@@ -260,6 +280,15 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
   if (delflag && specieslistflag && masslimitflag)
     error->all(FLERR, "Incompatible combination fix reaxff/species command options");
 
+  if (delete_Nlimit > 0) {
+    if (lmp->citeme) lmp->citeme->add(cite_reaxff_species_delete);
+    memory->create(delete_Tcount,delete_Nsteps,"reaxff/species:delete_Tcount");
+
+    for (int i = 0; i < delete_Nsteps; i++)
+      delete_Tcount[i] = -1;
+    delete_Tcount[0] = 0;
+  }
+
   vector_nmole = 0;
   vector_nspec = 0;
 }
@@ -279,6 +308,7 @@ FixReaxFFSpecies::~FixReaxFFSpecies()
   memory->destroy(Mol2Spec);
   memory->destroy(MolType);
   memory->destroy(MolName);
+  memory->destroy(delete_Tcount);
 
   delete[] filepos;
   delete[] filedel;
@@ -375,7 +405,13 @@ void FixReaxFFSpecies::Output_ReaxFF_Bonds(bigint ntimestep, FILE * /*fp*/)
   // point to fix_ave_atom
   f_SPECBOND->end_of_step();
 
-  if (ntimestep != nvalid) return;
+  if (ntimestep != nvalid) {
+    // push back delete_Tcount on every step
+    if (delete_Nlimit > 0)
+      for (int i = delete_Nsteps-1; i > 0; i--)
+        delete_Tcount[i] = delete_Tcount[i-1];
+    return;
+  }
 
   nlocal = atom->nlocal;
 
@@ -826,6 +862,15 @@ void FixReaxFFSpecies::WritePos(int Nmole, int Nspec)
 
 void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
 {
+  int ndeletions;
+  int headroom = -1;
+  if (delete_Nlimit > 0) {
+    if (delete_Tcount[delete_Nsteps-1] == -1) return;
+    ndeletions = delete_Tcount[0] - delete_Tcount[delete_Nsteps-1];
+    headroom = MAX(0, delete_Nlimit - ndeletions);
+    if (headroom == 0) return;
+  }
+
   int i, j, m, n, itype, cid;
   int ndel, ndelone, count, count_tmp;
   int *Nameall;
@@ -856,7 +901,23 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
   int *marklist;
   memory->create(marklist, nlocal, "reaxff/species:marklist");
 
-  for (m = 1; m <= Nmole; m++) {
+  std::random_device rnd;
+  std::minstd_rand park_rng(rnd());
+  int *molrange;
+  memory->create(molrange,Nmole,"reaxff/species:molrange");
+  for (m = 0; m < Nmole; m++)
+    molrange[m] = m + 1;
+  if (delete_Nlimit > 0) {
+    // shuffle index when using rate_limit, in case order is biased
+    if (comm->me == 0)
+      std::shuffle(&molrange[0],&molrange[Nmole], park_rng);
+    MPI_Bcast(&molrange[0], Nmole, MPI_INT, 0, world);
+  }
+
+  int this_delete_Tcount = 0;
+  for (int mm = 0; mm < Nmole; mm++) {
+    if (this_delete_Tcount == headroom) break;
+    m = molrange[mm];
     localmass = totalmass = count = nmarklist = 0;
     for (n = 0; n < ntypes; n++) Name[n] = 0;
 
@@ -896,6 +957,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
       // find corresponding moltype
 
       if (totalmass > massmin && totalmass < massmax) {
+        this_delete_Tcount++;
         for (j = 0; j < nmarklist; j++) {
           mark[marklist[j]] = 1;
           deletecount[Mol2Spec[m - 1]] += 1.0 / (double) count;
@@ -905,6 +967,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
       if (count > 0) {
         for (i = 0; i < ndelspec; i++) {
           if (del_species[i] == species_str) {
+            this_delete_Tcount++;
             for (j = 0; j < nmarklist; j++) {
               mark[marklist[j]] = 1;
               deletecount[i] += 1.0 / (double) count;
@@ -976,6 +1039,14 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
     }
   }
 
+
+  // push back delete_Tcount on every step
+  if (delete_Nlimit > 0) {
+    for (i = delete_Nsteps-1; i > 0; i--)
+      delete_Tcount[i] = delete_Tcount[i-1];
+    delete_Tcount[0] += this_delete_Tcount;
+  }
+
   if (ndel && (atom->map_style != Atom::MAP_NONE)) {
     atom->nghost = 0;
     atom->map_init();
@@ -988,6 +1059,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
   memory->destroy(marklist);
   memory->destroy(mark);
   memory->destroy(deletecount);
+  memory->destroy(molrange);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/REAXFF/fix_reaxff_species.h b/src/REAXFF/fix_reaxff_species.h
index 65eeae4c60..329e17145b 100644
--- a/src/REAXFF/fix_reaxff_species.h
+++ b/src/REAXFF/fix_reaxff_species.h
@@ -60,6 +60,7 @@ class FixReaxFFSpecies : public Fix {
   FILE *fp, *pos, *fdel;
   int eleflag, posflag, multipos, padflag, setupflag;
   int delflag, specieslistflag, masslimitflag;
+  int delete_Nlimit, delete_Nsteps, *delete_Tcount;
   double massmin, massmax;
   int singlepos_opened, multipos_opened, del_opened;
   char *ele, **eletype, *filepos, *filedel;
diff --git a/src/REPLICA/fix_pimd.cpp b/src/REPLICA/fix_pimd.cpp
index 5daff2d643..154f3deecd 100644
--- a/src/REPLICA/fix_pimd.cpp
+++ b/src/REPLICA/fix_pimd.cpp
@@ -84,14 +84,15 @@ FixPIMD::FixPIMD(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
       else if (strcmp(arg[i + 1], "cmd") == 0)
         method = CMD;
       else
-        error->universe_all(FLERR, "Unknown method parameter for fix pimd");
+        error->universe_all(FLERR, fmt::format("Unknown method parameter {} for fix pimd",
+                                               arg[i + 1]));
     } else if (strcmp(arg[i], "fmass") == 0) {
       fmass = utils::numeric(FLERR, arg[i + 1], false, lmp);
-      if (fmass < 0.0 || fmass > 1.0)
-        error->universe_all(FLERR, "Invalid fmass value for fix pimd");
+      if ((fmass < 0.0) || (fmass > np))
+        error->universe_all(FLERR, fmt::format("Invalid fmass value {} for fix pimd", fmass));
     } else if (strcmp(arg[i], "sp") == 0) {
       sp = utils::numeric(FLERR, arg[i + 1], false, lmp);
-      if (fmass < 0.0) error->universe_all(FLERR, "Invalid sp value for fix pimd");
+      if (sp < 0.0) error->universe_all(FLERR, "Invalid sp value for fix pimd");
     } else if (strcmp(arg[i], "temp") == 0) {
       nhc_temp = utils::numeric(FLERR, arg[i + 1], false, lmp);
       if (nhc_temp < 0.0) error->universe_all(FLERR, "Invalid temp value for fix pimd");
@@ -120,7 +121,7 @@ FixPIMD::FixPIMD(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
 
   global_freq = 1;
   vector_flag = 1;
-  size_vector = 2;
+  size_vector = 3;
   extvector = 1;
   comm_forward = 3;
 
@@ -135,6 +136,7 @@ FixPIMD::FixPIMD(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
 }
 
 /* ---------------------------------------------------------------------- */
+
 FixPIMD::~FixPIMD()
 {
   delete[] mass;
@@ -166,6 +168,7 @@ FixPIMD::~FixPIMD()
 }
 
 /* ---------------------------------------------------------------------- */
+
 int FixPIMD::setmask()
 {
   int mask = 0;
@@ -215,7 +218,7 @@ void FixPIMD::init()
   double beta = 1.0 / (Boltzmann * nhc_temp);
   double _fbond = 1.0 * np / (beta * beta * hbar * hbar);
 
-  omega_np = sqrt((double)np) / (hbar * beta) * sqrt(force->mvv2e);
+  omega_np = sqrt((double) np) / (hbar * beta) * sqrt(force->mvv2e);
   fbond = -_fbond * force->mvv2e;
 
   if (universe->me == 0)
@@ -306,7 +309,7 @@ void FixPIMD::nhc_init()
       nhc_eta_dotdot[i][ichain] = 0.0;
       nhc_eta_mass[i][ichain] = mass0;
       if ((method == CMD || method == NMPIMD) && universe->iworld == 0)
-        ;
+        ; // do nothing
       else
         nhc_eta_mass[i][ichain] *= fmass;
     }
@@ -538,6 +541,8 @@ void FixPIMD::spring_force()
   double *xlast = buf_beads[x_last];
   double *xnext = buf_beads[x_next];
 
+  virial = 0.0;
+
   for (int i = 0; i < nlocal; i++) {
     double delx1 = xlast[0] - x[i][0];
     double dely1 = xlast[1] - x[i][1];
@@ -557,11 +562,13 @@ void FixPIMD::spring_force()
     double dy = dely1 + dely2;
     double dz = delz1 + delz2;
 
+    virial += -0.5 * (x[i][0] * f[i][0] + x[i][1] * f[i][1] + x[i][2] * f[i][2]);
+
     f[i][0] -= (dx) *ff;
     f[i][1] -= (dy) *ff;
     f[i][2] -= (dz) *ff;
 
-    spring_energy += (dx * dx + dy * dy + dz * dz);
+    spring_energy += -0.5 * ff * (delx2 * delx2 + dely2 * dely2 + delz2 * delz2);
   }
 }
 
@@ -875,5 +882,6 @@ double FixPIMD::compute_vector(int n)
 {
   if (n == 0) { return spring_energy; }
   if (n == 1) { return t_sys; }
+  if (n == 2) { return virial; }
   return 0.0;
 }
diff --git a/src/REPLICA/fix_pimd.h b/src/REPLICA/fix_pimd.h
index 384bc2ce25..b96c088efe 100644
--- a/src/REPLICA/fix_pimd.h
+++ b/src/REPLICA/fix_pimd.h
@@ -57,7 +57,7 @@ class FixPIMD : public Fix {
 
   /* ring-polymer model */
 
-  double omega_np, fbond, spring_energy, sp;
+  double omega_np, fbond, spring_energy, sp, virial;
   int x_last, x_next;
 
   void spring_force();
diff --git a/src/atom.cpp b/src/atom.cpp
index 480a779e68..32285758c0 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -2345,6 +2345,18 @@ void Atom::setup_sort_bins()
     return;
   }
 
+#ifdef LMP_GPU
+  if (userbinsize == 0.0) {
+    auto ifix = dynamic_cast<FixGPU *>(modify->get_fix_by_id("package_gpu"));
+    if (ifix) {
+      const double subx = domain->subhi[0] - domain->sublo[0];
+      const double suby = domain->subhi[1] - domain->sublo[1];
+      const double subz = domain->subhi[2] - domain->sublo[2];
+      binsize = ifix->binsize(subx, suby, subz, atom->nlocal, 0.5 * neighbor->cutneighmax);
+    }
+  }
+#endif
+
   double bininv = 1.0/binsize;
 
   // nbin xyz = local bins
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index f2b094ec37..05371c8259 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -535,6 +535,7 @@ void Neighbor::init()
       int flag=0;
       for (int isub=0; isub < ph->nstyles; ++isub) {
         if (force->pair_match("amoeba",0,isub)
+            || force->pair_match("hippo",0,isub)
             || force->pair_match("coul/wolf",0,isub)
             || force->pair_match("coul/dsf",0,isub)
             || force->pair_match("coul/exclude",0)
@@ -545,6 +546,7 @@ void Neighbor::init()
         special_flag[1] = special_flag[2] = special_flag[3] = 2;
     } else {
       if (force->pair_match("amoeba",0)
+          || force->pair_match("hippo",0)
           || force->pair_match("coul/wolf",0)
           || force->pair_match("coul/dsf",0)
           || force->pair_match("coul/exclude",0)
diff --git a/unittest/force-styles/tests/atomic-pair-meam_ms.yaml b/unittest/force-styles/tests/atomic-pair-meam_ms.yaml
new file mode 100644
index 0000000000..e479514017
--- /dev/null
+++ b/unittest/force-styles/tests/atomic-pair-meam_ms.yaml
@@ -0,0 +1,94 @@
+---
+lammps_version: 22 Dec 2022
+tags: slow
+date_generated: Thu Jan 26 15:27:03 2023
+epsilon: 2.5e-12
+skip_tests:
+prerequisites: ! |
+  pair meam/ms
+pre_commands: ! |
+  variable newton_pair delete
+  if "$(is_active(package,gpu)) > 0.0" then "variable newton_pair index off" else "variable newton_pair index on"
+post_commands: ! ""
+input_file: in.metal
+pair_style: meam/ms
+pair_coeff: ! |
+  * * library.msmeam H Ga4 HGa.msmeam H Ga4
+extract: ! |
+  scale 2
+natoms: 32
+init_vdwl: 785.6030480758675
+init_coul: 0
+init_stress: ! |2-
+   3.3502530994900699e+03  3.6405858278699407e+03  3.6349804214165547e+03 -3.1609283411508039e+02 -7.9448207656135153e+01 -1.9854140603340727e+02
+init_forces: ! |2
+    1  1.2872255079741514e+01 -7.5031848810810864e-01  4.5969595156096510e+01
+    2 -3.9028679722038632e+01 -1.5647800180326567e+02 -1.6643992152928173e+00
+    3 -6.1521549955194672e+01  2.6970968316419874e+02 -9.6866430262650326e+01
+    4  3.1462579880342336e+01  4.0240291291218455e+01  1.1654869213327775e+01
+    5  1.4859248182951113e+01 -3.4132880749392825e+01  6.7430378007130244e+01
+    6  6.4609571260694096e+00 -3.8973222482916441e+01 -2.8510000379627442e+01
+    7  7.8114612113500250e+00 -1.0421431668544374e+01 -4.2887607385766536e+01
+    8 -4.8934215863351795e+01 -6.3567347969802590e-01  1.1845972792272754e+02
+    9  9.4089549606898402e+01 -7.4342942103394511e+00  2.5331198575951383e+01
+   10  1.5130369934140692e+01 -5.9245630928969938e+01 -6.7469126603400198e+01
+   11 -2.5176547213746847e+01  1.1577205529172168e+02 -2.2897457133540517e+01
+   12  6.2237686199502349e+01  2.0501996047945163e+01 -2.8805091517252826e+01
+   13 -5.9438589221526925e+01  3.0453092653824072e+01 -1.9919245831196157e+01
+   14  6.9128305482543766e+01 -7.7400771634148342e+01  3.3376079908119145e+01
+   15 -4.9671207786831857e+01 -4.9520814527298228e+01  8.4325181097614305e+01
+   16 -1.1782591146017666e+01 -3.2478963020209051e+01  1.5503663677714293e+01
+   17  9.0881787245915220e+00  6.2377477671714963e+01 -4.0411006180232363e+01
+   18 -4.2285082775720454e+01  2.4883979527636967e+01 -4.4858149086530510e+00
+   19 -8.0259798420493979e+01  9.6356660229207137e+01  6.0543230952477984e+01
+   20  8.0924547938759346e+01  7.1034504027236025e+01 -7.1958482512489610e+01
+   21  1.0833434220705425e+02 -1.5973910256481020e+02 -2.5432700070393153e+01
+   22 -2.3754601906353900e+00  5.2216955012971823e+01  4.7112051341131576e+00
+   23 -2.7227169255996543e+01  8.1968603165764222e+01  4.6535834898716878e+01
+   24 -2.9230758067555616e+01  6.5909555829367733e+01 -2.8250697734131258e+01
+   25 -5.1310041582953993e+01 -3.0895272949222822e+01 -5.4271286813003794e+00
+   26  3.9605941911194620e+01 -5.5919050176828883e+01 -1.0209061328106253e+01
+   27  8.2934427989660890e+01  6.1956200199325636e+01  5.0072108788590960e+01
+   28 -7.8572755094413296e+01 -3.9613391730681300e+01 -2.6183413623428891e+00
+   29  6.9475725072041925e+01 -6.0535433603583563e+01 -1.4566536349135829e+01
+   30 -2.4347184151182930e+01 -1.9359391333689970e+02 -2.6718379302915952e+01
+   31  7.7351971629808688e+01 -7.0102650745312999e+01 -5.4615048867524763e+01
+   32 -1.5060591772899014e+02  8.4489763988097266e+01  2.9799482293372058e+01
+run_vdwl: 682.3107192428497
+run_coul: 0
+run_stress: ! |2-
+   3.2247564044913129e+03  3.3749506031067485e+03  3.3223794967215117e+03 -2.8460979167554797e+02 -7.2614457076660575e+00 -3.1510685747732862e+02
+run_forces: ! |2
+    1 -1.2037185973996296e+01 -2.5090364403764944e+01  1.4014184973113366e+01
+    2 -3.7365848425239264e+01 -1.5871199357658887e+02  3.7846333470446991e+00
+    3 -3.2057228694304293e+01  2.5316344962361612e+02 -6.0679585186816752e+01
+    4  2.9086197614116237e+01  4.8267528016068823e+01  4.3387429619749920e+00
+    5 -1.1672554618399744e+01 -2.6840760926124332e+01  4.9694308545223279e+01
+    6  1.1892092913978592e+01 -4.9360840569608243e+01 -2.3083171938147949e+01
+    7  2.1084251901459215e+01 -4.8251731643401072e+00 -3.8474871193885967e+01
+    8 -5.7775944085787714e+01  1.3522956442661442e+01  1.1661345819661486e+02
+    9  7.2926105059437930e+01  4.8686056096860133e+00  2.3817134806042311e+01
+   10  1.7307367990304396e+01 -3.0865570121704572e+01 -1.2314307646704794e+01
+   11 -1.1341297645054201e+01  9.1441145595173211e+01 -2.1806407500802493e+01
+   12  4.0645024127126625e+01  1.2207243511090397e+01 -2.6757649464936929e+01
+   13 -5.2283270287937697e+01  3.4023912643812679e+01 -1.9030352703627774e+01
+   14  8.4403128243303399e+01 -9.3773678297574406e+01  1.6481720093363641e+01
+   15 -4.2790833192154764e+01 -4.3242943642279130e+01  7.1075696811865868e+01
+   16 -1.5041912007490836e+01 -3.3544044565611586e+01  2.4823109532967212e+01
+   17 -9.6413207346836316e-01  4.5826021602656141e+01 -3.9155163702194102e+01
+   18 -2.0337015515785971e+01  7.2815285567550134e+00 -8.2049879725129813e+00
+   19 -6.4105384732081120e+01  1.1564665740933788e+02  2.4163791756721466e+01
+   20  8.5723654185276146e+01  8.3354105531647818e+01 -6.6380939444134356e+01
+   21  7.2614253221132458e+01 -1.0858997173537107e+02 -9.7505297776024449e+00
+   22 -7.0420361713052930e+00  5.3431098224890221e+01  3.3089063930822551e+00
+   23 -2.6591358240682062e+01  5.7408565880721866e+01  2.7437106471305679e+01
+   24 -4.1792038450554799e+01  5.1730557789864775e+01 -4.0814677464080816e+01
+   25 -4.1432062506590214e+01 -2.5839213423062226e+01  4.2240164846210408e+00
+   26  4.7210066329871566e+01 -5.2462761136081880e+01 -7.3222050314410501e+00
+   27  7.1880187551772764e+01  6.4264938765955392e+01  4.3600944370341068e+01
+   28 -8.4540787660053340e+01 -3.5402262816619938e+01 -1.8100280797937039e+01
+   29  6.9538301274653790e+01 -6.3441028093040622e+01 -1.4636386232064458e+01
+   30 -1.0347208112535196e+01 -1.7647584813608077e+02  7.2581082578181517e+00
+   31  5.5139777976761025e+01 -4.2081916983382541e+01 -4.6602437208067727e+01
+   32 -1.0993230999577290e+02  3.4110056387297462e+01  1.8478090262857769e+01
+...