From d3712787651da28e62bf83f4cf8c88af91c111ad Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 11 Feb 2021 07:54:28 -0500
Subject: [PATCH 001/116] correct dump style cfg label generation

---
 src/dump_cfg.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/dump_cfg.cpp b/src/dump_cfg.cpp
index ed8df72096..b4e6af90cf 100644
--- a/src/dump_cfg.cpp
+++ b/src/dump_cfg.cpp
@@ -75,7 +75,8 @@ DumpCFG::DumpCFG(LAMMPS *lmp, int narg, char **arg) :
 
     if (argi.get_dim() == 1) {
       std::string newarg(std::to_string(earg[iarg][0]));
-      newarg += '_' + argi.get_name() + '_' + std::to_string(argi.get_index1());
+      newarg += std::string("_") + argi.get_name();
+      newarg += std::string("_") + std::to_string(argi.get_index1());
       auxname[i] = new char[newarg.size()+1];
       strcpy(auxname[i],newarg.c_str());
     } else {

From 7da64cba891b4e3272655f9a6bc367402f01a6e7 Mon Sep 17 00:00:00 2001
From: Plimpton <sjplimp@s957182.srn.sandia.gov>
Date: Wed, 10 Feb 2021 16:21:25 -0700
Subject: [PATCH 002/116] fix issues with multiple uses of create_bonds command

---
 doc/src/create_bonds.rst | 10 ++++++++++
 src/create_bonds.cpp     |  2 +-
 src/neigh_request.cpp    |  2 ++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/doc/src/create_bonds.rst b/doc/src/create_bonds.rst
index b69fd909f0..056b60c7aa 100644
--- a/doc/src/create_bonds.rst
+++ b/doc/src/create_bonds.rst
@@ -125,6 +125,16 @@ cannot appear in the neighbor list, to avoid creation of duplicate
 bonds.  The neighbor list for all atom type pairs must also extend to
 a distance that encompasses the *rmax* for new bonds to create.
 
+.. note::
+
+   If you want to create bonds between pairs of 1-3 or 1-4 atoms in
+   the current bond topology, then you need to use :doc:`special_bonds
+   lj 0 1 1 <special_bonds>` to insure those pairs appear in the
+   neighbor list.  They will not appear with the default special_bonds
+   settings which are zero for 1-2, 1-3, and 1-4 atoms.  1-3 or 1-4
+   atoms are those which are 2 hops or 3 hops apart in the bond
+   topology.
+
 An additional requirement for this style is that your system must be
 ready to perform a simulation.  This means, for example, that all
 :doc:`pair_style <pair_style>` coefficients be set via the
diff --git a/src/create_bonds.cpp b/src/create_bonds.cpp
index 7ee17bcfcc..e5274d2cf8 100644
--- a/src/create_bonds.cpp
+++ b/src/create_bonds.cpp
@@ -233,7 +233,7 @@ void CreateBonds::many()
   // build neighbor list this command needs based on earlier request
 
   NeighList *list = neighbor->lists[irequest];
-  neighbor->build_one(list);
+  neighbor->build_one(list,1);
 
   // loop over all neighs of each atom
   // compute distance between two atoms consistently on both procs
diff --git a/src/neigh_request.cpp b/src/neigh_request.cpp
index 0d4818fbe1..2339783d14 100644
--- a/src/neigh_request.cpp
+++ b/src/neigh_request.cpp
@@ -225,6 +225,8 @@ void NeighRequest::copy_request(NeighRequest *other, int skipflag)
   int i,j;
   int ntypes = atom->ntypes;
 
+  skip = other->skip;
+  
   if (other->iskip) {
     iskip = new int[ntypes+1];
     for (i = 1; i <= ntypes; i++)

From 258452d1d4ba18604ed4e413996eb525a44a0d09 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 10 Feb 2021 18:40:25 -0500
Subject: [PATCH 003/116] whitespace

---
 src/neigh_request.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neigh_request.cpp b/src/neigh_request.cpp
index 2339783d14..8c8168952e 100644
--- a/src/neigh_request.cpp
+++ b/src/neigh_request.cpp
@@ -226,7 +226,7 @@ void NeighRequest::copy_request(NeighRequest *other, int skipflag)
   int ntypes = atom->ntypes;
 
   skip = other->skip;
-  
+
   if (other->iskip) {
     iskip = new int[ntypes+1];
     for (i = 1; i <= ntypes; i++)

From d83827508fd4c66737318f44c1e21728651d22af Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 10 Feb 2021 20:14:19 -0500
Subject: [PATCH 004/116] use neighbor->nrequest to be safer, since
 neighbor->nlist may be larger

---
 src/USER-INTEL/npair_skip_intel.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/USER-INTEL/npair_skip_intel.cpp b/src/USER-INTEL/npair_skip_intel.cpp
index 4f6648ddc1..53900f116f 100644
--- a/src/USER-INTEL/npair_skip_intel.cpp
+++ b/src/USER-INTEL/npair_skip_intel.cpp
@@ -55,8 +55,8 @@ void NPairSkipIntel::copy_neighbor_info()
 {
   NPair::copy_neighbor_info();
   if (_full_props) delete []_full_props;
-  _full_props = new int[neighbor->nlist];
-  for (int i = 0; i < neighbor->nlist; i++)
+  _full_props = new int[neighbor->nrequest];
+  for (int i = 0; i < neighbor->nrequest; i++)
     _full_props[i] = neighbor->requests[i]->full;
 }
 

From bd547a3c4285549d822aaf30c23636a442898ad7 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sat, 6 Feb 2021 18:28:18 -0500
Subject: [PATCH 005/116] Step version strings for next patch release

---
 doc/lammps.1  | 2 +-
 src/version.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/lammps.1 b/doc/lammps.1
index 299f8538b0..12cff4eeec 100644
--- a/doc/lammps.1
+++ b/doc/lammps.1
@@ -1,4 +1,4 @@
-.TH LAMMPS "24 December 2020" "2020-12-24"
+.TH LAMMPS "9 February 2021" "2021-02-09"
 .SH NAME
 .B LAMMPS
 \- Molecular Dynamics Simulator.
diff --git a/src/version.h b/src/version.h
index f812b62821..c04929c145 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "24 Dec 2020"
+#define LAMMPS_VERSION "9 Feb 2021"

From a742935817e891e99a672befae9ba999e0a21528 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 9 Feb 2021 21:34:08 -0500
Subject: [PATCH 006/116] change version strings to 10 Feb 2021

---
 doc/lammps.1  | 2 +-
 src/version.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/lammps.1 b/doc/lammps.1
index 12cff4eeec..9351ba5636 100644
--- a/doc/lammps.1
+++ b/doc/lammps.1
@@ -1,4 +1,4 @@
-.TH LAMMPS "9 February 2021" "2021-02-09"
+.TH LAMMPS "10 February 2021" "2021-02-10"
 .SH NAME
 .B LAMMPS
 \- Molecular Dynamics Simulator.
diff --git a/src/version.h b/src/version.h
index c04929c145..84541d4456 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "9 Feb 2021"
+#define LAMMPS_VERSION "10 Feb 2021"

From 45ba0bd3133c265f3020aab7d89654496114ef0f Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 05:24:29 -0600
Subject: [PATCH 007/116] refactor kim commands by @akohlmey

---
 src/KIM/kim_init.h         |  7 -------
 src/KIM/kim_interactions.h |  7 -------
 src/KIM/kim_param.h        |  7 -------
 src/KIM/kim_property.cpp   |  4 ++--
 src/KIM/kim_property.h     | 11 ++---------
 src/KIM/kim_query.h        |  7 -------
 6 files changed, 4 insertions(+), 39 deletions(-)

diff --git a/src/KIM/kim_init.h b/src/KIM/kim_init.h
index 8fa3247b22..6937ab7677 100644
--- a/src/KIM/kim_init.h
+++ b/src/KIM/kim_init.h
@@ -56,12 +56,6 @@
    Designed for use with the kim-api-2.1.0 (and newer) package
 ------------------------------------------------------------------------- */
 
-#ifdef COMMAND_CLASS
-
-CommandStyle(kim_init,KimInit)
-
-#else
-
 #ifndef LMP_KIM_INIT_H
 #define LMP_KIM_INIT_H
 
@@ -89,7 +83,6 @@ class KimInit : protected Pointers {
 
 }
 
-#endif
 #endif
 
 /* ERROR/WARNING messages:
diff --git a/src/KIM/kim_interactions.h b/src/KIM/kim_interactions.h
index 071e5b284f..8790f2df14 100644
--- a/src/KIM/kim_interactions.h
+++ b/src/KIM/kim_interactions.h
@@ -56,12 +56,6 @@
    Designed for use with the kim-api-2.1.0 (and newer) package
 ------------------------------------------------------------------------- */
 
-#ifdef COMMAND_CLASS
-
-CommandStyle(kim_interactions,KimInteractions)
-
-#else
-
 #ifndef LMP_KIM_INTERACTIONS_H
 #define LMP_KIM_INTERACTIONS_H
 
@@ -81,7 +75,6 @@ class KimInteractions : protected Pointers {
 
 }
 
-#endif
 #endif
 
 /* ERROR/WARNING messages:
diff --git a/src/KIM/kim_param.h b/src/KIM/kim_param.h
index 3e20207cca..bfc27a71bf 100644
--- a/src/KIM/kim_param.h
+++ b/src/KIM/kim_param.h
@@ -55,12 +55,6 @@
    Designed for use with the kim-api-2.1.0 (and newer) package
 ------------------------------------------------------------------------- */
 
-#ifdef COMMAND_CLASS
-
-CommandStyle(kim_param, KimParam)
-
-#else
-
 #ifndef LMP_KIM_PARAM_H
 #define LMP_KIM_PARAM_H
 
@@ -82,7 +76,6 @@ public:
 } // namespace LAMMPS_NS
 
 #endif // LMP_KIM_PARAM_H
-#endif // COMMAND_CLASS
 
 /* ERROR/WARNING messages:
 
diff --git a/src/KIM/kim_property.cpp b/src/KIM/kim_property.cpp
index 17d8778c7a..3fb46d442f 100644
--- a/src/KIM/kim_property.cpp
+++ b/src/KIM/kim_property.cpp
@@ -70,7 +70,7 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-kimProperty::kimProperty(LAMMPS *lmp) : Pointers(lmp)
+KimProperty::KimProperty(LAMMPS *lmp) : Pointers(lmp)
 {
   // one-time initialization of Python interpreter
   python->init();
@@ -82,7 +82,7 @@ kimProperty::kimProperty(LAMMPS *lmp) : Pointers(lmp)
   }
 }
 
-void kimProperty::command(int narg, char **arg)
+void KimProperty::command(int narg, char **arg)
 {
 #if LMP_PYTHON
 #if PY_MAJOR_VERSION >= 3
diff --git a/src/KIM/kim_property.h b/src/KIM/kim_property.h
index ff5faa6781..11729433b5 100644
--- a/src/KIM/kim_property.h
+++ b/src/KIM/kim_property.h
@@ -53,12 +53,6 @@
    Designed for use with the kim-api-2.1.0 (and newer) package
 ------------------------------------------------------------------------- */
 
-#ifdef COMMAND_CLASS
-
-CommandStyle(kim_property, kimProperty)
-
-#else
-
 #ifndef LMP_KIM_PROPERTY_H
 #define LMP_KIM_PROPERTY_H
 
@@ -67,10 +61,10 @@ CommandStyle(kim_property, kimProperty)
 namespace LAMMPS_NS
 {
 
-class kimProperty : protected Pointers
+class KimProperty : protected Pointers
 {
 public:
-  kimProperty(class LAMMPS *lmp);
+  KimProperty(class LAMMPS *lmp);
 
   void command(int, char **);
 };
@@ -78,7 +72,6 @@ public:
 } // namespace LAMMPS_NS
 
 #endif // LMP_KIM_PROPERTY_H
-#endif // COMMAND_CLASS
 
 /* ERROR/WARNING messages:
 
diff --git a/src/KIM/kim_query.h b/src/KIM/kim_query.h
index f2523f5a98..ce59e2f67f 100644
--- a/src/KIM/kim_query.h
+++ b/src/KIM/kim_query.h
@@ -55,12 +55,6 @@
    Designed for use with the kim-api-2.1.0 (and newer) package
 ------------------------------------------------------------------------- */
 
-#ifdef COMMAND_CLASS
-
-CommandStyle(kim_query,KimQuery)
-
-#else
-
 #ifndef LMP_KIM_QUERY_H
 #define LMP_KIM_QUERY_H
 
@@ -76,7 +70,6 @@ class KimQuery : protected Pointers {
 
 }
 
-#endif
 #endif
 
 /* ERROR/WARNING messages:

From 856c9064fb9633629dfd3882c2a549175b9585e8 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 06:16:19 -0600
Subject: [PATCH 008/116] prototype implementation for KIM wrapper command by
 @akohlmey

---
 src/KIM/kim_command.cpp | 96 +++++++++++++++++++++++++++++++++++++++++
 src/KIM/kim_command.h   | 83 +++++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+)
 create mode 100644 src/KIM/kim_command.cpp
 create mode 100644 src/KIM/kim_command.h

diff --git a/src/KIM/kim_command.cpp b/src/KIM/kim_command.cpp
new file mode 100644
index 0000000000..699aa4371b
--- /dev/null
+++ b/src/KIM/kim_command.cpp
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   This program is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+   more details.
+
+   You should have received a copy of the GNU General Public License along with
+   this program; if not, see <https://www.gnu.org/licenses>.
+
+   Linking LAMMPS statically or dynamically with other modules is making a
+   combined work based on LAMMPS. Thus, the terms and conditions of the GNU
+   General Public License cover the whole combination.
+
+   In addition, as a special exception, the copyright holders of LAMMPS give
+   you permission to combine LAMMPS with free software programs or libraries
+   that are released under the GNU LGPL and with code included in the standard
+   release of the "kim-api" under the CDDL (or modified versions of such code,
+   with unchanged license). You may copy and distribute such a system following
+   the terms of the GNU GPL for LAMMPS and the licenses of the other code
+   concerned, provided that you include the source code of that other code
+   when and as the GNU GPL requires distribution of source code.
+
+   Note that people who make modified versions of LAMMPS are not obligated to
+   grant this special exception for their modified versions; it is their choice
+   whether to do so. The GNU General Public License gives permission to release
+   a modified version without this exception; this exception also makes it
+   possible to release a modified version which carries forward this exception.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Designed for use with the kim-api-2.1.0 (and newer) package
+------------------------------------------------------------------------- */
+
+#include "kim_command.h"
+
+#include "error.h"
+
+// include KIM sub-command headers here
+#include "kim_init.h"
+#include "kim_interactions.h"
+#include "kim_param.h"
+#include "kim_property.h"
+#include "kim_query.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+void KimCommand::command(int narg, char **arg)
+{
+  if (narg < 1) error->all(FLERR,"Illegal kim command");
+
+  const std::string subcmd(arg[0]);
+  narg--;
+  arg++;
+
+  if (subcmd == "init") {
+    KimInit *cmd = new KimInit(lmp);
+    cmd->command(narg,arg);
+  } else if (subcmd == "interactions") {
+    KimInteractions *cmd = new KimInteractions(lmp);
+    cmd->command(narg,arg);
+  } else if (subcmd == "param") {
+    KimParam *cmd = new KimParam(lmp);
+    cmd->command(narg,arg);
+  } else if (subcmd == "property") {
+    KimProperty *cmd = new KimProperty(lmp);
+    cmd->command(narg,arg);
+  } else if (subcmd == "query") {
+    KimQuery *cmd = new KimQuery(lmp);
+    cmd->command(narg,arg);
+  } else error->all(FLERR,fmt::format("Unknown kim subcommand {}",subcmd));
+}
+
diff --git a/src/KIM/kim_command.h b/src/KIM/kim_command.h
new file mode 100644
index 0000000000..f327e4f2f3
--- /dev/null
+++ b/src/KIM/kim_command.h
@@ -0,0 +1,83 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   This program is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the Free
+   Software Foundation; either version 2 of the License, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+   more details.
+
+   You should have received a copy of the GNU General Public License along with
+   this program; if not, see <https://www.gnu.org/licenses>.
+
+   Linking LAMMPS statically or dynamically with other modules is making a
+   combined work based on LAMMPS. Thus, the terms and conditions of the GNU
+   General Public License cover the whole combination.
+
+   In addition, as a special exception, the copyright holders of LAMMPS give
+   you permission to combine LAMMPS with free software programs or libraries
+   that are released under the GNU LGPL and with code included in the standard
+   release of the "kim-api" under the CDDL (or modified versions of such code,
+   with unchanged license). You may copy and distribute such a system following
+   the terms of the GNU GPL for LAMMPS and the licenses of the other code
+   concerned, provided that you include the source code of that other code
+   when and as the GNU GPL requires distribution of source code.
+
+   Note that people who make modified versions of LAMMPS are not obligated to
+   grant this special exception for their modified versions; it is their choice
+   whether to do so. The GNU General Public License gives permission to release
+   a modified version without this exception; this exception also makes it
+   possible to release a modified version which carries forward this exception.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Designed for use with the kim-api-2.1.0 (and newer) package
+------------------------------------------------------------------------- */
+
+#ifdef COMMAND_CLASS
+
+CommandStyle(kim,KimCommand)
+
+#else
+
+#ifndef LMP_KIM_COMMAND_H
+#define LMP_KIM_COMMAND_H
+
+#include "pointers.h"
+
+namespace LAMMPS_NS {
+
+class KimCommand : protected Pointers {
+ public:
+  KimCommand(class LAMMPS *lmp) : Pointers(lmp) {};
+  void command(int, char **);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+
+*/

From 0c5b3bc611a0a68e4336f41d5c5c23edc2dc3325 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 05:28:41 -0600
Subject: [PATCH 009/116] clean up and remove unnecessary comments

---
 src/KIM/kim_param.cpp  | 2 --
 src/KIM/kim_param.h    | 7 ++-----
 src/KIM/kim_property.h | 5 ++---
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/KIM/kim_param.cpp b/src/KIM/kim_param.cpp
index 1628bb56d9..cef1dea642 100644
--- a/src/KIM/kim_param.cpp
+++ b/src/KIM/kim_param.cpp
@@ -134,8 +134,6 @@ void get_kim_unit_names(
 
 KimParam::KimParam(LAMMPS *lmp) : Pointers(lmp) {}
 
-KimParam::~KimParam() {}
-
 void KimParam::command(int narg, char **arg)
 {
   // kim_param is a command for
diff --git a/src/KIM/kim_param.h b/src/KIM/kim_param.h
index bfc27a71bf..7988e494be 100644
--- a/src/KIM/kim_param.h
+++ b/src/KIM/kim_param.h
@@ -67,15 +67,12 @@ class KimParam : protected Pointers
 {
 public:
   KimParam(class LAMMPS *lmp);
-
-  ~KimParam();
-
   void command(int, char **);
 };
 
-} // namespace LAMMPS_NS
+}
 
-#endif // LMP_KIM_PARAM_H
+#endif
 
 /* ERROR/WARNING messages:
 
diff --git a/src/KIM/kim_property.h b/src/KIM/kim_property.h
index 11729433b5..a804ad573c 100644
--- a/src/KIM/kim_property.h
+++ b/src/KIM/kim_property.h
@@ -65,13 +65,12 @@ class KimProperty : protected Pointers
 {
 public:
   KimProperty(class LAMMPS *lmp);
-
   void command(int, char **);
 };
 
-} // namespace LAMMPS_NS
+}
 
-#endif // LMP_KIM_PROPERTY_H
+#endif
 
 /* ERROR/WARNING messages:
 

From dac21e5c76e92a5eb4b9e47bd8065c9cd883acbb Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 05:31:48 -0600
Subject: [PATCH 010/116]  using unique_ptr to prevent memory leak

---
 src/KIM/kim_command.cpp | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/KIM/kim_command.cpp b/src/KIM/kim_command.cpp
index 699aa4371b..bce1e0d929 100644
--- a/src/KIM/kim_command.cpp
+++ b/src/KIM/kim_command.cpp
@@ -12,7 +12,8 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Axel Kohlmeyer (Temple U)
+   Contributing authors: Axel Kohlmeyer (Temple U),
+                         Yaser Afshar (UMN)
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
@@ -64,6 +65,8 @@
 #include "kim_property.h"
 #include "kim_query.h"
 
+#include <memory>
+
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
@@ -77,20 +80,19 @@ void KimCommand::command(int narg, char **arg)
   arg++;
 
   if (subcmd == "init") {
-    KimInit *cmd = new KimInit(lmp);
-    cmd->command(narg,arg);
+    std::unique_ptr<KimInit> cmd(new KimInit(lmp));
+    cmd->command(narg, arg);
   } else if (subcmd == "interactions") {
-    KimInteractions *cmd = new KimInteractions(lmp);
-    cmd->command(narg,arg);
+    std::unique_ptr<KimInteractions> cmd(new KimInteractions(lmp));
+    cmd->command(narg, arg);
   } else if (subcmd == "param") {
-    KimParam *cmd = new KimParam(lmp);
-    cmd->command(narg,arg);
+    std::unique_ptr<KimParam> cmd(new KimParam(lmp));
+    cmd->command(narg, arg);
   } else if (subcmd == "property") {
-    KimProperty *cmd = new KimProperty(lmp);
-    cmd->command(narg,arg);
+    std::unique_ptr<KimProperty> cmd(new KimProperty(lmp));
+    cmd->command(narg, arg);
   } else if (subcmd == "query") {
-    KimQuery *cmd = new KimQuery(lmp);
-    cmd->command(narg,arg);
-  } else error->all(FLERR,fmt::format("Unknown kim subcommand {}",subcmd));
+    std::unique_ptr<KimQuery> cmd(new KimQuery(lmp));
+    cmd->command(narg, arg);
+  } else error->all(FLERR, fmt::format("Unknown kim subcommand {}", subcmd));
 }
-

From c3393cfc4bdea96a08fd2090a95652674a25535e Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 05:38:44 -0600
Subject: [PATCH 011/116] update the error messages to 'kim init' and clean up

---
 src/KIM/kim_init.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/KIM/kim_init.cpp b/src/KIM/kim_init.cpp
index 2d639ede4c..c4e9955ac4 100644
--- a/src/KIM/kim_init.cpp
+++ b/src/KIM/kim_init.cpp
@@ -81,17 +81,17 @@ using namespace LAMMPS_NS;
 
 void KimInit::command(int narg, char **arg)
 {
-  if ((narg < 2) || (narg > 3)) error->all(FLERR,"Illegal kim_init command");
+  if ((narg < 2) || (narg > 3)) error->all(FLERR,"Illegal 'kim init' command");
 
   if (domain->box_exist)
-    error->all(FLERR,"Must use 'kim_init' command before "
+    error->all(FLERR,"Must use 'kim init' command before "
                      "simulation box is defined");
   char *model_name = utils::strdup(arg[0]);
   char *user_units = utils::strdup(arg[1]);
   if (narg == 3) {
     if (strcmp(arg[2],"unit_conversion_mode")==0) unit_conversion_mode = true;
     else {
-      error->all(FLERR,fmt::format("Illegal kim_init command.\nThe argument "
+      error->all(FLERR,fmt::format("Illegal 'kim init' command.\nThe argument "
                                    "followed by unit_style {} is an optional "
                                    "argument and when is used must "
                                    "be unit_conversion_mode", user_units));
@@ -283,7 +283,8 @@ void KimInit::determine_model_type_and_units(char * model_name,
 
 /* ---------------------------------------------------------------------- */
 
-void KimInit::do_init(char *model_name, char *user_units, char *model_units, KIM_Model *&pkim)
+void KimInit::do_init(char *model_name, char *user_units, char *model_units,
+                      KIM_Model *&pkim)
 {
   // create storage proxy fix. delete existing fix, if needed.
 
@@ -298,7 +299,8 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units, KIM
   fix_store->setptr("model_units", (void *) model_units);
 
   // Begin output to log file
-  input->write_echo("#=== BEGIN kim-init ==========================================\n");
+  input->write_echo("#=== BEGIN kim init ==================================="
+                    "=======\n");
 
   KIM_SimulatorModel * simulatorModel;
   if (model_type == SM) {
@@ -407,7 +409,8 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units, KIM
   }
 
   // End output to log file
-  input->write_echo("#=== END kim-init ============================================\n\n");
+  input->write_echo("#=== END kim init ====================================="
+                    "=======\n\n");
 }
 
 /* ---------------------------------------------------------------------- */

From c36a52a8f95d703fd58cb85730c85d50ccd2a2a8 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 05:45:25 -0600
Subject: [PATCH 012/116] clean up and add extra space after comma

---
 src/KIM/kim_init.cpp | 106 +++++++++++++++++++++----------------------
 1 file changed, 51 insertions(+), 55 deletions(-)

diff --git a/src/KIM/kim_init.cpp b/src/KIM/kim_init.cpp
index c4e9955ac4..43ccfda155 100644
--- a/src/KIM/kim_init.cpp
+++ b/src/KIM/kim_init.cpp
@@ -81,30 +81,28 @@ using namespace LAMMPS_NS;
 
 void KimInit::command(int narg, char **arg)
 {
-  if ((narg < 2) || (narg > 3)) error->all(FLERR,"Illegal 'kim init' command");
+  if ((narg < 2) || (narg > 3)) error->all(FLERR, "Illegal 'kim init' command");
 
   if (domain->box_exist)
-    error->all(FLERR,"Must use 'kim init' command before "
-                     "simulation box is defined");
+    error->all(FLERR, "Must use 'kim init' command before "
+                      "simulation box is defined");
   char *model_name = utils::strdup(arg[0]);
   char *user_units = utils::strdup(arg[1]);
   if (narg == 3) {
-    if (strcmp(arg[2],"unit_conversion_mode")==0) unit_conversion_mode = true;
+    if (strcmp(arg[2], "unit_conversion_mode")==0) unit_conversion_mode = true;
     else {
-      error->all(FLERR,fmt::format("Illegal 'kim init' command.\nThe argument "
-                                   "followed by unit_style {} is an optional "
-                                   "argument and when is used must "
-                                   "be unit_conversion_mode", user_units));
+      error->all(FLERR, fmt::format("Illegal 'kim init' command.\nThe argument "
+                                    "followed by unit_style {} is an optional "
+                                    "argument and when is used must "
+                                    "be unit_conversion_mode", user_units));
     }
   } else unit_conversion_mode = false;
 
   char *model_units;
   KIM_Model *pkim = nullptr;
 
-  if (universe->me == 0)
-    std::remove("kim.log");
-  if (universe->nprocs > 1)
-    MPI_Barrier(universe->uworld);
+  if (universe->me == 0) std::remove("kim.log");
+  if (universe->nprocs > 1) MPI_Barrier(universe->uworld);
 
   determine_model_type_and_units(model_name, user_units, &model_units, pkim);
 
@@ -125,43 +123,43 @@ void get_kim_unit_names(
     KIM_TimeUnit & timeUnit,
     Error * error)
 {
-  if (strcmp(system,"real") == 0) {
+  if (strcmp(system, "real") == 0) {
     lengthUnit = KIM_LENGTH_UNIT_A;
     energyUnit = KIM_ENERGY_UNIT_kcal_mol;
     chargeUnit = KIM_CHARGE_UNIT_e;
     temperatureUnit = KIM_TEMPERATURE_UNIT_K;
     timeUnit = KIM_TIME_UNIT_fs;
-  } else if (strcmp(system,"metal") == 0) {
+  } else if (strcmp(system, "metal") == 0) {
     lengthUnit = KIM_LENGTH_UNIT_A;
     energyUnit = KIM_ENERGY_UNIT_eV;
     chargeUnit = KIM_CHARGE_UNIT_e;
     temperatureUnit = KIM_TEMPERATURE_UNIT_K;
     timeUnit = KIM_TIME_UNIT_ps;
-  } else if (strcmp(system,"si") == 0) {
+  } else if (strcmp(system, "si") == 0) {
     lengthUnit = KIM_LENGTH_UNIT_m;
     energyUnit = KIM_ENERGY_UNIT_J;
     chargeUnit = KIM_CHARGE_UNIT_C;
     temperatureUnit = KIM_TEMPERATURE_UNIT_K;
     timeUnit = KIM_TIME_UNIT_s;
-  } else if (strcmp(system,"cgs") == 0) {
+  } else if (strcmp(system, "cgs") == 0) {
     lengthUnit = KIM_LENGTH_UNIT_cm;
     energyUnit = KIM_ENERGY_UNIT_erg;
     chargeUnit = KIM_CHARGE_UNIT_statC;
     temperatureUnit = KIM_TEMPERATURE_UNIT_K;
     timeUnit = KIM_TIME_UNIT_s;
-  } else if (strcmp(system,"electron") == 0) {
+  } else if (strcmp(system, "electron") == 0) {
     lengthUnit = KIM_LENGTH_UNIT_Bohr;
     energyUnit = KIM_ENERGY_UNIT_Hartree;
     chargeUnit = KIM_CHARGE_UNIT_e;
     temperatureUnit = KIM_TEMPERATURE_UNIT_K;
     timeUnit = KIM_TIME_UNIT_fs;
-  } else if (strcmp(system,"lj") == 0 ||
-             strcmp(system,"micro") ==0 ||
-             strcmp(system,"nano")==0) {
-    error->all(FLERR,fmt::format("LAMMPS unit_style {} not supported "
-                                 "by KIM models", system));
+  } else if (strcmp(system, "lj") == 0 ||
+             strcmp(system, "micro") ==0 ||
+             strcmp(system, "nano")==0) {
+    error->all(FLERR, fmt::format("LAMMPS unit_style {} not supported "
+                                  "by KIM models", system));
   } else {
-    error->all(FLERR,"Unknown unit_style");
+    error->all(FLERR, "Unknown unit_style");
   }
 }
 }  // namespace
@@ -182,13 +180,13 @@ void KimInit::determine_model_type_and_units(char * model_name,
 
   int kim_error = KIM_Collections_Create(&collections);
   if (kim_error)
-    error->all(FLERR,"Unable to access KIM Collections to find Model");
+    error->all(FLERR, "Unable to access KIM Collections to find Model");
 
   auto logID = fmt::format("{}_Collections", comm->me);
   KIM_Collections_SetLogID(collections, logID.c_str());
 
   kim_error = KIM_Collections_GetItemType(collections, model_name, &itemType);
-  if (kim_error) error->all(FLERR,"KIM Model name not found");
+  if (kim_error) error->all(FLERR, "KIM Model name not found");
   KIM_Collections_Destroy(&collections);
 
   if (KIM_CollectionItemType_Equal(itemType,
@@ -205,7 +203,7 @@ void KimInit::determine_model_type_and_units(char * model_name,
                                      &units_accepted,
                                      &pkim);
 
-    if (kim_error) error->all(FLERR,"Unable to load KIM Simulator Model");
+    if (kim_error) error->all(FLERR, "Unable to load KIM Simulator Model");
 
     model_type = MO;
 
@@ -239,17 +237,17 @@ void KimInit::determine_model_type_and_units(char * model_name,
         }
         KIM_Model_Destroy(&pkim);
       }
-      error->all(FLERR,"KIM Model does not support any lammps unit system");
+      error->all(FLERR, "KIM Model does not support any lammps unit system");
     } else {
       KIM_Model_Destroy(&pkim);
-      error->all(FLERR,"KIM Model does not support the requested unit system");
+      error->all(FLERR, "KIM Model does not support the requested unit system");
     }
   } else if (KIM_CollectionItemType_Equal(
              itemType, KIM_COLLECTION_ITEM_TYPE_simulatorModel)) {
     KIM_SimulatorModel * simulatorModel;
     kim_error = KIM_SimulatorModel_Create(model_name, &simulatorModel);
     if (kim_error)
-      error->all(FLERR,"Unable to load KIM Simulator Model");
+      error->all(FLERR, "Unable to load KIM Simulator Model");
     model_type = SM;
 
     logID = fmt::format("{}_SimulatorModel", comm->me);
@@ -265,7 +263,7 @@ void KimInit::determine_model_type_and_units(char * model_name,
       KIM_SimulatorModel_GetSimulatorFieldMetadata(
           simulatorModel, i, &sim_lines, &sim_field);
 
-      if (0 == strcmp(sim_field,"units")) {
+      if (0 == strcmp(sim_field, "units")) {
         KIM_SimulatorModel_GetSimulatorFieldLine(
           simulatorModel, i, 0, &sim_value);
         *model_units = utils::strdup(sim_value);
@@ -275,8 +273,8 @@ void KimInit::determine_model_type_and_units(char * model_name,
     KIM_SimulatorModel_Destroy(&simulatorModel);
 
     if ((! unit_conversion_mode) && (strcmp(*model_units, user_units)!=0)) {
-      error->all(FLERR,fmt::format("Incompatible units for KIM Simulator Model"
-                                   ", required units = {}", *model_units));
+      error->all(FLERR, fmt::format("Incompatible units for KIM Simulator Model"
+                                    ", required units = {}", *model_units));
     }
   }
 }
@@ -307,7 +305,7 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
     int kim_error =
       KIM_SimulatorModel_Create(model_name, &simulatorModel);
     if (kim_error)
-      error->all(FLERR,"Unable to load KIM Simulator Model");
+      error->all(FLERR, "Unable to load KIM Simulator Model");
 
     auto logID = fmt::format("{}_SimulatorModel", comm->me);
     KIM_SimulatorModel_SetLogID(simulatorModel, logID.c_str());
@@ -316,8 +314,8 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
     KIM_SimulatorModel_GetSimulatorNameAndVersion(
         simulatorModel, &sim_name, &sim_version);
 
-    if (0 != strcmp(sim_name,"LAMMPS"))
-      error->all(FLERR,"Incompatible KIM Simulator Model");
+    if (0 != strcmp(sim_name, "LAMMPS"))
+      error->all(FLERR, "Incompatible KIM Simulator Model");
 
     if (comm->me == 0) {
       std::string mesg("# Using KIM Simulator Model : ");
@@ -330,7 +328,7 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
       mesg += "\n";
       mesg += "#\n";
 
-      utils::logmesg(lmp,mesg);
+      utils::logmesg(lmp, mesg);
     }
 
     fix_store->setptr("simulator_model", (void *) simulatorModel);
@@ -358,11 +356,11 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
 
     for (int i=0; i < sim_fields; ++i) {
       KIM_SimulatorModel_GetSimulatorFieldMetadata(
-          simulatorModel,i,&sim_lines,&sim_field);
-      if (0 == strcmp(sim_field,"model-init")) {
+          simulatorModel, i, &sim_lines, &sim_field);
+      if (0 == strcmp(sim_field, "model-init")) {
         for (int j=0; j < sim_lines; ++j) {
           KIM_SimulatorModel_GetSimulatorFieldLine(
-              simulatorModel,i,j,&sim_value);
+              simulatorModel, i, j, &sim_value);
           input->one(sim_value);
         }
         break;
@@ -390,17 +388,17 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
         &extent, &str_name, &str_desc);
         max_len = MAX(max_len, (int)strlen(str_name));
       }
-      max_len = MAX(18,max_len+1);
+      max_len = MAX(18, max_len + 1);
       mesg += fmt::format(" No.      | {:<{}} | data type  | extent\n",
                           "Parameter name", max_len);
-      mesg += fmt::format("{:-<{}}\n","-",max_len+35);
+      mesg += fmt::format("{:-<{}}\n", "-", max_len + 35);
       for (int i = 0; i < numberOfParameters; ++i) {
         KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType,
         &extent, &str_name, &str_desc);
         auto data_type = std::string("\"");
         data_type += KIM_DataType_ToString(kim_DataType) + std::string("\"");
-        mesg += fmt::format(" {:<8} | {:<{}} | {:<10} | {}\n",i+1,str_name,
-                            max_len,data_type,extent);
+        mesg += fmt::format(" {:<8} | {:<{}} | {:<10} | {}\n", i + 1, str_name,
+                            max_len, data_type, extent);
       }
     } else mesg += "No mutable parameters.\n";
 
@@ -420,7 +418,7 @@ void KimInit::do_variables(const std::string &from, const std::string &to)
   // refuse conversion from or to reduced units
 
   if ((from == "lj") || (to == "lj"))
-    error->all(FLERR,"Cannot set up conversion variables for 'lj' units");
+    error->all(FLERR, "Cannot set up conversion variables for 'lj' units");
 
   // get index to internal style variables. create, if needed.
   // set conversion factors for newly created variables.
@@ -445,7 +443,7 @@ void KimInit::do_variables(const std::string &from, const std::string &to)
                          nullptr};
 
   input->write_echo(fmt::format("# Conversion factors from {} to {}:\n",
-                                from,to));
+                                from, to));
 
   auto variable = input->variable;
   for (int i = 0; units[i] != nullptr; ++i) {
@@ -455,16 +453,14 @@ void KimInit::do_variables(const std::string &from, const std::string &to)
       variable->set(var_str + " internal 1.0");
       v_unit = variable->find(var_str.c_str());
     }
-    ier = lammps_unit_conversion(units[i],
-                                 from,
-                                 to,
+    ier = lammps_unit_conversion(units[i], from, to,
                                  conversion_factor);
     if (ier != 0)
-      error->all(FLERR,fmt::format("Unable to obtain conversion factor: "
-                                   "unit = {}; from = {}; to = {}",
-                                   units[i], from, to));
+      error->all(FLERR, fmt::format("Unable to obtain conversion factor: "
+                                    "unit = {}; from = {}; to = {}",
+                                    units[i], from, to));
 
-    variable->internal_set(v_unit,conversion_factor);
+    variable->internal_set(v_unit, conversion_factor);
     input->write_echo(fmt::format("variable {:<15s} internal {:<15.12e}\n",
                                   var_str, conversion_factor));
   }
@@ -486,13 +482,13 @@ void KimInit::write_log_cite(char *model_name)
   if (model_type == MO) {
     err = KIM_Collections_CacheListOfItemMetadataFiles(
       collections, KIM_COLLECTION_ITEM_TYPE_portableModel,
-      model_name,&extent);
+      model_name, &extent);
   } else if (model_type == SM) {
     err = KIM_Collections_CacheListOfItemMetadataFiles(
       collections, KIM_COLLECTION_ITEM_TYPE_simulatorModel,
       model_name, &extent);
   } else {
-    error->all(FLERR,"Unknown model type");
+    error->all(FLERR, "Unknown model type");
   }
 
   if (err) {
@@ -509,7 +505,7 @@ void KimInit::write_log_cite(char *model_name)
         &availableAsString, &fileString);
     if (err) continue;
 
-    if (0 == strncmp("kimcite",fileName,7)) {
+    if (0 == strncmp("kimcite", fileName, 7)) {
       if ((lmp->citeme) && (availableAsString)) lmp->citeme->add(fileString);
     }
   }

From e5efe21d90e44f819de531e78b361ea4a89fe99e Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 05:46:50 -0600
Subject: [PATCH 013/116] update the error messages to 'kim interactions' and
 clean up the code

---
 src/KIM/kim_interactions.cpp | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/KIM/kim_interactions.cpp b/src/KIM/kim_interactions.cpp
index afb1391606..626fd2b4ba 100644
--- a/src/KIM/kim_interactions.cpp
+++ b/src/KIM/kim_interactions.cpp
@@ -83,10 +83,10 @@ using namespace LAMMPS_NS;
 
 void KimInteractions::command(int narg, char **arg)
 {
-  if (narg < 1) error->all(FLERR,"Illegal kim_interactions command");
+  if (narg < 1) error->all(FLERR,"Illegal 'kim interactions' command");
 
   if (!domain->box_exist)
-    error->all(FLERR,"Must use 'kim_interactions' command after "
+    error->all(FLERR,"Must use 'kim interactions' command after "
                      "simulation box is defined");
 
   do_setup(narg,arg);
@@ -100,10 +100,10 @@ void KimInteractions::do_setup(int narg, char **arg)
   if ((narg == 1) && (0 == strcmp("fixed_types",arg[0]))) {
     fixed_types = true;
   } else if (narg != atom->ntypes) {
-    error->all(FLERR,fmt::format("Illegal kim_interactions command.\nThe "
+    error->all(FLERR,fmt::format("Illegal 'kim interactions' command.\nThe "
                                  "LAMMPS simulation has {} atom type(s), but "
                                  "{} chemical species passed to the "
-                                 "kim_interactions command",
+                                 "'kim interactions' command",
                                  atom->ntypes, narg));
   } else {
     fixed_types = false;
@@ -112,7 +112,7 @@ void KimInteractions::do_setup(int narg, char **arg)
   char *model_name = nullptr;
   KIM_SimulatorModel *simulatorModel(nullptr);
 
-  // check if we had a kim_init command by finding fix STORE/KIM
+  // check if we had a kim init command by finding fix STORE/KIM
   // retrieve model name and pointer to simulator model class instance.
   // validate model name if not given as null pointer.
 
@@ -121,10 +121,11 @@ void KimInteractions::do_setup(int narg, char **arg)
     FixStoreKIM *fix_store = (FixStoreKIM *) modify->fix[ifix];
     model_name = (char *)fix_store->getptr("model_name");
     simulatorModel = (KIM_SimulatorModel *)fix_store->getptr("simulator_model");
-  } else error->all(FLERR,"Must use 'kim_init' before 'kim_interactions'");
+  } else error->all(FLERR,"Must use 'kim init' before 'kim interactions'");
 
   // Begin output to log file
-  input->write_echo("#=== BEGIN kim_interactions ==================================\n");
+  input->write_echo("#=== BEGIN kim interactions ==========================="
+                    "=======\n");
 
   if (simulatorModel) {
     if (!fixed_types) {
@@ -211,7 +212,7 @@ void KimInteractions::do_setup(int narg, char **arg)
             //  * This is an INTERNAL command.
             //  * It is intended for use only by KIM Simulator Models.
             //  * It is not possible to use this command outside of the context
-            //    of the kim_interactions command and KIM Simulator Models.
+            //    of the kim interactions command and KIM Simulator Models.
             //  * The command performs a transformation from symbolic
             //    string-based atom types to lammps numeric atom types for
             //    the pair_coeff and charge settings.
@@ -250,7 +251,8 @@ void KimInteractions::do_setup(int narg, char **arg)
   }
 
   // End output to log file
-  input->write_echo("#=== END kim_interactions ====================================\n\n");
+  input->write_echo("#=== END kim interactions ============================="
+                    "=======\n\n");
 }
 
 /* ---------------------------------------------------------------------- */

From 98e734845eb21e7ad5e801206b247c0b7911856e Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 05:49:35 -0600
Subject: [PATCH 014/116] clean up and add extra space after comma

---
 src/KIM/kim_interactions.cpp | 66 ++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/src/KIM/kim_interactions.cpp b/src/KIM/kim_interactions.cpp
index 626fd2b4ba..59fc4d3f25 100644
--- a/src/KIM/kim_interactions.cpp
+++ b/src/KIM/kim_interactions.cpp
@@ -83,13 +83,13 @@ using namespace LAMMPS_NS;
 
 void KimInteractions::command(int narg, char **arg)
 {
-  if (narg < 1) error->all(FLERR,"Illegal 'kim interactions' command");
+  if (narg < 1) error->all(FLERR, "Illegal 'kim interactions' command");
 
   if (!domain->box_exist)
-    error->all(FLERR,"Must use 'kim interactions' command after "
-                     "simulation box is defined");
+    error->all(FLERR, "Must use 'kim interactions' command after "
+                      "simulation box is defined");
 
-  do_setup(narg,arg);
+  do_setup(narg, arg);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -97,14 +97,14 @@ void KimInteractions::command(int narg, char **arg)
 void KimInteractions::do_setup(int narg, char **arg)
 {
   bool fixed_types;
-  if ((narg == 1) && (0 == strcmp("fixed_types",arg[0]))) {
+  if ((narg == 1) && (0 == strcmp("fixed_types", arg[0]))) {
     fixed_types = true;
   } else if (narg != atom->ntypes) {
-    error->all(FLERR,fmt::format("Illegal 'kim interactions' command.\nThe "
-                                 "LAMMPS simulation has {} atom type(s), but "
-                                 "{} chemical species passed to the "
-                                 "'kim interactions' command",
-                                 atom->ntypes, narg));
+    error->all(FLERR, fmt::format("Illegal 'kim interactions' command.\nThe "
+                                  "LAMMPS simulation has {} atom type(s), but "
+                                  "{} chemical species passed to the "
+                                  "'kim interactions' command",
+                                  atom->ntypes, narg));
   } else {
     fixed_types = false;
   }
@@ -121,7 +121,7 @@ void KimInteractions::do_setup(int narg, char **arg)
     FixStoreKIM *fix_store = (FixStoreKIM *) modify->fix[ifix];
     model_name = (char *)fix_store->getptr("model_name");
     simulatorModel = (KIM_SimulatorModel *)fix_store->getptr("simulator_model");
-  } else error->all(FLERR,"Must use 'kim init' before 'kim interactions'");
+  } else error->all(FLERR, "Must use 'kim init' before 'kim interactions'");
 
   // Begin output to log file
   input->write_echo("#=== BEGIN kim interactions ==========================="
@@ -150,7 +150,7 @@ void KimInteractions::do_setup(int narg, char **arg)
       bool species_is_supported;
       char const *sim_species;
       KIM_SimulatorModel_GetNumberOfSupportedSpecies(
-          simulatorModel,&sim_num_species);
+          simulatorModel, &sim_num_species);
 
       for (auto atom_type_sym : utils::split_words(atom_type_sym_list)) {
         species_is_supported = false;
@@ -161,8 +161,8 @@ void KimInteractions::do_setup(int narg, char **arg)
           if (atom_type_sym == sim_species) species_is_supported = true;
         }
         if (!species_is_supported) {
-          error->all(FLERR,fmt::format("Species '{}' is not supported by this "
-                                       "KIM Simulator Model", atom_type_sym));
+          error->all(FLERR, fmt::format("Species '{}' is not supported by this "
+                                        "KIM Simulator Model", atom_type_sym));
         }
       }
     } else {
@@ -178,10 +178,10 @@ void KimInteractions::do_setup(int narg, char **arg)
       KIM_SimulatorModel_GetSimulatorFieldMetadata(
           simulatorModel, i, &sim_lines, &sim_field);
 
-      if (strcmp(sim_field,"units") == 0) {
+      if (strcmp(sim_field, "units") == 0) {
         KIM_SimulatorModel_GetSimulatorFieldLine(
           simulatorModel, i, 0, &sim_value);
-        if (strcmp(sim_value,update->unit_style) != 0)
+        if (strcmp(sim_value, update->unit_style) != 0)
           error->all(FLERR,"Incompatible units for KIM Simulator Model");
       }
     }
@@ -190,7 +190,7 @@ void KimInteractions::do_setup(int narg, char **arg)
     for (int i = 0; i < sim_fields; ++i) {
       KIM_SimulatorModel_GetSimulatorFieldMetadata(
         simulatorModel, i, &sim_lines, &sim_field);
-      if (strcmp(sim_field,"model-defn") == 0) {
+      if (strcmp(sim_field, "model-defn") == 0) {
         if (domain->periodicity[0]&&
             domain->periodicity[1]&&
             domain->periodicity[2])
@@ -207,7 +207,7 @@ void KimInteractions::do_setup(int narg, char **arg)
         for (int j = 0; j < sim_lines; ++j) {
           KIM_SimulatorModel_GetSimulatorFieldLine(
             simulatorModel, i, j, &sim_value);
-          if (utils::strmatch(sim_value,"^KIM_SET_TYPE_PARAMETERS")) {
+          if (utils::strmatch(sim_value, "^KIM_SET_TYPE_PARAMETERS")) {
             // Notes regarding the KIM_SET_TYPE_PARAMETERS command
             //  * This is an INTERNAL command.
             //  * It is intended for use only by KIM Simulator Models.
@@ -228,7 +228,7 @@ void KimInteractions::do_setup(int narg, char **arg)
     }
 
     if (no_model_definition)
-      error->all(FLERR,"KIM Simulator Model has no Model definition");
+      error->all(FLERR, "KIM Simulator Model has no Model definition");
 
     KIM_SimulatorModel_OpenAndInitializeTemplateMap(simulatorModel);
 
@@ -237,7 +237,7 @@ void KimInteractions::do_setup(int narg, char **arg)
     // not a simulator model. issue pair_style and pair_coeff commands.
 
     if (fixed_types)
-      error->all(FLERR,"fixed_types cannot be used with a KIM Portable Model");
+      error->all(FLERR, "fixed_types cannot be used with a KIM Portable Model");
 
     // NOTE: all references to arg must appear before calls to input->one()
     // as that will reset the argument vector.
@@ -263,18 +263,18 @@ void KimInteractions::KIM_SET_TYPE_PARAMETERS(const std::string &input_line) con
 
   const std::string key = words[1];
   if (key != "pair" && key != "charge")
-    error->one(FLERR,fmt::format("Unrecognized KEY {} for "
-                                 "KIM_SET_TYPE_PARAMETERS command", key));
+    error->one(FLERR, fmt::format("Unrecognized KEY {} for "
+                                  "KIM_SET_TYPE_PARAMETERS command", key));
 
   std::string filename = words[2];
-  std::vector<std::string> species(words.begin()+3,words.end());
+  std::vector<std::string> species(words.begin() + 3, words.end());
   if ((int)species.size() != atom->ntypes)
-    error->one(FLERR,"Incorrect args for KIM_SET_TYPE_PARAMETERS command");
+    error->one(FLERR, "Incorrect args for KIM_SET_TYPE_PARAMETERS command");
 
   FILE *fp = nullptr;
   if (comm->me == 0) {
-    fp = fopen(filename.c_str(),"r");
-    if (fp == nullptr) error->one(FLERR,"Parameter file not found");
+    fp = fopen(filename.c_str(), "r");
+    if (fp == nullptr) error->one(FLERR, "Parameter file not found");
   }
 
   char line[MAXLINE], *ptr;
@@ -282,16 +282,16 @@ void KimInteractions::KIM_SET_TYPE_PARAMETERS(const std::string &input_line) con
 
   while (1) {
     if (comm->me == 0) {
-      ptr = fgets(line,MAXLINE,fp);
+      ptr = fgets(line, MAXLINE,fp);
       if (ptr == nullptr) {
         eof = 1;
         fclose(fp);
       } else n = strlen(line) + 1;
     }
-    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    MPI_Bcast(&eof, 1, MPI_INT, 0, world);
     if (eof) break;
-    MPI_Bcast(&n,1,MPI_INT,0,world);
-    MPI_Bcast(line,n,MPI_CHAR,0,world);
+    MPI_Bcast(&n, 1, MPI_INT, 0, world);
+    MPI_Bcast(line, n, MPI_CHAR, 0, world);
 
     auto trimmed = utils::trim_comment(line);
     if (trimmed.find_first_not_of(" \t\n\r") == std::string::npos) continue;
@@ -302,13 +302,13 @@ void KimInteractions::KIM_SET_TYPE_PARAMETERS(const std::string &input_line) con
         for (int ib = ia; ib < atom->ntypes; ++ib)
           if (((species[ia] == words[0]) && (species[ib] == words[1]))
               || ((species[ib] == words[0]) && (species[ia] == words[1])))
-            input->one(fmt::format("pair_coeff {} {} {}",ia+1,ib+1,
-              fmt::join(words.begin()+2,words.end()," ")));
+            input->one(fmt::format("pair_coeff {} {} {}", ia + 1, ib + 1,
+              fmt::join(words.begin() + 2, words.end(), " ")));
       }
     } else {
       for (int ia = 0; ia < atom->ntypes; ++ia)
         if (species[ia] == words[0])
-          input->one(fmt::format("set type {} charge {}",ia+1,words[1]));
+          input->one(fmt::format("set type {} charge {}", ia + 1, words[1]));
     }
   }
 }

From 265650d97cd04667d680284fef8d33e06fd1e00c Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 05:50:37 -0600
Subject: [PATCH 015/116] update the error messages to 'kim param' and clean up
 the code

---
 src/KIM/kim_param.cpp | 56 +++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/KIM/kim_param.cpp b/src/KIM/kim_param.cpp
index cef1dea642..9b9909b623 100644
--- a/src/KIM/kim_param.cpp
+++ b/src/KIM/kim_param.cpp
@@ -136,30 +136,29 @@ KimParam::KimParam(LAMMPS *lmp) : Pointers(lmp) {}
 
 void KimParam::command(int narg, char **arg)
 {
-  // kim_param is a command for
+  // kim param is a command for
   // getting/setting the value of a %KIM PM parameter
   //
-  // kim_param get param_name index_range variables formatarg
-  // kim_param set param_name index_range values
+  // kim param get param_name index_range variables formatarg
+  // kim param set param_name index_range values
+  //
+  // kim param get paramname 1 varname
+  // kim param get paramname index_range varname_1, ..., varname_N
+  // kim param get paramname index_range varname_base split
+  // kim param get paramname index_range varname_base list
+  // kim param set paramname index_range values
 
-  // kim_param   get paramname 1 varname
-  // kim_param   get paramname index_range varname_1, ..., varname_N
-  // kim_param   get paramname index_range varname_base split
-  // kim_param   get paramname index_range varname_base list
-  // kim_param   set paramname index_range values
-
-  if (narg < 4)
-    error->all(FLERR, "Illegal kim_param command");
+  if (narg < 4) error->all(FLERR, "Illegal 'kim param' command");
 
   std::string kim_param_get_set = arg[0];
 
   if ((kim_param_get_set != "get") && (kim_param_get_set != "set")) {
-    std::string msg("Incorrect arguments in kim_param command.\n");
-    msg += "'kim_param get/set' is mandatory";
+    std::string msg("Incorrect arguments in 'kim param' command.\n");
+    msg += "'kim param get/set' is mandatory";
     error->all(FLERR, msg);
   }
 
-  // Check if we called a kim_init command
+  // Check if we called a kim init command
   // by finding fix STORE/KIM
   // retrieve model name and model units.
 
@@ -178,15 +177,16 @@ void KimParam::command(int narg, char **arg)
 
     isPortableModel = simulatorModel ? false : true;
     if (!isPortableModel)
-      error->all(FLERR, "kim_param can only be used with a KIM Portable Model");
+      error->all(FLERR,
+        "'kim param' can only be used with a KIM Portable Model");
 
     model_name = (char *)fix_store->getptr("model_name");
     model_units = (char *)fix_store->getptr("model_units");
   } else
-    error->all(FLERR, "Must use 'kim_init' before 'kim_param'");
+    error->all(FLERR, "Must use 'kim init' before 'kim param'");
 
-  input->write_echo(fmt::format("#=== BEGIN kim-param {} ==================="
-                                "==================\n",kim_param_get_set));
+  input->write_echo(fmt::format("#=== BEGIN kim param {} ==================="
+                                "==================\n", kim_param_get_set));
 
   KIM_Model *pkim = nullptr;
 
@@ -214,10 +214,10 @@ void KimParam::command(int narg, char **arg)
                         "no match for kim style in lammps");
   } else {
     if (kim_param_get_set == "set") {
-      std::string msg("Wrong 'kim_param set' command.\n");
+      std::string msg("Wrong 'kim param set' command.\n");
       msg += "To set the new parameter values, pair style must ";
-      msg += "be assigned.\nMust use 'kim_interactions' or";
-      msg += "'pair_style kim' before 'kim_param set'";
+      msg += "be assigned.\nMust use 'kim interactions' or";
+      msg += "'pair_style kim' before 'kim param set'";
       error->all(FLERR, msg);
     } else {
       KIM_LengthUnit lengthUnit;
@@ -287,7 +287,7 @@ void KimParam::command(int narg, char **arg)
         }
 
         if (param_index >= numberOfParameters) {
-          auto msg = fmt::format("Wrong argument in kim_param get command.\n"
+          auto msg = fmt::format("Wrong argument in 'kim param get' command.\n"
                                  "This Model does not have the requested '{}' "
                                  "parameter", paramname);
           error->all(FLERR, msg);
@@ -336,7 +336,7 @@ void KimParam::command(int narg, char **arg)
             nubound = nlbound;
           }
         } else {
-          std::string msg("Wrong number of arguments in 'kim_param get' ");
+          std::string msg("Wrong number of arguments in 'kim param get' ");
           msg += "command.\nIndex range after parameter name is mandatory";
           error->all(FLERR, msg);
         }
@@ -348,7 +348,7 @@ void KimParam::command(int narg, char **arg)
           // Get the variable/variable_base name
           varname = arg[i++];
         } else {
-          std::string msg("Wrong number of arguments in 'kim_param get' ");
+          std::string msg("Wrong number of arguments in 'kim param get' ");
           msg += "command.\nThe LAMMPS variable name is mandatory";
           error->all(FLERR, msg);
         }
@@ -377,14 +377,14 @@ void KimParam::command(int narg, char **arg)
               }
             } else {
               auto msg =
-                fmt::format("Wrong number of arguments in 'kim_param get' "
+                fmt::format("Wrong number of arguments in 'kim param get' "
                             "command.\nThe LAMMPS '{}' variable names or "
                             "'{} split' is mandatory", nvars, varname);
               error->all(FLERR, msg);
             }
           } else {
             auto msg =
-              fmt::format("Wrong number of arguments in 'kim_param get' "
+              fmt::format("Wrong number of arguments in 'kim param get' "
                           "command.\nThe LAMMPS '{}' variable names or "
                           "'{} split/list' is mandatory", nvars, varname);
             error->all(FLERR, msg);
@@ -498,6 +498,6 @@ void KimParam::command(int narg, char **arg)
   if (!isPairStyleAssigned)
     KIM_Model_Destroy(&pkim);
 
-  input->write_echo(fmt::format("#=== END kim-param {} ====================="
-                                "==================\n",kim_param_get_set));
+  input->write_echo(fmt::format("#=== END kim param {} ====================="
+                                "==================\n", kim_param_get_set));
 }

From 2d9dcf4e8d24875f9589e842c139c54898eec480 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 05:52:10 -0600
Subject: [PATCH 016/116] clean up and add extra space after comma

---
 src/KIM/kim_param.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/KIM/kim_param.cpp b/src/KIM/kim_param.cpp
index 9b9909b623..1ebbed62f6 100644
--- a/src/KIM/kim_param.cpp
+++ b/src/KIM/kim_param.cpp
@@ -120,11 +120,11 @@ void get_kim_unit_names(
     chargeUnit = KIM_CHARGE_UNIT_e;
     temperatureUnit = KIM_TEMPERATURE_UNIT_K;
     timeUnit = KIM_TIME_UNIT_fs;
-  } else if (strcmp(system,"lj") == 0 ||
-             strcmp(system,"micro") ==0 ||
-             strcmp(system,"nano")==0) {
-    error->all(FLERR,fmt::format("LAMMPS unit_style {} not supported "
-                                 "by KIM models", system));
+  } else if (strcmp(system, "lj") == 0 ||
+             strcmp(system, "micro") ==0 ||
+             strcmp(system, "nano")==0) {
+    error->all(FLERR, fmt::format("LAMMPS unit_style {} not supported "
+                                  "by KIM models", system));
   } else
     error->all(FLERR, "Unknown unit_style");
 }
@@ -227,8 +227,7 @@ void KimParam::command(int narg, char **arg)
       KIM_TimeUnit timeUnit;
 
       get_kim_unit_names(model_units, lengthUnit, energyUnit,
-                         chargeUnit, temperatureUnit, timeUnit,
-                         error);
+                         chargeUnit, temperatureUnit, timeUnit, error);
 
       int units_accepted;
 

From a859643bac717e5ee51c61be0b6c36a36f8cbb7d Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 05:53:00 -0600
Subject: [PATCH 017/116] update the error messages to 'kim property' and clean
 up the code

---
 src/KIM/kim_property.cpp | 120 ++++++++++++++++++++-------------------
 1 file changed, 62 insertions(+), 58 deletions(-)

diff --git a/src/KIM/kim_property.cpp b/src/KIM/kim_property.cpp
index 3fb46d442f..d620356caf 100644
--- a/src/KIM/kim_property.cpp
+++ b/src/KIM/kim_property.cpp
@@ -75,32 +75,28 @@ KimProperty::KimProperty(LAMMPS *lmp) : Pointers(lmp)
   // one-time initialization of Python interpreter
   python->init();
 
-  if (!python->has_minimum_version(3, 6)) {
-    error->all(FLERR, "Invalid Python version.\n"
-                      "The kim-property Python package requires Python "
-                      "3 >= 3.6 support.");
-  }
+  if (!python->has_minimum_version(3, 6))
+    error->all(FLERR, "Invalid Python version.\nThe kim-property Python "
+                      "package requires Python 3 >= 3.6 support");
 }
 
 void KimProperty::command(int narg, char **arg)
 {
 #if LMP_PYTHON
 #if PY_MAJOR_VERSION >= 3
-  if (narg < 2)
-    error->all(FLERR, "Invalid kim_property command.");
-
+  if (narg < 2) error->all(FLERR, "Invalid 'kim property' command");
   if (!(strcmp(arg[0], "create") == 0) &&
       !(strcmp(arg[0], "destroy") == 0) &&
       !(strcmp(arg[0], "modify") == 0) &&
       !(strcmp(arg[0], "remove") == 0) &&
       !(strcmp(arg[0], "dump") == 0)) {
-    std::string msg("Incorrect arguments in kim_property command.\n");
-    msg += "'kim_property create/destroy/modify/remove/dump' ";
-    msg += "is mandatory.";
+    std::string msg("Incorrect arguments in 'kim property' command.\n");
+    msg += "'kim property create/destroy/modify/remove/dump' is mandatory";
     error->all(FLERR, msg);
   }
 
-  input->write_echo("#=== kim-property ===========================================\n");
+  input->write_echo("#=== kim property ====================================="
+                    "======\n");
 
   // Get the kim_str ptr to the data associated with a kim_property_str
   // variable
@@ -115,18 +111,18 @@ void KimProperty::command(int narg, char **arg)
     PyObject *obj = PyUnicode_FromString("kim_property");
     if (!obj) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Creating a 'PyObject'!");
+      error->all(FLERR, "Failed to create a 'PyObject'");
     }
 
     kim_property = PyImport_Import(obj);
     if (!kim_property) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Unable to import Python kim_property module!"
-                        "\nkim-property Python package can be installed "
-                        "with pip:\n'pip install kim-property'\n"
-                        "See the installation instructions at\n"
-                        "https://github.com/openkim/kim-property#installing-kim-property\n"
-                        "for detailed information.");
+      std::string msg("Unable to import Python kim_property module!");
+      msg += "\nkim-property Python package can be installed with pip:\n";
+      msg += "'pip install kim-property'\nSee the installation instructions ";
+      msg += "at\nhttps://github.com/openkim/kim-property#installing-kim-";
+      msg += "property\nfor detailed information";
+      error->all(FLERR, msg);
     }
 
     // Decrementing of the reference count
@@ -137,7 +133,7 @@ void KimProperty::command(int narg, char **arg)
   if (strcmp(arg[0], "create") == 0) {
     if (narg != 3) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Invalid 'kim_property create' command.");
+      error->all(FLERR, "Invalid 'kim property create' command");
     }
 
     int const ID = utils::inumeric(FLERR, arg[1], true, lmp);
@@ -151,8 +147,9 @@ void KimProperty::command(int narg, char **arg)
       PyObject_GetAttrString(kim_property, "kim_property_create");
     if (!pFunc) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Unable to get an attribute named "
-                        "'kim_property_create' from a kim_property object!");
+      std::string msg("Unable to get an attribute named ");
+      msg += "'kim_property_create' from a kim_property object";
+      error->all(FLERR, msg);
     }
 
     // Decrementing of the reference count
@@ -162,7 +159,7 @@ void KimProperty::command(int narg, char **arg)
     PyObject *pArgs = PyTuple_New(nSize);
     if (!pArgs) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Could not create Python function arguments.");
+      error->all(FLERR, "Could not create Python function arguments");
     }
 
     // Python object to set the tuple
@@ -185,15 +182,16 @@ void KimProperty::command(int narg, char **arg)
     if (!pValue) {
       PyErr_Print();
       PyGILState_Release(gstate);
-      error->one(FLERR, "Python 'kim_property_create' function "
-                        "evaluation failed!");
+      std::string msg("Python 'kim_property_create' function ");
+      msg += "evaluation failed";
+      error->one(FLERR, msg);
     }
 
     // Python function returned a string value
     const char *pystr = PyUnicode_AsUTF8(pValue);
     if (kim_str) input->variable->set_string("kim_property_str", pystr);
-    else input->variable->set(std::string("kim_property_str string '")
-                                          + pystr + std::string("'"));
+    else
+      input->variable->set(fmt::format("kim_property_str string '{}'", pystr));
 
     Py_XDECREF(pArgs);
     Py_XDECREF(pFunc);
@@ -201,7 +199,7 @@ void KimProperty::command(int narg, char **arg)
   } else if (strcmp(arg[0], "destroy") == 0) {
     if (narg != 2) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Invalid 'kim_property destroy' command.");
+      error->all(FLERR, "Invalid 'kim property destroy' command");
     }
 
     if (!kim_str) {
@@ -212,13 +210,15 @@ void KimProperty::command(int narg, char **arg)
     int const ID = utils::inumeric(FLERR, arg[1], true, lmp);
 
     // Python function
-    // This is the equivalent of the Python expression kim_property.kim_property_destroy
+    // This is the equivalent of the Python expression
+    // kim_property.kim_property_destroy
     PyObject *pFunc =
       PyObject_GetAttrString(kim_property, "kim_property_destroy");
     if (!pFunc) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Unable to get an attribute named "
-                        "'kim_property_destroy' from a kim_property object!");
+      std::string msg("Unable to get an attribute named ");
+      msg += "'kim_property_destroy' from a kim_property object";
+      error->all(FLERR, msg);
     }
 
     // Decrementing of the reference count
@@ -228,7 +228,7 @@ void KimProperty::command(int narg, char **arg)
     PyObject *pArgs = PyTuple_New(2);
     if (!pArgs) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Could not create Python function arguments.");
+      error->all(FLERR, "Could not create Python function arguments");
     }
 
     // Python object to set the tuple
@@ -244,8 +244,9 @@ void KimProperty::command(int narg, char **arg)
     if (!pValue) {
       PyErr_Print();
       PyGILState_Release(gstate);
-      error->one(FLERR, "Python 'kim_property_destroy' function "
-                        "evaluation failed!");
+      std::string msg("Python 'kim_property_destroy' function ");
+      msg += "evaluation failed";
+      error->one(FLERR, msg);
     }
 
     // Python function returned a string value
@@ -258,13 +259,12 @@ void KimProperty::command(int narg, char **arg)
   } else if (strcmp(arg[0], "modify") == 0) {
     if (narg < 6) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Invalid 'kim_property modify' command.");
+      error->all(FLERR, "Invalid 'kim property modify' command");
     }
 
     if (!kim_str) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "There is no property instance to modify "
-                        "the content.");
+      error->all(FLERR, "There is no property instance to modify the content");
     }
 
     int const ID = utils::inumeric(FLERR, arg[1], true, lmp);
@@ -276,8 +276,9 @@ void KimProperty::command(int narg, char **arg)
       PyObject_GetAttrString(kim_property, "kim_property_modify");
     if (!pFunc) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Unable to get an attribute named "
-                        "'kim_property_modify' from a kim_property object!");
+      std::string msg("Unable to get an attribute named ");
+      msg += "'kim_property_modify' from a kim_property object";
+      error->all(FLERR, msg);
     }
 
     // Decrementing of the reference count
@@ -287,7 +288,7 @@ void KimProperty::command(int narg, char **arg)
     PyObject *pArgs = PyTuple_New(static_cast<Py_ssize_t>(narg));
     if (!pArgs) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Could not create Python function arguments.");
+      error->all(FLERR, "Could not create Python function arguments");
     }
 
     // Python object to set the tuple
@@ -308,8 +309,9 @@ void KimProperty::command(int narg, char **arg)
     if (!pValue) {
       PyErr_Print();
       PyGILState_Release(gstate);
-      error->one(FLERR, "Python 'kim_property_modify' function "
-                        "evaluation failed!");
+      std::string msg("Python 'kim_property_modify' function ");
+      msg += "evaluation failed";
+      error->one(FLERR, msg);
     }
 
     // Python function returned a string value
@@ -322,13 +324,12 @@ void KimProperty::command(int narg, char **arg)
   } else if (strcmp(arg[0], "remove") == 0) {
     if (narg < 4) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Invalid 'kim_property remove' command.");
+      error->all(FLERR, "Invalid 'kim property remove' command");
     }
 
     if (!kim_str) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "There is no property instance to remove "
-                        "the content.");
+      error->all(FLERR, "There is no property instance to remove the content");
     }
 
     int const ID = utils::inumeric(FLERR, arg[1], true, lmp);
@@ -340,8 +341,9 @@ void KimProperty::command(int narg, char **arg)
       PyObject_GetAttrString(kim_property, "kim_property_remove");
     if (!pFunc) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Unable to get an attribute named "
-                        "'kim_property_remove' from a kim_property object!");
+      std::string msg("Unable to get an attribute named ");
+      msg += "'kim_property_remove' from a kim_property object";
+      error->all(FLERR, msg);
     }
 
     // Decrementing of the reference count
@@ -351,7 +353,7 @@ void KimProperty::command(int narg, char **arg)
     PyObject *pArgs = PyTuple_New(static_cast<Py_ssize_t>(narg));
     if (!pArgs) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Could not create Python function arguments.");
+      error->all(FLERR, "Could not create Python function arguments");
     }
 
     // Python object to set the tuple
@@ -372,8 +374,9 @@ void KimProperty::command(int narg, char **arg)
     if (!pValue) {
       PyErr_Print();
       PyGILState_Release(gstate);
-      error->one(FLERR, "Python 'kim_property_remove' function "
-                        "evaluation failed!");
+      std::string msg("Python 'kim_property_remove' function ");
+      msg += "evaluation failed";
+      error->one(FLERR, msg);
     }
 
     // Python function returned a string value
@@ -386,13 +389,12 @@ void KimProperty::command(int narg, char **arg)
   } else if (strcmp(arg[0], "dump") == 0) {
     if (narg != 2) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Invalid 'kim_property dump' command.");
+      error->all(FLERR, "Invalid 'kim property dump' command");
     }
 
     if (!kim_str) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "There is no property instance to dump "
-                        "the content.");
+      error->all(FLERR, "There is no property instance to dump the content.");
     }
 
     // Python function
@@ -402,8 +404,9 @@ void KimProperty::command(int narg, char **arg)
       PyObject_GetAttrString(kim_property, "kim_property_dump");
     if (!pFunc) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Unable to get an attribute named "
-                        "'kim_property_dump' from a kim_property object!");
+      std::string msg("Unable to get an attribute named ");
+      msg += "'kim_property_dump' from a kim_property object";
+      error->all(FLERR, msg);
     }
 
     // Decrementing of the reference count
@@ -413,7 +416,7 @@ void KimProperty::command(int narg, char **arg)
     PyObject *pArgs = PyTuple_New(2);
     if (!pArgs) {
       PyGILState_Release(gstate);
-      error->all(FLERR, "Could not create Python function arguments.");
+      error->all(FLERR, "Could not create Python function arguments");
     }
 
     // Python object to set the tuple
@@ -430,8 +433,9 @@ void KimProperty::command(int narg, char **arg)
       if (!pValue) {
         PyErr_Print();
         PyGILState_Release(gstate);
-        error->one(FLERR, "Python 'kim_property_dump' function "
-                          "evaluation failed!");
+        std::string msg("Python 'kim_property_dump' function ");
+        msg += "evaluation failed";
+        error->one(FLERR, msg);
       }
     } else
       pValue = nullptr;

From 7d7c433fd7c815fed0d3a519ffa2cbca6002cfd4 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 05:54:33 -0600
Subject: [PATCH 018/116] update the error messages to 'kim query' and clean up
 the code

---
 src/KIM/kim_query.cpp | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/KIM/kim_query.cpp b/src/KIM/kim_query.cpp
index faf1d26909..ef4b7572d3 100644
--- a/src/KIM/kim_query.cpp
+++ b/src/KIM/kim_query.cpp
@@ -96,9 +96,9 @@ static size_t write_callback(void *, size_t, size_t, void *);
 
 void KimQuery::command(int narg, char **arg)
 {
-  if (narg < 2) error->all(FLERR,"Illegal kim_query command");
+  if (narg < 2) error->all(FLERR,"Illegal 'kim query' command");
 
-  // check if we had a kim_init command by finding fix STORE/KIM
+  // check if we had a kim init command by finding fix STORE/KIM
   // retrieve model name.
   char *model_name;
 
@@ -106,17 +106,17 @@ void KimQuery::command(int narg, char **arg)
   if (ifix >= 0) {
     FixStoreKIM *fix_store = (FixStoreKIM *) modify->fix[ifix];
     model_name = (char *)fix_store->getptr("model_name");
-  } else error->all(FLERR,"Must use 'kim_init' before 'kim_query'");
+  } else error->all(FLERR,"Must use 'kim init' before 'kim query'");
 
   char *varname = arg[0];
 
   bool split = false;
   if (strcmp("split",arg[1]) == 0) {
-    if (narg == 2) error->all(FLERR,"Illegal kim_query command.\nThe keyword "
+    if (narg == 2) error->all(FLERR,"Illegal 'kim query' command.\nThe keyword "
                                     "'split' must be followed by the name of "
                                     "the query function");
     if (strcmp("list",arg[2]) == 0)
-      error->all(FLERR,"Illegal kim_query command.\nThe 'list' keyword "
+      error->all(FLERR,"Illegal 'kim query' command.\nThe 'list' keyword "
                        "can not be used after 'split'");
     split = true;
     arg++;
@@ -126,7 +126,7 @@ void KimQuery::command(int narg, char **arg)
   // The “list” is the default setting
   // the result is returned as a space-separated list of values in variable
   if (strcmp("list",arg[1]) == 0) {
-    if (narg == 2) error->all(FLERR,"Illegal kim_query command.\nThe 'list' "
+    if (narg == 2) error->all(FLERR,"Illegal 'kim query' command.\nThe 'list' "
                                     "keyword must be followed by ('split' "
                                     "and) the name of the query function");
     arg++;
@@ -136,11 +136,11 @@ void KimQuery::command(int narg, char **arg)
   char *function = arg[1];
   for (int i = 2; i < narg; ++i) {
     if (strncmp("model=",arg[i],6) == 0)
-      error->all(FLERR,"Illegal 'model' key in kim_query command");
+      error->all(FLERR,"Illegal 'model' key in 'kim query' command");
 
     if (!strchr(arg[i], '=') || !strchr(arg[i], '[') || !strchr(arg[i], ']'))
       error->all(FLERR,fmt::format("Illegal query format.\nInput argument of "
-                                   "`{}` to kim_query is wrong. The query "
+                                   "`{}` to 'kim query' is wrong. The query "
                                    "format is the keyword=[value], where value "
                                    "is always an array of one or more "
                                    "comma-separated items", arg[i]));
@@ -161,7 +161,8 @@ void KimQuery::command(int narg, char **arg)
     error->all(FLERR,fmt::format("OpenKIM query returned no results"));
   }
 
-  input->write_echo("#=== BEGIN kim-query =========================================\n");
+  input->write_echo("#=== BEGIN kim query =================================="
+                    "=======\n");
   ValueTokenizer values(value, ",");
   if (split) {
     int counter = 1;
@@ -182,11 +183,12 @@ void KimQuery::command(int narg, char **arg)
     input->variable->set(setcmd);
     input->write_echo(fmt::format("variable {}\n", setcmd));
   }
-  input->write_echo("#=== END kim-query ===========================================\n\n");
+  input->write_echo("#=== END kim query ===================================="
+                    "=======\n\n");
 
   delete[] value;
 #else
-  error->all(FLERR,"Cannot use 'kim_query' command when KIM package "
+  error->all(FLERR,"Cannot use 'kim query' command when KIM package "
                    "is compiled without support for libcurl");
 #endif
 }
@@ -292,7 +294,7 @@ char *do_query(char *qfunction, char * model_name, int narg, char **arg,
         }
       }
 
-      std::string user_agent = fmt::format("kim_query--LAMMPS/{} ({})",
+      std::string user_agent = fmt::format("kim query--LAMMPS/{} ({})",
                                            LAMMPS_VERSION, Info::get_os_info());
 
       curl_easy_setopt(handle, CURLOPT_USERAGENT, user_agent.c_str());

From 6769ded03c87aebabe773d5777b31908e2f88bf5 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 05:55:36 -0600
Subject: [PATCH 019/116] update the unittests with the latest interface
 changes

---
 unittest/commands/test_kim_commands.cpp | 334 ++++++++++++------------
 1 file changed, 173 insertions(+), 161 deletions(-)

diff --git a/unittest/commands/test_kim_commands.cpp b/unittest/commands/test_kim_commands.cpp
index 275a9eae3a..5ea458de59 100644
--- a/unittest/commands/test_kim_commands.cpp
+++ b/unittest/commands/test_kim_commands.cpp
@@ -79,31 +79,51 @@ protected:
     }
 };
 
+TEST_F(KimCommandsTest, kim)
+{
+    if (!LAMMPS::is_installed_pkg("KIM")) GTEST_SKIP();
+
+    TEST_FAILURE(".*ERROR: Illegal kim command.*",
+                 lmp->input->one("kim"););
+    TEST_FAILURE(".*ERROR: Unknown kim subcommand.*",
+                 lmp->input->one("kim unknown"););
+    TEST_FAILURE(".*ERROR: Unknown command: kim_init.*",
+                 lmp->input->one("kim_init"););
+    TEST_FAILURE(".*ERROR: Unknown command: kim_interactions.*",
+                 lmp->input->one("kim_interactions"););
+    TEST_FAILURE(".*ERROR: Unknown command: kim_param.*",
+                 lmp->input->one("kim_param"););
+    TEST_FAILURE(".*ERROR: Unknown command: kim_property.*",
+                 lmp->input->one("kim_property"););
+    TEST_FAILURE(".*ERROR: Unknown command: kim_query.*",
+                 lmp->input->one("kim_query"););
+}
+
 TEST_F(KimCommandsTest, kim_init)
 {
     if (!LAMMPS::is_installed_pkg("KIM")) GTEST_SKIP();
 
-    TEST_FAILURE(".*ERROR: Illegal kim_init command.*", 
-                 lmp->input->one("kim_init"););
-    TEST_FAILURE(".*ERROR: Illegal kim_init command.*",
-                 lmp->input->one("kim_init LennardJones_Ar real si"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim init' command.*",
+                 lmp->input->one("kim init"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim init' command.*",
+                 lmp->input->one("kim init LennardJones_Ar real si"););
     TEST_FAILURE(".*ERROR: LAMMPS unit_style lj not supported by KIM models.*",
-                 lmp->input->one("kim_init LennardJones_Ar lj"););
+                 lmp->input->one("kim init LennardJones_Ar lj"););
     TEST_FAILURE(".*ERROR: LAMMPS unit_style micro not supported by KIM models.*",
-                 lmp->input->one("kim_init LennardJones_Ar micro"););
+                 lmp->input->one("kim init LennardJones_Ar micro"););
     TEST_FAILURE(".*ERROR: LAMMPS unit_style nano not supported by KIM models.*",
-                 lmp->input->one("kim_init LennardJones_Ar nano"););
+                 lmp->input->one("kim init LennardJones_Ar nano"););
     TEST_FAILURE(".*ERROR: Unknown unit_style.*",
-                 lmp->input->one("kim_init LennardJones_Ar new_style"););
+                 lmp->input->one("kim init LennardJones_Ar new_style"););
     TEST_FAILURE(".*ERROR: KIM Model name not found.*",
-                 lmp->input->one("kim_init Unknown_Model real"););
+                 lmp->input->one("kim init Unknown_Model real"););
     TEST_FAILURE(".*ERROR: Incompatible units for KIM Simulator Model, required units = metal.*",
-                 lmp->input->one("kim_init Sim_LAMMPS_LJcut_AkersonElliott_Alchemy_PbAu real"););
+                 lmp->input->one("kim init Sim_LAMMPS_LJcut_AkersonElliott_Alchemy_PbAu real"););
     // TEST_FAILURE(".*ERROR: KIM Model does not support the requested unit system.*",
-    //              lmp->input->one("kim_init ex_model_Ar_P_Morse real"););
+    //              lmp->input->one("kim init ex_model_Ar_P_Morse real"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
-    lmp->input->one("kim_init LennardJones_Ar real");
+    lmp->input->one("kim init LennardJones_Ar real");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
     int ifix = lmp->modify->find_fix("KIM_MODEL_STORE");
@@ -114,27 +134,27 @@ TEST_F(KimCommandsTest, kim_interactions)
 {
     if (!LAMMPS::is_installed_pkg("KIM")) GTEST_SKIP();
 
-    TEST_FAILURE(".*ERROR: Illegal kim_interactions command.*",
-                 lmp->input->one("kim_interactions"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim interactions' command.*",
+                 lmp->input->one("kim interactions"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
-    lmp->input->one("kim_init LennardJones_Ar real");
+    lmp->input->one("kim init LennardJones_Ar real");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
-    TEST_FAILURE(".*ERROR: Must use 'kim_interactions' command "
+    TEST_FAILURE(".*ERROR: Must use 'kim interactions' command "
                  "after simulation box is defined.*",
-                 lmp->input->one("kim_interactions Ar"););
+                 lmp->input->one("kim interactions Ar"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
-    lmp->input->one("kim_init LennardJones_Ar real");
+    lmp->input->one("kim init LennardJones_Ar real");
     lmp->input->one("lattice fcc 4.4300");
     lmp->input->one("region box block 0 10 0 10 0 10");
     lmp->input->one("create_box 1 box");
     lmp->input->one("create_atoms 1 box");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
-    TEST_FAILURE(".*ERROR: Illegal kim_interactions command.*",
-                 lmp->input->one("kim_interactions Ar Ar"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim interactions' command.*",
+                 lmp->input->one("kim interactions Ar Ar"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
@@ -144,8 +164,8 @@ TEST_F(KimCommandsTest, kim_interactions)
     lmp->input->one("create_atoms 4 box");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
-    TEST_FAILURE(".*ERROR: Illegal kim_interactions command.*",
-                 lmp->input->one("kim_interactions Ar Ar"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim interactions' command.*",
+                 lmp->input->one("kim interactions Ar Ar"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
@@ -155,12 +175,12 @@ TEST_F(KimCommandsTest, kim_interactions)
     lmp->input->one("create_atoms 1 box");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
-    TEST_FAILURE(".*ERROR: Must use 'kim_init' before 'kim_interactions'.*",
-                 lmp->input->one("kim_interactions Ar"););
+    TEST_FAILURE(".*ERROR: Must use 'kim init' before 'kim interactions'.*",
+                 lmp->input->one("kim interactions Ar"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim_init LennardJones_Ar real");
+    lmp->input->one("kim init LennardJones_Ar real");
     lmp->input->one("lattice fcc 4.4300");
     lmp->input->one("region box block 0 10 0 10 0 10");
     lmp->input->one("create_box 1 box");
@@ -168,7 +188,7 @@ TEST_F(KimCommandsTest, kim_interactions)
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
     TEST_FAILURE(".*ERROR: fixed_types cannot be used with a KIM Portable Model.*",
-                 lmp->input->one("kim_interactions fixed_types"););
+                 lmp->input->one("kim interactions fixed_types"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
@@ -181,7 +201,7 @@ TEST_F(KimCommandsTest, kim_interactions)
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim_init Sim_LAMMPS_LJcut_AkersonElliott_Alchemy_PbAu metal");
+    lmp->input->one("kim init Sim_LAMMPS_LJcut_AkersonElliott_Alchemy_PbAu metal");
     lmp->input->one("lattice fcc 4.920");
     lmp->input->one("region box block 0 10 0 10 0 10");
     lmp->input->one("create_box 1 box");
@@ -189,31 +209,31 @@ TEST_F(KimCommandsTest, kim_interactions)
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
     TEST_FAILURE(".*ERROR: Species 'Ar' is not supported by this KIM Simulator Model.*",
-                 lmp->input->one("kim_interactions Ar"););
+                 lmp->input->one("kim interactions Ar"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim_init Sim_LAMMPS_LJcut_AkersonElliott_Alchemy_PbAu metal");
+    lmp->input->one("kim init Sim_LAMMPS_LJcut_AkersonElliott_Alchemy_PbAu metal");
     lmp->input->one("lattice fcc 4.08");
     lmp->input->one("region box block 0 10 0 10 0 10");
     lmp->input->one("create_box 1 box");
     lmp->input->one("create_atoms 1 box");
-    lmp->input->one("kim_interactions Au");
+    lmp->input->one("kim interactions Au");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
     // ASSERT_EQ(lmp->output->var_kim_periodic, 1);
     // TEST_FAILURE(".*ERROR: Incompatible units for KIM Simulator Model.*",
-    //              lmp->input->one("kim_interactions Au"););
+    //              lmp->input->one("kim interactions Au"););
 
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim_init LennardJones_Ar real");
+    lmp->input->one("kim init LennardJones_Ar real");
     lmp->input->one("lattice fcc 4.4300");
     lmp->input->one("region box block 0 10 0 10 0 10");
     lmp->input->one("create_box 1 box");
     lmp->input->one("create_atoms 1 box");
-    lmp->input->one("kim_interactions Ar");
+    lmp->input->one("kim interactions Ar");
     lmp->input->one("mass 1 39.95");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
@@ -222,15 +242,15 @@ TEST_F(KimCommandsTest, kim_interactions)
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim_init LennardJones_Ar real");
+    lmp->input->one("kim init LennardJones_Ar real");
     lmp->input->one("lattice fcc 4.4300");
     lmp->input->one("region box block 0 10 0 10 0 10");
     lmp->input->one("create_box 1 box");
     lmp->input->one("create_atoms 1 box");
-    lmp->input->one("kim_interactions Ar");
+    lmp->input->one("kim interactions Ar");
     lmp->input->one("mass 1 39.95");
     lmp->input->one("run 1");
-    lmp->input->one("kim_interactions Ar");
+    lmp->input->one("kim interactions Ar");
     lmp->input->one("run 1");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 }
@@ -239,94 +259,95 @@ TEST_F(KimCommandsTest, kim_param)
 {
     if (!LAMMPS::is_installed_pkg("KIM")) GTEST_SKIP();
 
-    TEST_FAILURE(".*ERROR: Illegal kim_param command.*", lmp->input->one("kim_param"););
-    TEST_FAILURE(".*ERROR: Incorrect arguments in kim_param command.\n"
-                 "'kim_param get/set' is mandatory.*",
-                 lmp->input->one("kim_param unknown shift 1 shift"););
-    TEST_FAILURE(".*ERROR: Must use 'kim_init' before 'kim_param'.*",
-                 lmp->input->one("kim_param get shift 1 shift"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim param' command.*",
+                 lmp->input->one("kim param"););
+    TEST_FAILURE(".*ERROR: Incorrect arguments in 'kim param' command.\n"
+                 "'kim param get/set' is mandatory.*",
+                 lmp->input->one("kim param unknown shift 1 shift"););
+    TEST_FAILURE(".*ERROR: Must use 'kim init' before 'kim param'.*",
+                 lmp->input->one("kim param get shift 1 shift"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim_init Sim_LAMMPS_LJcut_AkersonElliott_Alchemy_PbAu metal");
+    lmp->input->one("kim init Sim_LAMMPS_LJcut_AkersonElliott_Alchemy_PbAu metal");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
-    TEST_FAILURE(".*ERROR: kim_param can only be used with a KIM Portable Model.*",
-                 lmp->input->one("kim_param get shift 1 shift"););
+    TEST_FAILURE(".*ERROR: 'kim param' can only be used with a KIM Portable Model.*",
+                 lmp->input->one("kim param get shift 1 shift"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim_init LennardJones612_UniversalShifted__MO_959249795837_003 real");
+    lmp->input->one("kim init LennardJones612_UniversalShifted__MO_959249795837_003 real");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
     TEST_FAILURE(".*ERROR: Illegal index '0' for "
                  "'shift' parameter with the extent of '1'.*",
-                 lmp->input->one("kim_param get shift 0 shift"););
+                 lmp->input->one("kim param get shift 0 shift"););
     TEST_FAILURE(".*ERROR: Illegal index '2' for "
                  "'shift' parameter with the extent of '1'.*",
-                 lmp->input->one("kim_param get shift 2 shift"););
+                 lmp->input->one("kim param get shift 2 shift"););
     TEST_FAILURE(".*ERROR: Illegal index_range.\nExpected integer "
                  "parameter\\(s\\) instead of '1.' in index_range.*",
-                 lmp->input->one("kim_param get shift 1. shift"););
+                 lmp->input->one("kim param get shift 1. shift"););
     TEST_FAILURE(".*ERROR: Illegal index_range '1-2' for 'shift' "
                  "parameter with the extent of '1'.*",
-                 lmp->input->one("kim_param get shift 1:2 shift"););
+                 lmp->input->one("kim param get shift 1:2 shift"););
     TEST_FAILURE(".*ERROR: Illegal index_range.\nExpected integer "
                  "parameter\\(s\\) instead of '1-2' in index_range.*",
-                 lmp->input->one("kim_param get shift 1-2 shift"););
-    TEST_FAILURE(".*ERROR: Wrong number of arguments in 'kim_param "
+                 lmp->input->one("kim param get shift 1-2 shift"););
+    TEST_FAILURE(".*ERROR: Wrong number of arguments in 'kim param "
                  "get' command.\nThe LAMMPS '3' variable names or "
                  "'s1 split' is mandatory.*",
-                 lmp->input->one("kim_param get sigmas 1:3 s1 s2"););
-    TEST_FAILURE(".*ERROR: Wrong argument in kim_param get command.\nThis "
+                 lmp->input->one("kim param get sigmas 1:3 s1 s2"););
+    TEST_FAILURE(".*ERROR: Wrong argument in 'kim param get' command.\nThis "
                  "Model does not have the requested 'unknown' parameter.*",
-                 lmp->input->one("kim_param get unknown 1 unknown"););
-    TEST_FAILURE(".*ERROR: Wrong 'kim_param set' command.\n"
+                 lmp->input->one("kim param get unknown 1 unknown"););
+    TEST_FAILURE(".*ERROR: Wrong 'kim param set' command.\n"
                  "To set the new parameter values, pair style must "
-                 "be assigned.\nMust use 'kim_interactions' or"
-                 "'pair_style kim' before 'kim_param set'.*",
-                 lmp->input->one("kim_param set shift 1 2"););
+                 "be assigned.\nMust use 'kim interactions' or"
+                 "'pair_style kim' before 'kim param set'.*",
+                 lmp->input->one("kim param set shift 1 2"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
-    lmp->input->one("kim_param get shift 1 shift");
+    lmp->input->one("kim param get shift 1 shift");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
     ASSERT_FALSE(lmp->input->variable->find("shift") == -1);
-    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("shift")) == std::string("1"));
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("shift")) == "1");
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim_init LennardJones612_UniversalShifted__MO_959249795837_003 real");
+    lmp->input->one("kim init LennardJones612_UniversalShifted__MO_959249795837_003 real");
     lmp->input->one("lattice fcc 4.4300");
     lmp->input->one("region box block 0 10 0 10 0 10");
     lmp->input->one("create_box 1 box");
     lmp->input->one("create_atoms 1 box");
-    lmp->input->one("kim_interactions Ar");
+    lmp->input->one("kim interactions Ar");
     lmp->input->one("mass 1 39.95");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
     TEST_FAILURE(".*ERROR: Illegal index '2' for "
                  "'shift' parameter with the extent of '1'.*",
-                 lmp->input->one("kim_param set shift 2 2"););
+                 lmp->input->one("kim param set shift 2 2"););
     TEST_FAILURE(".*ERROR: Illegal index_range.\nExpected integer "
                  "parameter\\(s\\) instead of '1.' in index_range.*",
-                 lmp->input->one("kim_param set shift 1. shift"););
+                 lmp->input->one("kim param set shift 1. shift"););
     TEST_FAILURE(".*ERROR: Illegal index_range '1-2' for "
                  "'shift' parameter with the extent of '1'.*",
-                 lmp->input->one("kim_param set shift 1:2 2"););
+                 lmp->input->one("kim param set shift 1:2 2"););
     TEST_FAILURE(".*ERROR: Wrong number of variable values for pair coefficients.*",
-                 lmp->input->one("kim_param set sigmas 1:3 0.5523570 0.4989030"););
+                 lmp->input->one("kim param set sigmas 1:3 0.5523570 0.4989030"););
     TEST_FAILURE(".*ERROR: Wrong argument for pair coefficients.\nThis "
                  "Model does not have the requested '0.4989030' parameter.*",
-                 lmp->input->one("kim_param set sigmas 1:1 0.5523570 0.4989030"););
+                 lmp->input->one("kim param set sigmas 1:1 0.5523570 0.4989030"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("variable new_shift equal 2");
-    lmp->input->one("kim_param set shift 1 ${new_shift}");
-    lmp->input->one("kim_param get shift 1 shift");
+    lmp->input->one("kim param set shift 1 ${new_shift}");
+    lmp->input->one("kim param get shift 1 shift");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
-    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("shift")) == std::string("2"));
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("shift")) == "2");
 }
 
 TEST_F(KimCommandsTest, kim_property)
@@ -338,168 +359,159 @@ TEST_F(KimCommandsTest, kim_property)
         TEST_FAILURE(".*ERROR: Invalid Python version.\n"
                      "The kim-property Python package requires Python "
                      "3 >= 3.6 support.*",
-                     lmp->input->one("kim_property"););
+                     lmp->input->one("kim property"););
     } else {
-        TEST_FAILURE(".*ERROR: Invalid kim_property command.*", 
-                     lmp->input->one("kim_property"););
-        TEST_FAILURE(".*ERROR: Invalid kim_property command.*",
-                     lmp->input->one("kim_property create"););
-        TEST_FAILURE(".*ERROR: Incorrect arguments in kim_property command.\n"
-                     "'kim_property create/destroy/modify/remove/dump' "
+        TEST_FAILURE(".*ERROR: Invalid 'kim property' command.*",
+                     lmp->input->one("kim property"););
+        TEST_FAILURE(".*ERROR: Invalid 'kim property' command.*",
+                     lmp->input->one("kim property create"););
+        TEST_FAILURE(".*ERROR: Incorrect arguments in 'kim property' command."
+                     "\n'kim property create/destroy/modify/remove/dump' "
                      "is mandatory.*",
-                     lmp->input->one("kim_property unknown 1 atomic-mass"););
+                     lmp->input->one("kim property unknown 1 atomic-mass"););
     }
 #if defined(KIM_EXTRA_UNITTESTS)
-        TEST_FAILURE(".*ERROR: Invalid 'kim_property create' command.*",
-                     lmp->input->one("kim_property create 1"););
-        TEST_FAILURE(".*ERROR: Invalid 'kim_property destroy' command.*",
-                     lmp->input->one("kim_property destroy 1 cohesive-potential-energy-cubic-crystal"););
-        TEST_FAILURE(".*ERROR: Invalid 'kim_property modify' command.*",
-                     lmp->input->one("kim_property modify 1 key short-name"););
-        TEST_FAILURE(".*ERROR: There is no property instance to modify the content.*",
-                     lmp->input->one("kim_property modify 1 key short-name source-value 1 fcc"););
-        TEST_FAILURE(".*ERROR: Invalid 'kim_property remove' command.*",
-                     lmp->input->one("kim_property remove 1 key"););
-        TEST_FAILURE(".*ERROR: There is no property instance to remove the content.*",
-                     lmp->input->one("kim_property remove 1 key short-name"););
-        TEST_FAILURE(".*ERROR: There is no property instance to dump the content.*",
-                     lmp->input->one("kim_property dump results.edn"););
-        if (!verbose) ::testing::internal::CaptureStdout();
-        lmp->input->one("clear");
-        lmp->input->one("kim_init LennardJones612_UniversalShifted__MO_959249795837_003 real");
-        lmp->input->one("kim_property create 1 cohesive-potential-energy-cubic-crystal");
-        lmp->input->one("kim_property modify 1 key short-name source-value 1 fcc");
-        lmp->input->one("kim_property destroy 1");
-        if (!verbose) ::testing::internal::GetCapturedStdout();
+    TEST_FAILURE(".*ERROR: Invalid 'kim property create' command.*",
+                 lmp->input->one("kim property create 1"););
+    TEST_FAILURE(".*ERROR: Invalid 'kim property destroy' command.*",
+                 lmp->input->one("kim property destroy 1 cohesive-potential-energy-cubic-crystal"););
+    TEST_FAILURE(".*ERROR: Invalid 'kim property modify' command.*",
+                 lmp->input->one("kim property modify 1 key short-name"););
+    TEST_FAILURE(".*ERROR: There is no property instance to modify the content.*",
+                 lmp->input->one("kim property modify 1 key short-name source-value 1 fcc"););
+    TEST_FAILURE(".*ERROR: Invalid 'kim property remove' command.*",
+                 lmp->input->one("kim property remove 1 key"););
+    TEST_FAILURE(".*ERROR: There is no property instance to remove the content.*",
+                 lmp->input->one("kim property remove 1 key short-name"););
+    TEST_FAILURE(".*ERROR: There is no property instance to dump the content.*",
+                 lmp->input->one("kim property dump results.edn"););
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("clear");
+    lmp->input->one("kim init LennardJones612_UniversalShifted__MO_959249795837_003 real");
+    lmp->input->one("kim property create 1 cohesive-potential-energy-cubic-crystal");
+    lmp->input->one("kim property modify 1 key short-name source-value 1 fcc");
+    lmp->input->one("kim property destroy 1");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
 #endif
 }
 
 TEST_F(KimCommandsTest, kim_query)
 {
     if (!LAMMPS::is_installed_pkg("KIM")) GTEST_SKIP();
-    
-    TEST_FAILURE(".*ERROR: Illegal kim_query command.*", 
-                 lmp->input->one("kim_query"););
-    TEST_FAILURE(".*ERROR: Must use 'kim_init' before 'kim_query'.*",
-                 lmp->input->one("kim_query a0 get_lattice_constant_cubic"););
+
+    TEST_FAILURE(".*ERROR: Illegal 'kim query' command.*",
+                 lmp->input->one("kim query"););
+    TEST_FAILURE(".*ERROR: Must use 'kim init' before 'kim query'.*",
+                 lmp->input->one("kim query a0 get_lattice_constant_cubic"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim_init LennardJones612_UniversalShifted__MO_959249795837_003 real");
+    lmp->input->one("kim init LennardJones612_UniversalShifted__MO_959249795837_003 real");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
-    TEST_FAILURE(".*ERROR: Illegal kim_query command.\nThe keyword 'split' "
-                 "must be followed by the name of the query function.*", 
-                 lmp->input->one("kim_query a0 split"););
-
-    TEST_FAILURE(".*ERROR: Illegal kim_query command.\nThe 'list' keyword "
-                 "can not be used after 'split'.*", 
-                 lmp->input->one("kim_query a0 split list"););
-
-    TEST_FAILURE(".*ERROR: Illegal kim_query command.\nThe 'list' keyword "
+    TEST_FAILURE(".*ERROR: Illegal 'kim query' command.\nThe keyword 'split' "
+                 "must be followed by the name of the query function.*",
+                 lmp->input->one("kim query a0 split"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim query' command.\nThe 'list' keyword "
+                 "can not be used after 'split'.*",
+                 lmp->input->one("kim query a0 split list"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim query' command.\nThe 'list' keyword "
                  "must be followed by \\('split' and\\) the name of the query "
-                 "function.*", lmp->input->one("kim_query a0 list"););
-
-    TEST_FAILURE(".*ERROR: Illegal 'model' key in kim_query command.*", 
-                 lmp->input->one("kim_query a0 get_lattice_constant_cubic "
+                 "function.*", lmp->input->one("kim query a0 list"););
+    TEST_FAILURE(".*ERROR: Illegal 'model' key in 'kim query' command.*",
+                 lmp->input->one("kim query a0 get_lattice_constant_cubic "
                                  "model=[MO_959249795837_003]"););
-
     TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `crystal` "
-                 "to kim_query is wrong. The query format is the "
+                 "to 'kim query' is wrong. The query format is the "
                  "keyword=\\[value\\], where value is always an array of one "
-                 "or more comma-separated items.*", 
-                 lmp->input->one("kim_query a0 get_lattice_constant_cubic "
+                 "or more comma-separated items.*",
+                 lmp->input->one("kim query a0 get_lattice_constant_cubic "
                                  "crystal"););
-
     TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `"
-                 "crystal=fcc` to kim_query is wrong. The query format is the "
-                 "keyword=\\[value\\], where value is always an array of one "
-                 "or more comma-separated items.*", 
-                 lmp->input->one("kim_query a0 get_lattice_constant_cubic "
+                 "crystal=fcc` to 'kim query' is wrong. The query format is "
+                 "the keyword=\\[value\\], where value is always an array of "
+                 "one or more comma-separated items.*",
+                 lmp->input->one("kim query a0 get_lattice_constant_cubic "
                                  "crystal=fcc"););
-
     TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `"
-                 "crystal=\\[fcc` to kim_query is wrong. The query format is "
+                 "crystal=\\[fcc` to 'kim query' is wrong. The query format is "
                  "the keyword=\\[value\\], where value is always an array of "
-                 "one or more comma-separated items.*", 
-                 lmp->input->one("kim_query a0 get_lattice_constant_cubic "
+                 "one or more comma-separated items.*",
+                 lmp->input->one("kim query a0 get_lattice_constant_cubic "
                                  "crystal=[fcc"););
-
    TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `"
-                 "crystal=fcc\\]` to kim_query is wrong. The query format is "
+                 "crystal=fcc\\]` to 'kim query' is wrong. The query format is "
                  "the keyword=\\[value\\], where value is always an array of "
-                 "one or more comma-separated items.*", 
-                 lmp->input->one("kim_query a0 get_lattice_constant_cubic "
+                 "one or more comma-separated items.*",
+                 lmp->input->one("kim query a0 get_lattice_constant_cubic "
                                  "crystal=fcc]"););
-    
-    std::string squery("kim_query a0 get_lattice_constant_cubic ");
+
+    std::string squery("kim query a0 get_lattice_constant_cubic ");
     squery += "crystal=[\"fcc\"] species=\"Al\",\"Ni\" units=[\"angstrom\"]";
-
    TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `species="
-                 "\"Al\",\"Ni\"` to kim_query is wrong. The query format is "
+                 "\"Al\",\"Ni\"` to 'kim query' is wrong. The query format is "
                  "the keyword=\\[value\\], where value is always an array of "
-                 "one or more comma-separated items.*", 
+                 "one or more comma-separated items.*",
                  lmp->input->one(squery););
 
-    squery = "kim_query a0 get_lattice_constant_cubic ";
+    squery = "kim query a0 get_lattice_constant_cubic ";
     squery += "crystal=[\"fcc\"] species=\"Al\",\"Ni\", units=[\"angstrom\"]";
-
    TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `species="
-                 "\"Al\",\"Ni\",` to kim_query is wrong. The query format is "
+                 "\"Al\",\"Ni\",` to 'kim query' is wrong. The query format is "
                  "the keyword=\\[value\\], where value is always an array of "
-                 "one or more comma-separated items.*", 
+                 "one or more comma-separated items.*",
                  lmp->input->one(squery););
 
-    squery = "kim_query a0 get_lattice_constant_cubic crystal=[fcc] "
+    squery = "kim query a0 get_lattice_constant_cubic crystal=[fcc] "
              "species=[Al]";
     TEST_FAILURE(".*ERROR: OpenKIM query failed:.*", lmp->input->one(squery););
 
-    squery = "kim_query a0 get_lattice_constant_cubic crystal=[fcc] "
+    squery = "kim query a0 get_lattice_constant_cubic crystal=[fcc] "
              "units=[\"angstrom\"]";
     TEST_FAILURE(".*ERROR: OpenKIM query failed:.*", lmp->input->one(squery););
 
 #if defined(KIM_EXTRA_UNITTESTS)
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim_init EAM_Dynamo_Mendelev_2007_Zr__MO_848899341753_000 metal");
-    
-    squery = "kim_query latconst split get_lattice_constant_hexagonal ";
+    lmp->input->one("kim init EAM_Dynamo_Mendelev_2007_Zr__MO_848899341753_000 metal");
+
+    squery = "kim query latconst split get_lattice_constant_hexagonal ";
     squery += "crystal=[\"hcp\"] species=[\"Zr\"] units=[\"angstrom\"]";
     lmp->input->one(squery);
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
-    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst_1")) == 
+    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst_1")) ==
                  std::string("3.234055244384789")));
-    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst_2")) == 
+    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst_2")) ==
                  std::string("5.167650199630013")));
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim_init EAM_Dynamo_Mendelev_2007_Zr__MO_848899341753_000 metal");
-    
-    squery = "kim_query latconst list get_lattice_constant_hexagonal ";
+    lmp->input->one("kim init EAM_Dynamo_Mendelev_2007_Zr__MO_848899341753_000 metal");
+
+    squery = "kim query latconst list get_lattice_constant_hexagonal ";
     squery += "crystal=[hcp] species=[Zr] units=[angstrom]";
     lmp->input->one(squery);
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
-    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst")) == 
+    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst")) ==
                  std::string("3.234055244384789  5.167650199630013")));
 
-    squery = "kim_query latconst list get_lattice_constant_hexagonal ";
+    squery = "kim query latconst list get_lattice_constant_hexagonal ";
     squery += "crystal=[bcc] species=[Zr] units=[angstrom]";
     TEST_FAILURE(".*ERROR: OpenKIM query failed:.*", lmp->input->one(squery););
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim_init EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal");
-    
-    squery = "kim_query alpha get_linear_thermal_expansion_coefficient_cubic ";
+    lmp->input->one("kim init EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal");
+
+    squery = "kim query alpha get_linear_thermal_expansion_coefficient_cubic ";
     squery += "crystal=[fcc] species=[Al] units=[1/K] temperature=[293.15] ";
     squery += "temperature_units=[K]";
     lmp->input->one(squery);
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
-    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("alpha")) == 
+    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("alpha")) ==
                  std::string("1.654960564704273e-05")));
 #endif
 }

From 10a48f18d0db17682513c4315d54e73df16c1bf1 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 08:03:21 -0600
Subject: [PATCH 020/116] update kim command examples

---
 examples/kim/in.kim-ex.melt       | 42 +++++++++++-----------
 examples/kim/in.kim-pm-property   | 58 +++++++++++++++----------------
 examples/kim/in.kim-pm-query.melt | 44 +++++++++++------------
 examples/kim/in.kim-pm.melt       | 42 +++++++++++-----------
 examples/kim/in.kim-sm.melt       | 44 +++++++++++------------
 examples/kim/in.lammps.melt       | 44 +++++++++++------------
 6 files changed, 137 insertions(+), 137 deletions(-)

diff --git a/examples/kim/in.kim-ex.melt b/examples/kim/in.kim-ex.melt
index 5cc3dbc61b..200e2c3dcd 100644
--- a/examples/kim/in.kim-ex.melt
+++ b/examples/kim/in.kim-ex.melt
@@ -1,35 +1,35 @@
 # 3d Lennard-Jones melt
 #
 # This example requires that the example models provided with
-# the kim-api package are installed.  see the ./lib/kim/README or
-# ./lib/kim/Install.py files for details on how to install these
+# the kim-api package are installed.  see the `./lib/kim/README` or
+# `./lib/kim/Install.py` files for details on how to install these
 # example models.
 #
 
-variable	x index 1
-variable	y index 1
-variable	z index 1
+variable     x index 1
+variable     y index 1
+variable     z index 1
 
-variable	xx equal 20*$x
-variable	yy equal 20*$y
-variable	zz equal 20*$z
+variable     xx equal 20*$x
+variable     yy equal 20*$y
+variable     zz equal 20*$z
 
-kim_init	LennardJones_Ar real
+kim          init LennardJones_Ar real
 
-lattice		fcc 4.4300
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-create_box	1 box
-create_atoms	1 box
+lattice      fcc 4.4300
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box   1 box
+create_atoms 1 box
 
-kim_interactions Ar
+kim          interactions Ar
 
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
 
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
 
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
 
-run 		100
+run          100
diff --git a/examples/kim/in.kim-pm-property b/examples/kim/in.kim-pm-property
index fea1527820..d69879c728 100644
--- a/examples/kim/in.kim-pm-property
+++ b/examples/kim/in.kim-pm-property
@@ -1,34 +1,34 @@
-# kim-property example
+# kim property example
 #
 # For detailed information of this example please refer to:
-# https://openkim.org/doc/evaluation/tutorial-lammps/
+# `https://openkim.org/doc/evaluation/tutorial-lammps/`
 #
 # Description:
 #
-# This example is designed to calculate the cohesive energy corresponding to 
-# the equilibrium FCC lattice constant for 
-# `LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004`  model for 
-# argon.  The material properties computed in LAMMPS are represented as a 
-# standard KIM property instance format. (See 
-# https://openkim.org/doc/schema/properties-framework/ and 
-# https://lammps.sandia.gov/doc/kim_commands.html for further details).
-# Then the created property instance is written to a file named results.edn 
-# using the `kim_property dump` commands.  
+# This example is designed to calculate the cohesive energy corresponding to
+# the equilibrium FCC lattice constant for
+# `LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004` model for
+# argon. The material properties computed in LAMMPS are represented as a
+# standard KIM property instance format. (See
+# `https://openkim.org/doc/schema/properties-framework/` and
+# `https://lammps.sandia.gov/doc/kim_commands.html` for further details).
+# Then the created property instance is written to a file named `results.edn`
+# using the `kim property dump` command.
 #
 # Requirement:
-# 
-# This example requires LAMMPS built with the Python 3.6 or later package 
-# installed. See the `https://lammps.sandia.gov/doc/python.html` doc page for 
+#
+# This example requires LAMMPS built with the Python 3.6 or later package
+# installed. See the `https://lammps.sandia.gov/doc/python.html` doc page for
 # more info on building LAMMPS with the version of Python on your system.
-# After successfully building LAMMPS with Python, you need to install the 
-# kim-property Python package, See the 
-# `https://lammps.sandia.gov/doc/Build_extras.html#kim` doc page for 
+# After successfully building LAMMPS with Python, you need to install the
+# kim-property Python package, See the
+# `https://lammps.sandia.gov/doc/Build_extras.html#kim` doc page for
 # further details.
 #
 # This example requires that the KIM Portable Model (PM)
 # `LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004`
-# is installed.  This can be done with the command 
-# `kim-api-collections-management install user LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004`
+# is installed.  This can be done with the command
+#   kim-api-collections-management install user LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004
 # If this command does not work, you may need to setup your PATH to find the utility.
 # If you installed the kim-api using the LAMMPS CMake build, you can do the following
 # (where the current working directory is assumed to be the LAMMPS build directory)
@@ -38,14 +38,14 @@
 #   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
 # (where you should relplace X.Y.Z with the appropriate kim-api version number).
 #
-# Or, see https://openkim.org/doc/obtaining-models for alternative options.
+# Or, see `https://openkim.org/doc/obtaining-models` for alternative options.
 #
 
 # Initialize interatomic potential (KIM model) and units
 atom_style atomic
 
 # Set the OpenKIM model that will be used
-kim_init LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004 metal
+kim init LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004 metal
 
 # the equilibrium lattice constant for the fcc structure
 variable lattice_constant equal 5.248509056866169
@@ -55,14 +55,14 @@ boundary p p p
 
 # Create an FCC lattice with the lattice spacing
 # using a single conventional (orthogonal) unit cell
-lattice fcc ${lattice_constant}
-region box block 0 1 0 1 0 1 units lattice
-create_box 1 box
+lattice      fcc ${lattice_constant}
+region box   block 0 1 0 1 0 1 units lattice
+create_box   1 box
 create_atoms 1 box
-mass 1 39.948
+mass         1 39.948
 
 # Specify the KIM interactions
-kim_interactions Ar
+kim interactions Ar
 
 # Compute energy
 run 0
@@ -72,10 +72,10 @@ variable natoms       equal "count(all)"
 variable ecohesive    equal "-pe/v_natoms"
 
 # Create a property instance
-kim_property create 1 cohesive-potential-energy-cubic-crystal
+kim property create 1 cohesive-potential-energy-cubic-crystal
 
 # Set all the key-value pairs for this property instance
-kim_property modify 1 key short-name source-value 1 fcc                          &
+kim property modify 1 key short-name source-value 1 fcc                          &
                       key species source-value 1 Ar                              &
                       key a source-value ${lattice_constant}                     &
                             source-unit angstrom                                 &
@@ -88,4 +88,4 @@ kim_property modify 1 key short-name source-value 1 fcc
                                                     source-unit eV
 
 # Dump the results in a file
-kim_property dump "results.edn"
+kim property dump "results.edn"
diff --git a/examples/kim/in.kim-pm-query.melt b/examples/kim/in.kim-pm-query.melt
index fa04d90436..9e1e04000d 100644
--- a/examples/kim/in.kim-pm-query.melt
+++ b/examples/kim/in.kim-pm-query.melt
@@ -1,7 +1,7 @@
 # 3d Lennard-Jones melt
 #
 # This example requires that the KIM Portable Model (PM)
-# SW_StillingerWeber_1985_Si__MO_405512056662_005
+# `SW_StillingerWeber_1985_Si__MO_405512056662_005`
 # is installed.  This can be done with the command
 #   kim-api-collections-management install user SW_StillingerWeber_1985_Si__MO_405512056662_005
 # If this command does not work, you may need to setup your PATH to find the utility.
@@ -13,34 +13,34 @@
 #   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
 # (where you should relplace X.Y.Z with the appropriate kim-api version number).
 #
-# Or, see https://openkim.org/doc/obtaining-models for alternative options.
+# Or, see `https://openkim.org/doc/obtaining-models` for alternative options.
 #
 
-variable	x index 1
-variable	y index 1
-variable	z index 1
+variable     x index 1
+variable     y index 1
+variable     z index 1
 
-variable	xx equal 20*$x
-variable	yy equal 20*$y
-variable	zz equal 20*$z
+variable     xx equal 20*$x
+variable     yy equal 20*$y
+variable     zz equal 20*$z
 
-kim_init	SW_StillingerWeber_1985_Si__MO_405512056662_005 real
-kim_query       a0 get_lattice_constant_cubic crystal=["fcc"] species=["Si"] units=["angstrom"]
+kim          init SW_StillingerWeber_1985_Si__MO_405512056662_005 real
+kim          query a0 get_lattice_constant_cubic crystal=["fcc"] species=["Si"] units=["angstrom"]
 
-lattice		fcc ${a0}
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-create_box	1 box
-create_atoms	1 box
+lattice      fcc ${a0}
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box   1 box
+create_atoms 1 box
 
-kim_interactions Si
+kim          interactions Si
 
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
 
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
 
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
 
-run 		100
+run          100
diff --git a/examples/kim/in.kim-pm.melt b/examples/kim/in.kim-pm.melt
index 9959a66793..46150d8c54 100644
--- a/examples/kim/in.kim-pm.melt
+++ b/examples/kim/in.kim-pm.melt
@@ -1,7 +1,7 @@
 # 3d Lennard-Jones melt
 #
 # This example requires that the KIM Portable Model (PM)
-# SW_StillingerWeber_1985_Si__MO_405512056662_005
+# `SW_StillingerWeber_1985_Si__MO_405512056662_005`
 # is installed.  This can be done with the command
 #   kim-api-collections-management install user SW_StillingerWeber_1985_Si__MO_405512056662_005
 # If this command does not work, you may need to setup your PATH to find the utility.
@@ -13,33 +13,33 @@
 #   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
 # (where you should relplace X.Y.Z with the appropriate kim-api version number).
 #
-# Or, see https://openkim.org/doc/obtaining-models for alternative options.
+# Or, see `https://openkim.org/doc/obtaining-models` for alternative options.
 #
 
-variable	x index 1
-variable	y index 1
-variable	z index 1
+variable     x index 1
+variable     y index 1
+variable     z index 1
 
-variable	xx equal 20*$x
-variable	yy equal 20*$y
-variable	zz equal 20*$z
+variable     xx equal 20*$x
+variable     yy equal 20*$y
+variable     zz equal 20*$z
 
-kim_init	SW_StillingerWeber_1985_Si__MO_405512056662_005 real
+kim          init SW_StillingerWeber_1985_Si__MO_405512056662_005 real
 
-lattice		fcc 4.4300
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-create_box	1 box
-create_atoms	1 box
+lattice      fcc 4.4300
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box   1 box
+create_atoms 1 box
 
-kim_interactions Si
+kim          interactions Si
 
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
 
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
 
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
 
-run 		100
+run          100
diff --git a/examples/kim/in.kim-sm.melt b/examples/kim/in.kim-sm.melt
index 0ee8e9a857..1c49ead229 100644
--- a/examples/kim/in.kim-sm.melt
+++ b/examples/kim/in.kim-sm.melt
@@ -1,8 +1,8 @@
 # 3d Lennard-Jones melt
 #
 # This example requires that the KIM Simulator Model (PM)
-# Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
-# is installed.  This can be done with the command
+# `Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000`
+# is installed. This can be done with the command
 #   kim-api-collections-management install user Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
 # If this command does not work, you may need to setup your PATH to find the utility.
 # If you installed the kim-api using the LAMMPS CMake build, you can do the following
@@ -13,33 +13,33 @@
 #   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
 # (where you should relplace X.Y.Z with the appropriate kim-api version number).
 #
-# See https://openkim.org/doc/obtaining-models for alternative options.
+# See `https://openkim.org/doc/obtaining-models` for alternative options.
 #
 
-variable	x index 1
-variable	y index 1
-variable	z index 1
+variable     x index 1
+variable     y index 1
+variable     z index 1
 
-variable	xx equal 20*$x
-variable	yy equal 20*$y
-variable	zz equal 20*$z
+variable     xx equal 20*$x
+variable     yy equal 20*$y
+variable     zz equal 20*$z
 
-kim_init	Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000 real
+kim          init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000 real
 
-lattice		fcc 4.4300
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-create_box	1 box
-create_atoms	1 box
+lattice      fcc 4.4300
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box   1 box
+create_atoms 1 box
 
-kim_interactions O
+kim          interactions O
 
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
 
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
 
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
 
-run 		100
+run          100
diff --git a/examples/kim/in.lammps.melt b/examples/kim/in.lammps.melt
index 5792f3a5db..fbedb61985 100644
--- a/examples/kim/in.lammps.melt
+++ b/examples/kim/in.lammps.melt
@@ -1,33 +1,33 @@
 # 3d Lennard-Jones melt
 
-variable	x index 1
-variable	y index 1
-variable	z index 1
+variable     x index 1
+variable     y index 1
+variable     z index 1
 
-variable	xx equal 20*$x
-variable	yy equal 20*$y
-variable	zz equal 20*$z
+variable     xx equal 20*$x
+variable     yy equal 20*$y
+variable     zz equal 20*$z
 
-units		real
+units        real
 
-lattice		fcc 4.4300
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-create_box	1 box
-create_atoms	1 box
+lattice      fcc 4.4300
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box   1 box
+create_atoms 1 box
 
-pair_style	lj/cut 8.1500
-pair_coeff	1 1 0.0104 3.4000
+pair_style   lj/cut 8.1500
+pair_coeff   1 1 0.0104 3.4000
 
-#pair_style      kim LennardJones_Ar
-#pair_coeff      * * Ar
+#pair_style  kim LennardJones_Ar
+#pair_coeff  * * Ar
 
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
 
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
 
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
 
-run 		100
+run          100

From 12a9b6165a0da00f67279fd8271ff9fcb904a325 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 06:01:38 -0600
Subject: [PATCH 021/116] remove old log files

---
 .../kim/log.7Aug19.in.kim-ex.melt.clang.1     | 107 ---------------
 .../kim/log.7Aug19.in.kim-ex.melt.clang.4     | 113 ----------------
 .../log.7Aug19.in.kim-pm-query.melt.clang.1   | 124 ------------------
 .../log.7Aug19.in.kim-pm-query.melt.clang.4   | 124 ------------------
 .../kim/log.7Aug19.in.kim-pm.melt.clang.1     | 118 -----------------
 .../kim/log.7Aug19.in.kim-pm.melt.clang.4     | 118 -----------------
 .../kim/log.7Aug19.in.kim-sm.melt.clang.1     |  71 ----------
 .../kim/log.7Aug19.in.kim-sm.melt.clang.4     |  60 ---------
 .../kim/log.7Aug19.in.lammps.melt.clang.1     |  92 -------------
 .../kim/log.7Aug19.in.lammps.melt.clang.4     |  92 -------------
 10 files changed, 1019 deletions(-)
 delete mode 100644 examples/kim/log.7Aug19.in.kim-ex.melt.clang.1
 delete mode 100644 examples/kim/log.7Aug19.in.kim-ex.melt.clang.4
 delete mode 100644 examples/kim/log.7Aug19.in.kim-pm-query.melt.clang.1
 delete mode 100644 examples/kim/log.7Aug19.in.kim-pm-query.melt.clang.4
 delete mode 100644 examples/kim/log.7Aug19.in.kim-pm.melt.clang.1
 delete mode 100644 examples/kim/log.7Aug19.in.kim-pm.melt.clang.4
 delete mode 100644 examples/kim/log.7Aug19.in.kim-sm.melt.clang.1
 delete mode 100644 examples/kim/log.7Aug19.in.kim-sm.melt.clang.4
 delete mode 100644 examples/kim/log.7Aug19.in.lammps.melt.clang.1
 delete mode 100644 examples/kim/log.7Aug19.in.lammps.melt.clang.4

diff --git a/examples/kim/log.7Aug19.in.kim-ex.melt.clang.1 b/examples/kim/log.7Aug19.in.kim-ex.melt.clang.1
deleted file mode 100644
index 17fa1bc534..0000000000
--- a/examples/kim/log.7Aug19.in.kim-ex.melt.clang.1
+++ /dev/null
@@ -1,107 +0,0 @@
-LAMMPS (7 Aug 2019)
-# 3d Lennard-Jones melt
-#
-# This example requires that the example models provided with
-# the kim-api package are installed.  see the ./lib/kim/README or
-# ./lib/kim/Install.py files for details on how to install these
-# example models.
-#
-
-variable	x index 1
-variable	y index 1
-variable	z index 1
-
-variable	xx equal 20*$x
-variable	xx equal 20*1
-variable	yy equal 20*$y
-variable	yy equal 20*1
-variable	zz equal 20*$z
-variable	zz equal 20*1
-
-kim_init	LennardJones_Ar real
-#=== BEGIN kim-init ==========================================
-units real
-#=== END kim-init ============================================
-
-
-lattice		fcc 4.4300
-Lattice spacing in x,y,z = 4.43 4.43 4.43
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-region		box block 0 20 0 ${yy} 0 ${zz}
-region		box block 0 20 0 20 0 ${zz}
-region		box block 0 20 0 20 0 20
-create_box	1 box
-Created orthogonal box = (0 0 0) to (88.6 88.6 88.6)
-  1 by 1 by 1 MPI processor grid
-create_atoms	1 box
-Created 32000 atoms
-  create_atoms CPU = 0.004321 secs
-
-kim_interactions Ar
-#=== BEGIN kim_interactions ==================================
-pair_style kim LennardJones_Ar
-WARNING: KIM Model does not provide `partialParticleEnergy'; energy per atom will be zero (../pair_kim.cpp:974)
-WARNING: KIM Model does not provide `partialParticleVirial'; virial per atom will be zero (../pair_kim.cpp:979)
-pair_coeff * * Ar 
-#=== END kim_interactions ====================================
-
-
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
-
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
-
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
-
-run 		100
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 8.45
-  ghost atom cutoff = 8.45
-  binsize = 4.225, bins = 21 21 21
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair kim, perpetual
-      attributes: full, newton off, cut 8.45
-      pair build: full/bin/atomonly
-      stencil: full/bin/3d
-      bin: standard
-Setting up Verlet run ...
-  Unit style    : real
-  Current step  : 0
-  Time step     : 1
-Per MPI rank memory allocation (min/avg/max) = 28.12 | 28.12 | 28.12 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0          200    145069.63            0    164146.22    128015.94 
-     100    95.179703    154939.42            0    164017.94    131602.75 
-Loop time of 3.48256 on 1 procs for 100 steps with 32000 atoms
-
-Performance: 2.481 ns/day, 9.674 hours/ns, 28.715 timesteps/s
-98.3% CPU use with 1 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 3.0502     | 3.0502     | 3.0502     |   0.0 | 87.59
-Neigh   | 0.3646     | 0.3646     | 0.3646     |   0.0 | 10.47
-Comm    | 0.01783    | 0.01783    | 0.01783    |   0.0 |  0.51
-Output  | 6.8e-05    | 6.8e-05    | 6.8e-05    |   0.0 |  0.00
-Modify  | 0.034349   | 0.034349   | 0.034349   |   0.0 |  0.99
-Other   |            | 0.01547    |            |       |  0.44
-
-Nlocal:    32000 ave 32000 max 32000 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    19911 ave 19911 max 19911 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    0 ave 0 max 0 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-FullNghs:  4.25375e+06 ave 4.25375e+06 max 4.25375e+06 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 4253750
-Ave neighs/atom = 132.93
-Neighbor list builds = 3
-Dangerous builds = 0
-Total wall time: 0:00:03
diff --git a/examples/kim/log.7Aug19.in.kim-ex.melt.clang.4 b/examples/kim/log.7Aug19.in.kim-ex.melt.clang.4
deleted file mode 100644
index 8e076815fc..0000000000
--- a/examples/kim/log.7Aug19.in.kim-ex.melt.clang.4
+++ /dev/null
@@ -1,113 +0,0 @@
-LAMMPS (7 Aug 2019)
-# 3d Lennard-Jones melt
-#
-# This example requires that the example models provided with
-# the kim-api package are installed.  see the ./lib/kim/README or
-# ./lib/kim/Install.py files for details on how to install these
-# example models.
-#
-
-variable	x index 1
-variable	y index 1
-variable	z index 1
-
-variable	xx equal 20*$x
-variable	xx equal 20*1
-variable	yy equal 20*$y
-variable	yy equal 20*1
-variable	zz equal 20*$z
-variable	zz equal 20*1
-
-kim_init	LennardJones_Ar real
-#=== BEGIN kim-init ==========================================
-units real
-#=== END kim-init ============================================
-
-
-lattice		fcc 4.4300
-Lattice spacing in x,y,z = 4.43 4.43 4.43
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-region		box block 0 20 0 ${yy} 0 ${zz}
-region		box block 0 20 0 20 0 ${zz}
-region		box block 0 20 0 20 0 20
-create_box	1 box
-Created orthogonal box = (0 0 0) to (88.6 88.6 88.6)
-  1 by 2 by 2 MPI processor grid
-create_atoms	1 box
-Created 32000 atoms
-  create_atoms CPU = 0.000989 secs
-
-kim_interactions Ar
-#=== BEGIN kim_interactions ==================================
-pair_style kim LennardJones_Ar
-WARNING: KIM Model does not provide `partialParticleEnergy'; energy per atom will be zero (../pair_kim.cpp:974)
-WARNING: KIM Model does not provide `partialParticleVirial'; virial per atom will be zero (../pair_kim.cpp:979)
-pair_coeff * * Ar 
-WARNING: KIM Model does not provide `partialParticleEnergy'; energy per atom will be zero (../pair_kim.cpp:974)
-WARNING: KIM Model does not provide `partialParticleVirial'; virial per atom will be zero (../pair_kim.cpp:979)
-#=== END kim_interactions ====================================
-
-
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
-WARNING: KIM Model does not provide `partialParticleEnergy'; energy per atom will be zero (../pair_kim.cpp:974)
-WARNING: KIM Model does not provide `partialParticleVirial'; virial per atom will be zero (../pair_kim.cpp:979)
-WARNING: KIM Model does not provide `partialParticleEnergy'; energy per atom will be zero (../pair_kim.cpp:974)
-WARNING: KIM Model does not provide `partialParticleVirial'; virial per atom will be zero (../pair_kim.cpp:979)
-
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
-
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
-
-run 		100
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 8.45
-  ghost atom cutoff = 8.45
-  binsize = 4.225, bins = 21 21 21
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair kim, perpetual
-      attributes: full, newton off, cut 8.45
-      pair build: full/bin/atomonly
-      stencil: full/bin/3d
-      bin: standard
-Setting up Verlet run ...
-  Unit style    : real
-  Current step  : 0
-  Time step     : 1
-Per MPI rank memory allocation (min/avg/max) = 9.791 | 9.791 | 9.791 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0          200    145069.63            0    164146.22    128015.94 
-     100    95.179703    154939.42            0    164017.94    131602.75 
-Loop time of 0.924494 on 4 procs for 100 steps with 32000 atoms
-
-Performance: 9.346 ns/day, 2.568 hours/ns, 108.167 timesteps/s
-99.6% CPU use with 4 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 0.76434    | 0.76847    | 0.77207    |   0.3 | 83.12
-Neigh   | 0.09089    | 0.094446   | 0.099911   |   1.1 | 10.22
-Comm    | 0.038599   | 0.044759   | 0.051381   |   2.1 |  4.84
-Output  | 3.5e-05    | 4e-05      | 4.9e-05    |   0.0 |  0.00
-Modify  | 0.009396   | 0.009685   | 0.009941   |   0.2 |  1.05
-Other   |            | 0.00709    |            |       |  0.77
-
-Nlocal:    8000 ave 8018 max 7967 min
-Histogram: 1 0 0 0 0 0 1 0 0 2
-Nghost:    9131 ave 9164 max 9113 min
-Histogram: 2 0 0 1 0 0 0 0 0 1
-Neighs:    0 ave 0 max 0 min
-Histogram: 4 0 0 0 0 0 0 0 0 0
-FullNghs:  1.06344e+06 ave 1.06594e+06 max 1.05881e+06 min
-Histogram: 1 0 0 0 0 0 1 0 0 2
-
-Total # of neighbors = 4253750
-Ave neighs/atom = 132.93
-Neighbor list builds = 3
-Dangerous builds = 0
-Total wall time: 0:00:00
diff --git a/examples/kim/log.7Aug19.in.kim-pm-query.melt.clang.1 b/examples/kim/log.7Aug19.in.kim-pm-query.melt.clang.1
deleted file mode 100644
index 1ca44c98ef..0000000000
--- a/examples/kim/log.7Aug19.in.kim-pm-query.melt.clang.1
+++ /dev/null
@@ -1,124 +0,0 @@
-LAMMPS (7 Aug 2019)
-# 3d Lennard-Jones melt
-#
-# This example requires that the KIM Portable Model (PM)
-# SW_StillingerWeber_1985_Si__MO_405512056662_005
-# is installed.  This can be done with the command
-#   kim-api-collections-management install user SW_StillingerWeber_1985_Si__MO_405512056662_005
-# If this command does not work, you may need to setup your PATH to find the utility.
-# If you installed the kim-api using the LAMMPS CMake build, you can do the following
-# (where the current working directory is assumed to be the LAMMPS build directory)
-#   source ./kim_build-prefix/bin/kim-api-activate
-# If you installed the kim-api using the LAMMPS Make build, you can do the following
-# (where the current working directory is assumed to be the LAMMPS src directory)
-#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
-# (where you should relplace X.Y.Z with the appropriate kim-api version number).
-#
-# Or, see https://openkim.org/doc/obtaining-models for alternative options.
-#
-
-variable	x index 1
-variable	y index 1
-variable	z index 1
-
-variable	xx equal 20*$x
-variable	xx equal 20*1
-variable	yy equal 20*$y
-variable	yy equal 20*1
-variable	zz equal 20*$z
-variable	zz equal 20*1
-
-kim_init	SW_StillingerWeber_1985_Si__MO_405512056662_005 real
-#=== BEGIN kim-init ==========================================
-units real
-#=== END kim-init ============================================
-
-kim_query       a0 get_lattice_constant_cubic crystal=["fcc"] species=["Si"] units=["angstrom"]
-#=== BEGIN kim-query =========================================
-variable a0 string 4.146581932902336
-#=== END kim-query ===========================================
-
-
-lattice		fcc ${a0}
-lattice		fcc 4.146581932902336
-Lattice spacing in x,y,z = 4.14658 4.14658 4.14658
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-region		box block 0 20 0 ${yy} 0 ${zz}
-region		box block 0 20 0 20 0 ${zz}
-region		box block 0 20 0 20 0 20
-create_box	1 box
-Created orthogonal box = (0 0 0) to (82.9316 82.9316 82.9316)
-  1 by 1 by 1 MPI processor grid
-create_atoms	1 box
-Created 32000 atoms
-  create_atoms CPU = 0.005415 secs
-
-kim_interactions Si
-#=== BEGIN kim_interactions ==================================
-pair_style kim SW_StillingerWeber_1985_Si__MO_405512056662_005
-pair_coeff * * Si 
-#=== END kim_interactions ====================================
-
-
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
-
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
-
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
-
-run 		100
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 4.07118
-  ghost atom cutoff = 4.07118
-  binsize = 2.03559, bins = 41 41 41
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair kim, perpetual
-      attributes: full, newton off, cut 4.07118
-      pair build: full/bin/atomonly
-      stencil: full/bin/3d
-      bin: standard
-Setting up Verlet run ...
-  Unit style    : real
-  Current step  : 0
-  Time step     : 1
-Per MPI rank memory allocation (min/avg/max) = 10.36 | 10.36 | 10.36 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0          200   -126084.25            0   -107007.66    1528.8768 
-     100    94.450495   -116016.03            0   -107007.07    2282.2685 
-Loop time of 74.6055 on 1 procs for 100 steps with 32000 atoms
-
-Performance: 0.116 ns/day, 207.238 hours/ns, 1.340 timesteps/s
-98.6% CPU use with 1 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 74.446     | 74.446     | 74.446     |   0.0 | 99.79
-Neigh   | 0.096611   | 0.096611   | 0.096611   |   0.0 |  0.13
-Comm    | 0.014594   | 0.014594   | 0.014594   |   0.0 |  0.02
-Output  | 7.9e-05    | 7.9e-05    | 7.9e-05    |   0.0 |  0.00
-Modify  | 0.03454    | 0.03454    | 0.03454    |   0.0 |  0.05
-Other   |            | 0.01396    |            |       |  0.02
-
-Nlocal:    32000 ave 32000 max 32000 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    9667 ave 9667 max 9667 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    0 ave 0 max 0 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-FullNghs:  450192 ave 450192 max 450192 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 450192
-Ave neighs/atom = 14.0685
-Neighbor list builds = 3
-Dangerous builds = 0
-
-Please see the log.cite file for references relevant to this simulation
-
-Total wall time: 0:01:16
diff --git a/examples/kim/log.7Aug19.in.kim-pm-query.melt.clang.4 b/examples/kim/log.7Aug19.in.kim-pm-query.melt.clang.4
deleted file mode 100644
index 8c4148ce15..0000000000
--- a/examples/kim/log.7Aug19.in.kim-pm-query.melt.clang.4
+++ /dev/null
@@ -1,124 +0,0 @@
-LAMMPS (7 Aug 2019)
-# 3d Lennard-Jones melt
-#
-# This example requires that the KIM Portable Model (PM)
-# SW_StillingerWeber_1985_Si__MO_405512056662_005
-# is installed.  This can be done with the command
-#   kim-api-collections-management install user SW_StillingerWeber_1985_Si__MO_405512056662_005
-# If this command does not work, you may need to setup your PATH to find the utility.
-# If you installed the kim-api using the LAMMPS CMake build, you can do the following
-# (where the current working directory is assumed to be the LAMMPS build directory)
-#   source ./kim_build-prefix/bin/kim-api-activate
-# If you installed the kim-api using the LAMMPS Make build, you can do the following
-# (where the current working directory is assumed to be the LAMMPS src directory)
-#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
-# (where you should relplace X.Y.Z with the appropriate kim-api version number).
-#
-# Or, see https://openkim.org/doc/obtaining-models for alternative options.
-#
-
-variable	x index 1
-variable	y index 1
-variable	z index 1
-
-variable	xx equal 20*$x
-variable	xx equal 20*1
-variable	yy equal 20*$y
-variable	yy equal 20*1
-variable	zz equal 20*$z
-variable	zz equal 20*1
-
-kim_init	SW_StillingerWeber_1985_Si__MO_405512056662_005 real
-#=== BEGIN kim-init ==========================================
-units real
-#=== END kim-init ============================================
-
-kim_query       a0 get_lattice_constant_cubic crystal=["fcc"] species=["Si"] units=["angstrom"]
-#=== BEGIN kim-query =========================================
-variable a0 string 4.146581932902336
-#=== END kim-query ===========================================
-
-
-lattice		fcc ${a0}
-lattice		fcc 4.146581932902336
-Lattice spacing in x,y,z = 4.14658 4.14658 4.14658
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-region		box block 0 20 0 ${yy} 0 ${zz}
-region		box block 0 20 0 20 0 ${zz}
-region		box block 0 20 0 20 0 20
-create_box	1 box
-Created orthogonal box = (0 0 0) to (82.9316 82.9316 82.9316)
-  1 by 2 by 2 MPI processor grid
-create_atoms	1 box
-Created 32000 atoms
-  create_atoms CPU = 0.000946 secs
-
-kim_interactions Si
-#=== BEGIN kim_interactions ==================================
-pair_style kim SW_StillingerWeber_1985_Si__MO_405512056662_005
-pair_coeff * * Si 
-#=== END kim_interactions ====================================
-
-
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
-
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
-
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
-
-run 		100
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 4.07118
-  ghost atom cutoff = 4.07118
-  binsize = 2.03559, bins = 41 41 41
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair kim, perpetual
-      attributes: full, newton off, cut 4.07118
-      pair build: full/bin/atomonly
-      stencil: full/bin/3d
-      bin: standard
-Setting up Verlet run ...
-  Unit style    : real
-  Current step  : 0
-  Time step     : 1
-Per MPI rank memory allocation (min/avg/max) = 3.489 | 3.489 | 3.489 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0          200   -126084.25            0   -107007.66    1528.8768 
-     100    94.450495   -116016.03            0   -107007.07    2282.2685 
-Loop time of 19.0792 on 4 procs for 100 steps with 32000 atoms
-
-Performance: 0.453 ns/day, 52.998 hours/ns, 5.241 timesteps/s
-99.4% CPU use with 4 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 18.78      | 18.855     | 18.937     |   1.5 | 98.83
-Neigh   | 0.026047   | 0.026274   | 0.0266     |   0.1 |  0.14
-Comm    | 0.09039    | 0.17196    | 0.24675    |  15.9 |  0.90
-Output  | 3.9e-05    | 4.975e-05  | 6.1e-05    |   0.0 |  0.00
-Modify  | 0.015667   | 0.015819   | 0.016008   |   0.1 |  0.08
-Other   |            | 0.01008    |            |       |  0.05
-
-Nlocal:    8000 ave 8029 max 7968 min
-Histogram: 1 1 0 0 0 0 0 0 0 2
-Nghost:    4259 ave 4303 max 4202 min
-Histogram: 1 0 0 0 0 0 2 0 0 1
-Neighs:    0 ave 0 max 0 min
-Histogram: 4 0 0 0 0 0 0 0 0 0
-FullNghs:  112548 ave 113091 max 111995 min
-Histogram: 1 0 0 1 0 0 0 1 0 1
-
-Total # of neighbors = 450192
-Ave neighs/atom = 14.0685
-Neighbor list builds = 3
-Dangerous builds = 0
-
-Please see the log.cite file for references relevant to this simulation
-
-Total wall time: 0:00:20
diff --git a/examples/kim/log.7Aug19.in.kim-pm.melt.clang.1 b/examples/kim/log.7Aug19.in.kim-pm.melt.clang.1
deleted file mode 100644
index f5845d7fc4..0000000000
--- a/examples/kim/log.7Aug19.in.kim-pm.melt.clang.1
+++ /dev/null
@@ -1,118 +0,0 @@
-LAMMPS (7 Aug 2019)
-# 3d Lennard-Jones melt
-#
-# This example requires that the KIM Portable Model (PM)
-# SW_StillingerWeber_1985_Si__MO_405512056662_005
-# is installed.  This can be done with the command
-#   kim-api-collections-management install user SW_StillingerWeber_1985_Si__MO_405512056662_005
-# If this command does not work, you may need to setup your PATH to find the utility.
-# If you installed the kim-api using the LAMMPS CMake build, you can do the following
-# (where the current working directory is assumed to be the LAMMPS build directory)
-#   source ./kim_build-prefix/bin/kim-api-activate
-# If you installed the kim-api using the LAMMPS Make build, you can do the following
-# (where the current working directory is assumed to be the LAMMPS src directory)
-#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
-# (where you should relplace X.Y.Z with the appropriate kim-api version number).
-#
-# Or, see https://openkim.org/doc/obtaining-models for alternative options.
-#
-
-variable	x index 1
-variable	y index 1
-variable	z index 1
-
-variable	xx equal 20*$x
-variable	xx equal 20*1
-variable	yy equal 20*$y
-variable	yy equal 20*1
-variable	zz equal 20*$z
-variable	zz equal 20*1
-
-kim_init	SW_StillingerWeber_1985_Si__MO_405512056662_005 real
-#=== BEGIN kim-init ==========================================
-units real
-#=== END kim-init ============================================
-
-
-lattice		fcc 4.4300
-Lattice spacing in x,y,z = 4.43 4.43 4.43
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-region		box block 0 20 0 ${yy} 0 ${zz}
-region		box block 0 20 0 20 0 ${zz}
-region		box block 0 20 0 20 0 20
-create_box	1 box
-Created orthogonal box = (0 0 0) to (88.6 88.6 88.6)
-  1 by 1 by 1 MPI processor grid
-create_atoms	1 box
-Created 32000 atoms
-  create_atoms CPU = 0.003591 secs
-
-kim_interactions Si
-#=== BEGIN kim_interactions ==================================
-pair_style kim SW_StillingerWeber_1985_Si__MO_405512056662_005
-pair_coeff * * Si 
-#=== END kim_interactions ====================================
-
-
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
-
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
-
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
-
-run 		100
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 4.07118
-  ghost atom cutoff = 4.07118
-  binsize = 2.03559, bins = 44 44 44
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair kim, perpetual
-      attributes: full, newton off, cut 4.07118
-      pair build: full/bin/atomonly
-      stencil: full/bin/3d
-      bin: standard
-Setting up Verlet run ...
-  Unit style    : real
-  Current step  : 0
-  Time step     : 1
-Per MPI rank memory allocation (min/avg/max) = 10.44 | 10.44 | 10.44 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0          200   -85249.847            0   -66173.259   -33302.387 
-     100    253.43357    -90346.68            0   -66173.441   -14888.698 
-Loop time of 74.248 on 1 procs for 100 steps with 32000 atoms
-
-Performance: 0.116 ns/day, 206.244 hours/ns, 1.347 timesteps/s
-98.8% CPU use with 1 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 74.118     | 74.118     | 74.118     |   0.0 | 99.83
-Neigh   | 0.069623   | 0.069623   | 0.069623   |   0.0 |  0.09
-Comm    | 0.0137     | 0.0137     | 0.0137     |   0.0 |  0.02
-Output  | 7.6e-05    | 7.6e-05    | 7.6e-05    |   0.0 |  0.00
-Modify  | 0.031883   | 0.031883   | 0.031883   |   0.0 |  0.04
-Other   |            | 0.01433    |            |       |  0.02
-
-Nlocal:    32000 ave 32000 max 32000 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    7760 ave 7760 max 7760 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    0 ave 0 max 0 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-FullNghs:  402352 ave 402352 max 402352 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 402352
-Ave neighs/atom = 12.5735
-Neighbor list builds = 4
-Dangerous builds = 0
-
-Please see the log.cite file for references relevant to this simulation
-
-Total wall time: 0:01:14
diff --git a/examples/kim/log.7Aug19.in.kim-pm.melt.clang.4 b/examples/kim/log.7Aug19.in.kim-pm.melt.clang.4
deleted file mode 100644
index 0b4632b999..0000000000
--- a/examples/kim/log.7Aug19.in.kim-pm.melt.clang.4
+++ /dev/null
@@ -1,118 +0,0 @@
-LAMMPS (7 Aug 2019)
-# 3d Lennard-Jones melt
-#
-# This example requires that the KIM Portable Model (PM)
-# SW_StillingerWeber_1985_Si__MO_405512056662_005
-# is installed.  This can be done with the command
-#   kim-api-collections-management install user SW_StillingerWeber_1985_Si__MO_405512056662_005
-# If this command does not work, you may need to setup your PATH to find the utility.
-# If you installed the kim-api using the LAMMPS CMake build, you can do the following
-# (where the current working directory is assumed to be the LAMMPS build directory)
-#   source ./kim_build-prefix/bin/kim-api-activate
-# If you installed the kim-api using the LAMMPS Make build, you can do the following
-# (where the current working directory is assumed to be the LAMMPS src directory)
-#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
-# (where you should relplace X.Y.Z with the appropriate kim-api version number).
-#
-# Or, see https://openkim.org/doc/obtaining-models for alternative options.
-#
-
-variable	x index 1
-variable	y index 1
-variable	z index 1
-
-variable	xx equal 20*$x
-variable	xx equal 20*1
-variable	yy equal 20*$y
-variable	yy equal 20*1
-variable	zz equal 20*$z
-variable	zz equal 20*1
-
-kim_init	SW_StillingerWeber_1985_Si__MO_405512056662_005 real
-#=== BEGIN kim-init ==========================================
-units real
-#=== END kim-init ============================================
-
-
-lattice		fcc 4.4300
-Lattice spacing in x,y,z = 4.43 4.43 4.43
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-region		box block 0 20 0 ${yy} 0 ${zz}
-region		box block 0 20 0 20 0 ${zz}
-region		box block 0 20 0 20 0 20
-create_box	1 box
-Created orthogonal box = (0 0 0) to (88.6 88.6 88.6)
-  1 by 2 by 2 MPI processor grid
-create_atoms	1 box
-Created 32000 atoms
-  create_atoms CPU = 0.000997 secs
-
-kim_interactions Si
-#=== BEGIN kim_interactions ==================================
-pair_style kim SW_StillingerWeber_1985_Si__MO_405512056662_005
-pair_coeff * * Si 
-#=== END kim_interactions ====================================
-
-
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
-
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
-
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
-
-run 		100
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 4.07118
-  ghost atom cutoff = 4.07118
-  binsize = 2.03559, bins = 44 44 44
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair kim, perpetual
-      attributes: full, newton off, cut 4.07118
-      pair build: full/bin/atomonly
-      stencil: full/bin/3d
-      bin: standard
-Setting up Verlet run ...
-  Unit style    : real
-  Current step  : 0
-  Time step     : 1
-Per MPI rank memory allocation (min/avg/max) = 3.517 | 3.517 | 3.517 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0          200   -85249.847            0   -66173.259   -33302.387 
-     100    253.43357    -90346.68            0   -66173.441   -14888.698 
-Loop time of 19.0287 on 4 procs for 100 steps with 32000 atoms
-
-Performance: 0.454 ns/day, 52.857 hours/ns, 5.255 timesteps/s
-99.1% CPU use with 4 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 18.81      | 18.838     | 18.883     |   0.6 | 99.00
-Neigh   | 0.018598   | 0.01914    | 0.020732   |   0.7 |  0.10
-Comm    | 0.10341    | 0.1475     | 0.17393    |   7.1 |  0.78
-Output  | 6e-05      | 6.225e-05  | 6.7e-05    |   0.0 |  0.00
-Modify  | 0.014839   | 0.014925   | 0.015047   |   0.1 |  0.08
-Other   |            | 0.008997   |            |       |  0.05
-
-Nlocal:    8000 ave 8014 max 7988 min
-Histogram: 1 1 0 0 0 0 1 0 0 1
-Nghost:    3374.75 ave 3389 max 3361 min
-Histogram: 1 0 1 0 0 0 0 1 0 1
-Neighs:    0 ave 0 max 0 min
-Histogram: 4 0 0 0 0 0 0 0 0 0
-FullNghs:  100588 ave 100856 max 100392 min
-Histogram: 1 0 1 0 1 0 0 0 0 1
-
-Total # of neighbors = 402352
-Ave neighs/atom = 12.5735
-Neighbor list builds = 4
-Dangerous builds = 0
-
-Please see the log.cite file for references relevant to this simulation
-
-Total wall time: 0:00:19
diff --git a/examples/kim/log.7Aug19.in.kim-sm.melt.clang.1 b/examples/kim/log.7Aug19.in.kim-sm.melt.clang.1
deleted file mode 100644
index 1b77e58a3a..0000000000
--- a/examples/kim/log.7Aug19.in.kim-sm.melt.clang.1
+++ /dev/null
@@ -1,71 +0,0 @@
-LAMMPS (7 Aug 2019)
-# 3d Lennard-Jones melt
-#
-# This example requires that the KIM Simulator Model (PM)
-# Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
-# is installed.  This can be done with the command
-#   kim-api-collections-management install user Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
-# If this command does not work, you may need to setup your PATH to find the utility.
-# If you installed the kim-api using the LAMMPS CMake build, you can do the following
-# (where the current working directory is assumed to be the LAMMPS build directory)
-#   source ./kim_build-prefix/bin/kim-api-activate
-# If you installed the kim-api using the LAMMPS Make build, you can do the following
-# (where the current working directory is assumed to be the LAMMPS src directory)
-#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
-# (where you should relplace X.Y.Z with the appropriate kim-api version number).
-#
-# See https://openkim.org/doc/obtaining-models for alternative options.
-#
-
-variable	x index 1
-variable	y index 1
-variable	z index 1
-
-variable	xx equal 20*$x
-variable	xx equal 20*1
-variable	yy equal 20*$y
-variable	yy equal 20*1
-variable	zz equal 20*$z
-variable	zz equal 20*1
-
-kim_init	Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000 real
-#=== BEGIN kim-init ==========================================
-# Using KIM Simulator Model : Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
-# For Simulator             : LAMMPS 28 Feb 2019
-# Running on                : LAMMPS 7 Aug 2019
-#
-units real
-atom_style charge
-neigh_modify one 4000
-#=== END kim-init ============================================
-
-
-lattice		fcc 4.4300
-Lattice spacing in x,y,z = 4.43 4.43 4.43
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-region		box block 0 20 0 ${yy} 0 ${zz}
-region		box block 0 20 0 20 0 ${zz}
-region		box block 0 20 0 20 0 20
-create_box	1 box
-Created orthogonal box = (0 0 0) to (88.6 88.6 88.6)
-  1 by 1 by 1 MPI processor grid
-create_atoms	1 box
-Created 32000 atoms
-  create_atoms CPU = 0.003447 secs
-
-kim_interactions O
-#=== BEGIN kim_interactions ==================================
-pair_style reax/c /var/tmp/kim-simulator-model-parameter-file-directory-6Acs1QDbXgBx/lmp_control safezone 2.0 mincap 100
-ERROR: Unrecognized pair style 'reax/c' is part of the USER-REAXC package which is not enabled in this LAMMPS binary. (../force.cpp:262)
-Last command: pair_style reax/c /var/tmp/kim-simulator-model-parameter-file-directory-6Acs1QDbXgBx/lmp_control safezone 2.0 mincap 100
---------------------------------------------------------------------------
-Primary job  terminated normally, but 1 process returned
-a non-zero exit code. Per user-direction, the job has been aborted.
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-mpirun detected that one or more processes exited with non-zero status, thus causing
-the job to be terminated. The first process to do so was:
-
-  Process name: [[33054,1],0]
-  Exit code:    1
---------------------------------------------------------------------------
diff --git a/examples/kim/log.7Aug19.in.kim-sm.melt.clang.4 b/examples/kim/log.7Aug19.in.kim-sm.melt.clang.4
deleted file mode 100644
index 72b62beffb..0000000000
--- a/examples/kim/log.7Aug19.in.kim-sm.melt.clang.4
+++ /dev/null
@@ -1,60 +0,0 @@
-LAMMPS (7 Aug 2019)
-# 3d Lennard-Jones melt
-#
-# This example requires that the KIM Simulator Model (PM)
-# Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
-# is installed.  This can be done with the command
-#   kim-api-collections-management install user Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
-# If this command does not work, you may need to setup your PATH to find the utility.
-# If you installed the kim-api using the LAMMPS CMake build, you can do the following
-# (where the current working directory is assumed to be the LAMMPS build directory)
-#   source ./kim_build-prefix/bin/kim-api-activate
-# If you installed the kim-api using the LAMMPS Make build, you can do the following
-# (where the current working directory is assumed to be the LAMMPS src directory)
-#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
-# (where you should relplace X.Y.Z with the appropriate kim-api version number).
-#
-# See https://openkim.org/doc/obtaining-models for alternative options.
-#
-
-variable	x index 1
-variable	y index 1
-variable	z index 1
-
-variable	xx equal 20*$x
-variable	xx equal 20*1
-variable	yy equal 20*$y
-variable	yy equal 20*1
-variable	zz equal 20*$z
-variable	zz equal 20*1
-
-kim_init	Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000 real
-#=== BEGIN kim-init ==========================================
-# Using KIM Simulator Model : Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
-# For Simulator             : LAMMPS 28 Feb 2019
-# Running on                : LAMMPS 7 Aug 2019
-#
-units real
-atom_style charge
-neigh_modify one 4000
-#=== END kim-init ============================================
-
-
-lattice		fcc 4.4300
-Lattice spacing in x,y,z = 4.43 4.43 4.43
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-region		box block 0 20 0 ${yy} 0 ${zz}
-region		box block 0 20 0 20 0 ${zz}
-region		box block 0 20 0 20 0 20
-create_box	1 box
-Created orthogonal box = (0 0 0) to (88.6 88.6 88.6)
-  1 by 2 by 2 MPI processor grid
-create_atoms	1 box
-Created 32000 atoms
-  create_atoms CPU = 0.001307 secs
-
-kim_interactions O
-#=== BEGIN kim_interactions ==================================
-pair_style reax/c /var/tmp/kim-simulator-model-parameter-file-directory-6tmKtZEXzhgv/lmp_control safezone 2.0 mincap 100
-ERROR: Unrecognized pair style 'reax/c' is part of the USER-REAXC package which is not enabled in this LAMMPS binary. (../force.cpp:262)
-Last command: pair_style reax/c /var/tmp/kim-simulator-model-parameter-file-directory-6tmKtZEXzhgv/lmp_control safezone 2.0 mincap 100
diff --git a/examples/kim/log.7Aug19.in.lammps.melt.clang.1 b/examples/kim/log.7Aug19.in.lammps.melt.clang.1
deleted file mode 100644
index f697504777..0000000000
--- a/examples/kim/log.7Aug19.in.lammps.melt.clang.1
+++ /dev/null
@@ -1,92 +0,0 @@
-LAMMPS (7 Aug 2019)
-# 3d Lennard-Jones melt
-
-variable	x index 1
-variable	y index 1
-variable	z index 1
-
-variable	xx equal 20*$x
-variable	xx equal 20*1
-variable	yy equal 20*$y
-variable	yy equal 20*1
-variable	zz equal 20*$z
-variable	zz equal 20*1
-
-units		real
-
-lattice		fcc 4.4300
-Lattice spacing in x,y,z = 4.43 4.43 4.43
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-region		box block 0 20 0 ${yy} 0 ${zz}
-region		box block 0 20 0 20 0 ${zz}
-region		box block 0 20 0 20 0 20
-create_box	1 box
-Created orthogonal box = (0 0 0) to (88.6 88.6 88.6)
-  1 by 1 by 1 MPI processor grid
-create_atoms	1 box
-Created 32000 atoms
-  create_atoms CPU = 0.003037 secs
-
-pair_style	lj/cut 8.1500
-pair_coeff	1 1 0.0104 3.4000
-
-#pair_style      kim LennardJones_Ar
-#pair_coeff      * * Ar
-
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
-
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
-
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
-
-run 		100
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 8.45
-  ghost atom cutoff = 8.45
-  binsize = 4.225, bins = 21 21 21
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair lj/cut, perpetual
-      attributes: half, newton on
-      pair build: half/bin/atomonly/newton
-      stencil: half/bin/3d/newton
-      bin: standard
-Setting up Verlet run ...
-  Unit style    : real
-  Current step  : 0
-  Time step     : 1
-Per MPI rank memory allocation (min/avg/max) = 19.23 | 19.23 | 19.23 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0          200    6290.8194            0    25367.408    6750.7421 
-     100    98.747096    15900.676            0    25319.465    10184.453 
-Loop time of 2.43768 on 1 procs for 100 steps with 32000 atoms
-
-Performance: 3.544 ns/day, 6.771 hours/ns, 41.023 timesteps/s
-97.8% CPU use with 1 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 2.1895     | 2.1895     | 2.1895     |   0.0 | 89.82
-Neigh   | 0.17546    | 0.17546    | 0.17546    |   0.0 |  7.20
-Comm    | 0.021001   | 0.021001   | 0.021001   |   0.0 |  0.86
-Output  | 7.9e-05    | 7.9e-05    | 7.9e-05    |   0.0 |  0.00
-Modify  | 0.034253   | 0.034253   | 0.034253   |   0.0 |  1.41
-Other   |            | 0.01735    |            |       |  0.71
-
-Nlocal:    32000 ave 32000 max 32000 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    19911 ave 19911 max 19911 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    1.96027e+06 ave 1.96027e+06 max 1.96027e+06 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 1960266
-Ave neighs/atom = 61.2583
-Neighbor list builds = 3
-Dangerous builds = 0
-Total wall time: 0:00:02
diff --git a/examples/kim/log.7Aug19.in.lammps.melt.clang.4 b/examples/kim/log.7Aug19.in.lammps.melt.clang.4
deleted file mode 100644
index 2d25348b06..0000000000
--- a/examples/kim/log.7Aug19.in.lammps.melt.clang.4
+++ /dev/null
@@ -1,92 +0,0 @@
-LAMMPS (7 Aug 2019)
-# 3d Lennard-Jones melt
-
-variable	x index 1
-variable	y index 1
-variable	z index 1
-
-variable	xx equal 20*$x
-variable	xx equal 20*1
-variable	yy equal 20*$y
-variable	yy equal 20*1
-variable	zz equal 20*$z
-variable	zz equal 20*1
-
-units		real
-
-lattice		fcc 4.4300
-Lattice spacing in x,y,z = 4.43 4.43 4.43
-region		box block 0 ${xx} 0 ${yy} 0 ${zz}
-region		box block 0 20 0 ${yy} 0 ${zz}
-region		box block 0 20 0 20 0 ${zz}
-region		box block 0 20 0 20 0 20
-create_box	1 box
-Created orthogonal box = (0 0 0) to (88.6 88.6 88.6)
-  1 by 2 by 2 MPI processor grid
-create_atoms	1 box
-Created 32000 atoms
-  create_atoms CPU = 0.001194 secs
-
-pair_style	lj/cut 8.1500
-pair_coeff	1 1 0.0104 3.4000
-
-#pair_style      kim LennardJones_Ar
-#pair_coeff      * * Ar
-
-mass		1 39.95
-velocity	all create 200.0 232345 loop geom
-
-neighbor	0.3 bin
-neigh_modify	delay 0 every 1 check yes
-
-fix		1 all nve
-#fix		1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
-
-run 		100
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 8.45
-  ghost atom cutoff = 8.45
-  binsize = 4.225, bins = 21 21 21
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair lj/cut, perpetual
-      attributes: half, newton on
-      pair build: half/bin/atomonly/newton
-      stencil: half/bin/3d/newton
-      bin: standard
-Setting up Verlet run ...
-  Unit style    : real
-  Current step  : 0
-  Time step     : 1
-Per MPI rank memory allocation (min/avg/max) = 7.633 | 7.633 | 7.633 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0          200    6290.8194            0    25367.408    6750.7421 
-     100    98.747096    15900.676            0    25319.465    10184.453 
-Loop time of 0.726239 on 4 procs for 100 steps with 32000 atoms
-
-Performance: 11.897 ns/day, 2.017 hours/ns, 137.696 timesteps/s
-98.7% CPU use with 4 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 0.57617    | 0.5835     | 0.59084    |   0.9 | 80.34
-Neigh   | 0.046682   | 0.047783   | 0.048641   |   0.3 |  6.58
-Comm    | 0.065469   | 0.071509   | 0.07899    |   2.3 |  9.85
-Output  | 3.9e-05    | 4.6e-05    | 6.1e-05    |   0.0 |  0.01
-Modify  | 0.013205   | 0.01363    | 0.014044   |   0.3 |  1.88
-Other   |            | 0.009775   |            |       |  1.35
-
-Nlocal:    8000 ave 8012 max 7989 min
-Histogram: 1 0 0 0 2 0 0 0 0 1
-Nghost:    9131 ave 9142 max 9119 min
-Histogram: 1 0 0 0 0 2 0 0 0 1
-Neighs:    490066 ave 491443 max 489273 min
-Histogram: 2 0 0 0 1 0 0 0 0 1
-
-Total # of neighbors = 1960266
-Ave neighs/atom = 61.2583
-Neighbor list builds = 3
-Dangerous builds = 0
-Total wall time: 0:00:00

From 7c102a6096671026a775574d0dcf5bc1e9a357d3 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 06:02:21 -0600
Subject: [PATCH 022/116] Extend the 'kim query' command

Extend the 'kim query' command to get all available models meeting
certain requirements. To query for KIM models the query function
is `get_available_models`. Now, the 'kim query' works with the `model`
argument and can also be used with no 'kim init' call requirement.
---
 src/KIM/kim_query.cpp | 233 +++++++++++++++++++++++++-----------------
 1 file changed, 137 insertions(+), 96 deletions(-)

diff --git a/src/KIM/kim_query.cpp b/src/KIM/kim_query.cpp
index ef4b7572d3..db2bd47f94 100644
--- a/src/KIM/kim_query.cpp
+++ b/src/KIM/kim_query.cpp
@@ -81,125 +81,161 @@
 using namespace LAMMPS_NS;
 
 #if defined(LMP_KIM_CURL)
+namespace {
+static constexpr int kBufSize{10240};
 
 struct WriteBuf {
   char *dataptr;
   size_t sizeleft;
 };
 
-static char *do_query(char *, char *, int, char **, int, MPI_Comm);
-static size_t write_callback(void *, size_t, size_t, void *);
+static char *do_query(const std::string &, const std::string &,
+                      int, char **, int, MPI_Comm);
 
+static size_t write_callback(void *, size_t, size_t, void *);
+} // namespace
 #endif
 
 /* ---------------------------------------------------------------------- */
 
 void KimQuery::command(int narg, char **arg)
 {
-  if (narg < 2) error->all(FLERR,"Illegal 'kim query' command");
+  if (narg < 2) error->all(FLERR, "Illegal 'kim query' command");
 
-  // check if we had a kim init command by finding fix STORE/KIM
-  // retrieve model name.
-  char *model_name;
-
-  const int ifix = modify->find_fix("KIM_MODEL_STORE");
-  if (ifix >= 0) {
-    FixStoreKIM *fix_store = (FixStoreKIM *) modify->fix[ifix];
-    model_name = (char *)fix_store->getptr("model_name");
-  } else error->all(FLERR,"Must use 'kim init' before 'kim query'");
-
-  char *varname = arg[0];
-
-  bool split = false;
-  if (strcmp("split",arg[1]) == 0) {
-    if (narg == 2) error->all(FLERR,"Illegal 'kim query' command.\nThe keyword "
-                                    "'split' must be followed by the name of "
-                                    "the query function");
-    if (strcmp("list",arg[2]) == 0)
-      error->all(FLERR,"Illegal 'kim query' command.\nThe 'list' keyword "
-                       "can not be used after 'split'");
-    split = true;
-    arg++;
-    narg--;
-  }
+  std::string var_name{arg[0]};
 
+  // format_arg = list, split, or index (optional):
+  std::string format_arg{arg[1]};
+  if (format_arg == "split" || format_arg == "list" || format_arg == "index") {
+    if (narg == 2) {
+      auto msg = fmt::format("Illegal 'kim query' command.\nThe keyword '{}' "
+        "must be followed by the name of the query function", format_arg);
+      error->all(FLERR, msg);
+    }
+    ++arg;
+    --narg;
   // The “list” is the default setting
-  // the result is returned as a space-separated list of values in variable
-  if (strcmp("list",arg[1]) == 0) {
-    if (narg == 2) error->all(FLERR,"Illegal 'kim query' command.\nThe 'list' "
-                                    "keyword must be followed by ('split' "
-                                    "and) the name of the query function");
-    arg++;
-    narg--;
+  // the result is returned as a space-separated list of values in a variable
+  } else format_arg = "list";
+
+  std::string query_function{arg[1]};
+  if (query_function == "split" || query_function == "list" ||
+      query_function == "index") {
+    auto msg = fmt::format("Illegal 'kim query' command.\nThe '{}' keyword "
+      "can not be used after '{}'", query_function, format_arg);
+    error->all(FLERR, msg);
   }
 
-  char *function = arg[1];
-  for (int i = 2; i < narg; ++i) {
-    if (strncmp("model=",arg[i],6) == 0)
-      error->all(FLERR,"Illegal 'model' key in 'kim query' command");
+  std::string model_name;
 
-    if (!strchr(arg[i], '=') || !strchr(arg[i], '[') || !strchr(arg[i], ']'))
-      error->all(FLERR,fmt::format("Illegal query format.\nInput argument of "
-                                   "`{}` to 'kim query' is wrong. The query "
-                                   "format is the keyword=[value], where value "
-                                   "is always an array of one or more "
-                                   "comma-separated items", arg[i]));
+  // check the query_args format (a series of keyword=value pairs)
+  for (int i = 2; i < narg; ++i) {
+    if (!strchr(arg[i], '=') || !strchr(arg[i], '[') || !strchr(arg[i], ']')) {
+      auto msg = fmt::format("Illegal query format.\nInput argument "
+        "of `{}` to 'kim query' is wrong. The query format is the "
+        "keyword=[value], where value is always an array of one or "
+        "more comma-separated items", arg[i]);
+      error->all(FLERR, msg);
+    }
+  }
+
+  if (query_function != "get_available_models") {
+    for (int i = 2; i < narg; ++i) {
+      // check if the model is specified as an argument
+      if (strncmp("model=", arg[i], 6) == 0) {
+        ValueTokenizer values(arg[i], "=[]");
+        std::string key = values.next_string();
+        model_name = values.next_string();
+        break;
+      }
+    }
+    // if the model name is not provided by the user
+    if (model_name.empty()) {
+      // check if we had a kim init command by finding fix STORE/KIM
+      const int ifix = modify->find_fix("KIM_MODEL_STORE");
+      if (ifix >= 0) {
+        FixStoreKIM *fix_store = (FixStoreKIM *) modify->fix[ifix];
+        char *model_name_c = (char *) fix_store->getptr("model_name");
+        model_name = fmt::format("{}", model_name_c);
+      } else {
+        auto msg = fmt::format("Illegal query format.\nMust use 'kim init' "
+         "before 'kim query' or must provide the model name after query "
+         "function with the format of 'model=[model_name]'");
+        error->all(FLERR, msg);
+      }
+    }
   }
 
 #if defined(LMP_KIM_CURL)
-
-  char *value = do_query(function, model_name, narg-2, arg+2, comm->me, world);
+  char *value = do_query(query_function, model_name,
+                         narg - 2, arg + 2, comm->me, world);
 
   // check for valid result
-  // on error the content of "value" is a '\0' byte
-  // as the first element, and then the error message
-  // that was returned by the web server
+  // on error the content of "value" is a '\0' byte as the first element,
+  // and then the error message that was returned by the web server
 
   if (strlen(value) == 0) {
-    error->all(FLERR,fmt::format("OpenKIM query failed: {}", value+1));
-  } else if (strcmp(value,"EMPTY") == 0) {
-    error->all(FLERR,fmt::format("OpenKIM query returned no results"));
+    auto msg = fmt::format("OpenKIM query failed: {}", value + 1);
+    delete [] value;
+    error->all(FLERR, msg);
+  } else if (strcmp(value, "EMPTY") == 0) {
+    delete [] value;
+    error->all(FLERR, fmt::format("OpenKIM query returned no results"));
   }
 
-  input->write_echo("#=== BEGIN kim query =================================="
+  input->write_echo("#=== BEGIN kim-query =================================="
                     "=======\n");
   ValueTokenizer values(value, ",");
-  if (split) {
+  if (format_arg == "split") {
     int counter = 1;
     while (values.has_next()) {
       auto svalue = values.next_string();
-      auto setcmd = fmt::format("{}_{} string {}", varname, counter++, svalue);
+      auto setcmd = fmt::format("{}_{} string {}", var_name, counter++, svalue);
       input->variable->set(setcmd);
       input->write_echo(fmt::format("variable {}\n", setcmd));
     }
   } else {
-    auto svalue = values.next_string();
-    std::string setcmd = fmt::format("{} string \"{}", varname, svalue);
-    while (values.has_next()) {
-      svalue = values.next_string();
-      setcmd += fmt::format(" {}", svalue);
+    std::string setcmd;
+    auto svalue = utils::trim(values.next_string());
+    if (format_arg == "list") {
+      setcmd = fmt::format("{} string \"", var_name);
+      setcmd += (svalue.front() == '"' && svalue.back() == '"')
+        ? fmt::format("{}", svalue.substr(1, svalue.size() - 2))
+        : fmt::format("{}", svalue);
+      while (values.has_next()) {
+        svalue = utils::trim(values.next_string());
+        setcmd += (svalue.front() == '"' && svalue.back() == '"')
+          ? fmt::format(" {}", svalue.substr(1, svalue.size() - 2))
+          : fmt::format(" {}", svalue);
+      }
+      setcmd += "\"";
+    } else {
+      // format_arg == "index"
+      setcmd = fmt::format("{} index {}", var_name, svalue);
+      while (values.has_next()) {
+        svalue = values.next_string();
+        setcmd += fmt::format(" {}", svalue);
+      }
     }
-    setcmd += "\"";
     input->variable->set(setcmd);
     input->write_echo(fmt::format("variable {}\n", setcmd));
   }
-  input->write_echo("#=== END kim query ===================================="
+  input->write_echo("#=== END kim-query ===================================="
                     "=======\n\n");
 
-  delete[] value;
+  delete [] value;
 #else
-  error->all(FLERR,"Cannot use 'kim query' command when KIM package "
-                   "is compiled without support for libcurl");
+  error->all(FLERR, "Cannot use 'kim query' command when KIM package "
+                    "is compiled without support for libcurl");
 #endif
 }
 
 #if defined(LMP_KIM_CURL)
-
+namespace {
 // copy data to the user provided data structure, optionally in increments
-
 size_t write_callback(void *data, size_t size, size_t nmemb, void *userp)
 {
-  struct WriteBuf *buf = (struct WriteBuf *)userp;
+  WriteBuf *buf = (WriteBuf *) userp;
 
   // copy chunks into the buffer for as long as there is space left
   if (buf->sizeleft) {
@@ -211,25 +247,23 @@ size_t write_callback(void *data, size_t size, size_t nmemb, void *userp)
 
     buf->dataptr += copy_this_much;
     buf->sizeleft -= copy_this_much;
-
     return copy_this_much;
   }
   return 0; // done
 }
 
-char *do_query(char *qfunction, char * model_name, int narg, char **arg,
-               int rank, MPI_Comm comm)
+char *do_query(const std::string &qfunction, const std::string &mname,
+               int narg, char **arg, int rank, MPI_Comm comm)
 {
-  char value[512];
+  char value[kBufSize];
 
   // run the web query from rank 0 only
-
   if (rank == 0) {
     // set up and clear receive buffer
-    struct WriteBuf buf;
+    WriteBuf buf;
     buf.dataptr = value;
-    buf.sizeleft = 511;
-    memset(value,0,512);
+    buf.sizeleft = kBufSize - 1;
+    memset(value, 0, kBufSize);
 
     // create curl web query instance
     curl_global_init(CURL_GLOBAL_DEFAULT);
@@ -237,17 +271,21 @@ char *do_query(char *qfunction, char * model_name, int narg, char **arg,
 
     if (handle) {
       auto url = fmt::format("https://query.openkim.org/api/{}", qfunction);
-      auto query = fmt::format("model=[\"{}\"]", model_name);
+      auto query = mname.empty()
+        ? fmt::format("")
+        : (mname.front() == '"' && mname.back() == '"')
+          ? fmt::format("model=[{}]", mname)
+          : fmt::format("model=[\"{}\"]", mname);
       for (int i = 0; i < narg; ++i) {
         ValueTokenizer values(arg[i], "=[]");
         std::string key = values.next_string();
+        if (key == "model") continue;
         std::string val = values.next_string();
         std::string::size_type n = val.find(",");
         if (n == std::string::npos) {
           if (utils::is_integer(val) ||
               utils::is_double(val) ||
-              (val.front() == '"' &&
-               val.back() == '"')) {
+              (val.front() == '"' && val.back() == '"')) {
             query += fmt::format("&{}", arg[i]);
           } else {
             query += fmt::format("&{}=[\"{}\"]", key, val);
@@ -258,8 +296,7 @@ char *do_query(char *qfunction, char * model_name, int narg, char **arg,
             std::string sval = val.substr(0, n);
             if (utils::is_integer(sval) ||
                 utils::is_double(sval) ||
-                (val.front() == '"' &&
-                 val.back() == '"')) {
+                (val.front() == '"' && val.back() == '"')) {
               query += fmt::format("{},", sval);
             } else {
               query += fmt::format("\"{}\",", sval);
@@ -267,8 +304,11 @@ char *do_query(char *qfunction, char * model_name, int narg, char **arg,
             val = val.substr(n + 1);
             n = val.find(",");
           }
-          if (val.size()) query += fmt::format("\"{}\"]", val);
-          else query[query.size() - 1]=']';
+          if (val.size()) {
+            query += (val.front() == '"' && val.back() == '"')
+              ? fmt::format("{}]", val)
+              : fmt::format("\"{}\"]", val);
+          } else query.back() = ']';
         }
       }
 
@@ -294,28 +334,28 @@ char *do_query(char *qfunction, char * model_name, int narg, char **arg,
         }
       }
 
-      std::string user_agent = fmt::format("kim query--LAMMPS/{} ({})",
-                                           LAMMPS_VERSION, Info::get_os_info());
+      auto user_agent = fmt::format("kim query--LAMMPS/{} ({})",
+                                    LAMMPS_VERSION, Info::get_os_info());
 
       curl_easy_setopt(handle, CURLOPT_USERAGENT, user_agent.c_str());
       curl_easy_setopt(handle, CURLOPT_URL, url.c_str());
       curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
       curl_easy_setopt(handle, CURLOPT_POSTFIELDS, query.c_str());
       curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, write_callback);
-      curl_easy_setopt(handle, CURLOPT_WRITEDATA,&buf);
+      curl_easy_setopt(handle, CURLOPT_WRITEDATA, &buf);
 
       // perform OpenKIM query and check for errors
       CURLcode res = curl_easy_perform(handle);
       if (res != CURLE_OK) {
         // on error we return an "empty" string but add error message after it
         value[0] = '\0';
-        strcpy(value+1,curl_easy_strerror(res));
+        strcpy(value + 1, curl_easy_strerror(res));
       }
       curl_easy_cleanup(handle);
     }
     curl_global_cleanup();
   }
-  MPI_Bcast(value, 512, MPI_CHAR, 0, comm);
+  MPI_Bcast(value, kBufSize, MPI_CHAR, 0, comm);
 
   // we must make a proper copy of the query, as the stack allocation
   // for "value" will go out of scope. a valid query has a '[' as
@@ -330,27 +370,28 @@ char *do_query(char *qfunction, char * model_name, int narg, char **arg,
     if (value[len] == ']') {
       value[len] = '\0';
       retval = new char[len];
-      if (strcmp(value+1, "") == 0) strcpy(retval,"EMPTY");
-      else strcpy(retval,value+1);
+      if (strcmp(value + 1, "") == 0) strcpy(retval, "EMPTY");
+      else strcpy(retval, value + 1);
     } else {
-      retval = new char[len+2];
+      retval = new char[len + 2];
       retval[0] = '\0';
-      strcpy(retval+1,value);
+      strcpy(retval + 1, value);
     }
   // an error message starts with a '\0' character
   } else if (value[0] == '\0') {
-    int len = strlen(value+1)+2;
+    int len = strlen(value + 1) + 2;
     retval = new char[len];
     retval[0] = '\0';
-    strcpy(retval+1,value+1);
+    strcpy(retval + 1, value + 1);
   // unknown response type. we should not get here.
   } else {
     // we return an "empty" string but add error message after it
-    int len = strlen(value)+2;
+    int len = strlen(value) + 2;
     retval = new char[len];
     retval[0] = '\0';
-    strcpy(retval+1,value);
+    strcpy(retval + 1, value);
   }
   return retval;
 }
+} // namespace
 #endif

From 90e748aa5cf7fda58c57036a5cbfde1f53a93c78 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 06:03:45 -0600
Subject: [PATCH 023/116] Update the 'kim query' unittest

Update the unittest with the latest extension.
Replace the discontinued model of
`EAM_Dynamo_Mendelev_2007_Zr__MO_848899341753_000`
and replace it with the identical version,
`EAM_Dynamo_MendelevAckland_2007v3_Zr__MO_004835508849_000`
---
 unittest/commands/test_kim_commands.cpp | 166 ++++++++++++++++++------
 1 file changed, 127 insertions(+), 39 deletions(-)

diff --git a/unittest/commands/test_kim_commands.cpp b/unittest/commands/test_kim_commands.cpp
index 5ea458de59..9d02cdb74c 100644
--- a/unittest/commands/test_kim_commands.cpp
+++ b/unittest/commands/test_kim_commands.cpp
@@ -38,7 +38,6 @@ const bool have_openmpi = false;
 using LAMMPS_NS::utils::split_words;
 
 namespace LAMMPS_NS {
-using ::testing::ExitedWithCode;
 using ::testing::MatchesRegex;
 using ::testing::StrEq;
 
@@ -401,26 +400,33 @@ TEST_F(KimCommandsTest, kim_query)
 
     TEST_FAILURE(".*ERROR: Illegal 'kim query' command.*",
                  lmp->input->one("kim query"););
-    TEST_FAILURE(".*ERROR: Must use 'kim init' before 'kim query'.*",
-                 lmp->input->one("kim query a0 get_lattice_constant_cubic"););
-
-    if (!verbose) ::testing::internal::CaptureStdout();
-    lmp->input->one("clear");
-    lmp->input->one("kim init LennardJones612_UniversalShifted__MO_959249795837_003 real");
-    if (!verbose) ::testing::internal::GetCapturedStdout();
-
     TEST_FAILURE(".*ERROR: Illegal 'kim query' command.\nThe keyword 'split' "
                  "must be followed by the name of the query function.*",
                  lmp->input->one("kim query a0 split"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim query' command.\nThe keyword 'list' "
+                 "must be followed by the name of the query function.*",
+                 lmp->input->one("kim query a0 list"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim query' command.\nThe keyword 'index' "
+                 "must be followed by the name of the query function.*",
+                 lmp->input->one("kim query a0 index"););
     TEST_FAILURE(".*ERROR: Illegal 'kim query' command.\nThe 'list' keyword "
                  "can not be used after 'split'.*",
                  lmp->input->one("kim query a0 split list"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim query' command.\nThe 'index' keyword "
+                 "can not be used after 'split'.*",
+                 lmp->input->one("kim query a0 split index"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim query' command.\nThe 'split' keyword "
+                 "can not be used after 'list'.*",
+                 lmp->input->one("kim query a0 list split"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim query' command.\nThe 'index' keyword "
+                 "can not be used after 'list'.*",
+                 lmp->input->one("kim query a0 list index"););
     TEST_FAILURE(".*ERROR: Illegal 'kim query' command.\nThe 'list' keyword "
-                 "must be followed by \\('split' and\\) the name of the query "
-                 "function.*", lmp->input->one("kim query a0 list"););
-    TEST_FAILURE(".*ERROR: Illegal 'model' key in 'kim query' command.*",
-                 lmp->input->one("kim query a0 get_lattice_constant_cubic "
-                                 "model=[MO_959249795837_003]"););
+                 "can not be used after 'index'.*",
+                 lmp->input->one("kim query a0 index list"););
+    TEST_FAILURE(".*ERROR: Illegal 'kim query' command.\nThe 'split' keyword "
+                 "can not be used after 'index'.*",
+                 lmp->input->one("kim query a0 index split"););
     TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `crystal` "
                  "to 'kim query' is wrong. The query format is the "
                  "keyword=\\[value\\], where value is always an array of one "
@@ -428,9 +434,9 @@ TEST_F(KimCommandsTest, kim_query)
                  lmp->input->one("kim query a0 get_lattice_constant_cubic "
                                  "crystal"););
     TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `"
-                 "crystal=fcc` to 'kim query' is wrong. The query format is "
-                 "the keyword=\\[value\\], where value is always an array of "
-                 "one or more comma-separated items.*",
+                 "crystal=fcc` to 'kim query' is wrong. The query format is the "
+                 "keyword=\\[value\\], where value is always an array of one "
+                 "or more comma-separated items.*",
                  lmp->input->one("kim query a0 get_lattice_constant_cubic "
                                  "crystal=fcc"););
     TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `"
@@ -448,46 +454,111 @@ TEST_F(KimCommandsTest, kim_query)
 
     std::string squery("kim query a0 get_lattice_constant_cubic ");
     squery += "crystal=[\"fcc\"] species=\"Al\",\"Ni\" units=[\"angstrom\"]";
-   TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `species="
+    TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `species="
                  "\"Al\",\"Ni\"` to 'kim query' is wrong. The query format is "
                  "the keyword=\\[value\\], where value is always an array of "
                  "one or more comma-separated items.*",
                  lmp->input->one(squery););
 
     squery = "kim query a0 get_lattice_constant_cubic ";
-    squery += "crystal=[\"fcc\"] species=\"Al\",\"Ni\", units=[\"angstrom\"]";
-   TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `species="
-                 "\"Al\",\"Ni\",` to 'kim query' is wrong. The query format is "
+    squery += "crystal=[fcc] species=Al,Ni units=[angstrom]";
+    TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `species="
+                 "Al,Ni` to 'kim query' is wrong. The query format is "
                  "the keyword=\\[value\\], where value is always an array of "
                  "one or more comma-separated items.*",
                  lmp->input->one(squery););
 
-    squery = "kim query a0 get_lattice_constant_cubic crystal=[fcc] "
-             "species=[Al]";
-    TEST_FAILURE(".*ERROR: OpenKIM query failed:.*", lmp->input->one(squery););
+    squery = "kim query a0 get_lattice_constant_cubic ";
+    squery += "crystal=[fcc] species=Al,Ni, units=[angstrom]";
+    TEST_FAILURE(".*ERROR: Illegal query format.\nInput argument of `species="
+                 "Al,Ni,` to 'kim query' is wrong. The query format is "
+                 "the keyword=\\[value\\], where value is always an array of "
+                 "one or more comma-separated items.*",
+                 lmp->input->one(squery););
+
+    squery = "kim query a0 get_lattice_constant_cubic crystal=[\"fcc\"] "
+             "species=[\"Al\"]";
+    TEST_FAILURE(".*ERROR: Illegal query format.\nMust use 'kim init' before "
+                 "'kim query' or must provide the model name after query "
+                 "function with the format of 'model=\\[model_name\\]'.*",
+                 lmp->input->one(squery););
 
     squery = "kim query a0 get_lattice_constant_cubic crystal=[fcc] "
-             "units=[\"angstrom\"]";
-    TEST_FAILURE(".*ERROR: OpenKIM query failed:.*", lmp->input->one(squery););
+             "species=[Al]";
+    TEST_FAILURE(".*ERROR: Illegal query format.\nMust use 'kim init' before "
+                 "'kim query' or must provide the model name after query "
+                 "function with the format of 'model=\\[model_name\\]'.*",
+                 lmp->input->one(squery););
+
+    squery = "kim query a0 get_lattice_constant_cubic crystal=[\"fcc\"] "
+             "species=[Al]";
+    TEST_FAILURE(".*ERROR: Illegal query format.\nMust use 'kim init' before "
+                 "'kim query' or must provide the model name after query "
+                 "function with the format of 'model=\\[model_name\\]'.*",
+                 lmp->input->one(squery););
 
 #if defined(KIM_EXTRA_UNITTESTS)
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim init EAM_Dynamo_Mendelev_2007_Zr__MO_848899341753_000 metal");
 
-    squery = "kim query latconst split get_lattice_constant_hexagonal ";
-    squery += "crystal=[\"hcp\"] species=[\"Zr\"] units=[\"angstrom\"]";
+    squery = "kim query latconst_1 get_lattice_constant_cubic ";
+    squery += "crystal=[fcc] species=[Al] units=[angstrom] ";
+    squery += "model=[EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005]";
     lmp->input->one(squery);
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
     ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst_1")) ==
-                 std::string("3.234055244384789")));
-    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst_2")) ==
-                 std::string("5.167650199630013")));
+                 "4.032082033157349"));
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
-    lmp->input->one("kim init EAM_Dynamo_Mendelev_2007_Zr__MO_848899341753_000 metal");
+    lmp->input->one("kim init EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal");
+
+    squery = "kim query latconst_1 get_lattice_constant_cubic ";
+    squery += "crystal=[fcc] species=[Al] units=[angstrom]";
+    lmp->input->one(squery);
+
+    squery = "kim query latconst_2 get_lattice_constant_cubic ";
+    squery += "crystal=[fcc] species=[Al] units=[angstrom] ";
+    squery += "model=[LennardJones612_UniversalShifted__MO_959249795837_003]";
+    lmp->input->one(squery);
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst_1")) ==
+                 "4.032082033157349"));
+    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst_2")) ==
+                 "3.328125931322575"));
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("clear");
+    lmp->input->one("kim init EAM_Dynamo_MendelevAckland_2007v3_Zr__MO_004835508849_000 metal");
+
+    squery = "kim query latconst split get_lattice_constant_hexagonal ";
+    squery += "crystal=[hcp] species=[Zr] units=[angstrom]";
+    lmp->input->one(squery);
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst_1")) ==
+                 "3.234055244384789"));
+    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst_2")) ==
+                 "5.167650199630013"));
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("clear");
+
+    squery = "kim query latconst index get_lattice_constant_hexagonal ";
+    squery += "crystal=[hcp] species=[Zr] units=[angstrom] ";
+    squery += "model=[EAM_Dynamo_MendelevAckland_2007v3_Zr__MO_004835508849_000]";
+    lmp->input->one(squery);
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst")) ==
+                 "3.234055244384789"));
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("variable latconst delete");
+    lmp->input->one("clear");
+    lmp->input->one("kim init EAM_Dynamo_MendelevAckland_2007v3_Zr__MO_004835508849_000 metal");
 
     squery = "kim query latconst list get_lattice_constant_hexagonal ";
     squery += "crystal=[hcp] species=[Zr] units=[angstrom]";
@@ -495,11 +566,7 @@ TEST_F(KimCommandsTest, kim_query)
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
     ASSERT_TRUE((std::string(lmp->input->variable->retrieve("latconst")) ==
-                 std::string("3.234055244384789  5.167650199630013")));
-
-    squery = "kim query latconst list get_lattice_constant_hexagonal ";
-    squery += "crystal=[bcc] species=[Zr] units=[angstrom]";
-    TEST_FAILURE(".*ERROR: OpenKIM query failed:.*", lmp->input->one(squery););
+                 "3.234055244384789 5.167650199630013"));
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
@@ -512,7 +579,28 @@ TEST_F(KimCommandsTest, kim_query)
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
     ASSERT_TRUE((std::string(lmp->input->variable->retrieve("alpha")) ==
-                 std::string("1.654960564704273e-05")));
+                 "1.654960564704273e-05"));
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("clear");
+
+    squery = "kim query model_list list get_available_models ";
+    squery += "species=[Al]";
+    lmp->input->one(squery);
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    std::string model_list = lmp->input->variable->retrieve("model_list");
+    auto n = model_list.find("EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005");
+    ASSERT_TRUE(n != std::string::npos);
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("clear");
+
+    squery = "kim query model_name index get_available_models ";
+    squery += "species=[Al]";
+    lmp->input->one(squery);
+    lmp->input->one("variable model_name delete");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
 #endif
 }
 } // namespace LAMMPS_NS

From 8f55701da85bc28794412390a65eb3a06df72d28 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Fri, 12 Feb 2021 06:05:41 -0600
Subject: [PATCH 024/116] adding 'kim query' command examples

---
 examples/kim/in.kim-query | 76 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 examples/kim/in.kim-query

diff --git a/examples/kim/in.kim-query b/examples/kim/in.kim-query
new file mode 100644
index 0000000000..a0d1379372
--- /dev/null
+++ b/examples/kim/in.kim-query
@@ -0,0 +1,76 @@
+# kim query example
+#
+# Requirement:
+#
+# This example requires LAMMPS is built with KIM package. A requirement for
+# the KIM package, is the KIM API library that must be downloaded from the
+# OpenKIM website and installed before LAMMPS is compiled. The 'kim query'
+# command requires the libcurl library to be installed. See the
+# `https://lammps.sandia.gov/doc/Build_extras.html#kim` doc page for further
+# details
+#
+# This example requires that the KIM Models
+# `EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005`
+# and
+# `EAM_Dynamo_MendelevAckland_2007v3_Zr__MO_004835508849_000`
+# are installed.
+#
+# This can be done with the commands
+# `kim-api-collections-management install user `EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005`
+# `kim-api-collections-management install user `EAM_Dynamo_MendelevAckland_2007v3_Zr__MO_004835508849_000`
+#
+# If these commands do not work, you may need to setup your PATH to find the utility.
+# If you installed the kim-api using the LAMMPS CMake build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS build directory)
+#   source ./kim_build-prefix/bin/kim-api-activate
+# If you installed the kim-api using the LAMMPS Make build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS src directory)
+#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
+# (where you should relplace X.Y.Z with the appropriate kim-api version number).
+#
+# Or, see https://openkim.org/doc/obtaining-models for alternative options.
+#
+
+# -----------------------------------------------
+# Get an equilibrium fcc crystal lattice constant
+# -----------------------------------------------
+kim   init EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal
+kim   query latconst_1 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom]
+print "FCC lattice constant (EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005) = ${latconst_1}"
+# Get the lattice contant from a different model
+kim   query latconst_2 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005]
+print "FCC lattice constant (EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005) = ${latconst_2}"
+clear
+
+
+# -----------------------------------------------
+# Get an equilibrium fcc crystal lattice constant
+# -----------------------------------------------
+kim   query latconst_1 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005]
+kim   query latconst_2 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005]
+print "FCC lattice constant (EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005)   = ${latconst_1}"
+print "FCC lattice constant (EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005) = ${latconst_2}"
+clear
+
+
+# -----------------------------------------------
+# Get an equilibrium hcp crystal lattice constant
+# -----------------------------------------------
+kim   init EAM_Dynamo_MendelevAckland_2007v3_Zr__MO_004835508849_000 metal
+kim   query latconst split get_lattice_constant_hexagonal crystal=["hcp"] species=["Zr"] units=["angstrom"]
+print "HCP lattice constants = ${latconst_1}, ${latconst_2}"
+clear
+
+
+# -----------------------------------------------
+# Query for KIM models from openkim.org
+# Get all the EAM models that support Al
+# -----------------------------------------------
+kim   query model index get_available_models species=[Al] potential_type=[eam]
+label model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+print "FCC lattice constant (${model}) = ${latconst}"
+next  model
+jump  SELF model_loop
+clear
+

From 2aa326c8273c144d0bb0f8aaee27ac432db3dc7d Mon Sep 17 00:00:00 2001
From: Jacob Gissinger <jrgiss05@gmail.com>
Date: Fri, 12 Feb 2021 10:56:48 -0500
Subject: [PATCH 025/116] bond/react: same-type initiators fix

previously, if/when a reaction happens could depend on the order of listed initiator atoms, if they have the same type, in some cases
---
 src/USER-REACTION/fix_bond_react.cpp | 17 +++++++++++------
 src/USER-REACTION/fix_bond_react.h   |  0
 2 files changed, 11 insertions(+), 6 deletions(-)
 mode change 100644 => 100755 src/USER-REACTION/fix_bond_react.cpp
 mode change 100644 => 100755 src/USER-REACTION/fix_bond_react.h

diff --git a/src/USER-REACTION/fix_bond_react.cpp b/src/USER-REACTION/fix_bond_react.cpp
old mode 100644
new mode 100755
index 3098a1bd67..c4ffbea4fc
--- a/src/USER-REACTION/fix_bond_react.cpp
+++ b/src/USER-REACTION/fix_bond_react.cpp
@@ -1031,23 +1031,28 @@ void FixBondReact::post_integrate()
       if (finalpartner[i] == 0) continue;
 
       j = atom->map(finalpartner[i]);
-      // if (j < 0 || tag[i] < tag[j]) {
-      if (tag[i] < tag[j]) { //atom->map(std::min(tag[i],tag[j])) <= nlocal &&
-        if (nattempt[rxnID] == maxattempt) {
+      if (tag[i] < tag[j]) {
+        if (nattempt[rxnID] > maxattempt-2) {
           maxattempt += DELTA;
-          // third column of 'attempt': bond/react integer ID
+          // third dim of 'attempt': bond/react integer ID
           memory->grow(attempt,maxattempt,2,nreacts,"bond/react:attempt");
         }
         // to ensure types remain in same order
-        // unnecessary now taken from reaction map file
         if (iatomtype[rxnID] == type[i]) {
           attempt[nattempt[rxnID]][0][rxnID] = tag[i];
           attempt[nattempt[rxnID]][1][rxnID] = finalpartner[i];
+          nattempt[rxnID]++;
+          // add another attempt if initiator atoms are same type
+          if (iatomtype[rxnID] == jatomtype[rxnID]) {
+            attempt[nattempt[rxnID]][0][rxnID] = finalpartner[i];
+            attempt[nattempt[rxnID]][1][rxnID] = tag[i];
+            nattempt[rxnID]++;
+          }
         } else {
           attempt[nattempt[rxnID]][0][rxnID] = finalpartner[i];
           attempt[nattempt[rxnID]][1][rxnID] = tag[i];
+          nattempt[rxnID]++;
         }
-        nattempt[rxnID]++;
       }
     }
   }
diff --git a/src/USER-REACTION/fix_bond_react.h b/src/USER-REACTION/fix_bond_react.h
old mode 100644
new mode 100755

From a60853cca6c85e0dc6f5626c1d8d47f9b41c5a9d Mon Sep 17 00:00:00 2001
From: jrgissing <jrgiss05@gmail.com>
Date: Sat, 13 Feb 2021 14:39:28 -0500
Subject: [PATCH 026/116] memory leak

introduced in recent 'create atoms' feature
---
 src/USER-REACTION/fix_bond_react.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/USER-REACTION/fix_bond_react.cpp b/src/USER-REACTION/fix_bond_react.cpp
index c4ffbea4fc..1ec29efacd 100755
--- a/src/USER-REACTION/fix_bond_react.cpp
+++ b/src/USER-REACTION/fix_bond_react.cpp
@@ -2710,7 +2710,7 @@ update molecule IDs, charges, types, special lists and all topology
 
 void FixBondReact::update_everything()
 {
-  int nlocal; // must be defined after create_atoms
+  int nlocal = atom->nlocal; // must be redefined after create atoms
   int *type = atom->type;
   int **nspecial = atom->nspecial;
   tagint **special = atom->special;
@@ -2722,6 +2722,9 @@ void FixBondReact::update_everything()
   // used when deleting atoms
   int ndel,ndelone;
   int *mark;
+  int nmark = nlocal;
+  memory->create(mark,nmark,"bond/react:mark");
+  for (int i = 0; i < nmark; i++) mark[i] = 0;
   tagint *tag = atom->tag;
   AtomVec *avec = atom->avec;
 
@@ -2783,8 +2786,11 @@ void FixBondReact::update_everything()
 
     // mark to-delete atoms
     nlocal = atom->nlocal;
-    mark = new int[nlocal];
-    for (int i = 0; i < nlocal; i++) mark[i] = 0;
+    if (nlocal > nmark) {
+      memory->grow(mark,nlocal,"bond/react:mark");
+      for (int i = nmark; i < nlocal; i++) mark[i] = 0;
+      nmark = nlocal;
+    }
     for (int i = 0; i < update_num_mega; i++) {
       rxnID = update_mega_glove[0][i];
       onemol = atom->molecules[unreacted_mol[rxnID]];
@@ -3233,7 +3239,7 @@ void FixBondReact::update_everything()
       }
     }
   }
-  delete [] mark;
+  memory->destroy(mark);
 
   MPI_Allreduce(&ndelone,&ndel,1,MPI_INT,MPI_SUM,world);
 

From 91f74cf9569f796dad61ec0e5eadb70c492ffac0 Mon Sep 17 00:00:00 2001
From: jrgissing <jrgiss05@gmail.com>
Date: Sat, 13 Feb 2021 20:48:31 +0000
Subject: [PATCH 027/116] permissions!

---
 src/USER-REACTION/fix_bond_react.cpp | 0
 src/USER-REACTION/fix_bond_react.h   | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 src/USER-REACTION/fix_bond_react.cpp
 mode change 100755 => 100644 src/USER-REACTION/fix_bond_react.h

diff --git a/src/USER-REACTION/fix_bond_react.cpp b/src/USER-REACTION/fix_bond_react.cpp
old mode 100755
new mode 100644
diff --git a/src/USER-REACTION/fix_bond_react.h b/src/USER-REACTION/fix_bond_react.h
old mode 100755
new mode 100644

From e7e2d2323be2885eb2e496f08ff6fb6de47ab1f7 Mon Sep 17 00:00:00 2001
From: Michael Brown <michael.w.brown@intel.com>
Date: Mon, 15 Feb 2021 08:20:50 -0800
Subject: [PATCH 028/116] Feb2021 GPU Package Update - GPU Package Files

---
 lib/gpu/Makefile.cuda_mps                  |    2 +-
 lib/gpu/Makefile.hip                       |    2 +-
 lib/gpu/Makefile.lammps.mac_ocl            |    2 +-
 lib/gpu/Makefile.linux_opencl              |   15 +-
 lib/gpu/Makefile.mac_opencl                |   10 +-
 lib/gpu/Makefile.mac_opencl_mpi            |   23 +
 lib/gpu/Makefile.oneapi                    |   26 +
 lib/gpu/Nvidia.makefile                    |   13 +-
 lib/gpu/Opencl.makefile                    |   78 +-
 lib/gpu/README                             |  361 ++++---
 lib/gpu/geryon/hip_device.h                |   44 +-
 lib/gpu/geryon/hip_kernel.h                |    4 +-
 lib/gpu/geryon/hip_texture.h               |   31 +
 lib/gpu/geryon/nvd_device.h                |   44 +-
 lib/gpu/geryon/nvd_kernel.h                |   24 +-
 lib/gpu/geryon/nvd_texture.h               |   42 +-
 lib/gpu/geryon/ocl_device.h                |  320 +++++-
 lib/gpu/geryon/ocl_kernel.h                |  115 +-
 lib/gpu/geryon/ocl_macros.h                |    8 -
 lib/gpu/geryon/ocl_memory.h                |   50 +-
 lib/gpu/geryon/ocl_texture.h               |   53 +
 lib/gpu/geryon/ocl_timer.h                 |    7 +-
 lib/gpu/geryon/ucl_basemat.h               |   10 +-
 lib/gpu/geryon/ucl_d_vec.h                 |    2 +-
 lib/gpu/geryon/ucl_get_devices.cpp         |    4 +-
 lib/gpu/geryon/ucl_h_mat.h                 |    2 +-
 lib/gpu/geryon/ucl_h_vec.h                 |    8 +-
 lib/gpu/geryon/ucl_vector.h                |    4 +-
 lib/gpu/lal_answer.cpp                     |  173 ++-
 lib/gpu/lal_answer.h                       |   16 +-
 lib/gpu/lal_atom.cpp                       |    4 +-
 lib/gpu/lal_atom.h                         |   75 +-
 lib/gpu/lal_aux_fun1.h                     |  599 ++++++++---
 lib/gpu/lal_base_atomic.cpp                |  137 ++-
 lib/gpu/lal_base_atomic.h                  |   25 +-
 lib/gpu/lal_base_charge.cpp                |  128 ++-
 lib/gpu/lal_base_charge.h                  |   16 +-
 lib/gpu/lal_base_dipole.cpp                |  105 +-
 lib/gpu/lal_base_dipole.h                  |   17 +-
 lib/gpu/lal_base_dpd.cpp                   |  134 ++-
 lib/gpu/lal_base_dpd.h                     |   23 +-
 lib/gpu/lal_base_ellipsoid.cpp             |  153 ++-
 lib/gpu/lal_base_ellipsoid.h               |   41 +-
 lib/gpu/lal_base_three.cpp                 |  220 ++--
 lib/gpu/lal_base_three.h                   |   58 +-
 lib/gpu/lal_beck.cpp                       |   18 +-
 lib/gpu/lal_beck.cu                        |   42 +-
 lib/gpu/lal_beck.h                         |    2 +-
 lib/gpu/lal_beck_ext.cpp                   |    4 +-
 lib/gpu/lal_born.cpp                       |   18 +-
 lib/gpu/lal_born.cu                        |   44 +-
 lib/gpu/lal_born.h                         |    2 +-
 lib/gpu/lal_born_coul_long.cpp             |   18 +-
 lib/gpu/lal_born_coul_long.cu              |   48 +-
 lib/gpu/lal_born_coul_long.h               |    2 +-
 lib/gpu/lal_born_coul_long_cs.cu           |   48 +-
 lib/gpu/lal_born_coul_long_cs_ext.cpp      |    4 +-
 lib/gpu/lal_born_coul_long_ext.cpp         |    4 +-
 lib/gpu/lal_born_coul_wolf.cpp             |   18 +-
 lib/gpu/lal_born_coul_wolf.cu              |   53 +-
 lib/gpu/lal_born_coul_wolf.h               |    2 +-
 lib/gpu/lal_born_coul_wolf_cs.cu           |   52 +-
 lib/gpu/lal_born_coul_wolf_cs_ext.cpp      |    4 +-
 lib/gpu/lal_born_coul_wolf_ext.cpp         |    4 +-
 lib/gpu/lal_born_ext.cpp                   |    4 +-
 lib/gpu/lal_buck.cpp                       |   18 +-
 lib/gpu/lal_buck.cu                        |   44 +-
 lib/gpu/lal_buck.h                         |    2 +-
 lib/gpu/lal_buck_coul.cpp                  |   18 +-
 lib/gpu/lal_buck_coul.cu                   |   48 +-
 lib/gpu/lal_buck_coul.h                    |    2 +-
 lib/gpu/lal_buck_coul_ext.cpp              |    4 +-
 lib/gpu/lal_buck_coul_long.cpp             |   18 +-
 lib/gpu/lal_buck_coul_long.cu              |   48 +-
 lib/gpu/lal_buck_coul_long.h               |    2 +-
 lib/gpu/lal_buck_coul_long_ext.cpp         |    4 +-
 lib/gpu/lal_buck_ext.cpp                   |    4 +-
 lib/gpu/lal_charmm.cpp                     |  166 +++
 lib/gpu/lal_charmm.cu                      |  303 ++++++
 lib/gpu/lal_charmm.h                       |   89 ++
 lib/gpu/lal_charmm_ext.cpp                 |  137 +++
 lib/gpu/lal_charmm_long.cpp                |   18 +-
 lib/gpu/lal_charmm_long.cu                 |   47 +-
 lib/gpu/lal_charmm_long.h                  |    2 +-
 lib/gpu/lal_charmm_long_ext.cpp            |    4 +-
 lib/gpu/lal_colloid.cpp                    |   18 +-
 lib/gpu/lal_colloid.cu                     |   44 +-
 lib/gpu/lal_colloid.h                      |    2 +-
 lib/gpu/lal_colloid_ext.cpp                |    4 +-
 lib/gpu/lal_coul.cpp                       |   18 +-
 lib/gpu/lal_coul.cu                        |   46 +-
 lib/gpu/lal_coul.h                         |    2 +-
 lib/gpu/lal_coul_debye.cpp                 |   18 +-
 lib/gpu/lal_coul_debye.cu                  |   47 +-
 lib/gpu/lal_coul_debye.h                   |    2 +-
 lib/gpu/lal_coul_debye_ext.cpp             |    4 +-
 lib/gpu/lal_coul_dsf.cpp                   |   18 +-
 lib/gpu/lal_coul_dsf.cu                    |   51 +-
 lib/gpu/lal_coul_dsf.h                     |    2 +-
 lib/gpu/lal_coul_dsf_ext.cpp               |    4 +-
 lib/gpu/lal_coul_ext.cpp                   |    4 +-
 lib/gpu/lal_coul_long.cpp                  |   18 +-
 lib/gpu/lal_coul_long.cu                   |  140 +--
 lib/gpu/lal_coul_long.h                    |    2 +-
 lib/gpu/lal_coul_long_cs.cu                |  141 +--
 lib/gpu/lal_coul_long_cs_ext.cpp           |    4 +-
 lib/gpu/lal_coul_long_ext.cpp              |    4 +-
 lib/gpu/lal_device.cpp                     |  594 ++++++++---
 lib/gpu/lal_device.cu                      |   42 +-
 lib/gpu/lal_device.h                       |  114 +-
 lib/gpu/lal_dipole_lj.cpp                  |   18 +-
 lib/gpu/lal_dipole_lj.cu                   |  297 ++++--
 lib/gpu/lal_dipole_lj.h                    |    2 +-
 lib/gpu/lal_dipole_lj_ext.cpp              |    4 +-
 lib/gpu/lal_dipole_lj_sf.cpp               |   18 +-
 lib/gpu/lal_dipole_lj_sf.cu                |  298 ++++--
 lib/gpu/lal_dipole_lj_sf.h                 |    2 +-
 lib/gpu/lal_dipole_lj_sf_ext.cpp           |    4 +-
 lib/gpu/lal_dipole_long_lj.cpp             |   18 +-
 lib/gpu/lal_dipole_long_lj.cu              |  297 ++++--
 lib/gpu/lal_dipole_long_lj.h               |    2 +-
 lib/gpu/lal_dipole_long_lj_ext.cpp         |    4 +-
 lib/gpu/lal_dpd.cpp                        |   38 +-
 lib/gpu/lal_dpd.cu                         |   88 +-
 lib/gpu/lal_dpd.h                          |    2 +-
 lib/gpu/lal_dpd_ext.cpp                    |    4 +-
 lib/gpu/lal_eam.cpp                        |   89 +-
 lib/gpu/lal_eam.cu                         |  213 ++--
 lib/gpu/lal_eam.h                          |    6 +-
 lib/gpu/lal_eam_alloy_ext.cpp              |    4 +-
 lib/gpu/lal_eam_ext.cpp                    |    6 +-
 lib/gpu/lal_eam_fs_ext.cpp                 |    4 +-
 lib/gpu/lal_ellipsoid_extra.h              |  122 ++-
 lib/gpu/lal_ellipsoid_nbor.cu              |   27 +-
 lib/gpu/lal_gauss.cpp                      |   20 +-
 lib/gpu/lal_gauss.cu                       |   51 +-
 lib/gpu/lal_gauss.h                        |    2 +-
 lib/gpu/lal_gauss_ext.cpp                  |    4 +-
 lib/gpu/lal_gayberne.cpp                   |   36 +-
 lib/gpu/lal_gayberne.cu                    |   34 +-
 lib/gpu/lal_gayberne.h                     |    2 +-
 lib/gpu/lal_gayberne_lj.cu                 |   94 +-
 lib/gpu/lal_lj.cpp                         |   39 +-
 lib/gpu/lal_lj.cu                          |   96 +-
 lib/gpu/lal_lj.h                           |    2 +-
 lib/gpu/lal_lj96.cpp                       |   18 +-
 lib/gpu/lal_lj96.cu                        |   44 +-
 lib/gpu/lal_lj96.h                         |    2 +-
 lib/gpu/lal_lj96_ext.cpp                   |    4 +-
 lib/gpu/lal_lj_class2_long.cpp             |   18 +-
 lib/gpu/lal_lj_class2_long.cu              |   48 +-
 lib/gpu/lal_lj_class2_long.h               |    2 +-
 lib/gpu/lal_lj_class2_long_ext.cpp         |    4 +-
 lib/gpu/lal_lj_coul.cpp                    |   18 +-
 lib/gpu/lal_lj_coul.cu                     |   48 +-
 lib/gpu/lal_lj_coul.h                      |    2 +-
 lib/gpu/lal_lj_coul_debye.cpp              |   18 +-
 lib/gpu/lal_lj_coul_debye.cu               |   48 +-
 lib/gpu/lal_lj_coul_debye.h                |    2 +-
 lib/gpu/lal_lj_coul_debye_ext.cpp          |    4 +-
 lib/gpu/lal_lj_coul_ext.cpp                |    4 +-
 lib/gpu/lal_lj_coul_long.cpp               |   18 +-
 lib/gpu/lal_lj_coul_long.cu                |   48 +-
 lib/gpu/lal_lj_coul_long.h                 |    2 +-
 lib/gpu/lal_lj_coul_long_ext.cpp           |    4 +-
 lib/gpu/lal_lj_coul_msm.cpp                |   18 +-
 lib/gpu/lal_lj_coul_msm.cu                 |   53 +-
 lib/gpu/lal_lj_coul_msm.h                  |    2 +-
 lib/gpu/lal_lj_coul_msm_ext.cpp            |    4 +-
 lib/gpu/lal_lj_cubic.cpp                   |   18 +-
 lib/gpu/lal_lj_cubic.cu                    |   44 +-
 lib/gpu/lal_lj_cubic.h                     |    2 +-
 lib/gpu/lal_lj_cubic_ext.cpp               |    4 +-
 lib/gpu/lal_lj_dsf.cpp                     |   18 +-
 lib/gpu/lal_lj_dsf.cu                      |   53 +-
 lib/gpu/lal_lj_dsf.h                       |    2 +-
 lib/gpu/lal_lj_dsf_ext.cpp                 |    4 +-
 lib/gpu/lal_lj_expand.cpp                  |   18 +-
 lib/gpu/lal_lj_expand.cu                   |   44 +-
 lib/gpu/lal_lj_expand.h                    |    2 +-
 lib/gpu/lal_lj_expand_coul_long.cpp        |   18 +-
 lib/gpu/lal_lj_expand_coul_long.cu         |   46 +-
 lib/gpu/lal_lj_expand_coul_long.h          |    2 +-
 lib/gpu/lal_lj_expand_coul_long_ext.cpp    |    4 +-
 lib/gpu/lal_lj_expand_ext.cpp              |    4 +-
 lib/gpu/lal_lj_ext.cpp                     |    4 +-
 lib/gpu/lal_lj_gromacs.cpp                 |   18 +-
 lib/gpu/lal_lj_gromacs.cu                  |   43 +-
 lib/gpu/lal_lj_gromacs.h                   |    2 +-
 lib/gpu/lal_lj_gromacs_ext.cpp             |    4 +-
 lib/gpu/lal_lj_sdk.cpp                     |   18 +-
 lib/gpu/lal_lj_sdk.cu                      |   45 +-
 lib/gpu/lal_lj_sdk.h                       |    2 +-
 lib/gpu/lal_lj_sdk_ext.cpp                 |    4 +-
 lib/gpu/lal_lj_sdk_long.cpp                |   18 +-
 lib/gpu/lal_lj_sdk_long.cu                 |   47 +-
 lib/gpu/lal_lj_sdk_long.h                  |    2 +-
 lib/gpu/lal_lj_sdk_long_ext.cpp            |    4 +-
 lib/gpu/lal_lj_tip4p_long.cpp              |   93 +-
 lib/gpu/lal_lj_tip4p_long.cu               |  192 ++--
 lib/gpu/lal_lj_tip4p_long.h                |   17 +-
 lib/gpu/lal_lj_tip4p_long_ext.cpp          |    6 +-
 lib/gpu/lal_mie.cpp                        |   18 +-
 lib/gpu/lal_mie.cu                         |   43 +-
 lib/gpu/lal_mie.h                          |    2 +-
 lib/gpu/lal_mie_ext.cpp                    |    4 +-
 lib/gpu/lal_morse.cpp                      |   18 +-
 lib/gpu/lal_morse.cu                       |   45 +-
 lib/gpu/lal_morse.h                        |    2 +-
 lib/gpu/lal_morse_ext.cpp                  |    4 +-
 lib/gpu/lal_neighbor.cpp                   |  414 ++++++--
 lib/gpu/lal_neighbor.h                     |  120 ++-
 lib/gpu/lal_neighbor_cpu.cu                |    4 +-
 lib/gpu/lal_neighbor_gpu.cu                |  238 ++++-
 lib/gpu/lal_neighbor_shared.cpp            |   44 +-
 lib/gpu/lal_neighbor_shared.h              |   40 +
 lib/gpu/lal_pppm.cpp                       |   20 +-
 lib/gpu/lal_pppm.cu                        |    3 +
 lib/gpu/lal_pppm_ext.cpp                   |    6 +-
 lib/gpu/lal_pre_cuda_hip.h                 |  355 +++++++
 lib/gpu/lal_pre_ocl_config.h               |   53 +
 lib/gpu/lal_precision.h                    |   63 +-
 lib/gpu/lal_preprocessor.h                 |  778 ++++----------
 lib/gpu/lal_re_squared.cpp                 |   39 +-
 lib/gpu/lal_re_squared.cu                  |   42 +-
 lib/gpu/lal_re_squared.h                   |    2 +-
 lib/gpu/lal_re_squared_lj.cu               |  232 ++--
 lib/gpu/lal_soft.cpp                       |   18 +-
 lib/gpu/lal_soft.cu                        |   42 +-
 lib/gpu/lal_soft.h                         |    2 +-
 lib/gpu/lal_soft_ext.cpp                   |    4 +-
 lib/gpu/lal_sw.cpp                         |  266 ++---
 lib/gpu/lal_sw.cu                          |  912 ++++++++--------
 lib/gpu/lal_sw.h                           |   40 +-
 lib/gpu/lal_sw_ext.cpp                     |   35 +-
 lib/gpu/lal_table.cpp                      |   91 +-
 lib/gpu/lal_table.cu                       |  169 +--
 lib/gpu/lal_table.h                        |    9 +-
 lib/gpu/lal_table_ext.cpp                  |    4 +-
 lib/gpu/lal_tersoff.cpp                    |  260 ++---
 lib/gpu/lal_tersoff.cu                     | 1108 ++++++++++----------
 lib/gpu/lal_tersoff.h                      |   27 +-
 lib/gpu/lal_tersoff_ext.cpp                |    6 +-
 lib/gpu/lal_tersoff_extra.h                |  238 ++---
 lib/gpu/lal_tersoff_mod.cpp                |  224 ++--
 lib/gpu/lal_tersoff_mod.cu                 |  708 ++++++-------
 lib/gpu/lal_tersoff_mod.h                  |   10 +-
 lib/gpu/lal_tersoff_mod_ext.cpp            |    6 +-
 lib/gpu/lal_tersoff_zbl.cpp                |  237 +++--
 lib/gpu/lal_tersoff_zbl.cu                 |  692 ++++++------
 lib/gpu/lal_tersoff_zbl.h                  |   10 +-
 lib/gpu/lal_tersoff_zbl_ext.cpp            |    6 +-
 lib/gpu/lal_ufm.cpp                        |   18 +-
 lib/gpu/lal_ufm.cu                         |   44 +-
 lib/gpu/lal_ufm.h                          |    2 +-
 lib/gpu/lal_ufm_ext.cpp                    |    4 +-
 lib/gpu/lal_vashishta.cpp                  |  102 +-
 lib/gpu/lal_vashishta.cu                   |  520 ++++-----
 lib/gpu/lal_vashishta.h                    |    7 +-
 lib/gpu/lal_vashishta_ext.cpp              |   14 +-
 lib/gpu/lal_yukawa.cpp                     |   18 +-
 lib/gpu/lal_yukawa.cu                      |   42 +-
 lib/gpu/lal_yukawa.h                       |    2 +-
 lib/gpu/lal_yukawa_colloid.cpp             |   67 +-
 lib/gpu/lal_yukawa_colloid.cu              |   47 +-
 lib/gpu/lal_yukawa_colloid.h               |    2 +-
 lib/gpu/lal_yukawa_colloid_ext.cpp         |    4 +-
 lib/gpu/lal_yukawa_ext.cpp                 |    4 +-
 lib/gpu/lal_zbl.cpp                        |   18 +-
 lib/gpu/lal_zbl.cu                         |   42 +-
 lib/gpu/lal_zbl.h                          |    2 +-
 lib/gpu/lal_zbl_ext.cpp                    |    4 +-
 src/GPU/Install.sh                         |   13 +
 src/GPU/fix_gpu.cpp                        |   70 +-
 src/GPU/fix_gpu.h                          |   11 +
 src/GPU/fix_nh_gpu.cpp                     |  552 ++++++++++
 src/GPU/fix_nh_gpu.h                       |  164 +++
 src/GPU/fix_npt_gpu.cpp                    |   68 ++
 src/GPU/fix_npt_gpu.h                      |   52 +
 src/GPU/fix_nve_asphere_gpu.cpp            |  440 ++++++++
 src/GPU/fix_nve_asphere_gpu.h              |   63 ++
 src/GPU/fix_nve_gpu.cpp                    |  291 +++++
 src/GPU/fix_nve_gpu.h                      |   60 ++
 src/GPU/fix_nvt_gpu.cpp                    |   50 +
 src/GPU/fix_nvt_gpu.h                      |   52 +
 src/GPU/gpu_extra.h                        |   41 +-
 src/GPU/pair_beck_gpu.cpp                  |    9 +-
 src/GPU/pair_born_coul_long_cs_gpu.cpp     |   21 +-
 src/GPU/pair_born_coul_long_gpu.cpp        |    3 +-
 src/GPU/pair_born_coul_wolf_cs_gpu.cpp     |   39 +-
 src/GPU/pair_born_coul_wolf_gpu.cpp        |   11 +-
 src/GPU/pair_born_gpu.cpp                  |   15 +-
 src/GPU/pair_buck_coul_cut_gpu.cpp         |    3 +-
 src/GPU/pair_buck_coul_long_gpu.cpp        |    3 +-
 src/GPU/pair_buck_gpu.cpp                  |    7 +-
 src/GPU/pair_colloid_gpu.cpp               |   21 +-
 src/GPU/pair_coul_cut_gpu.cpp              |   29 +-
 src/GPU/pair_coul_debye_gpu.cpp            |   29 +-
 src/GPU/pair_coul_dsf_gpu.cpp              |    9 +-
 src/GPU/pair_coul_long_cs_gpu.cpp          |   37 +-
 src/GPU/pair_coul_long_gpu.cpp             |    3 +-
 src/GPU/pair_dpd_gpu.cpp                   |    7 +-
 src/GPU/pair_dpd_tstat_gpu.cpp             |   25 +-
 src/GPU/pair_eam_alloy_gpu.cpp             |   33 +-
 src/GPU/pair_eam_fs_gpu.cpp                |   26 +-
 src/GPU/pair_eam_gpu.cpp                   |   14 +-
 src/GPU/pair_gauss_gpu.cpp                 |   18 +-
 src/GPU/pair_gayberne_gpu.cpp              |    9 +-
 src/GPU/pair_lj96_cut_gpu.cpp              |    3 +-
 src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp |  309 ++++++
 src/GPU/pair_lj_charmm_coul_charmm_gpu.h   |   62 ++
 src/GPU/pair_lj_charmm_coul_long_gpu.cpp   |    3 +-
 src/GPU/pair_lj_class2_coul_long_gpu.cpp   |    3 +-
 src/GPU/pair_lj_class2_gpu.cpp             |    3 +-
 src/GPU/pair_lj_cubic_gpu.cpp              |   25 +-
 src/GPU/pair_lj_cut_coul_cut_gpu.cpp       |   11 +-
 src/GPU/pair_lj_cut_coul_debye_gpu.cpp     |   21 +-
 src/GPU/pair_lj_cut_coul_dsf_gpu.cpp       |    9 +-
 src/GPU/pair_lj_cut_coul_long_gpu.cpp      |    7 +-
 src/GPU/pair_lj_cut_coul_msm_gpu.cpp       |   15 +-
 src/GPU/pair_lj_cut_dipole_cut_gpu.cpp     |    3 +-
 src/GPU/pair_lj_cut_dipole_long_gpu.cpp    |   44 +-
 src/GPU/pair_lj_cut_gpu.cpp                |   13 +-
 src/GPU/pair_lj_cut_tip4p_long_gpu.cpp     |    3 +-
 src/GPU/pair_lj_expand_coul_long_gpu.cpp   |   45 +-
 src/GPU/pair_lj_expand_gpu.cpp             |    7 +-
 src/GPU/pair_lj_gromacs_gpu.cpp            |   20 +-
 src/GPU/pair_lj_sdk_coul_long_gpu.cpp      |    3 +-
 src/GPU/pair_lj_sdk_gpu.cpp                |    3 +-
 src/GPU/pair_lj_sf_dipole_sf_gpu.cpp       |   23 +-
 src/GPU/pair_mie_cut_gpu.cpp               |    9 +-
 src/GPU/pair_morse_gpu.cpp                 |    9 +-
 src/GPU/pair_resquared_gpu.cpp             |   17 +-
 src/GPU/pair_soft_gpu.cpp                  |   17 +-
 src/GPU/pair_sw_gpu.cpp                    |  146 +--
 src/GPU/pair_table_gpu.cpp                 |    4 +-
 src/GPU/pair_tersoff_gpu.cpp               |    6 +-
 src/GPU/pair_tersoff_mod_gpu.cpp           |   13 +-
 src/GPU/pair_tersoff_zbl_gpu.cpp           |    6 +-
 src/GPU/pair_ufm_gpu.cpp                   |   36 +-
 src/GPU/pair_vashishta_gpu.cpp             |   48 +-
 src/GPU/pair_yukawa_colloid_gpu.cpp        |   36 +-
 src/GPU/pair_yukawa_gpu.cpp                |   11 +-
 src/GPU/pair_zbl_gpu.cpp                   |    9 +-
 src/GPU/pppm_gpu.cpp                       |   34 +-
 345 files changed, 13424 insertions(+), 7708 deletions(-)
 create mode 100644 lib/gpu/Makefile.mac_opencl_mpi
 create mode 100644 lib/gpu/Makefile.oneapi
 create mode 100644 lib/gpu/lal_charmm.cpp
 create mode 100644 lib/gpu/lal_charmm.cu
 create mode 100644 lib/gpu/lal_charmm.h
 create mode 100644 lib/gpu/lal_charmm_ext.cpp
 create mode 100644 lib/gpu/lal_pre_cuda_hip.h
 create mode 100644 lib/gpu/lal_pre_ocl_config.h
 create mode 100644 src/GPU/fix_nh_gpu.cpp
 create mode 100644 src/GPU/fix_nh_gpu.h
 create mode 100644 src/GPU/fix_npt_gpu.cpp
 create mode 100644 src/GPU/fix_npt_gpu.h
 create mode 100644 src/GPU/fix_nve_asphere_gpu.cpp
 create mode 100644 src/GPU/fix_nve_asphere_gpu.h
 create mode 100644 src/GPU/fix_nve_gpu.cpp
 create mode 100644 src/GPU/fix_nve_gpu.h
 create mode 100644 src/GPU/fix_nvt_gpu.cpp
 create mode 100644 src/GPU/fix_nvt_gpu.h
 create mode 100644 src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp
 create mode 100644 src/GPU/pair_lj_charmm_coul_charmm_gpu.h

diff --git a/lib/gpu/Makefile.cuda_mps b/lib/gpu/Makefile.cuda_mps
index 172640ce6a..baffe99b47 100644
--- a/lib/gpu/Makefile.cuda_mps
+++ b/lib/gpu/Makefile.cuda_mps
@@ -51,7 +51,7 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
 
 # host code compiler and settings
 
-CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
+CUDR_CPP = mpicxx -fopenmp -fopenmp-simd -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
 CUDR_OPTS = -O2 $(LMP_INC)
 CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PROXY) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
          $(CUDPP_OPT)
diff --git a/lib/gpu/Makefile.hip b/lib/gpu/Makefile.hip
index e2fd3c22d7..c34823d471 100644
--- a/lib/gpu/Makefile.hip
+++ b/lib/gpu/Makefile.hip
@@ -17,7 +17,7 @@ LMP_INC = -DLAMMPS_SMALLBIG
 HIP_PRECISION = -D_SINGLE_DOUBLE
 
 HIP_OPTS = -O3
-HIP_HOST_OPTS = -Wno-deprecated-declarations
+HIP_HOST_OPTS = -Wno-deprecated-declarations -fopenmp -fopenmp-sim
 HIP_HOST_INCLUDE =
 
 # use device sort
diff --git a/lib/gpu/Makefile.lammps.mac_ocl b/lib/gpu/Makefile.lammps.mac_ocl
index f6c8a36430..0073efa2ba 100644
--- a/lib/gpu/Makefile.lammps.mac_ocl
+++ b/lib/gpu/Makefile.lammps.mac_ocl
@@ -1,5 +1,5 @@
 # Settings that the LAMMPS build will import when this package library is used
 
-gpu_SYSINC =
+gpu_SYSINC = -DFFT_SINGLE
 gpu_SYSLIB = -framework OpenCL
 gpu_SYSPATH = 
diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl
index 2aea7f5a46..c20e26b1f3 100644
--- a/lib/gpu/Makefile.linux_opencl
+++ b/lib/gpu/Makefile.linux_opencl
@@ -1,25 +1,21 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Linux Makefile for OpenCL 
+#  Generic Linux Makefile for OpenCL - Mixed precision
 # ------------------------------------------------------------------------- */
 
 # which file will be copied to Makefile.lammps
 
 EXTRAMAKE = Makefile.lammps.opencl
 
-# OCL_TUNE = -DFERMI_OCL       # -- Uncomment for NVIDIA Fermi
-# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
-# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
-OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device
-
 # this setting should match LAMMPS Makefile
 # one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
 
 LMP_INC = -DLAMMPS_SMALLBIG
 
-OCL_INC = -I/usr/local/cuda/include  # Path to CL directory
-OCL_CPP = mpic++ $(DEFAULT_DEVICE) -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC) -std=c++11
-OCL_LINK = -L/usr/local/cuda/lib64 -lOpenCL
+OCL_INC = 
+OCL_CPP = mpic++ -std=c++11 -O3 -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
+OCL_LINK = -lOpenCL
 OCL_PREC = -D_SINGLE_DOUBLE
+OCL_TUNE = -fopenmp -fopenmp-simd -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
 
 BIN_DIR = ./
 OBJ_DIR = ./
@@ -28,4 +24,3 @@ AR = ar
 BSH = /bin/sh
 
 include Opencl.makefile
-
diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl
index 62b58c1cef..ae7e8ca6fd 100644
--- a/lib/gpu/Makefile.mac_opencl
+++ b/lib/gpu/Makefile.mac_opencl
@@ -1,19 +1,17 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Mac Makefile for OpenCL 
+#  Generic Mac Makefile for OpenCL - Single precision with FFT_SINGLE
 # ------------------------------------------------------------------------- */
 
 # which file will be copied to Makefile.lammps
 
 EXTRAMAKE = Makefile.lammps.mac_ocl
 
-OCL_TUNE = -DFERMI_OCL       # -- Uncomment for NVIDIA Fermi
-# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
-# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
-# OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device
+LMP_INC = -DLAMMPS_SMALLBIG
 
-OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT
+OCL_CPP = clang++ -std=c++11 -O3 -I../../src/STUBS
 OCL_LINK = -framework OpenCL
 OCL_PREC = -D_SINGLE_SINGLE
+OCL_TUNE = -DUCL_NO_EXIT
 
 BIN_DIR = ./
 OBJ_DIR = ./
diff --git a/lib/gpu/Makefile.mac_opencl_mpi b/lib/gpu/Makefile.mac_opencl_mpi
new file mode 100644
index 0000000000..9be9f07e93
--- /dev/null
+++ b/lib/gpu/Makefile.mac_opencl_mpi
@@ -0,0 +1,23 @@
+# /* ----------------------------------------------------------------------   
+#  Generic Mac Makefile for OpenCL - Single precision with FFT_SINGLE
+# ------------------------------------------------------------------------- */
+
+# which file will be copied to Makefile.lammps
+
+EXTRAMAKE = Makefile.lammps.mac_ocl
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+OCL_CPP = mpicxx -std=c++11 -O3 -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
+OCL_LINK = -framework OpenCL
+OCL_PREC = -D_SINGLE_SINGLE
+OCL_TUNE = -DUCL_NO_EXIT -DMPI_GERYON
+
+BIN_DIR = ./
+OBJ_DIR = ./
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Opencl.makefile
+
diff --git a/lib/gpu/Makefile.oneapi b/lib/gpu/Makefile.oneapi
new file mode 100644
index 0000000000..015ab47057
--- /dev/null
+++ b/lib/gpu/Makefile.oneapi
@@ -0,0 +1,26 @@
+# /* ----------------------------------------------------------------------
+#  Generic Linux Makefile for OpenCL
+# ------------------------------------------------------------------------- */
+
+# which file will be copied to Makefile.lammps
+
+EXTRAMAKE = Makefile.lammps.opencl
+
+# this setting should match LAMMPS Makefile
+# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+OCL_INC =
+OCL_CPP = mpiicpc -std=c++11 -xHost -O2 -qopenmp -qopenmp-simd  -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
+OCL_LINK = -lOpenCL
+OCL_PREC = -D_SINGLE_DOUBLE
+OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -fp-model fast=2 -no-prec-div
+
+BIN_DIR = ./
+OBJ_DIR = ./
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Opencl.makefile
diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 6716388562..d3275b890f 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -1,6 +1,7 @@
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
-NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h
+NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \
+         lal_pre_cuda_hip.h
 ALL_H  =  $(NVD_H) $(wildcard ./lal_*.h)
 
 # Source files
@@ -39,17 +40,21 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
 
 # device code compilation
 
-$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
+$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
+                         lal_pre_cuda_hip.h
 	$(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
 
 $(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin
 	$(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h
+	rm $(OBJ_DIR)/pppm_f.cubin
 
-$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
+$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
+                         lal_pre_cuda_hip.h
 	$(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu
 
 $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
 	$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
+	rm $(OBJ_DIR)/pppm_d.cubin
 
 $(OBJ_DIR)/%_cubin.h: lal_%.cu  $(ALL_H)
 	$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
@@ -93,7 +98,7 @@ $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 
 
 clean:
-	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CUHS) *.linkinfo
+	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CUHS) *.cubin *.linkinfo
 
 veryclean: clean
 	-rm -rf *~ *.linkinfo
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 996a564998..2ff98827d4 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -1,8 +1,15 @@
+# Common headers for kernels
+PRE1_H = lal_preprocessor.h lal_aux_fun1.h
+
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
-OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_preprocessor.h
-PRE1_H = lal_preprocessor.h lal_aux_fun1.h
-ALL_H  =  $(OCL_H) $(wildcard ./lal_*.h)
+OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h
+
+# Headers for Host files
+HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \
+         lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
+         lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
+         lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H)
 
 # Source files
 SRCS := $(wildcard ./lal_*.cpp)
@@ -28,12 +35,75 @@ OCL  = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL
 
 # device code compilation
 
+$(OBJ_DIR)/atom_cl.h: lal_atom.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh atom lal_preprocessor.h lal_atom.cu $(OBJ_DIR)/atom_cl.h
+
+$(OBJ_DIR)/neighbor_cpu_cl.h: lal_neighbor_cpu.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh neighbor_cpu lal_preprocessor.h lal_neighbor_cpu.cu $(OBJ_DIR)/neighbor_cpu_cl.h
+
+$(OBJ_DIR)/neighbor_gpu_cl.h: lal_neighbor_gpu.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh neighbor_gpu lal_preprocessor.h lal_neighbor_gpu.cu $(OBJ_DIR)/neighbor_gpu_cl.h
+
+$(OBJ_DIR)/device_cl.h: lal_device.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh device lal_preprocessor.h lal_device.cu $(OBJ_DIR)/device_cl.h
+
+$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
+
+$(OBJ_DIR)/ellipsoid_nbor_cl.h: lal_ellipsoid_nbor.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor lal_preprocessor.h lal_ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h
+
+$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu $(PRE1_H) lal_ellipsoid_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh gayberne $(PRE1_H) lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h;
+
+$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu $(PRE1_H) lal_ellipsoid_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh gayberne_lj $(PRE1_H) lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h;
+
+$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu $(PRE1_H) lal_ellipsoid_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh re_squared $(PRE1_H) lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h;
+
+$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu $(PRE1_H) lal_ellipsoid_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh re_squared_lj $(PRE1_H) lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h;
+
+$(OBJ_DIR)/tersoff_cl.h: lal_tersoff.cu $(PRE1_H) lal_tersoff_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh tersoff $(PRE1_H) lal_tersoff_extra.h lal_tersoff.cu $(OBJ_DIR)/tersoff_cl.h;
+
+$(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh tersoff_mod $(PRE1_H) lal_tersoff_mod_extra.h lal_tersoff_mod.cu $(OBJ_DIR)/tersoff_mod_cl.h;
+
+$(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h;
+
 $(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;
 
 # host code compilation
 
-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(KERS)
+$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
+	$(OCL) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
+	$(OCL) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
+	$(OCL) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
+	$(OCL) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
+	$(OCL) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
+	$(OCL) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
+	$(OCL) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
+	$(OCL) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cl.h $(HOST_H)
 	$(OCL) -o $@ -c $< -I$(OBJ_DIR)
 
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H)
diff --git a/lib/gpu/README b/lib/gpu/README
index dfa8dcf7ff..28655836f4 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -1,21 +1,110 @@
                   --------------------------------
                      LAMMPS ACCELERATOR LIBRARY
                   --------------------------------
-                     
+
                        W. Michael Brown (ORNL)
                         Trung Dac Nguyen (ORNL/Northwestern)
-                          Peng Wang (NVIDIA)
+                        Nitin Dhamankar (Intel)
                        Axel Kohlmeyer (Temple)
+                          Peng Wang (NVIDIA)
+                        Anders Hafreager (UiO)
+                          V. Nikolskiy (HSE)
+                   Maurice de Koning (Unicamp/Brazil)
+                  Rodolfo Paula Leite (Unicamp/Brazil)
                          Steve Plimpton (SNL)
                         Inderaj Bains (NVIDIA)
 
--------------------------------------------------------------------
 
-This directory has source files to build a library that LAMMPS
-links against when using the GPU package.
+------------------------------------------------------------------------------
 
-This library must be built with a C++ compiler, before LAMMPS is
-built, so LAMMPS can link against it.
+This directory has source files to build a library that LAMMPS links against
+when using the GPU package.
+
+This library must be built with a C++ compiler along with CUDA, HIP, or OpenCL
+before LAMMPS is built, so LAMMPS can link against it.
+
+This library, libgpu.a, provides routines for acceleration of certain
+LAMMPS styles and neighbor list builds using CUDA, OpenCL, or ROCm HIP.
+
+Pair styles supported by this library are marked in the list of Pair style
+potentials with a "g". See the online version at:
+
+https://lammps.sandia.gov/doc/Commands_pair.html
+
+In addition the (plain) pppm kspace style is supported as well.
+
+------------------------------------------------------------------------------
+                              DEVICE QUERY
+------------------------------------------------------------------------------
+The gpu library includes binaries to check for available GPUs and their
+properties. It is a good idea to run this on first use to make sure the
+system and build is setup properly. Additionally, the GPU numbering for
+specific selection of devices should be taking from this output. The GPU
+library may split some accelerators into separate virtual accelerators for
+efficient use with MPI.
+
+After building the GPU library, for OpenCL:
+  ./ocl_get_devices
+and for CUDA
+  ./nvc_get_devices
+
+------------------------------------------------------------------------------
+                              QUICK START
+------------------------------------------------------------------------------
+OpenCL: Mac without MPI:
+  make -f Makefile.mac_opencl -j; cd ../../src/; make mpi-stubs
+  make g++_serial -j
+  ./lmp_g++_serial -in ../bench/in.lj -log none -sf gpu
+
+OpenCL: Mac with MPI:
+  make -f Makefile.mac_opencl_mpi -j; cd ../../src/; make g++_openmpi -j
+  mpirun -np $NUM_MPI ./lmp_g++_openmpi -in ../bench/in.lj -log none -sf gpu
+
+OpenCL: Linux with Intel oneAPI:
+  make -f Makefile.oneapi -j; cd ../../src; make oneapi -j
+  export OMP_NUM_THREADS=$NUM_THREADS
+  mpirun -np $NUM_MPI ./lmp_oneapi -in ../bench/in.lj -log none -sf gpu
+
+OpenCL: Linux with MPI:
+  make -f Makefile.linux_opencl -j; cd ../../src; make omp -j
+  export OMP_NUM_THREADS=$NUM_THREADS
+  mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
+
+NVIDIA CUDA:
+  make -f Makefile.cuda_mps -j; cd ../../src; make omp -j
+  export CUDA_MPS_LOG_DIRECTORY=/tmp; export CUDA_MPS_PIPE_DIRECTORY=/tmp
+  nvidia-smi -i 0 -c EXCLUSIVE_PROCESS
+  export OMP_NUM_THREADS=$NUM_THREADS
+  mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
+  echo quit | /usr/bin/nvidia-cuda-mps-control
+
+AMD HIP:
+  make -f Makefile.hip -j; cd ../../src; make omp -j
+  export OMP_NUM_THREADS=$NUM_THREADS
+  mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
+
+------------------------------------------------------------------------------
+                 Installing oneAPI, OpenCl, CUDA, or ROCm
+------------------------------------------------------------------------------
+The easiest approach is to use the linux package manger to perform the
+installation from Intel, NVIDIA, etc. repositories. All are available for
+free. The oneAPI installation includes Intel optimized MPI and C++ compilers,
+along with many libraries. Alternatively, Intel OpenCL can also be installed
+separately from the Intel repository.
+
+NOTE: Installation of the CUDA SDK is not required, only the CUDA toolkit.
+
+See:
+
+https://software.intel.com/content/www/us/en/develop/tools/oneapi/hpc-toolkit.html
+
+https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
+
+https://github.com/RadeonOpenCompute/ROCm
+
+------------------------------------------------------------------------------
+                              Build Intro
+------------------------------------------------------------------------------
 
 You can type "make lib-gpu" from the src directory to see help on how
 to build this library via make commands, or you can do the same thing
@@ -25,13 +114,13 @@ do it manually by following the instructions below.
 Build the library using one of the provided Makefile.* files or create
 your own, specific to your compiler and system.  For example:
 
-make -f Makefile.linux
+make -f Makefile.linux_opencl
 
 When you are done building this library, two files should
 exist in this directory:
 
-libgpu.a		the library LAMMPS will link against
-Makefile.lammps		settings the LAMMPS Makefile will import
+libgpu.a                the library LAMMPS will link against
+Makefile.lammps         settings the LAMMPS Makefile will import
 
 Makefile.lammps is created by the make command, by copying one of the
 Makefile.lammps.* files.  See the EXTRAMAKE setting at the top of the
@@ -45,77 +134,52 @@ IMPORTANT: If you re-build the library, e.g. for a different precision
 Makefile.linux clean, to insure all previous derived files are removed
 before the new build is done.
 
-Makefile.lammps has settings for 3 variables:
-
-user-gpu_SYSINC = leave blank for this package
-user-gpu_SYSLIB = CUDA libraries needed by this package
-user-gpu_SYSPATH = path(s) to where those libraries are
-
-Because you have the CUDA compilers on your system, you should have
-the needed libraries.  If the CUDA development tools were installed
-in the standard manner, the settings in the Makefile.lammps.standard
-file should work.
-
--------------------------------------------------------------------
-
-                          GENERAL NOTES
-                  --------------------------------
-                          
-This library, libgpu.a, provides routines for GPU acceleration
-of certain LAMMPS styles and neighbor list builds. Compilation of this 
-library requires installing the CUDA GPU driver and CUDA toolkit for
-your operating system. Installation of the CUDA SDK is not necessary.
-In addition to the LAMMPS library, the binary nvc_get_devices will also
-be built. This can be used to query the names and properties of GPU 
-devices on your system. A Makefile for OpenCL and ROCm HIP compilation
-is provided, but support for it is not currently provided by the developers.
-Details of the implementation are provided in:
-
-----
-
-Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing 
-Molecular Dynamics on Hybrid High Performance Computers - Short Range 
-Forces. Computer Physics Communications. 2011. 182: p. 898-911. 
-
-and
-
-Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing 
-Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle 
-Particle-Mesh. Computer Physics Communications. 2012. 183: p. 449-459. 
-
-and
-
-Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High 
-Performance Computers - Three-Body Potentials. Computer Physics Communications. 
-2013. 184: p. 2785–2793.
-
-----
-
-NOTE: Installation of the CUDA SDK is not required, only the CUDA
-toolkit itself or an OpenCL 1.2 compatible header and library.
-
-Pair styles supporting GPU acceleration this this library
-are marked in the list of Pair style potentials with a "g".
-See the online version at: https://lammps.sandia.gov/doc/Commands_pair.html
-
-In addition the (plain) pppm kspace style is supported as well.
+NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG,
+      or LAMMPS_SMALLSMALL if specified when building LAMMPS (i.e. in
+      src/MAKE/Makefile.foo) should be consistent with that specified
+      when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).
 
 
-                     MULTIPLE LAMMPS PROCESSES
-                  --------------------------------
-                     
-Multiple LAMMPS MPI processes can share GPUs on the system, but multiple
-GPUs cannot be utilized by a single MPI process. In many cases, the
-best performance will be obtained by running as many MPI processes as
-CPU cores available with the condition that the number of MPI processes
-is an integer multiple of the number of GPUs being used. See the 
-LAMMPS user manual for details on running with GPU acceleration.
+------------------------------------------------------------------------------
+                             PRECISION MODES
+------------------------------------------------------------------------------
+The GPU library supports 3 precision modes: single, double, and mixed, with
+the latter being the default for most Makefiles aside from Mac specific
+Makefiles due to the more restrictive nature of the Apple OpenCL for some
+devices.
+
+To specify the precision mode (output to the screen before LAMMPS runs for
+verification), set either CUDA_PRECISION, OCL_PREC, or HIP_PRECISION to one
+of -D_SINGLE_SINGLE, -D_DOUBLE_DOUBLE, or -D_SINGLE_DOUBLE.
+
+Some accelerators or OpenCL implementations only support single precision.
+This mode should be used with care and appropriate validation as the errors
+can scale with system size in this implementation. This can be useful for
+accelerating test runs when setting up a simulation for production runs on
+another machine. In the case where only single precision is supported, either
+LAMMPS must be compiled with -DFFT_SINGLE to use PPPM with GPU acceleration
+or GPU acceleration should be disabled for PPPM (e.g. suffix off or pair/only
+as described in the LAMMPS documentation).
 
 
-                    BUILDING AND PRECISION MODES
-                  --------------------------------
+------------------------------------------------------------------------------
+                             CUDA BUILD NOTES
+------------------------------------------------------------------------------
+NOTE: when compiling with CMake, all of the considerations listed below
+are considered within the CMake configuration process, so no separate
+compilation of the gpu library is required. Also this will build in support
+for all compute architecture that are supported by the CUDA toolkit version
+used to build the gpu library.
 
-To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of 
+If you do not want to use a fat binary, that supports multiple CUDA
+architectures, the CUDA_ARCH must be set to match the GPU architecture. This
+is reported by nvc_get_devices executable created by the build process and
+a detailed list of GPU architectures and CUDA compatible GPUs can be found
+e.g. here: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
+
+The CUDA_HOME variable should be set to the location of the CUDA toolkit.
+
+To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of
 the Makefiles. CUDA_ARCH should be set based on the compute capability of
 your GPU. This can be verified by running the nvc_get_devices executable after
 the build is complete. Additionally, the GPU package must be installed and
@@ -123,82 +187,93 @@ compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the
 LAMMPS makefile.
 
 Please note that the GPU library accesses the CUDA driver library directly,
-so it needs to be linked not only to the CUDA runtime library (libcudart.so)
-that ships with the CUDA toolkit, but also with the CUDA driver library
-(libcuda.so) that ships with the Nvidia driver. If you are compiling LAMMPS
-on the head node of a GPU cluster, this library may not be installed,
-so you may need to copy it over from one of the compute nodes (best into
-this directory). Recent CUDA toolkits starting from CUDA 9 provide a dummy
-libcuda.so library (typically under $(CUDA_HOME)/lib64/stubs), that can be used for
-linking.
+so it needs to be linked with the CUDA driver library (libcuda.so) that ships
+with the Nvidia driver. If you are compiling LAMMPS on the head node of a GPU
+cluster, this library may not be installed, so you may need to copy it over
+from one of the compute nodes (best into this directory). Recent CUDA toolkits
+starting from CUDA 9 provide a dummy libcuda.so library (typically under
+$(CUDA_HOME)/lib64/stubs), that can be used for linking.
 
-The gpu library supports 3 precision modes as determined by 
-the CUDA_PRECISION variable:
+Best performance with the GPU library is typically with multiple MPI processes
+sharing the same GPU cards. For NVIDIA, this is most efficient with CUDA
+MPS enabled. To prevent runtime errors for GPUs configured in exclusive process
+mode with MPS, the GPU library should be build with either of the equivalent
+-DCUDA_MPS_SUPPORT or -DCUDA_PROXY flags.
 
-  CUDA_PRECISION = -D_SINGLE_SINGLE  # Single precision for all calculations
-  CUDA_PRECISION = -D_DOUBLE_DOUBLE  # Double precision for all calculations
-  CUDA_PRECISION = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
+------------------------------------------------------------------------------
+                             HIP BUILD NOTES
+------------------------------------------------------------------------------
 
-As of CUDA 7.5 only GPUs with compute capability 2.0 (Fermi) or newer are
-supported and as of CUDA 9.0 only compute capability 3.0 (Kepler) or newer
-are supported. There are some limitations of this library for GPUs older
-than that, which require additional preprocessor flag, and limit features,
-but they are kept for historical reasons. There is no value in trying to
-use those GPUs for production calculations.
-
-You have to make sure that you set a CUDA_ARCH line suitable for your
-hardware and CUDA toolkit version: e.g. -arch=sm_35 for Tesla K20 or K40
-or -arch=sm_52 GeForce GTX Titan X. A detailed list of GPU architectures
-and CUDA compatible GPUs can be found e.g. here: 
-https://en.wikipedia.org/wiki/CUDA#GPUs_supported
-
-NOTE: when compiling with CMake, all of the considerations listed below
-are considered within the CMake configuration process, so no separate 
-compilation of the gpu library is required. Also this will build in support
-for all compute architecture that are supported by the CUDA toolkit version
-used to build the gpu library.
-
-Please note the CUDA_CODE settings in Makefile.linux_multi, which allows
-to compile this library with support for multiple GPUs. This list can be
-extended for newer GPUs with newer CUDA toolkits and should allow to build
-a single GPU library compatible with all GPUs that are worth using for
-GPU acceleration and supported by the current CUDA toolkits and drivers.
-
-NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG, 
-      or LAMMPS_SMALLSMALL if specified when building LAMMPS (i.e. in 
-      src/MAKE/Makefile.foo) should be consistent with that specified 
-      when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).
-
-                      BUILDING FOR HIP FRAMEWORK
-                   --------------------------------
-1. Install the latest ROCm framework (https://github.com/RadeonOpenCompute/ROCm).
-2. GPU sorting requires installing hipcub 
+1. GPU sorting requires installing hipcub
 (https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
 additionally requires cub (https://nvlabs.github.io/cub). Download and
 extract the cub directory to lammps/lib/gpu/ or specify an appropriate
 path in lammps/lib/gpu/Makefile.hip.
-3. In Makefile.hip it is possible to specify the target platform via 
-export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target 
+2. In Makefile.hip it is possible to specify the target platform via
+export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target
 architecture (gfx803, gfx900, gfx906 etc.)
-4. If your MPI implementation does not support `mpicxx --showme` command,
+3. If your MPI implementation does not support `mpicxx --showme` command,
 it is required to specify the corresponding MPI compiler and linker flags
 in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
-5. Building the GPU library (libgpu.a): 
-    cd lammps/lib/gpu; make -f Makefile.hip -j
-6. Building the LAMMPS executable (lmp_hip):
-    cd ../../src; make hip -j
 
-                      EXAMPLE CONVENTIONAL BUILD PROCESS
-                  --------------------------------
-                    
-cd ~/lammps/lib/gpu
-emacs Makefile.linux
-make -f Makefile.linux
-./nvc_get_devices
-cd ../../src
-emacs ./MAKE/Makefile.linux
-make yes-asphere
-make yes-kspace
-make yes-gpu
-make linux
+------------------------------------------------------------------------------
+                             OPENCL BUILD NOTES
+------------------------------------------------------------------------------
+If GERYON_NUMA_FISSION is defined at build time, LAMMPS will consider separate
+NUMA nodes on GPUs or accelerators as separate devices. For example, a 2-socket
+CPU would appear as two separate devices for OpenCL (and LAMMPS would require
+two MPI processes to use both sockets with the GPU library - each with its
+own device ID as output by ocl_get_devices).
+
+For a debug build, use "-DUCL_DEBUG -DGERYON_KERNEL_DUMP" and remove
+"-DUCL_NO_EXIT" and "-DMPI_GERYON" from the build options.
+
+------------------------------------------------------------------------------
+                   ALL PREPROCESSOR OPTIONS (For Advanced Users)
+------------------------------------------------------------------------------
+_SINGLE_SINGLE          Build library for single precision mode
+_SINGLE_DOUBLE          Build library for mixed precision mode
+_DOUBLE_DOUBLE          Build library for double precision mode
+CUDA_MPS_SUPPORT        Do not generate errors for exclusive mode for CUDA
+CUDA_PROXY              Same as above
+MPI_GERYON              Library should use MPI_Abort for unhandled errors
+GERYON_NUMA_FISSION     Accelerators with main memory NUMA are split into
+                        multiple virtual accelerators for each NUMA node
+LAL_USE_OMP=0           Disable OpenMP in lib, regardless of compiler setting
+LAL_USE_OMP_SIMD=0      Disable OpenMP SIMD in lib, regardless of compiler set
+GERYON_OCL_FLUSH        For OpenCL, flush queue after every enqueue
+LAL_NO_OCL_EV_JIT       Turn off JIT specialization for kernels in OpenCL
+LAL_USE_OLD_NEIGHBOR    Use old neighbor list algorithm
+USE_CUDPP               Enable GPU binning in neighbor builds (not recommended)
+USE_HIP_DEVICE_SORT     Enable GPU binning for HIP builds
+                        (only w/ LAL_USE_OLD_NEIGHBOR)
+LAL_NO_BLOCK_REDUCE     Use host for energy/virial accumulation
+LAL_OCL_EXTRA_ARGS      Supply extra args for OpenCL compiler delimited with :
+UCL_NO_EXIT             LAMMPS should handle errors instead of Geryon lib
+UCL_DEBUG               Debug build for Geryon
+GERYON_KERNEL_DUMP      Dump all compiled OpenCL programs with compiler
+                        flags and build logs
+GPU_CAST                Casting performed on GPU, untested recently
+THREE_CONCURRENT        Concurrent 3-body calcs in separate queues, untested
+
+
+------------------------------------------------------------------------------
+                           References for Details
+------------------------------------------------------------------------------
+
+Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing
+Molecular Dynamics on Hybrid High Performance Computers - Short Range
+Forces. Computer Physics Communications. 2011. 182: p. 898-911.
+
+and
+
+Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing
+Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle
+Particle-Mesh. Computer Physics Communications. 2012. 183: p. 449-459.
+
+and
+
+Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High
+Performance Computers - Three-Body Potentials. Computer Physics Communications.
+2013. 184: p. 2785–2793.
 
diff --git a/lib/gpu/geryon/hip_device.h b/lib/gpu/geryon/hip_device.h
index d2fb1919b7..373b3783b0 100644
--- a/lib/gpu/geryon/hip_device.h
+++ b/lib/gpu/geryon/hip_device.h
@@ -24,6 +24,8 @@ namespace ucl_hip {
 // --------------------------------------------------------------------------
 typedef hipStream_t command_queue;
 
+inline void ucl_flush(command_queue &cq) {}
+
 inline void ucl_sync(hipStream_t &stream) {
   CU_SAFE_CALL(hipStreamSynchronize(stream));
 }
@@ -143,15 +145,26 @@ class UCL_Device {
   inline std::string device_type_name(const int i) { return "GPU"; }
 
   /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type() { return device_type(_device); }
+  inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
   /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type(const int i) { return UCL_GPU; }
+  inline enum UCL_DEVICE_TYPE device_type(const int i) { return UCL_GPU; }
 
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory() { return shared_memory(_device); }
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
 
+  /// Returns preferred vector width
+  inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp32_width(const int i)
+    {return _properties[i].SIMDWidth;}
+  /// Returns preferred vector width
+  inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp64_width(const int i)
+    {return _properties[i].SIMDWidth;}
+
   /// Returns true if double precision is support for the current device
   inline bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
@@ -215,7 +228,19 @@ class UCL_Device {
   /// Get the maximum number of threads per block
   inline size_t group_size(const int i)
     { return _properties[i].maxThreadsPerBlock; }
-
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int dim)
+    { return group_size_dim(_device, dim); }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int i, const int dim)
+    { return _properties[i].maxThreadsDim[dim];}
+  
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size() { return slm_size(_device); }
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size(const int i)
+    { return _properties[i].sharedMemPerBlock; }
+ 
   /// Return the maximum memory pitch in bytes for current device
   inline size_t max_pitch() { return max_pitch(_device); }
   /// Return the maximum memory pitch in bytes
@@ -255,11 +280,20 @@ class UCL_Device {
   inline int max_sub_devices(const int i)
     { return 0; }
 
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support()
+    { return has_shuffle_support(_device); }
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support(const int i)
+    { return arch(i)>=3.0; }
+
   /// List all devices along with all properties
   inline void print_all(std::ostream &out);
 
-  /// Select the platform that has accelerators (for compatibility with OpenCL)
-  inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
+  /// For compatability with OCL API
+  inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
+			       const std::string vendor="")
+    { return set_platform(0); }
 
   inline int load_module(const void* program, hipModule_t& module, std::string *log=nullptr){
     auto it = _loaded_modules.emplace(program, hipModule_t());
diff --git a/lib/gpu/geryon/hip_kernel.h b/lib/gpu/geryon/hip_kernel.h
index c5014b52e7..10bc9f1334 100644
--- a/lib/gpu/geryon/hip_kernel.h
+++ b/lib/gpu/geryon/hip_kernel.h
@@ -14,6 +14,7 @@
 #include <fstream>
 #include <string>
 #include <iostream>
+#include <cstdio>
 
 namespace ucl_hip {
 
@@ -64,7 +65,7 @@ class UCL_Program {
   }
 
   /// Load a program from a string and compile with flags
-  inline int load_string(const void *program, const char *flags="", std::string *log=nullptr) {
+  inline int load_string(const void *program, const char *flags="", std::string *log=nullptr, FILE* foutput=nullptr) {
     return _device_ptr->load_module(program, _module, log);
   }
 
@@ -73,6 +74,7 @@ class UCL_Program {
   hipModule_t _module;
   hipStream_t _cq;
   friend class UCL_Texture;
+  friend class UCL_Const;
 };
 
 /// Class for dealing with CUDA Driver kernels
diff --git a/lib/gpu/geryon/hip_texture.h b/lib/gpu/geryon/hip_texture.h
index ae16bee900..9117adc879 100644
--- a/lib/gpu/geryon/hip_texture.h
+++ b/lib/gpu/geryon/hip_texture.h
@@ -107,6 +107,37 @@ class UCL_Texture {
   }
 };
 
+/// Class storing a const global memory reference
+class UCL_Const {
+ public:
+  UCL_Const() {}
+  ~UCL_Const() {}
+  /// Construct with a specified global reference
+  inline UCL_Const(UCL_Program &prog, const char *global_name)
+    { get_global(prog,global_name); }
+  /// Set the global reference for this object
+  inline void get_global(UCL_Program &prog, const char *global_name) {
+    _cq=prog.cq();
+    CU_SAFE_CALL(hipModuleGetGlobal(&_global, &_global_bytes, prog._module,
+				    global_name)); 
+  }
+  /// Copy from array on host to const memory
+  template <class numtyp>
+  inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
+    CU_SAFE_CALL(hipMemcpyHtoDAsync(_global, src.begin(), numel*sizeof(numtyp),
+				    _cq));
+  }
+  /// Get device ptr associated with object
+  inline const void* begin() const { return &_global; }
+  inline void clear() {}
+
+ private:
+  hipStream_t _cq;
+  void* _global;
+  size_t _global_bytes;
+  friend class UCL_Kernel;
+};
+
 } // namespace
 
 #endif
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 42f176bcbf..52b2ed478e 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -37,6 +37,8 @@ namespace ucl_cudadr {
 // --------------------------------------------------------------------------
 typedef CUstream command_queue;
 
+inline void ucl_flush(command_queue &cq) {}
+
 inline void ucl_sync(CUstream &stream) {
   CU_SAFE_CALL(cuStreamSynchronize(stream));
 }
@@ -156,15 +158,26 @@ class UCL_Device {
   inline std::string device_type_name(const int i) { return "GPU"; }
 
   /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type() { return device_type(_device); }
+  inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
   /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type(const int i) { return UCL_GPU; }
+  inline enum UCL_DEVICE_TYPE device_type(const int i) { return UCL_GPU; }
 
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory() { return shared_memory(_device); }
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
 
+  /// Returns preferred vector width
+  inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp32_width(const int i)
+    {return _properties[i].SIMDWidth;}
+  /// Returns preferred vector width
+  inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp64_width(const int i)
+    {return _properties[i].SIMDWidth;}
+
   /// Returns true if double precision is support for the current device
   inline bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
@@ -228,6 +241,18 @@ class UCL_Device {
   /// Get the maximum number of threads per block
   inline size_t group_size(const int i)
     { return _properties[i].maxThreadsPerBlock; }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int dim)
+    { return group_size_dim(_device, dim); }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int i, const int dim)
+    { return _properties[i].maxThreadsDim[dim]; }
+  
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size() { return slm_size(_device); }
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size(const int i)
+    { return _properties[i].sharedMemPerBlock; }
 
   /// Return the maximum memory pitch in bytes for current device
   inline size_t max_pitch() { return max_pitch(_device); }
@@ -268,11 +293,22 @@ class UCL_Device {
   inline int max_sub_devices(const int i)
     { return 0; }
 
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support()
+    { return has_shuffle_support(_device); }
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support(const int i)
+    { return arch(i)>=3.0; }
+
   /// List all devices along with all properties
   inline void print_all(std::ostream &out);
 
-  /// Select the platform that has accelerators (for compatibility with OpenCL)
-  inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
+  /// For compatability with OCL API
+  inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
+			       const std::string vendor="",
+			       const int ndevices=-1,
+			       const int first_device=-1)
+    { return set_platform(0); }
 
  private:
   int _device, _num_devices;
diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h
index d74b0e2dc1..c31b8cdf9b 100644
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@@ -26,6 +26,7 @@
 
 #include "nvd_device.h"
 #include <fstream>
+#include <cstdio>
 
 namespace ucl_cudadr {
 
@@ -77,7 +78,7 @@ class UCL_Program {
 
   /// Load a program from a string and compile with flags
   inline int load_string(const void *program, const char *flags="",
-                         std::string *log=nullptr) {
+                         std::string *log=nullptr, FILE* foutput=nullptr) {
     if (std::string(flags)=="BINARY")
       return load_binary((const char *)program);
     const unsigned int num_opts=2;
@@ -100,12 +101,25 @@ class UCL_Program {
 
     if (err != CUDA_SUCCESS) {
       #ifndef UCL_NO_EXIT
-      std::cerr << std::endl
+      std::cerr << std::endl << std::endl
                 << "----------------------------------------------------------\n"
                 << " UCL Error: Error compiling PTX Program...\n"
                 << "----------------------------------------------------------\n";
-      std::cerr << log << std::endl;
+      std::cerr << log << std::endl
+                << "----------------------------------------------------------\n\n";
       #endif
+      if (foutput != NULL) {
+	fprintf(foutput,"\n\n");
+	fprintf(foutput,
+		"----------------------------------------------------------\n");
+	fprintf(foutput," UCL Error: Error compiling PTX Program...\n");
+	fprintf(foutput,
+		"----------------------------------------------------------\n");
+	fprintf(foutput,"%s\n",log);
+	fprintf(foutput,
+		"----------------------------------------------------------\n");
+	fprintf(foutput,"\n\n");
+      }
       return UCL_COMPILE_ERROR;
     }
 
@@ -139,11 +153,15 @@ class UCL_Program {
     return UCL_SUCCESS;
   }
 
+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return _cq; }
+
   friend class UCL_Kernel;
  private:
   CUmodule _module;
   CUstream _cq;
   friend class UCL_Texture;
+  friend class UCL_Const;
 };
 
 /// Class for dealing with CUDA Driver kernels
diff --git a/lib/gpu/geryon/nvd_texture.h b/lib/gpu/geryon/nvd_texture.h
index c766af826c..65f4ad9ef5 100644
--- a/lib/gpu/geryon/nvd_texture.h
+++ b/lib/gpu/geryon/nvd_texture.h
@@ -38,8 +38,11 @@ class UCL_Texture {
   inline UCL_Texture(UCL_Program &prog, const char *texture_name)
     { get_texture(prog,texture_name); }
   /// Set the texture reference for this object
-  inline void get_texture(UCL_Program &prog, const char *texture_name)
-    { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
+  inline void get_texture(UCL_Program &prog, const char *texture_name) {
+    #if (CUDA_VERSION < 11000)
+    CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name));
+    #endif
+  }
 
   /// Bind a float array where each fetch grabs a vector of length numel
   template<class numtyp>
@@ -72,11 +75,14 @@ class UCL_Texture {
   }
 
  private:
+  #if (CUDA_VERSION < 11000)
   CUtexref _tex;
+  #endif
   friend class UCL_Kernel;
 
   template<class mat_typ>
   inline void _bind_float(mat_typ &vec, const unsigned numel) {
+    #if (CUDA_VERSION < 11000)
     #ifdef UCL_DEBUG
     assert(numel!=0 && numel<5);
     #endif
@@ -90,10 +96,42 @@ class UCL_Texture {
       else
         CU_SAFE_CALL(cuTexRefSetFormat(_tex,CU_AD_FORMAT_SIGNED_INT32,numel*2));
     }
+    #endif
   }
 
 };
 
+/// Class storing a const global memory reference
+class UCL_Const {
+ public:
+  UCL_Const() {}
+  ~UCL_Const() {}
+  /// Construct with a specified global reference
+  inline UCL_Const(UCL_Program &prog, const char *global_name)
+    { get_global(prog,global_name); }
+  /// Set the global reference for this object
+  inline void get_global(UCL_Program &prog, const char *global_name) {
+    _cq=prog.cq();
+    CU_SAFE_CALL(cuModuleGetGlobal(&_global, &_global_bytes, prog._module,
+				   global_name)); 
+  }
+  /// Copy from array on host to const memory
+  template <class numtyp>
+  inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
+    CU_SAFE_CALL(cuMemcpyHtoDAsync(_global, src.begin(), numel*sizeof(numtyp),
+				   _cq));
+  }
+  /// Get device ptr associated with object
+  inline const CUdeviceptr * begin() const { return &_global; }
+  inline void clear() {}
+
+ private:
+  CUstream _cq;
+  CUdeviceptr _global;
+  size_t _global_bytes;
+  friend class UCL_Kernel;
+};
+
 } // namespace
 
 #endif
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index de4def0bc1..b0a3e3d583 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -28,14 +28,6 @@
 #include <vector>
 #include <iostream>
 
-/* We default to OpenCL 1.2 as target version for now as
- * there are known issues with OpenCL 2.0 and later.
- * This is also to silence warnings from generic OpenCL headers */
-
-#if !defined(CL_TARGET_OPENCL_VERSION)
-#define CL_TARGET_OPENCL_VERSION 120
-#endif
-
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #include <OpenCL/cl_platform.h>
@@ -55,17 +47,36 @@ namespace ucl_opencl {
 typedef cl_command_queue command_queue;
 typedef cl_context context_type;
 
+inline void ucl_flush(command_queue &cq) { CL_SAFE_CALL(clFlush(cq)); }
+
 inline void ucl_sync(cl_command_queue &cq) {
   CL_SAFE_CALL(clFinish(cq));
 }
 
-inline bool _shared_mem_device(cl_device_type &device_type) {
+#if defined(GERYON_FORCE_SHARED_MAIN_MEM_ON)
+inline bool _shared_mem_device(cl_device_id &device) { return true; }
+#elif defined(GERYON_FORCE_SHARED_MAIN_MEM_OFF)
+inline bool _shared_mem_device(cl_device_id &device) { return false; }
+#else
+inline bool _shared_mem_device(cl_device_id &device) {
+  #ifdef CL_VERSION_1_2
+  cl_bool br;
+  CL_SAFE_CALL(clGetDeviceInfo(device, CL_DEVICE_HOST_UNIFIED_MEMORY,
+                               sizeof(cl_bool), &br,NULL));
+  return (br == CL_TRUE);
+  #else
+  cl_device_type device_type;
+  CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
+			       sizeof(device_type),&device_type,NULL));
   return (device_type==CL_DEVICE_TYPE_CPU);
+  #endif
 }
+#endif
 
 struct OCLProperties {
   std::string name;
   cl_device_type device_type;
+  bool is_subdevice;
   cl_ulong global_mem;
   cl_ulong shared_mem;
   cl_ulong const_mem;
@@ -74,12 +85,16 @@ struct OCLProperties {
   size_t work_group_size;
   size_t work_item_size[3];
   bool double_precision;
+  int preferred_vector_width32, preferred_vector_width64;
   int alignment;
   size_t timer_resolution;
   bool ecc_support;
   std::string c_version;
   bool partition_equal, partition_counts, partition_affinity;
   cl_uint max_sub_devices;
+  int cl_device_version;
+  bool has_subgroup_support;
+  bool has_shuffle_support;
 };
 
 /// Class for looking at data parallel device properties
@@ -182,16 +197,27 @@ class UCL_Device {
   inline std::string device_type_name(const int i);
 
   /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type() { return device_type(_device); }
+  inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
   /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type(const int i);
+  inline enum UCL_DEVICE_TYPE device_type(const int i);
 
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory() { return shared_memory(_device); }
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory(const int i)
-    { return _shared_mem_device(_properties[i].device_type); }
+    { return _shared_mem_device(_cl_devices[i]); }
 
+  /// Returns preferred vector width
+  inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp32_width(const int i)
+    {return _properties[i].preferred_vector_width32;}
+  /// Returns preferred vector width
+  inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp64_width(const int i)
+    {return _properties[i].preferred_vector_width64;}
+  
   /// Returns true if double precision is support for the current device
   inline bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
@@ -242,6 +268,18 @@ class UCL_Device {
   /// Get the maximum number of threads per block
   inline size_t group_size(const int i)
     { return _properties[i].work_group_size; }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int dim)
+    { return group_size_dim(_device, dim); }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int i, const int dim)
+    { return _properties[i].work_item_size[dim]; }
+
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size() { return slm_size(_device); }
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size(const int i)
+    { return _properties[i].shared_mem; }
 
   /// Return the maximum memory pitch in bytes for current device
   inline size_t max_pitch() { return max_pitch(_device); }
@@ -256,6 +294,12 @@ class UCL_Device {
   inline bool sharing_supported(const int i)
     { return true; }
 
+  /// True if the device is a sub-device
+  inline bool is_subdevice()
+    { return is_subdevice(_device); }
+  /// True if the device is a sub-device
+  inline bool is_subdevice(const int i)
+    { return _properties[i].is_subdevice; }
   /// True if splitting device into equal subdevices supported
   inline bool fission_equal()
     { return fission_equal(_device); }
@@ -274,6 +318,18 @@ class UCL_Device {
   /// True if splitting device into subdevices by affinity domains supported
   inline bool fission_by_affinity(const int i)
     { return _properties[i].partition_affinity; }
+  /// True if the device has subgroup support
+  inline bool has_subgroup_support()
+    { return has_subgroup_support(_device); }
+  /// True if the device has subgroup support
+  inline bool has_subgroup_support(const int i)
+    { return _properties[i].has_subgroup_support; }
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support()
+    { return has_shuffle_support(_device); }
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support(const int i)
+    { return _properties[i].has_shuffle_support; }
 
   /// Maximum number of subdevices allowed from device fission
   inline int max_sub_devices()
@@ -281,6 +337,12 @@ class UCL_Device {
   /// Maximum number of subdevices allowed from device fission
   inline int max_sub_devices(const int i)
     { return _properties[i].max_sub_devices; }
+  /// OpenCL version supported by the device
+  inline int cl_device_version()
+    { return cl_device_version(_device); }
+  /// OpenCL version supported by the device
+  inline int cl_device_version(const int i)
+    { return _properties[i].cl_device_version; }
 
   /// List all devices along with all properties
   inline void print_all(std::ostream &out);
@@ -288,8 +350,14 @@ class UCL_Device {
   /// Return the OpenCL type for the device
   inline cl_device_id & cl_device() { return _cl_device; }
 
-  /// Select the platform that has accelerators
-  inline int set_platform_accelerator(int pid=-1);
+  /// Automatically set the platform by type, vendor, and/or CU count
+  /** If first_device is positive, search restricted to platforms containing
+    * this device IDs. If ndevices is positive, search is restricted 
+    * to platforms with at least that many devices  **/
+  inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
+			       const std::string vendor="",
+			       const int ndevices=-1,
+			       const int first_device=-1);
 
  private:
   int _num_platforms;          // Number of platforms
@@ -322,8 +390,7 @@ UCL_Device::UCL_Device() {
     return;
   } else
     _num_platforms=static_cast<int>(nplatforms);
-  // note that platform 0 may not necessarily be associated with accelerators
-  set_platform_accelerator();
+  set_platform(0);
 }
 
 UCL_Device::~UCL_Device() {
@@ -332,6 +399,14 @@ UCL_Device::~UCL_Device() {
 
 void UCL_Device::clear() {
   _properties.clear();
+
+  #ifdef GERYON_NUMA_FISSION
+  #ifdef CL_VERSION_1_2
+  for (int i=0; i<_cl_devices.size(); i++)
+    CL_DESTRUCT_CALL(clReleaseDevice(_cl_devices[i]));
+  #endif
+  #endif
+
   _cl_devices.clear();
   if (_device>-1) {
     for (size_t i=0; i<_cq.size(); i++) {
@@ -341,6 +416,7 @@ void UCL_Device::clear() {
     CL_DESTRUCT_CALL(clReleaseContext(_context));
   }
   _device=-1;
+  _num_devices=0;
 }
 
 int UCL_Device::set_platform(int pid) {
@@ -370,11 +446,51 @@ int UCL_Device::set_platform(int pid) {
   CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
                               &n));
 
+  #ifndef GERYON_NUMA_FISSION
   // --- Store properties for each device
   for (int i=0; i<_num_devices; i++) {
     _cl_devices.push_back(device_list[i]);
     add_properties(device_list[i]);
   }
+  #else
+  // --- Create sub-devices for anything partitionable by NUMA and store props
+  int num_unpart = _num_devices;
+  _num_devices = 0;
+  for (int i=0; i<num_unpart; i++) {
+    cl_uint num_subdevices = 1;
+    cl_device_id *subdevice_list = device_list + i;
+
+    #ifdef CL_VERSION_1_2
+    cl_device_affinity_domain adomain;
+    CL_SAFE_CALL(clGetDeviceInfo(device_list[i],
+				 CL_DEVICE_PARTITION_AFFINITY_DOMAIN,
+				 sizeof(cl_device_affinity_domain),
+				 &adomain,NULL));
+
+    cl_device_partition_property props[3];
+    props[0]=CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
+    props[1]=CL_DEVICE_AFFINITY_DOMAIN_NUMA;
+    props[2]=0;
+    if (adomain & CL_DEVICE_AFFINITY_DOMAIN_NUMA)
+      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, 0, NULL,
+				      &num_subdevices));
+    if (num_subdevices > 1) {
+      subdevice_list = new cl_device_id[num_subdevices];
+      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, num_subdevices,
+				      subdevice_list, &num_subdevices));
+    }
+    #endif
+
+    for (int j=0; j<num_subdevices; j++) {
+      _num_devices++;
+      _cl_devices.push_back(subdevice_list[j]);
+      add_properties(subdevice_list[j]);
+    }
+
+    if (num_subdevices > 1) delete[] subdevice_list;
+  } // for i
+  #endif
+
   delete[] device_list;
   return UCL_SUCCESS;
 }
@@ -429,11 +545,18 @@ void UCL_Device::add_properties(cl_device_id device_list) {
                                sizeof(cl_uint),&op.alignment,nullptr));
   op.alignment/=8;
 
+  cl_uint float_width;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,
+                               CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
+                               sizeof(float_width),&float_width,nullptr));
+  op.preferred_vector_width32=float_width;
+
   // Determine if double precision is supported
   cl_uint double_width;
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                                sizeof(double_width),&double_width,nullptr));
+  op.preferred_vector_width64=double_width;
   if (double_width==0)
     op.double_precision=false;
   else
@@ -452,9 +575,14 @@ void UCL_Device::add_properties(cl_device_id device_list) {
     op.ecc_support=true;
 
   op.c_version="";
+  op.is_subdevice=false;
   op.partition_equal=false;
   op.partition_counts=false;
   op.partition_affinity=false;
+  op.max_sub_devices=1;
+  op.cl_device_version=0;
+  op.has_subgroup_support=false;
+  op.has_shuffle_support=false;
 
   #ifdef CL_VERSION_1_2
   size_t return_bytes;
@@ -463,6 +591,13 @@ void UCL_Device::add_properties(cl_device_id device_list) {
   op.c_version=buffer;
 
   cl_device_partition_property pinfo[4];
+  CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PARTITION_TYPE,
+			       4*sizeof(cl_device_partition_property),
+			       &pinfo, &return_bytes));
+  if (return_bytes == 0) op.is_subdevice=false;
+  else if (pinfo[0]) op.is_subdevice=true;
+  else op.is_subdevice=false;
+
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PARTITION_PROPERTIES,
                                4*sizeof(cl_device_partition_property),
@@ -480,6 +615,46 @@ void UCL_Device::add_properties(cl_device_id device_list) {
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
                                sizeof(cl_uint),&op.max_sub_devices,nullptr));
+
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_VERSION,1024,buffer,nullptr));
+  int cl_version_maj = buffer[7] - '0';
+  int cl_version_min = buffer[9] - '0';
+  op.cl_device_version = cl_version_maj * 100 + cl_version_min * 10;
+
+  size_t ext_str_size_ret;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS, 0, nullptr,
+			       &ext_str_size_ret));
+  char buffer2[ext_str_size_ret];
+  CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS,
+			       ext_str_size_ret, buffer2, nullptr));
+  #if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)
+  if (op.cl_device_version >= 210) {
+    if ((std::string(buffer2).find("cl_khr_subgroups") != std::string::npos) ||
+        (std::string(buffer2).find("cl_intel_subgroups") != std::string::npos))
+      op.has_subgroup_support=true;
+    if (std::string(buffer2).find("cl_intel_subgroups") != std::string::npos)
+      op.has_shuffle_support=true;
+  }
+  #endif
+  if (std::string(buffer2).find("cl_nv_device_attribute_query") !=
+      std::string::npos) {
+    #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+    #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
+    #endif
+    #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+    #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
+    #endif
+    cl_uint major, minor;
+    CL_SAFE_CALL(clGetDeviceInfo(device_list,
+				 CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+                                 sizeof(cl_uint), &major, nullptr));
+    CL_SAFE_CALL(clGetDeviceInfo(device_list,
+				 CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+                                 sizeof(cl_uint), &minor, nullptr));
+    double arch = static_cast<double>(minor)/10+major;
+    if (arch >= 3.0)
+      op.has_shuffle_support=true;
+  }
   #endif
 
   _properties.push_back(op);
@@ -516,7 +691,7 @@ std::string UCL_Device::device_type_name(const int i) {
 }
 
 // Get a string telling the type of the device
-int UCL_Device::device_type(const int i) {
+enum UCL_DEVICE_TYPE UCL_Device::device_type(const int i) {
   if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
     return UCL_CPU;
   else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
@@ -529,14 +704,8 @@ int UCL_Device::device_type(const int i) {
 
 // Set the CUDA device to the specified device number
 int UCL_Device::set(int num) {
-  cl_device_id *device_list = new cl_device_id[_num_devices];
-  cl_uint n;
-  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
-                               device_list,&n));
-
   _device=num;
-  _cl_device=device_list[_device];
-  delete[] device_list;
+  _cl_device=_cl_devices[_device];
   return create_context();
 }
 
@@ -555,6 +724,11 @@ void UCL_Device::print_all(std::ostream &out) {
       out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
       out << "  Type of device:                                "
           << device_type_name(i).c_str() << std::endl;
+      out << "  Is a subdevice:                                ";
+      if (is_subdevice(i))
+	out << "Yes\n";
+      else
+	out << "No\n";
       out << "  Double precision support:                      ";
       if (double_precision(i))
         out << "Yes\n";
@@ -613,33 +787,93 @@ void UCL_Device::print_all(std::ostream &out) {
         out << "No\n";
       out << "  Maximum subdevices from fission:               "
           << max_sub_devices(i) << std::endl;
+      out << "  Shared memory system:                          ";
+      if (shared_memory(i))
+        out << "Yes\n";
+      else
+        out << "No\n";
     }
   }
 }
 
-// Select the platform that is associated with accelerators
-// if pid < 0, select the first platform
-int UCL_Device::set_platform_accelerator(int pid) {
-  if (pid < 0) {
-    int found = 0;
-    for (int n=0; n<_num_platforms; n++) {
-      set_platform(n);
-      for (int i=0; i<num_devices(); i++) {
-        if ((_properties[i].device_type & CL_DEVICE_TYPE_CPU) ||
-            (_properties[i].device_type & CL_DEVICE_TYPE_GPU) ||
-            (_properties[i].device_type & CL_DEVICE_TYPE_ACCELERATOR)) {
-          found = 1;
-          break;
-        }
+int UCL_Device::auto_set_platform(const enum UCL_DEVICE_TYPE type,
+				  const std::string vendor,
+				  const int ndevices,
+				  const int first_device) {
+  if (_num_platforms < 2) return set_platform(0);
+
+  int last_device = -1;
+  if (first_device > -1) {
+    if (ndevices)
+      last_device = first_device + ndevices - 1;
+    else
+      last_device = first_device;
+  }
+  
+  bool vendor_match=false;
+  bool type_match=false;
+  int max_cus=0;
+  int best_platform=0;
+
+  std::string vendor_upper=vendor;
+  for (int i=0; i<vendor.length(); i++)
+    if (vendor_upper[i]<='z' && vendor_upper[i]>='a')
+      vendor_upper[i]=toupper(vendor_upper[i]);
+
+  for (int n=0; n<_num_platforms; n++) {
+    set_platform(n);
+    if (last_device > -1 && last_device >= num_devices()) continue;
+    if (ndevices > num_devices()) continue;
+
+    int first_id=0;
+    int last_id=num_devices()-1;
+    if (last_device > -1) {
+      first_id=first_device;
+      last_id=last_device;
+    }
+
+    if (vendor_upper!="") {
+      std::string pname = platform_name();
+      for (int i=0; i<pname.length(); i++)
+	if (pname[i]<='z' && pname[i]>='a')
+	  pname[i]=toupper(pname[i]);
+
+      if (pname.find(vendor_upper)!=std::string::npos) {
+	if (vendor_match == false) {
+	  best_platform=n;
+	  max_cus=0;
+	  vendor_match=true;
+	}
+      } else if (vendor_match)
+	continue;
+    }
+
+    if (type != UCL_DEFAULT) {
+      bool ptype_matched=false;
+      for (int d=first_id; d<=last_id; d++) {
+	if (type==device_type(d)) {
+	  if (type_match == false) {
+	    best_platform=n;
+	    max_cus=0;
+	    type_match=true;
+	    ptype_matched=true;
+	  }
+	}
+      }
+      if (type_match==true && ptype_matched==false)
+	continue;
+    }
+
+    for (int d=first_id; d<=last_id; d++) {
+      if (cus(d) > max_cus) {
+	best_platform=n;
+	max_cus=cus(d);
       }
-      if (found) return UCL_SUCCESS;
     }
-    return UCL_ERROR;
-  } else {
-    return set_platform(pid);
   }
+  return set_platform(best_platform);
 }
 
-} // namespace ucl_opencl 
+} // namespace ucl_opencl
 
 #endif
diff --git a/lib/gpu/geryon/ocl_kernel.h b/lib/gpu/geryon/ocl_kernel.h
index 77593f4515..23f9baa09e 100644
--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@@ -2,6 +2,7 @@
                                 ocl_kernel.h
                              -------------------
                                W. Michael Brown
+                            Nitin Dhamankar (Intel)
 
   Utilities for dealing with OpenCL kernels
 
@@ -26,6 +27,7 @@
 
 #include "ocl_device.h"
 #include <fstream>
+#include <cstdio>
 
 namespace ucl_opencl {
 
@@ -93,7 +95,7 @@ class UCL_Program {
 
   /// Load a program from a string and compile with flags
   inline int load_string(const void *program, const char *flags="",
-                         std::string *log=nullptr) {
+                         std::string *log=nullptr, FILE* foutput=nullptr) {
     cl_int error_flag;
     const char *prog=(const char *)program;
     _program=clCreateProgramWithSource(_context,1,&prog,nullptr,&error_flag);
@@ -107,27 +109,66 @@ class UCL_Program {
                                        sizeof(cl_build_status),&build_status,
                                        nullptr));
 
-    if (build_status != CL_SUCCESS || log!=nullptr) {
+    #ifdef GERYON_KERNEL_DUMP
+    {
       size_t ms;
-      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
-                                         nullptr, &ms));
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
+					 0,NULL,&ms));
       char *build_log = new char[ms];
-      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
-                                         build_log, nullptr));
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
+					 ms,build_log, NULL));
+      std::cout << std::endl << std::endl
+		<< "--------------------------------------------------------\n"
+		<< "   UCL PROGRAM DUMP\n"
+		<< "--------------------------------------------------------\n"
+		<< flags << std::endl
+		<< "--------------------------------------------------------\n"
+		<< prog << std::endl
+		<< "--------------------------------------------------------\n"
+		<< build_log
+		<< "--------------------------------------------------------\n"
+		<< std::endl << std::endl;
+    }
+    #endif
+    
+    if (build_status != CL_SUCCESS || log!=NULL) {
+      size_t ms;
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
+					 0,NULL,&ms));
+      char *build_log = new char[ms];
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
+					 ms,build_log, NULL));
 
       if (log!=nullptr)
         *log=std::string(build_log);
 
       if (build_status != CL_SUCCESS) {
         #ifndef UCL_NO_EXIT
-        std::cerr << std::endl
-                  << "----------------------------------------------------------\n"
-                  << " UCL Error: Error compiling OpenCL Program ("
-                  << build_status << ") ...\n"
-                  << "----------------------------------------------------------\n";
+        std::cerr << std::endl << std::endl
+          << "----------------------------------------------------------\n"
+          << " UCL Error: Error compiling OpenCL Program ("
+          << build_status << ") ...\n"
+          << "----------------------------------------------------------\n";
         std::cerr << build_log << std::endl;
+	std::cerr <<
+	  "----------------------------------------------------------\n"
+	  << std::endl << std::endl;
         #endif
-        delete[] build_log;
+	if (foutput != NULL) {
+	  fprintf(foutput,"\n\n");
+	  fprintf(foutput,
+	    "----------------------------------------------------------\n");
+	  fprintf(foutput,
+		  " UCL Error: Error compiling OpenCL Program (%d) ...\n",
+		  build_status);
+	  fprintf(foutput,
+	    "----------------------------------------------------------\n");
+	  fprintf(foutput,"%s\n",build_log);
+	  fprintf(foutput,
+	    "----------------------------------------------------------\n");
+	  fprintf(foutput,"\n\n");
+	}
+	delete[] build_log;
         return UCL_COMPILE_ERROR;
       } else delete[] build_log;
     }
@@ -141,6 +182,7 @@ class UCL_Program {
   inline void cq(command_queue &cq_in) { _cq=cq_in; }
 
   friend class UCL_Kernel;
+  friend class UCL_Const;
  private:
   bool _init_done;
   cl_program _program;
@@ -322,9 +364,45 @@ class UCL_Kernel {
   inline void cq(command_queue &cq_in) { _cq=cq_in; }
   #include "ucl_arg_kludge.h"
 
+  #if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)
+  inline size_t max_subgroup_size(const size_t block_size_x) {
+    size_t block_size = block_size_x;
+    CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
+                                         CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+                                         sizeof(block_size), (void *) &block_size,
+                                         sizeof(size_t), (void *) &_mx_subgroup_sz,
+                                         NULL));
+    return _mx_subgroup_sz;
+  }
+
+  inline size_t max_subgroup_size(const size_t block_size_x,
+                                  const size_t block_size_y) {
+    size_t block_size[2] { block_size_x, block_size_y };
+    CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
+                                         CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+                                         sizeof(block_size), (void *) &block_size,
+                                         sizeof(size_t), (void *) &_mx_subgroup_sz,
+                                         NULL));
+    return _mx_subgroup_sz;
+  }
+
+  inline size_t max_subgroup_size(const size_t block_size_x,
+                                  const size_t block_size_y,
+                                  const size_t block_size_z) {
+    size_t block_size[3] { block_size_x, block_size_y, block_size_z };
+    CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
+                                         CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+                                         sizeof(block_size), (void *) &block_size,
+                                         sizeof(size_t), (void *) &_mx_subgroup_sz,
+                                         NULL));
+    return _mx_subgroup_sz;
+  }
+  #endif
+
  private:
   cl_kernel _kernel;
   cl_program _program;
+  cl_device_id _device;
   cl_uint _dimensions;
   size_t _block_size[3];
   size_t _num_blocks[3];
@@ -338,6 +416,11 @@ class UCL_Kernel {
   unsigned _kernel_info_nargs;
   //std::string _kernel_info_args[256];
   #endif
+
+  #ifdef CL_VERSION_2_1
+  size_t _mx_subgroup_sz;      // Maximum sub-group size for this kernel
+  #endif
+
 };
 
 inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
@@ -347,6 +430,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
   CL_SAFE_CALL(clRetainCommandQueue(_cq));
   _program=program._program;
   CL_SAFE_CALL(clRetainProgram(_program));
+  _device=program._device;
   cl_int error_flag;
   _kernel=clCreateKernel(program._program,function,&error_flag);
 
@@ -380,8 +464,11 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
 }
 
 void UCL_Kernel::run() {
-  CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,nullptr,
-                                      _num_blocks,_block_size,0,nullptr,nullptr));
+  CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
+                                      _num_blocks,_block_size,0,NULL,NULL));
+  #ifdef GERYON_OCL_FLUSH
+  ucl_flush(_cq);
+  #endif
 }
 
 } // namespace
diff --git a/lib/gpu/geryon/ocl_macros.h b/lib/gpu/geryon/ocl_macros.h
index aeff689859..5fb7665817 100644
--- a/lib/gpu/geryon/ocl_macros.h
+++ b/lib/gpu/geryon/ocl_macros.h
@@ -4,14 +4,6 @@
 #include <cstdio>
 #include <cassert>
 
-/* We default to OpenCL 1.2 as target version for now as
- * there are known issues with OpenCL 2.0 and later.
- * This is also to silence warnings from generic OpenCL headers */
-
-#if !defined(CL_TARGET_OPENCL_VERSION)
-#define CL_TARGET_OPENCL_VERSION 120
-#endif
-
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #else
diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h
index 740020ab18..8937d4145a 100644
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@@ -106,9 +106,9 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
   mat.cbegin()=clCreateBuffer(context,buffer_perm,n,nullptr,&error_flag);
   if (error_flag != CL_SUCCESS)
     return UCL_MEMORY_ERROR;
-    *mat.host_ptr() = (typename mat_type::data_type*)
-                      clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
-                                         map_perm,0,n,0,nullptr,nullptr,nullptr);
+  *mat.host_ptr() = (typename mat_type::data_type*)
+    clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
+		       map_perm,0,n,0,NULL,NULL,NULL);
 
   mat.cq()=cm.cq();
   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@@ -116,18 +116,15 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
 }
 
 template <class mat_type, class copy_type>
-inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
+inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
+                      const size_t n) {
   cl_int error_flag;
-  cl_context context;
-  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
-                                  &context,nullptr));
-  cl_mem_flags orig_flags;
-  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
-                                  &orig_flags,nullptr));
-  orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
-
-  mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
-                              *mat.host_ptr(), &error_flag);
+  cl_buffer_region subbuffer;
+  subbuffer.origin = o;
+  subbuffer.size = n;
+  mat.cbegin()=clCreateSubBuffer(cm.cbegin(), 0,
+                                 CL_BUFFER_CREATE_TYPE_REGION, &subbuffer,
+                                 &error_flag);
 
   CL_CHECK_ERR(error_flag);
   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@@ -470,6 +467,9 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
   size_t kn=n/sizeof(typename mat_type::data_type);
   CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
   #endif
+  #ifdef GERYON_OCL_FLUSH
+  ucl_flush(cq);
+  #endif
 }
 
 // --------------------------------------------------------------------------
@@ -585,7 +585,10 @@ template <> struct _ucl_memcpy<1,0> {
     std::cerr << "UCL_COPY 1NS\n";
     #endif
     CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
-                                     dst.begin(),0,nullptr,nullptr));
+                                     dst.begin(),0,NULL,NULL));
+    #ifdef GERYON_OCL_FLUSH
+    if (block==CL_FALSE) ucl_flush(cq);
+    #endif
   }
   template <class p1, class p2>
   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@@ -617,6 +620,9 @@ template <> struct _ucl_memcpy<1,0> {
         src_offset+=spitch;
         dst_offset+=dpitch;
       }
+    #ifdef GERYON_OCL_FLUSH
+    if (block==CL_FALSE) ucl_flush(cq);
+    #endif
   }
 };
 
@@ -637,7 +643,10 @@ template <> struct _ucl_memcpy<0,1> {
     std::cerr << "UCL_COPY 3NS\n";
     #endif
     CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
-                                      src.begin(),0,nullptr,nullptr));
+                                      src.begin(),0,NULL,NULL));
+    #ifdef GERYON_OCL_FLUSH
+    if (block==CL_FALSE) ucl_flush(cq);
+    #endif
   }
   template <class p1, class p2>
   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@@ -669,6 +678,9 @@ template <> struct _ucl_memcpy<0,1> {
         src_offset+=spitch;
         dst_offset+=dpitch;
       }
+    #ifdef GERYON_OCL_FLUSH
+    if (block==CL_FALSE) ucl_flush(cq);
+    #endif
   }
 };
 
@@ -690,6 +702,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
     #endif
 
     if (block==CL_TRUE) ucl_sync(cq);
+    #ifdef GERYON_OCL_FLUSH
+    else ucl_flush(cq);
+    #endif
   }
   template <class p1, class p2>
   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@@ -720,6 +735,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
     #endif
 
     if (block==CL_TRUE) ucl_sync(cq);
+    #ifdef GERYON_OCL_FLUSH
+    else ucl_flush(cq);
+    #endif
   }
 };
 
diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h
index 0e60045f55..43de4b258c 100644
--- a/lib/gpu/geryon/ocl_texture.h
+++ b/lib/gpu/geryon/ocl_texture.h
@@ -53,6 +53,59 @@ class UCL_Texture {
   friend class UCL_Kernel;
 };
 
+/// Class storing a const global memory reference
+class UCL_Const {
+ public:
+   UCL_Const() : _global_bytes(0), _active(false) {}
+  ~UCL_Const() { clear(); }
+  /// Construct with a specified global reference
+  inline UCL_Const(UCL_Program &prog, const char *global_name)
+    { get_global(prog,global_name); }
+  /// Set the global reference for this object
+  inline void get_global(UCL_Program &prog, const char *global_name) {
+    if (_active) {
+      CL_DESTRUCT_CALL(clReleaseContext(_context));
+      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
+    }
+    _active = true;
+    _context = prog._context;
+    _cq = prog._cq;
+    CL_SAFE_CALL(clRetainContext(_context));
+    CL_SAFE_CALL(clRetainCommandQueue(_cq));
+  }
+  /// Copy from array on host to const memory
+  template <class numtyp>
+  inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
+    const int bytes=numel*sizeof(numtyp);
+    if (_global_bytes < bytes) {
+      if (_global_bytes) CL_SAFE_CALL(clReleaseMemObject(_global));
+      cl_int e;
+      _global = clCreateBuffer(_context, CL_MEM_READ_ONLY, bytes, NULL, &e);
+      CL_SAFE_CALL(e);
+    }
+    CL_SAFE_CALL(clEnqueueWriteBuffer(_cq, _global, CL_FALSE, 0, bytes,
+				      (void *)src.begin(), 0, NULL, NULL));
+  }
+  /// Get device ptr associated with object
+  inline const cl_mem * begin() const { return &_global; }
+  inline void clear() {
+    if (_global_bytes) CL_SAFE_CALL(clReleaseMemObject(_global));
+    if (_active) {
+      CL_DESTRUCT_CALL(clReleaseContext(_context));
+      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
+    }
+    _global_bytes=0;
+    _active=false;
+  }
+
+ private:
+  cl_mem _global;
+  size_t _global_bytes;
+  cl_context _context;
+  cl_command_queue _cq;
+  bool _active;
+};
+
 } // namespace
 
 #endif
diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h
index 8e8ffa929e..ca74312d51 100644
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@@ -61,7 +61,6 @@ class UCL_Timer {
   /// Initialize command queue for timing
   inline void init(UCL_Device &dev, command_queue &cq) {
     clear();
-    t_factor=dev.timer_resolution()/1000000000.0;
     _cq=cq;
     clRetainCommandQueue(_cq);
     _initialized=true;
@@ -124,17 +123,17 @@ class UCL_Timer {
     clReleaseEvent(start_event);
     clReleaseEvent(stop_event);
     has_measured_time = false;
-    return (tend-tstart)*t_factor;
+    return (tend-tstart)*1e-6;
   }
 
   /// Return the time (s) of last start to stop - Forces synchronization
-  inline double seconds() { return time()/1000.0; }
+  inline double seconds() { return time()*1e-3; }
 
   /// Return the total time in ms
   inline double total_time() { return _total_time; }
 
   /// Return the total time in seconds
-  inline double total_seconds() { return _total_time/1000.0; }
+  inline double total_seconds() { return _total_time*1e-3; }
 
  private:
   cl_event start_event, stop_event;
diff --git a/lib/gpu/geryon/ucl_basemat.h b/lib/gpu/geryon/ucl_basemat.h
index 07e23aebe7..51fd33d623 100644
--- a/lib/gpu/geryon/ucl_basemat.h
+++ b/lib/gpu/geryon/ucl_basemat.h
@@ -69,17 +69,17 @@ class UCL_BaseMat {
   /// Return the type/permissions of memory allocation
   /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
     * or UCL_VIEW **/
+  /// Assert that any ops in associate command queue have been issued to device
+  inline void flush() { ucl_flush(_cq); }
+
   inline enum UCL_MEMOPT kind() const { return _kind; }
 
   inline bool shared_mem_device() {
     #ifdef _OCL_MAT
     cl_device_id device;
     CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE,
-                                       sizeof(cl_device_id),&device,nullptr));
-    cl_device_type device_type;
-    CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
-                                 sizeof(device_type),&device_type,nullptr));
-    return _shared_mem_device(device_type);
+                                       sizeof(cl_device_id),&device,NULL));
+    return _shared_mem_device(device);
     #else
     return false;
     #endif
diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h
index cd2a90fe2d..e791f18f29 100644
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@@ -39,7 +39,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   };
   typedef numtyp data_type;
 
-  UCL_D_Vec() : _cols(0) {}
+ UCL_D_Vec() : _cols(0), _row_bytes(0) {}
   ~UCL_D_Vec() { _device_free(*this); }
 
   /// Construct with n columns
diff --git a/lib/gpu/geryon/ucl_get_devices.cpp b/lib/gpu/geryon/ucl_get_devices.cpp
index b8dfc6f7b1..5654bb40bd 100644
--- a/lib/gpu/geryon/ucl_get_devices.cpp
+++ b/lib/gpu/geryon/ucl_get_devices.cpp
@@ -44,10 +44,8 @@ using namespace ucl_hip;
 int main(int argc, char** argv) {
   UCL_Device cop;
   std::cout << "Found " << cop.num_platforms() << " platform(s).\n";
-  if (cop.num_platforms()>0) {
-    std::cout << "Using platform: " << cop.platform_name() << std::endl;
+  if (cop.num_platforms()>0)
     cop.print_all(std::cout);
-  }
   return 0;
 }
 
diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h
index 1df3c2de4b..41dad2b285 100644
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@@ -241,7 +241,7 @@ class UCL_H_Mat : public UCL_BaseMat {
     _array=input.begin()+offset;
     _end=_array+_cols;
     #ifdef _OCL_MAT
-    _host_view(*this,input,_row_bytes*_rows);
+    _host_view(*this,input,offset*sizeof(numtyp),_row_bytes*_rows);
     #endif
   }
 
diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h
index a9d64349d9..5de0c312b0 100644
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@@ -39,7 +39,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    };
    typedef numtyp data_type;
 
-  UCL_H_Vec() : _cols(0) {
+ UCL_H_Vec() : _cols(0), _row_bytes(0) {
     #ifdef _OCL_MAT
     _carray=(cl_mem)(0);
     #endif
@@ -135,7 +135,7 @@ class UCL_H_Vec : public UCL_BaseMat {
     _cols=cols;
     _row_bytes=_cols*sizeof(numtyp);
     this->_cq=input.cq();
-    _array=input.begin();
+    _array=(numtyp *)input.begin();
     _end=_array+_cols;
     #ifdef _OCL_MAT
     _carray=input.cbegin();
@@ -240,10 +240,10 @@ class UCL_H_Vec : public UCL_BaseMat {
     _cols=cols;
     _row_bytes=_cols*sizeof(numtyp);
     this->_cq=input.cq();
-    _array=input.begin()+offset;
+    _array=(numtyp *)input.begin()+offset;
     _end=_array+_cols;
     #ifdef _OCL_MAT
-    _host_view(*this,input,_row_bytes);
+    _host_view(*this,input,offset*sizeof(numtyp),_row_bytes);
     #endif
   }
 
diff --git a/lib/gpu/geryon/ucl_vector.h b/lib/gpu/geryon/ucl_vector.h
index 7fe2604de6..c03fd31fce 100644
--- a/lib/gpu/geryon/ucl_vector.h
+++ b/lib/gpu/geryon/ucl_vector.h
@@ -162,7 +162,9 @@ class UCL_Vector {
   inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
   /// Block until command_queue associated with matrix is complete
   inline void sync() { host.sync(); }
-
+  /// Assert that any ops in associate command queue have been issued to device
+  inline void flush() { ucl_flush(host.cq()); }
+  
   ///Get the size of a row on the host (including any padding) in elements
   inline size_t row_size() const { return host.row_size(); }
   /// Get the size of a row on the host(including any padding) in bytes
diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp
index 803b781286..e2478a64e5 100644
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@@ -14,6 +14,9 @@
  ***************************************************************************/
 
 #include "lal_answer.h"
+#if (LAL_USE_OMP == 1)
+#include <omp.h>
+#endif
 
 namespace LAMMPS_AL {
 #define AnswerT Answer<numtyp,acctyp>
@@ -56,7 +59,7 @@ bool AnswerT::alloc(const int inum) {
 
 template <class numtyp, class acctyp>
 bool AnswerT::init(const int inum, const bool charge, const bool rot,
-                       UCL_Device &devi) {
+                   UCL_Device &devi) {
   clear();
 
   bool success=true;
@@ -81,6 +84,10 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot,
   _time_cast=0.0;
   _time_cpu_idle=0.0;
 
+  success=success && (error_flag.alloc(1,*dev,UCL_READ_WRITE,
+                                        UCL_WRITE_ONLY)==UCL_SUCCESS);
+  if (success) error_flag.zero();
+
   return success && alloc(ef_inum);
 }
 
@@ -111,6 +118,7 @@ bool AnswerT::add_fields(const bool charge, const bool rot) {
 template <class numtyp, class acctyp>
 void AnswerT::clear() {
   _gpu_bytes=0;
+  error_flag.clear();
   if (!_allocated)
     return;
   _allocated=false;
@@ -138,12 +146,21 @@ double AnswerT::host_memory_usage() const {
 
 template <class numtyp, class acctyp>
 void AnswerT::copy_answers(const bool eflag, const bool vflag,
-                               const bool ef_atom, const bool vf_atom) {
+                           const bool ef_atom, const bool vf_atom,
+                           const int red_blocks) {
   time_answer.start();
   _eflag=eflag;
   _vflag=vflag;
   _ef_atom=ef_atom;
   _vf_atom=vf_atom;
+  #ifdef LAL_NO_BLOCK_REDUCE
+  _ev_stride=_inum;
+  #else
+  if (ef_atom || vf_atom)
+    _ev_stride=_inum;
+  else
+    _ev_stride=red_blocks;
+  #endif
 
   int csize=_ev_fields;
   if (!eflag)
@@ -152,20 +169,24 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
     csize-=6;
 
   if (csize>0)
-    engv.update_host(_inum*csize,true);
+    engv.update_host(_ev_stride*csize,true);
   if (_rot)
     force.update_host(_inum*4*2,true);
   else
     force.update_host(_inum*4,true);
   time_answer.stop();
+
+  #ifndef GERYON_OCL_FLUSH
+  force.flush();
+  #endif
 }
 
 template <class numtyp, class acctyp>
 void AnswerT::copy_answers(const bool eflag, const bool vflag,
-                               const bool ef_atom, const bool vf_atom,
-                               int *ilist) {
+                           const bool ef_atom, const bool vf_atom,
+                           int *ilist, const int red_blocks) {
   _ilist=ilist;
-  copy_answers(eflag,vflag,ef_atom,vf_atom);
+  copy_answers(eflag,vflag,ef_atom,vf_atom,red_blocks);
 }
 
 template <class numtyp, class acctyp>
@@ -177,21 +198,24 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
   double evdwl=0.0;
   int vstart=0;
   if (_eflag) {
-    for (int i=0; i<_inum; i++)
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd reduction(+:evdwl)
+    #endif
+    for (int i=0; i<_ev_stride; i++)
       evdwl+=engv[i];
     if (_ef_atom) {
       if (_ilist==nullptr) {
-        for (int i=0; i<_inum; i++)
+        for (int i=0; i<_ev_stride; i++)
           eatom[i]+=engv[i];
       } else {
-        for (int i=0; i<_inum; i++)
+        for (int i=0; i<_ev_stride; i++)
           eatom[_ilist[i]]+=engv[i];
       }
     }
-    vstart=_inum;
+    vstart=_ev_stride;
   }
   if (_vflag) {
-    int iend=vstart+_inum;
+    int iend=vstart+_ev_stride;
     for (int j=0; j<6; j++) {
       for (int i=vstart; i<iend; i++)
         virial[j]+=engv[i];
@@ -206,8 +230,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
             vatom[_ilist[ii++]][j]+=engv[i];
         }
       }
-      vstart+=_inum;
-      iend+=_inum;
+      vstart+=_ev_stride;
+      iend+=_ev_stride;
     }
   }
 
@@ -224,28 +248,36 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
     return energy_virial(eatom,vatom,virial);
 
   double evdwl=0.0;
-  int ii, vstart=0, iend=_inum;
+  int ii, vstart=0, iend=_ev_stride;
   if (_eflag) {
-    iend=_inum*2;
-    for (int i=0; i<_inum; i++)
+    iend=_ev_stride*2;
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd reduction(+:evdwl)
+    #endif
+    for (int i=0; i<_ev_stride; i++)
       evdwl+=engv[i];
-    for (int i=_inum; i<iend; i++)
-      ecoul+=engv[i];
+    double ecv=0.0;
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd reduction(+:ecv)
+    #endif
+    for (int i=_ev_stride; i<iend; i++)
+      ecv+=engv[i];
+    ecoul+=ecv;
     if (_ef_atom) {
       if (_ilist==nullptr) {
-        for (int i=0; i<_inum; i++)
+        for (int i=0; i<_ev_stride; i++)
           eatom[i]+=engv[i];
-        for (int i=_inum; i<iend; i++)
+        for (int i=_ev_stride; i<iend; i++)
           eatom[i]+=engv[i];
       } else {
-        for (int i=0, ii=0; i<_inum; i++)
+        for (int i=0, ii=0; i<_ev_stride; i++)
           eatom[_ilist[ii++]]+=engv[i];
-        for (int i=_inum, ii=0; i<iend; i++)
+        for (int i=_ev_stride, ii=0; i<iend; i++)
           eatom[_ilist[ii++]]+=engv[i];
       }
     }
     vstart=iend;
-    iend+=_inum;
+    iend+=_ev_stride;
   }
   if (_vflag) {
     for (int j=0; j<6; j++) {
@@ -260,8 +292,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
             vatom[_ilist[ii++]][j]+=engv[i];
         }
       }
-      vstart+=_inum;
-      iend+=_inum;
+      vstart+=_ev_stride;
+      iend+=_ev_stride;
     }
   }
 
@@ -270,38 +302,79 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
 
 template <class numtyp, class acctyp>
 void AnswerT::get_answers(double **f, double **tor) {
-  int fl=0;
   if (_ilist==nullptr) {
-    for (int i=0; i<_inum; i++) {
-      f[i][0]+=force[fl];
-      f[i][1]+=force[fl+1];
-      f[i][2]+=force[fl+2];
-      fl+=4;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        tor[i][0]+=force[fl];
-        tor[i][1]+=force[fl+1];
-        tor[i][2]+=force[fl+2];
-        fl+=4;
+    typedef struct { double x,y,z; } vec3d;
+    typedef struct { acctyp x,y,z,w; } vec4d_t;
+    vec3d *fp=reinterpret_cast<vec3d*>(&(f[0][0]));
+    vec4d_t *forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
+
+    #if (LAL_USE_OMP == 1)
+    #pragma omp parallel
+    #endif
+    {
+      #if (LAL_USE_OMP == 1)
+      const int nthreads = omp_get_num_threads();
+      const int tid = omp_get_thread_num();
+      const int idelta = _inum / nthreads + 1;
+      const int ifrom = tid * idelta;
+      const int ito = std::min(ifrom + idelta, _inum);
+      #else
+      const int tid = 0;
+      const int ifrom = 0;
+      const int ito = _inum;
+      #endif
+
+      for (int i=ifrom; i<ito; i++) {
+        fp[i].x+=forcep[i].x;
+        fp[i].y+=forcep[i].y;
+        fp[i].z+=forcep[i].z;
+      }
+      if (_rot) {
+        vec3d *torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
+        forcep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
+        for (int i=ifrom; i<ito; i++) {
+          torp[i].x+=forcep[i].x;
+          torp[i].y+=forcep[i].y;
+          torp[i].z+=forcep[i].z;
+        }
       }
     }
   } else {
-    for (int i=0; i<_inum; i++) {
-      int ii=_ilist[i];
-      f[ii][0]+=force[fl];
-      f[ii][1]+=force[fl+1];
-      f[ii][2]+=force[fl+2];
-      fl+=4;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
+    #if (LAL_USE_OMP == 1)
+    #pragma omp parallel
+    #endif
+    {
+      #if (LAL_USE_OMP == 1)
+      const int nthreads = omp_get_num_threads();
+      const int tid = omp_get_thread_num();
+      const int idelta = _inum / nthreads + 1;
+      const int ifrom = tid * idelta;
+      const int ito = std::min(ifrom + idelta, _inum);
+      int fl=ifrom*4;
+      #else
+      const int tid = 0;
+      const int ifrom = 0;
+      const int ito = _inum;
+      int fl=0;
+      #endif
+
+      for (int i=ifrom; i<ito; i++) {
         int ii=_ilist[i];
-        tor[ii][0]+=force[fl];
-        tor[ii][1]+=force[fl+1];
-        tor[ii][2]+=force[fl+2];
+        f[ii][0]+=force[fl];
+        f[ii][1]+=force[fl+1];
+        f[ii][2]+=force[fl+2];
         fl+=4;
       }
+      if (_rot) {
+        fl=_inum*4 + ifrom*4;
+        for (int i=ifrom; i<ito; i++) {
+          int ii=_ilist[i];
+          tor[ii][0]+=force[fl];
+          tor[ii][1]+=force[fl+1];
+          tor[ii][2]+=force[fl+2];
+          fl+=4;
+        }
+      }
     }
   }
 }
diff --git a/lib/gpu/lal_answer.h b/lib/gpu/lal_answer.h
index 20dcb9ad09..ae52e64f16 100644
--- a/lib/gpu/lal_answer.h
+++ b/lib/gpu/lal_answer.h
@@ -110,12 +110,12 @@ class Answer {
   // -------------------------COPY FROM GPU -------------------------------
 
   /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom);
+  void copy_answers(const bool eflag, const bool vflag, const bool ef_atom,
+                    const bool vf_atom, const int red_blocks);
 
   /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom, int *ilist);
+  void copy_answers(const bool eflag, const bool vflag, const bool ef_atom,
+                    const bool vf_atom, int *ilist, const int red_blocks);
 
   /// Copy energy and virial data into LAMMPS memory
   double energy_virial(double *eatom, double **vatom, double *virial);
@@ -128,11 +128,13 @@ class Answer {
   void get_answers(double **f, double **tor);
 
   inline double get_answers(double **f, double **tor, double *eatom,
-                            double **vatom, double *virial, double &ecoul) {
+                            double **vatom, double *virial, double &ecoul,
+                            int &error_flag_in) {
     double ta=MPI_Wtime();
     time_answer.sync_stop();
     _time_cpu_idle+=MPI_Wtime()-ta;
     double ts=MPI_Wtime();
+    if (error_flag[0]) error_flag_in=error_flag[0];
     double evdw=energy_virial(eatom,vatom,virial,ecoul);
     get_answers(f,tor);
     _time_cast+=MPI_Wtime()-ts;
@@ -151,6 +153,8 @@ class Answer {
   UCL_Vector<acctyp,acctyp> force;
   /// Energy and virial per-atom storage
   UCL_Vector<acctyp,acctyp> engv;
+  /// Error flag
+  UCL_Vector<int,int> error_flag;
 
   /// Device timers
   UCL_Timer time_answer;
@@ -162,7 +166,7 @@ class Answer {
   bool alloc(const int inum);
 
   bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
-  int _max_local, _inum, _e_fields, _ev_fields, _ans_fields;
+  int _max_local, _inum, _e_fields, _ev_fields, _ans_fields, _ev_stride;
   int *_ilist;
   double _time_cast, _time_cpu_idle;
 
diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index 7ce3e3e7ff..cda4d383b5 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -414,9 +414,9 @@ const char *atom=0;
 
 template <class numtyp, class acctyp>
 void AtomT::compile_kernels(UCL_Device &dev) {
-  std::string flags = "-D"+std::string(OCL_VENDOR);
+  std::string flags = "";
   atom_program=new UCL_Program(dev);
-  atom_program->load_string(atom,flags);
+  atom_program->load_string(atom,flags,nullptr,screen);
   k_cast_x.set_function(*atom_program,"kernel_cast_x");
   _compiled=true;
 }
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index e39740d6c8..3cf97d94a0 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -24,6 +24,9 @@
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
 using namespace ucl_opencl;
+#ifndef LAL_NO_OCL_EV_JIT
+#define LAL_OCL_EV_JIT
+#endif
 #elif defined(USE_CUDART)
 #include "geryon/nvc_timer.h"
 #include "geryon/nvc_mat.h"
@@ -178,7 +181,7 @@ class Atom {
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
     ucl_copy(dev_v,view,false);
   }
 
@@ -197,7 +200,26 @@ class Atom {
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
+    ucl_copy(dev_v,view,false);
+  }
+
+  /// Pack LAMMPS atom type constants into 2 vectors and copy to device
+  template <class dev_typ, class t1, class t2>
+  inline void type_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
+                         UCL_H_Vec<numtyp> &buffer, t1 ***one, t2 ***two) {
+    int ii=0;
+    for (int i=0; i<n; i++) {
+      for (int j=0; j<n; j++) {
+        for (int k=0; k<n; k++) {
+          buffer[ii*2]=static_cast<numtyp>(one[i][j][k]);
+          buffer[ii*2+1]=static_cast<numtyp>(two[i][j][k]);
+          ii++;
+        }
+      }
+    }
+    UCL_H_Vec<dev_typ> view;
+    view.view_offset(0,buffer,n*n*n);
     ucl_copy(dev_v,view,false);
   }
 
@@ -217,7 +239,7 @@ class Atom {
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
     ucl_copy(dev_v,view,false);
   }
 
@@ -238,7 +260,7 @@ class Atom {
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
     ucl_copy(dev_v,view,false);
   }
 
@@ -251,7 +273,7 @@ class Atom {
       buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
     }
     UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),n,*dev);
+    view.view_offset(0,buffer,n);
     ucl_copy(dev_v,view,false);
   }
 
@@ -261,6 +283,9 @@ class Atom {
   inline void data_unavail()
     { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
 
+  typedef struct { double x,y,z; } vec3d;
+  typedef struct { numtyp x,y,z,w; } vec4d_t;
+
   /// Cast positions and types to write buffer
   inline void cast_x_data(double **host_ptr, const int *host_type) {
     if (_x_avail==false) {
@@ -269,13 +294,16 @@ class Atom {
       memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
       memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
       #else
-      int wl=0;
+      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
+      vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0]));
+      #if (LAL_USE_OMP == 1)
+      #pragma omp parallel for schedule(static)
+      #endif
       for (int i=0; i<_nall; i++) {
-        x[wl]=host_ptr[i][0];
-        x[wl+1]=host_ptr[i][1];
-        x[wl+2]=host_ptr[i][2];
-        x[wl+3]=host_type[i];
-        wl+=4;
+        xp[i].x=host_p[i].x;
+        xp[i].y=host_p[i].y;
+        xp[i].z=host_p[i].z;
+        xp[i].w=host_type[i];
       }
       #endif
       _time_cast+=MPI_Wtime()-t;
@@ -320,6 +348,11 @@ class Atom {
       } else if (sizeof(numtyp)==sizeof(double))
         memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
       else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
         for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
       _time_cast+=MPI_Wtime()-t;
     }
@@ -346,6 +379,11 @@ class Atom {
       } else if (sizeof(numtyp)==sizeof(double))
         memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
       else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
         for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
       _time_cast+=MPI_Wtime()-t;
     }
@@ -370,13 +408,16 @@ class Atom {
       memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
       memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
       #else
-      int wl=0;
+      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
+      vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0]));
+      #if (LAL_USE_OMP == 1)
+      #pragma omp parallel for schedule(static)
+      #endif
       for (int i=0; i<_nall; i++) {
-        v[wl]=host_ptr[i][0];
-        v[wl+1]=host_ptr[i][1];
-        v[wl+2]=host_ptr[i][2];
-        v[wl+3]=host_tag[i];
-        wl+=4;
+        vp[i].x=host_p[i].x;
+        vp[i].y=host_p[i].y;
+        vp[i].z=host_p[i].z;
+        vp[i].w=host_tag[i];
       }
       #endif
       _time_cast+=MPI_Wtime()-t;
diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h
index 5b7150d950..be00abbcef 100644
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@@ -40,170 +40,521 @@
     nbor_begin+=offset;                                                      \
   }
 
-#if (ARCH < 300)
+#define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
+                    i, numj, stride, nbor_end, nbor_begin)                   \
+    i=nbor_mem[ii];                                                          \
+    nbor_begin=ii+nbor_stride;                                               \
+    numj=nbor_mem[nbor_begin];                                               \
+    nbor_begin+=nbor_stride+ii*(t_per_atom-1);                               \
+    stride=fast_mul(t_per_atom,nbor_stride);                                 \
+    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj &             \
+                                                          (t_per_atom-1));   \
+    nbor_begin+=offset;
 
-#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
+#if (SHUFFLE_AVAIL == 0)
+
+#define simd_reduce_add1(width, local, offset, tid, one)                    \
+  local[0][tid]=one;                                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) local[0][tid] += local[0][tid+s];                       \
+  }                                                                         \
+  if (offset==0) one=local[0][tid];
+
+#define simd_reduce_add2(width, local, offset, tid, one, two)               \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
+    one=local[0][tid];                                                      \
+    two=local[1][tid];                                                      \
+  }
+
+#define simd_reduce_add3(width, local, offset, tid, one, two, three)        \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  local[2][tid]=three;                                                      \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
+      local[2][tid] += local[2][tid+s];                                     \
     }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
-        ei+=inum;                                                           \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    one=local[0][tid];                                                      \
+    two=local[1][tid];                                                      \
+    three=local[2][tid];                                                    \
+  }
+
+#define simd_reduce_add6(width, local, offset, tid, one, two, three,        \
+                         four, five, six)                                   \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  local[2][tid]=three;                                                      \
+  local[3][tid]=four;                                                       \
+  local[4][tid]=five;                                                       \
+  local[5][tid]=six;                                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
+      local[2][tid] += local[2][tid+s];                                     \
+      local[3][tid] += local[3][tid+s];                                     \
+      local[4][tid] += local[4][tid+s];                                     \
+      local[5][tid] += local[5][tid+s];                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    one=local[0][tid];                                                      \
+    two=local[1][tid];                                                      \
+    three=local[2][tid];                                                    \
+    four=local[3][tid];                                                     \
+    five=local[4][tid];                                                     \
+    six=local[5][tid];                                                      \
+  }
+
+#define simd_reduce_arr(trip, width, local, offset, tid, arr)               \
+  for (int r=0; r<trip; r++)                                                \
+    local[r][tid]=arr[r];                                                   \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      for (int r=0; r<trip; r++)                                            \
+        local[r][tid] += local[r][tid+s];                                   \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    for (int r=0; r<trip; r++)                                              \
+      arr[r]=local[r][tid];                                                 \
+  }
+
+#define block_reduce_add1(width, local, tid, one)                           \
+  local[0][tid]=one;                                                        \
+  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
+    __syncthreads();                                                        \
+    if (tid < s) local[0][tid] += local[0][tid+s];                          \
+  }                                                                         \
+  if (tid<width) {                                                          \
+    for (unsigned int s=width/2; s>0; s>>=1) {                              \
+      simdsync();                                                           \
+      if (tid < s) local[0][tid] += local[0][tid+s];                        \
+    }                                                                       \
+    if (tid==0) one=local[0][tid];                                          \
+  }
+
+#define block_reduce_add2(width, local, tid, one, two)                      \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
+    __syncthreads();                                                        \
+    if (tid < s) {                                                          \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
+    }                                                                       \
+  }                                                                         \
+  if (tid<width) {                                                          \
+    for (unsigned int s=width/2; s>0; s>>=1) {                              \
+      simdsync();                                                           \
+      if (tid < s) {                                                        \
+        local[0][tid] += local[0][tid+s];                                   \
+        local[1][tid] += local[1][tid+s];                                   \
+      }                                                                     \
+    }                                                                       \
+    if (tid==0) {                                                           \
+      one=local[0][tid];                                                    \
+      two=local[1][tid];                                                    \
+    }                                                                       \
+  }
+
+#define block_reduce_arr(trip, width, local, tid, arr)                      \
+  for (int r=0; r<trip; r++)                                                \
+    local[r][tid]=arr[r];                                                   \
+  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
+    __syncthreads();                                                        \
+    if (tid < s) {                                                          \
+      for (int r=0; r<trip; r++)                                            \
+        local[r][tid] += local[r][tid+s];                                   \
+    }                                                                       \
+  }                                                                         \
+  if (tid<width) {                                                          \
+    for (unsigned int s=width/2; s>0; s>>=1) {                              \
+      simdsync();                                                           \
+      if (tid < s) {                                                        \
+        for (int r=0; r<trip; r++)                                          \
+          local[r][tid] += local[r][tid+s];                                 \
+      }                                                                     \
+    }                                                                       \
+    if (tid==0) {                                                           \
+      for (int r=0; r<trip; r++)                                            \
+        arr[r]=local[r][tid];                                               \
+    }                                                                       \
+  }
+
+#define local_allocate_store_pair()                                         \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+#define local_allocate_store_charge()                                       \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+#define local_allocate_store_bio()                                          \
+    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+#define local_allocate_store_ellipse()                                      \
+    __local acctyp red_acc[6][BLOCK_ELLIPSE];
+#define local_allocate_store_three()                                        \
+    __local acctyp red_acc[6][BLOCK_ELLIPSE];
+
+#define store_answers(f, energy, virial, ii, inum, tid,                     \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int ev_stride=NUM_BLOCKS_X;                                     \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
       }                                                                     \
     }                                                                       \
-    ans[ii]=f;                                                              \
   }
 
 #define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                         t_per_atom, offset, eflag, vflag, ans, engv)        \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    red_acc[4][tid]=e_coul;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<5; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
       }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    e_coul=red_acc[4][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-      engv[ei]=e_coul*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    const int ev_stride=NUM_BLOCKS_X;                                       \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+          engv[ei]=e_coul*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
         ei+=inum;                                                           \
       }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
-    ans[ii]=f;                                                              \
   }
 
 #else
 
-#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
+#define simd_reduce_add1(width, one)                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) one += shfl_down(one, s, width);
+
+#define simd_reduce_add2(width, one, two)                                   \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    one += shfl_down(one, s, width);                                        \
+    two += shfl_down(two, s, width);                                        \
+  }
+
+#define simd_reduce_add3(width, one, two, three)                            \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    one += shfl_down(one, s, width);                                        \
+    two += shfl_down(two, s, width);                                        \
+    three += shfl_down(three, s, width);                                    \
+  }
+
+#define simd_reduce_add6(width, one, two, three, four, five, six)           \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    one += shfl_down(one, s, width);                                        \
+    two += shfl_down(two, s, width);                                        \
+    three += shfl_down(three, s, width);                                    \
+    four += shfl_down(four, s, width);                                      \
+    five += shfl_down(five, s, width);                                      \
+    six += shfl_down(six, s, width);                                        \
+  }
+
+#define simd_reduce_arr(trip, width, arr)                                   \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    for (int r=0; r<trip; r++)                                              \
+      arr[r] += shfl_down(arr[r], s, width);                                \
+  }
+
+#if (EVFLAG == 1)
+
+#define local_allocate_store_pair()                                         \
+    __local acctyp red_acc[7][BLOCK_PAIR / SIMD_SIZE];
+#define local_allocate_store_charge()                                       \
+    __local acctyp red_acc[8][BLOCK_PAIR / SIMD_SIZE];
+#define local_allocate_store_bio()                                          \
+    __local acctyp red_acc[8][BLOCK_BIO_PAIR / SIMD_SIZE];
+#define local_allocate_store_ellipse()
+#define local_allocate_store_three()                                        \
+    __local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
+
+#define store_answers(f, energy, virial, ii, inum, tid,                     \
+                      t_per_atom, offset, eflag, vflag, ans, engv)          \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]=energy*(acctyp)0.5;                                        \
         ei+=inum;                                                           \
       }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
-    ans[ii]=f;                                                              \
   }
 
 #define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                         t_per_atom, offset, eflag, vflag, ans, engv)        \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      f.x += shfl_xor(f.x, s, t_per_atom);                                  \
-      f.y += shfl_xor(f.y, s, t_per_atom);                                  \
-      f.z += shfl_xor(f.z, s, t_per_atom);                                  \
-      energy += shfl_xor(energy, s, t_per_atom);                            \
-      e_coul += shfl_xor(e_coul, s, t_per_atom);                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-      engv[ei]=e_coul*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+            engv[ei]=e_coul*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
         ei+=inum;                                                           \
       }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
-    ans[ii]=f;                                                              \
   }
 
+#else
+
+#define local_allocate_store_pair()
+#define local_allocate_store_charge()
+#define local_allocate_store_bio()
+#define local_allocate_store_ellipse()
+#define local_allocate_store_three()
+
+#define store_answers(f, energy, virial, ii, inum, tid,                     \
+                      t_per_atom, offset, eflag, vflag, ans, engv)          \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) ans[ii]=f;
+
+#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) ans[ii]=f;
+
+#endif
+
 #endif
 
diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp
index 42925aaeec..d35919105d 100644
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@@ -21,12 +21,15 @@ namespace LAMMPS_AL {
 extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
-BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0)  {
+BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0), _onetype(0) {
   device=&global_device;
   ans=new Answer<numtyp,acctyp>();
   nbor=new Neighbor();
   pair_program=nullptr;
   ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -36,6 +39,10 @@ BaseAtomicT::~BaseAtomic() {
   k_pair_fast.clear();
   k_pair.clear();
   if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -49,7 +56,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
                              const int max_nbors, const int maxspecial,
                              const double cell_size, const double gpu_split,
                              FILE *_screen, const void *pair_program,
-                             const char *k_name) {
+                             const char *k_name, const int onetype) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -64,28 +71,29 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
     _gpu_host=1;
 
   _threads_per_atom=device->threads_per_atom();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);
 
   int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-  if (success!=0)
-    return success;
-
   if (ucl_device!=device->gpu) _compiled=false;
 
   ucl_device=device->gpu;
   atom=&device->atom;
 
   _block_size=device->pair_block_size();
-  compile_kernels(*ucl_device,pair_program,k_name);
+  compile_kernels(*ucl_device,pair_program,k_name,onetype);
+
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -102,8 +110,8 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
 }
 
 template <class numtyp, class acctyp>
-void BaseAtomicT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+void BaseAtomicT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
 }
 
 template <class numtyp, class acctyp>
@@ -164,8 +172,8 @@ inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum,
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
@@ -177,13 +185,27 @@ inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BaseAtomicT::compute(const int f_ago, const int inum_full,
-                               const int nall, double **host_x, int *host_type,
-                               int *ilist, int *numj, int **firstneigh,
-                               const bool eflag, const bool vflag,
-                               const bool eatom, const bool vatom,
-                               int &host_start, const double cpu_time,
-                               bool &success) {
+                          const int nall, double **host_x, int *host_type,
+                          int *ilist, int *numj, int **firstneigh,
+                          const bool eflag_in, const bool vflag_in,
+                          const bool eatom, const bool vatom,
+                          int &host_start, const double cpu_time,
+                          bool &success) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -207,8 +229,8 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full,
   hd_balancer.start_timer();
   atom->add_x_data(host_x,host_type);
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
@@ -218,14 +240,28 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int ** BaseAtomicT::compute(const int ago, const int inum_full,
-                                 const int nall, double **host_x, int *host_type,
-                                 double *sublo, double *subhi, tagint *tag,
-                                 int **nspecial, tagint **special, const bool eflag,
-                                 const bool vflag, const bool eatom,
-                                 const bool vatom, int &host_start,
-                                 int **ilist, int **jnum,
-                                 const double cpu_time, bool &success) {
+                            const int nall, double **host_x, int *host_type,
+                            double *sublo, double *subhi, tagint *tag,
+                            int **nspecial, tagint **special,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom,
+                            int &host_start, int **ilist, int **jnum,
+                            const double cpu_time, bool &success) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -254,8 +290,8 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
   *ilist=nbor->host_ilist.begin();
   *jnum=nbor->host_acc.begin();
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
@@ -270,19 +306,46 @@ double BaseAtomicT::host_memory_usage_atomic() const {
 
 template <class numtyp, class acctyp>
 void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname) {
-  if (_compiled)
+                                  const char *kname, const int onetype) {
+  if (_compiled && _onetype==onetype)
     return;
+  _onetype=onetype;
 
   std::string s_fast=std::string(kname)+"_fast";
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   k_pair_fast.set_function(*pair_program,s_fast.c_str());
   k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
 
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
   _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }
 
 template class BaseAtomic<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_base_atomic.h b/lib/gpu/lal_base_atomic.h
index c97f42c50e..701675390f 100644
--- a/lib/gpu/lal_base_atomic.h
+++ b/lib/gpu/lal_base_atomic.h
@@ -53,10 +53,11 @@ class BaseAtomic {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
                   const double gpu_split, FILE *screen,
-                  const void *pair_program, const char *k_name);
+                  const void *pair_program, const char *k_name,
+                  const int onetype=0);
 
   /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
+  void estimate_gpu_overhead(const int add_kernels=0);
 
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
@@ -100,7 +101,7 @@ class BaseAtomic {
   /// Accumulate timers
   inline void acc_timers() {
     if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
       time_pair.add_to_total();
       atom->acc_timers();
       ans->acc_timers();
@@ -179,23 +180,31 @@ class BaseAtomic {
   Neighbor *nbor;
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
   inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }
+
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex;
 
  protected:
   bool _compiled;
-  int _block_size, _threads_per_atom;
+  int _block_size, _threads_per_atom, _onetype;
   double _max_bytes, _max_an_bytes;
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
-  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
+  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k,
+                       const int onetype);
 
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  virtual int loop(const int eflag, const int vflag) = 0;
 };
 
 }
diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp
index d5a6e06222..b0d08e4df7 100644
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@@ -27,6 +27,9 @@ BaseChargeT::BaseCharge() : _compiled(false), _max_bytes(0) {
   nbor=new Neighbor();
   pair_program=nullptr;
   ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -36,6 +39,10 @@ BaseChargeT::~BaseCharge() {
   k_pair_fast.clear();
   k_pair.clear();
   if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -64,21 +71,11 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
     _gpu_host=1;
 
   _threads_per_atom=device->threads_per_charge();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);
 
   int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-  if (success!=0)
-    return success;
-
   if (ucl_device!=device->gpu) _compiled=false;
 
   ucl_device=device->gpu;
@@ -88,6 +85,17 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
   _block_bio_size=device->block_bio_pair();
   compile_kernels(*ucl_device,pair_program,k_name);
 
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
+
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
 
@@ -104,8 +112,8 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
 }
 
 template <class numtyp, class acctyp>
-void BaseChargeT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+void BaseChargeT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
 }
 
 template <class numtyp, class acctyp>
@@ -166,8 +174,8 @@ inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
@@ -179,14 +187,28 @@ inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BaseChargeT::compute(const int f_ago, const int inum_full,
-                               const int nall, double **host_x, int *host_type,
-                               int *ilist, int *numj, int **firstneigh,
-                               const bool eflag, const bool vflag,
-                               const bool eatom, const bool vatom,
-                               int &host_start, const double cpu_time,
-                               bool &success, double *host_q,
-                               const int nlocal, double *boxlo, double *prd) {
+                          const int nall, double **host_x, int *host_type,
+                          int *ilist, int *numj, int **firstneigh,
+                          const bool eflag_in, const bool vflag_in,
+                          const bool eatom, const bool vatom,
+                          int &host_start, const double cpu_time,
+                          bool &success, double *host_q,
+                          const int nlocal, double *boxlo, double *prd) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -215,8 +237,8 @@ void BaseChargeT::compute(const int f_ago, const int inum_full,
   device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
@@ -226,15 +248,29 @@ void BaseChargeT::compute(const int f_ago, const int inum_full,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** BaseChargeT::compute(const int ago, const int inum_full,
-                                const int nall, double **host_x, int *host_type,
-                                double *sublo, double *subhi, tagint *tag,
-                                int **nspecial, tagint **special, const bool eflag,
-                                const bool vflag, const bool eatom,
-                                const bool vatom, int &host_start,
-                                int **ilist, int **jnum,
-                                const double cpu_time, bool &success,
-                                double *host_q, double *boxlo, double *prd) {
+                           const int nall, double **host_x, int *host_type,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum,
+                           const double cpu_time, bool &success,
+                           double *host_q, double *boxlo, double *prd) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -269,8 +305,8 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
   device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
@@ -292,13 +328,37 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
   std::string s_fast=std::string(kname)+"_fast";
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   k_pair_fast.set_function(*pair_program,s_fast.c_str());
   k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
 
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
   _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }
 
 template class BaseCharge<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h
index b6d3e9e3f8..6b8761092a 100644
--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@@ -57,7 +57,7 @@ class BaseCharge {
                   const void *pair_program, const char *k_name);
 
   /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
+  void estimate_gpu_overhead(const int add_kernels=0);
 
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
@@ -103,7 +103,7 @@ class BaseCharge {
   /// Accumulate timers
   inline void acc_timers() {
     if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
       time_pair.add_to_total();
       atom->acc_timers();
       ans->acc_timers();
@@ -177,9 +177,15 @@ class BaseCharge {
   Neighbor *nbor;
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
   inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex;
@@ -194,7 +200,7 @@ class BaseCharge {
 
   void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
 
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  virtual int loop(const int eflag, const int vflag) = 0;
 };
 
 }
diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp
index 57773a3b80..9781065b13 100644
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@@ -27,6 +27,9 @@ BaseDipoleT::BaseDipole() : _compiled(false), _max_bytes(0) {
   nbor=new Neighbor();
   pair_program=nullptr;
   ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -36,6 +39,10 @@ BaseDipoleT::~BaseDipole() {
   k_pair_fast.clear();
   k_pair.clear();
   if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -65,30 +72,30 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
     _gpu_host=1;
 
   _threads_per_atom=device->threads_per_charge();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);
 
   int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-  if (success!=0)
-    return success;
-
   if (ucl_device!=device->gpu) _compiled=false;
 
   ucl_device=device->gpu;
   atom=&device->atom;
 
   _block_size=device->pair_block_size();
-  _block_bio_size=device->block_bio_pair();
   compile_kernels(*ucl_device,pair_program,k_name);
 
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
+
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
 
@@ -168,8 +175,8 @@ inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
@@ -183,12 +190,26 @@ template <class numtyp, class acctyp>
 void BaseDipoleT::compute(const int f_ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           int *ilist, int *numj, int **firstneigh,
-                          const bool eflag, const bool vflag,
+                          const bool eflag_in, const bool vflag_in,
                           const bool eatom, const bool vatom,
                           int &host_start, const double cpu_time,
                           bool &success, double *host_q, double **host_mu,
                           const int nlocal, double *boxlo, double *prd) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -219,8 +240,8 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
   device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
@@ -232,14 +253,28 @@ template <class numtyp, class acctyp>
 int** BaseDipoleT::compute(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special, const bool eflag,
-                           const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum,
+                           int **nspecial, tagint **special,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom,
+                           int &host_start, int **ilist, int **jnum,
                            const double cpu_time, bool &success,
                            double *host_q, double **host_mu,
                            double *boxlo, double *prd) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -277,8 +312,8 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
   device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
@@ -300,14 +335,38 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
   std::string s_fast=std::string(kname)+"_fast";
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   k_pair_fast.set_function(*pair_program,s_fast.c_str());
   k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
   mu_tex.get_texture(*pair_program,"mu_tex");
 
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
   _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }
 
 template class BaseDipole<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_base_dipole.h b/lib/gpu/lal_base_dipole.h
index 856b69b56b..f7cefd9066 100644
--- a/lib/gpu/lal_base_dipole.h
+++ b/lib/gpu/lal_base_dipole.h
@@ -102,7 +102,7 @@ class BaseDipole {
   /// Accumulate timers
   inline void acc_timers() {
     if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
       time_pair.add_to_total();
       atom->acc_timers();
       ans->acc_timers();
@@ -176,9 +176,16 @@ class BaseDipole {
   Neighbor *nbor;
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
   inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }
+
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex;
@@ -187,14 +194,14 @@ class BaseDipole {
 
  protected:
   bool _compiled;
-  int _block_size, _block_bio_size, _threads_per_atom;
+  int _block_size, _threads_per_atom;
   double  _max_bytes, _max_an_bytes;
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
 
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  virtual int loop(const int eflag, const int vflag) = 0;
 };
 
 }
diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp
index e4fd80fcc3..4b6a964bfb 100644
--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@@ -27,6 +27,9 @@ BaseDPDT::BaseDPD() : _compiled(false), _max_bytes(0) {
   nbor=new Neighbor();
   pair_program=nullptr;
   ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -36,6 +39,10 @@ BaseDPDT::~BaseDPD() {
   k_pair_fast.clear();
   k_pair.clear();
   if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -47,9 +54,9 @@ int BaseDPDT::bytes_per_atom_atomic(const int max_nbors) const {
 template <class numtyp, class acctyp>
 int BaseDPDT::init_atomic(const int nlocal, const int nall,
                           const int max_nbors, const int maxspecial,
-                          const double cell_size,
-                          const double gpu_split, FILE *_screen,
-                          const void *pair_program, const char *k_name) {
+                          const double cell_size, const double gpu_split,
+                          FILE *_screen, const void *pair_program,
+                          const char *k_name, const int onetype) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -63,31 +70,30 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
   if (host_nlocal>0)
     _gpu_host=1;
 
-  _threads_per_atom=device->threads_per_charge();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);
+  _threads_per_atom=device->threads_per_atom();
 
   int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
   if (success!=0)
     return success;
 
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-
-  if (success!=0)
-    return success;
-
   if (ucl_device!=device->gpu) _compiled=false;
 
   ucl_device=device->gpu;
   atom=&device->atom;
 
   _block_size=device->pair_block_size();
-  _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name);
+  compile_kernels(*ucl_device,pair_program,k_name,onetype);
+
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -167,8 +173,8 @@ inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
@@ -179,16 +185,30 @@ inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseDPDT::compute(const int f_ago, const int inum_full,
-                       const int nall, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag,
-                       const bool eatom, const bool vatom,
-                       int &host_start, const double cpu_time,
-                       bool &success, tagint *tag, double **host_v,
-                       const double dtinvsqrt, const int seed, const int timestep,
+void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall,
+                       double **host_x, int *host_type, int *ilist, int *numj,
+                       int **firstneigh, const bool eflag_in,
+                       const bool vflag_in, const bool eatom,
+                       const bool vatom, int &host_start,
+                       const double cpu_time, bool &success, tagint *tag,
+                       double **host_v, const double dtinvsqrt,
+                       const int seed, const int timestep,
                        const int nlocal, double *boxlo, double *prd) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -218,8 +238,8 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
   _seed = seed;
   _timestep = timestep;
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
@@ -231,8 +251,8 @@ template <class numtyp, class acctyp>
 int** BaseDPDT::compute(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, tagint *tag,
-                        int **nspecial, tagint **special, const bool eflag,
-                        const bool vflag, const bool eatom,
+                        int **nspecial, tagint **special, const bool eflag_in,
+                        const bool vflag_in, const bool eatom,
                         const bool vatom, int &host_start,
                         int **ilist, int **jnum,
                         const double cpu_time, bool &success,
@@ -240,6 +260,20 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
                         const int seed, const int timestep,
                         double *boxlo, double *prd) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -275,8 +309,8 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
   _seed = seed;
   _timestep = timestep;
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
@@ -291,20 +325,48 @@ double BaseDPDT::host_memory_usage_atomic() const {
 
 template <class numtyp, class acctyp>
 void BaseDPDT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname) {
-  if (_compiled)
+                               const char *kname, const int onetype) {
+  if (_compiled && _onetype==onetype)
     return;
 
+  _onetype=onetype;
+
   std::string s_fast=std::string(kname)+"_fast";
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   k_pair_fast.set_function(*pair_program,s_fast.c_str());
   k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
   vel_tex.get_texture(*pair_program,"vel_tex");
 
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
   _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }
 
 template class BaseDPD<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h
index 5d1573c1a9..9eb56993af 100644
--- a/lib/gpu/lal_base_dpd.h
+++ b/lib/gpu/lal_base_dpd.h
@@ -52,7 +52,8 @@ class BaseDPD {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
                   const double gpu_split, FILE *screen,
-                  const void *pair_program, const char *k_name);
+                  const void *pair_program, const char *k_name,
+                  const int onetype=0);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead();
@@ -101,7 +102,7 @@ class BaseDPD {
   /// Accumulate timers
   inline void acc_timers() {
     if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
       time_pair.add_to_total();
       atom->acc_timers();
       ans->acc_timers();
@@ -177,9 +178,16 @@ class BaseDPD {
   Neighbor *nbor;
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
   inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }
+
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex;
@@ -191,13 +199,14 @@ class BaseDPD {
 
  protected:
   bool _compiled;
-  int _block_size, _block_bio_size, _threads_per_atom;
+  int _block_size, _threads_per_atom, _onetype;
   double  _max_bytes, _max_an_bytes;
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
-  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  void compile_kernels(UCL_Device &dev, const void *pair_string,
+                       const char *k, const int onetype);
+  virtual int loop(const int eflag, const int vflag) = 0;
 };
 
 }
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index 524705ed41..87bfe14751 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -29,7 +29,8 @@ const char *ellipsoid_nbor=0;
 extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
-BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
+BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0),
+                                  host_olist_size(0) {
   device=&global_device;
   ans=new Answer<numtyp,acctyp>();
   nbor=new Neighbor();
@@ -37,6 +38,10 @@ BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
   ellipsoid_program=nullptr;
   lj_program=nullptr;
   ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  ellipsoid_program_noev=nullptr;
+  lj_program_noev=nullptr;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -53,6 +58,14 @@ BaseEllipsoidT::~BaseEllipsoid() {
   if (nbor_program) delete nbor_program;
   if (ellipsoid_program) delete ellipsoid_program;
   if (lj_program) delete lj_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_ellipsoid_noev.clear();
+  k_ellipsoid_sphere_noev.clear();
+  k_sphere_ellipsoid_noev.clear();
+  k_lj_fast.clear();
+  if (ellipsoid_program_noev) delete ellipsoid_program_noev;
+  if (lj_program_noev) delete lj_program_noev;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -89,11 +102,6 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
   if (success!=0)
     return success;
 
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,true,1);
-  if (success!=0)
-    return success;
-
   if (ucl_device!=device->gpu) _compiled=false;
 
   ucl_device=device->gpu;
@@ -102,6 +110,11 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
   _block_size=device->block_ellipse();
   compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);
 
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,true,1);
+  if (success!=0)
+    return success;
+
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
 
@@ -133,12 +146,11 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
   if (_multiple_forms && gpu_nbor!=0)
     return -9;
 
-  if (_multiple_forms)
+  if (_multiple_forms) {
     ans->force.zero();
-
-  // Memory for ilist ordered by particle type
-  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)!=UCL_SUCCESS)
-    return -3;
+    host_olist_size = nbor->max_atoms();
+    host_olist = new int[nbor->max_atoms()];
+  }
 
   _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
@@ -160,7 +172,10 @@ template <class numtyp, class acctyp>
 void BaseEllipsoidT::clear_base() {
   // Output any timing information
   output_times();
-  host_olist.clear();
+  if (host_olist_size) {
+    host_olist_size = 0;
+    delete []host_olist;
+  }
 
   time_nbor1.clear();
   time_ellipsoid.clear();
@@ -206,10 +221,14 @@ void BaseEllipsoidT::output_times() {
   MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
              device->replica());
   double max_mb=mpi_max_bytes/(1024*1024);
-  double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
+
+  #ifdef USE_OPENCL
+  // Workaround for timing issue on Intel OpenCL
+  if (times[3] > 80e6) times[3]=0.0;
+  #endif
 
   if (device->replica_me()==0)
-    if (screen && times[5]>0.0) {
+    if (screen && times[7]>0.0) {
       int replica_size=device->replica_size();
 
       fprintf(screen,"\n\n-------------------------------------");
@@ -218,9 +237,8 @@ void BaseEllipsoidT::output_times() {
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (device->procs_per_gpu()==1 && t_time>0) {
+      if (device->procs_per_gpu()==1 && times[3]>0) {
         fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/replica_size);
-        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[5]/replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/replica_size);
         if (nbor->gpu_nbor()>0)
           fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/replica_size);
@@ -229,13 +247,15 @@ void BaseEllipsoidT::output_times() {
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/replica_size);
         fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/replica_size);
       }
-      if (nbor->gpu_nbor()==2)
-        fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[9]/replica_size);
       if (times[6]>0)
         fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);
+      fprintf(screen,"Vector width:    %d.\n", device->simd_size());
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+      if (nbor->gpu_nbor()==2)
+        fprintf(screen,"CPU Neighbor:    %.4f s.\n",times[9]/replica_size);
+      fprintf(screen,"CPU Cast/Pack:   %.4f s.\n",times[5]/replica_size);
       fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
       fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
       fprintf(screen,"-------------------------------------");
@@ -256,11 +276,13 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
   if (shared_types) {
     k_nbor_fast.set_size(GX,BX);
     k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start,
-                    &inum, &nbor->dev_packed, &form_low, &form_high);
+                    &inum, &nbor->dev_packed, &form_low, &form_high,
+                    &_threads_per_atom);
   } else {
     k_nbor.set_size(GX,BX);
     k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride,
-               &start, &inum, &nbor->dev_packed, &form_low, &form_high);
+               &start, &inum, &nbor->dev_packed, &form_low, &form_high,
+               &_threads_per_atom);
   }
 }
 
@@ -298,7 +320,7 @@ void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
         p++;
       }
     }
-    nbor->get_host(inum,host_olist.begin(),numj,firstneigh,block_size());
+    nbor->get_host(inum,host_olist,numj,firstneigh,block_size());
     nbor->copy_unpacked(inum,mn);
     return;
   }
@@ -330,8 +352,8 @@ inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
   nbor->copy_unpacked(inum,mn);
   _last_ellipse=inum;
   _max_last_ellipse=inum;
@@ -348,11 +370,18 @@ template <class numtyp, class acctyp>
 int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
                              const int nall, double **host_x, int *host_type,
                              int *ilist, int *numj, int **firstneigh,
-                             const bool eflag, const bool vflag,
+                             const bool eflag_in, const bool vflag_in,
                              const bool eatom, const bool vatom,
                              int &host_start, const double cpu_time,
                              bool &success, double **host_quat) {
   acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     zero_timers();
@@ -373,7 +402,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
   }
   int *list;
   if (_multiple_forms)
-    list=host_olist.begin();
+    list=host_olist;
   else
     list=ilist;
 
@@ -384,7 +413,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
   atom->add_quat_data();
 
   loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,list);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,list,inum);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
   return list;
@@ -394,15 +423,23 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall,
-                              double **host_x, int *host_type, double *sublo,
-                              double *subhi, tagint *tag, int **nspecial,
-                              tagint **special, const bool eflag, const bool vflag,
+int** BaseEllipsoidT::compute(const int ago, const int inum_full,
+                              const int nall, double **host_x, int *host_type,
+                              double *sublo, double *subhi, tagint *tag,
+                              int **nspecial, tagint **special,
+                              const bool eflag_in, const bool vflag_in,
                               const bool eatom, const bool vatom,
                               int &host_start, int **ilist, int **jnum,
                               const double cpu_time, bool &success,
                               double **host_quat) {
   acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     zero_timers();
@@ -435,7 +472,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
   *jnum=nbor->host_acc.begin();
 
   loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,inum);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
@@ -462,25 +499,26 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
   std::string s_lj=kns+"_lj";
   std::string s_lj_fast=kns+"_lj_fast";
 
-  std::string flags=device->compile_string();
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
 
   if (nbor_program) delete nbor_program;
   nbor_program=new UCL_Program(dev);
-  nbor_program->load_string(ellipsoid_nbor,flags.c_str());
+  nbor_program->load_string(ellipsoid_nbor,oclstring.c_str(),nullptr,screen);
   k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
   k_nbor.set_function(*nbor_program,"kernel_nbor");
   neigh_tex.get_texture(*nbor_program,"pos_tex");
 
   if (ellipsoid_program) delete ellipsoid_program;
   ellipsoid_program=new UCL_Program(dev);
-  ellipsoid_program->load_string(ellipsoid_string,flags.c_str());
+  ellipsoid_program->load_string(ellipsoid_string,oclstring.c_str(),
+                                 nullptr,screen);
   k_ellipsoid.set_function(*ellipsoid_program,kname);
   pos_tex.get_texture(*ellipsoid_program,"pos_tex");
   quat_tex.get_texture(*ellipsoid_program,"quat_tex");
 
   if (lj_program) delete lj_program;
   lj_program=new UCL_Program(dev);
-  lj_program->load_string(lj_string,flags.c_str());
+  lj_program->load_string(lj_string,oclstring.c_str(),nullptr,screen);
   k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str());
   k_lj_fast.set_function(*lj_program,s_lj_fast.c_str());
   k_lj.set_function(*lj_program,s_lj.c_str());
@@ -489,7 +527,52 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
   lj_pos_tex.get_texture(*lj_program,"pos_tex");
   lj_quat_tex.get_texture(*lj_program,"quat_tex");
 
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (ellipsoid_program_noev) delete ellipsoid_program_noev;
+  ellipsoid_program_noev=new UCL_Program(dev);
+  ellipsoid_program_noev->load_string(ellipsoid_string,oclstring.c_str(),
+                                      nullptr,screen);
+  k_ellipsoid_noev.set_function(*ellipsoid_program_noev,kname);
+
+  if (lj_program_noev) delete lj_program_noev;
+  lj_program_noev=new UCL_Program(dev);
+  lj_program_noev->load_string(lj_string,oclstring.c_str(),nullptr,screen);
+  k_sphere_ellipsoid_noev.set_function(*lj_program_noev,
+                                       s_sphere_ellipsoid.c_str());
+  k_lj_fast_noev.set_function(*lj_program_noev,s_lj_fast.c_str());
+  if (e_s)
+    k_ellipsoid_sphere_noev.set_function(*lj_program_noev,
+                                         s_ellipsoid_sphere.c_str());
+  #else
+  k_elps_sel = &k_ellipsoid;
+  k_elps_sphere_sel = &k_ellipsoid_sphere;
+  k_sphere_elps_sel = &k_sphere_ellipsoid;
+  k_lj_sel = &k_lj_fast;
+  #endif
+
   _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_lj_fast.max_subgroup_size(_block_size);
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid.max_subgroup_size(_block_size));
+    if (e_s)
+      mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere.max_subgroup_size(_block_size));
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_lj_fast_noev.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_noev.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid_noev.max_subgroup_size(_block_size));
+    if (e_s)
+      mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }
 
 template class BaseEllipsoid<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h
index dc1e624a2f..f30a0062d2 100644
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@@ -88,10 +88,10 @@ class BaseEllipsoid {
     ans->resize(nlocal, success);
     if (_multiple_forms) ans->force.zero();
 
-    if (olist_size>static_cast<int>(host_olist.numel())) {
-      host_olist.clear();
-      int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
-      success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
+    if (olist_size>host_olist_size) {
+      if (host_olist_size) delete []host_olist;
+      host_olist_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
+      host_olist = new int[host_olist_size];
     }
 
     nbor->resize(nlocal,host_inum,max_nbors,success);
@@ -116,7 +116,7 @@ class BaseEllipsoid {
   /// Accumulate timers
   inline void acc_timers() {
     if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
       time_nbor1.add_to_total();
       time_ellipsoid.add_to_total();
       if (_multiple_forms) {
@@ -223,14 +223,40 @@ class BaseEllipsoid {
   /// Neighbor data
   Neighbor *nbor;
   /// ilist with particles sorted by type
-  UCL_H_Vec<int> host_olist;
+  int *host_olist;
+  int host_olist_size;
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *nbor_program, *ellipsoid_program, *lj_program;
+  UCL_Program *ellipsoid_program_noev, *lj_program_noev;
   UCL_Kernel k_nbor_fast, k_nbor;
   UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid;
   UCL_Kernel k_lj_fast, k_lj;
+  UCL_Kernel k_ellipsoid_noev, k_ellipsoid_sphere_noev;
+  UCL_Kernel k_sphere_ellipsoid_noev, k_lj_fast_noev;
+  UCL_Kernel *k_elps_sel, *k_elps_sphere_sel, *k_sphere_elps_sel, *k_lj_sel;
   inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (_multiple_forms == false) {
+      if (eflag || vflag) k_elps_sel = &k_ellipsoid;
+      else k_elps_sel = &k_ellipsoid_noev;
+    } else {
+      if (eflag || vflag) {
+        k_elps_sel = &k_ellipsoid;
+        k_elps_sphere_sel = &k_ellipsoid_sphere;
+        k_sphere_elps_sel = &k_sphere_ellipsoid;
+        k_lj_sel = &k_lj_fast;
+      } else {
+        k_elps_sel = &k_ellipsoid_noev;
+        k_elps_sphere_sel = &k_ellipsoid_sphere_noev;
+        k_sphere_elps_sel = &k_sphere_ellipsoid_noev;
+        k_lj_sel = &k_lj_fast_noev;
+      }
+    }
+    #endif
+  }
+
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex;
@@ -240,7 +266,6 @@ class BaseEllipsoid {
   int _block_size, _threads_per_atom;
   double  _max_bytes, _max_an_bytes;
   double _gpu_overhead, _driver_overhead;
-  UCL_D_Vec<int> *_nbor_data;
 
   // True if we want to use fast GB-sphere or sphere-sphere calculations
   bool _multiple_forms;
@@ -250,7 +275,7 @@ class BaseEllipsoid {
   void compile_kernels(UCL_Device &dev, const void *ellipsoid_string,
                        const void *lj_string, const char *kname,const bool e_s);
 
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  virtual int loop(const int eflag, const int vflag) = 0;
 };
 
 }
diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp
index cfc138aea2..660385eb56 100644
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@@ -20,7 +20,7 @@ namespace LAMMPS_AL {
 extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
-BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
+BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0), _onetype(-1) {
   device=&global_device;
   ans=new Answer<numtyp,acctyp>();
   nbor=new Neighbor();
@@ -29,6 +29,9 @@ BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
   #endif
   pair_program=nullptr;
   ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -44,12 +47,18 @@ BaseThreeT::~BaseThree() {
   k_pair.clear();
   k_short_nbor.clear();
   if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_three_center_noev.clear();
+  k_three_end_noev.clear();
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }
 
 template <class numtyp, class acctyp>
 int BaseThreeT::bytes_per_atom_atomic(const int max_nbors) const {
   int b=device->atom.bytes_per_atom()+ans->bytes_per_atom()+
-         nbor->bytes_per_atom(max_nbors);
+    nbor->bytes_per_atom(max_nbors);
   #ifdef THREE_CONCURRENT
   b+=ans2->bytes_per_atom();
   #endif
@@ -62,7 +71,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
                            const double cell_size, const double gpu_split,
                            FILE *_screen, const void *pair_program,
                            const char *two, const char *three_center,
-                           const char *three_end, const char *short_nbor) {
+                           const char *three_end, const char *short_nbor,
+                           const int onetype, const int onetype3,
+                           const int spq, const int tpa_override) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -77,24 +88,16 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
   if (host_nlocal>0)
     _gpu_host=1;
 
-  _threads_per_atom=device->threads_per_atom();
-  if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else  // neigh yes or tpa == 1
-    _nbor_data=&(nbor->dev_nbor);
-  if (_threads_per_atom*_threads_per_atom>device->warp_size())
-    return -10;
+  // Allow forcing threads per atom to 1 for tersoff due to subg sync issue
+  if (tpa_override)
+    _threads_per_atom=tpa_override;
+  else
+    _threads_per_atom=device->threads_per_three();
 
   int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-  if (success!=0)
-    return success;
-
   if (ucl_device!=device->gpu) _compiled=false;
 
   ucl_device=device->gpu;
@@ -110,7 +113,19 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
 
   _block_pair=device->pair_block_size();
   _block_size=device->block_ellipse();
-  compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);
+  compile_kernels(*ucl_device,pair_program,two,three_center,three_end,
+                  short_nbor,onetype,onetype3,spq);
+
+  while (_threads_per_atom*_threads_per_atom>device->simd_size())
+    _threads_per_atom = _threads_per_atom / 2;
+
+  if (_threads_per_atom*_threads_per_atom>device->simd_size())
+    return -10;
+
+  success = device->init_nbor(nbor,nall,host_nlocal,nall,maxspecial,
+                              _gpu_host,max_nbors,cell_size,true,1,true);
+  if (success!=0)
+    return success;
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -121,22 +136,21 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
 
   pos_tex.bind_float(atom->x,4);
 
+  int ef_nall=nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+
   _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   #ifdef THREE_CONCURRENT
   _max_an_bytes+=ans2->gpu_bytes();
   #endif
 
-  int ef_nall=nall;
-  if (ef_nall==0)
-    ef_nall=2000;
-  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
-
   return 0;
 }
 
 template <class numtyp, class acctyp>
-void BaseThreeT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+void BaseThreeT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(4+add_kernels,_gpu_overhead,_driver_overhead);
 }
 
 template <class numtyp, class acctyp>
@@ -152,7 +166,6 @@ void BaseThreeT::clear_atomic() {
   time_pair.clear();
   hd_balancer.clear();
 
-  dev_short_nbor.clear();
   nbor->clear();
   ans->clear();
   #ifdef THREE_CONCURRENT
@@ -186,6 +199,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
 
   // now the requirement is removed, allowing to work within pair hybrid
   nbor->get_host(nlist,ilist,numj,firstneigh,block_size());
+  nbor->copy_unpacked(nlist,mn);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   #ifdef THREE_CONCURRENT
@@ -201,24 +215,32 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
 // Build neighbor list on device
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
-                                       const int nall, double **host_x,
-                                       int *host_type, double *sublo,
-                                       double *subhi, tagint *tag,
-                                       int **nspecial, tagint **special,
-                                       bool &success) {
+inline void BaseThreeT::build_nbor_list(const int inum, const int host_inum,
+                                        const int nall, double **host_x,
+                                        int *host_type, double *sublo,
+                                        double *subhi, tagint *tag,
+                                        int **nspecial, tagint **special,
+                                        bool &success) {
   success=true;
   resize_atom(inum,nall,success);
   resize_local(nall,host_inum,nbor->max_nbors(),success);
   if (!success)
-    return 0;
+    return;
   atom->cast_copy_x(host_x,host_type);
 
   _nall = nall;
 
+  // Increase the effective sub-domain size for neighbors of ghosts
+  // This is still inefficient because we are calculating neighbors for more
+  // ghosts than necessary due to increased ghost cutoff
+  const double ncut=nbor->cutoff()*2.0;
+  for (int i=0; i<3; i++) sublo[i]-=ncut;
+  for (int i=0; i<3; i++) subhi[i]+=ncut;
+
   int mn;
-  nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
+  nbor->copy_unpacked(nall,mn);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   #ifdef THREE_CONCURRENT
@@ -226,7 +248,6 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
   #endif
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
-  return mn;
 }
 
 // ---------------------------------------------------------------------------
@@ -236,10 +257,24 @@ template <class numtyp, class acctyp>
 void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
                          const int nlist, double **host_x, int *host_type,
                          int *ilist, int *numj, int **firstneigh,
-                         const bool eflag, const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start,
+                         const bool eflag_in, const bool vflag_in,
+                         const bool eatom, const bool vatom, int &host_start,
                          const double cpu_time, bool &success) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -260,19 +295,12 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
     reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
     if (!success)
       return;
-    _max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
   }
 
   atom->cast_x_data(host_x,host_type);
   hd_balancer.start_timer();
   atom->add_x_data(host_x,host_type);
 
-  // re-allocate dev_short_nbor if necessary
-  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    dev_short_nbor.resize((2+_max_nbors)*_nmax);
-  }
-
   // _ainum to be used in loop() for short neighbor list build
   _ainum = nlist;
 
@@ -282,11 +310,11 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
   #ifdef THREE_CONCURRENT
   ucl_device->sync();
   #endif
-  loop(eflag,vflag,evatom);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag,evatom,success);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   #ifdef THREE_CONCURRENT
-  ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans2->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans2);
   #endif
   hd_balancer.stop_timer();
@@ -296,15 +324,29 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int ** BaseThreeT::compute(const int ago, const int inum_full,
-                                 const int nall, double **host_x, int *host_type,
-                                 double *sublo, double *subhi, tagint *tag,
-                                 int **nspecial, tagint **special, const bool eflag,
-                                 const bool vflag, const bool eatom,
-                                 const bool vatom, int &host_start,
-                                 int **ilist, int **jnum,
-                                 const double cpu_time, bool &success) {
+int ** BaseThreeT::compute(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, double *sublo,
+                           double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, const bool eflag_in,
+                           const bool vflag_in, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum,
+                           const double cpu_time, bool &success) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -323,7 +365,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
 
   // Build neighbor list on GPU if necessary
   if (ago==0) {
-    _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                     sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return nullptr;
@@ -336,12 +378,6 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
   *ilist=nbor->host_ilist.begin();
   *jnum=nbor->host_acc.begin();
 
-  // re-allocate dev_short_nbor if necessary
-  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    dev_short_nbor.resize((2+_max_nbors)*_nmax);
-  }
-
   // _ainum to be used in loop() for short neighbor list build
   _ainum = nall;
 
@@ -351,11 +387,11 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
   #ifdef THREE_CONCURRENT
   ucl_device->sync();
   #endif
-  loop(eflag,vflag,evatom);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag,evatom,success);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
   #ifdef THREE_CONCURRENT
-  ans2->copy_answers(eflag,vflag,eatom,vatom);
+  ans2->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans2);
   #endif
   hd_balancer.stop_timer();
@@ -372,14 +408,24 @@ double BaseThreeT::host_memory_usage_atomic() const {
 template <class numtyp, class acctyp>
 void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                  const char *two, const char *three_center,
-                                 const char *three_end, const char* short_nbor) {
-  if (_compiled)
+                                 const char *three_end, const char* short_nbor,
+                                 const int onetype, const int onetype3,
+                                 const int spq) {
+  if (_compiled && _onetype==onetype && _onetype3==onetype3 && _spq==spq)
     return;
 
+  _onetype=onetype;
+  _onetype3=onetype3;
+  _spq=spq;
+
   std::string vatom_name=std::string(three_end)+"_vatom";
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  if (_onetype>=0) oclstring+=" -DONETYPE="+device->toa(_onetype)+
+                     " -DONETYPE3="+device->toa(_onetype3);
+  if (_spq) oclstring+=" -DSPQ="+device->toa(_spq);
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   k_three_center.set_function(*pair_program,three_center);
   k_three_end.set_function(*pair_program,three_end);
   k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
@@ -387,12 +433,50 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
   k_short_nbor.set_function(*pair_program,short_nbor);
   pos_tex.get_texture(*pair_program,"pos_tex");
 
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (_onetype>=0) oclstring+=" -DONETYPE="+device->toa(_onetype)+
+                     " -DONETYPE3="+device->toa(_onetype3);
+  if (_spq) oclstring+=" -DSPQ="+device->toa(_spq);
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_three_center_noev.set_function(*pair_program_noev,three_center);
+  k_three_end_noev.set_function(*pair_program_noev,three_end);
+  k_pair_noev.set_function(*pair_program_noev,two);
+  #else
+  k_sel = &k_pair;
+  k_3center_sel = &k_three_center;
+  k_3end_sel = &k_three_end;
+  #endif
+
   #ifdef THREE_CONCURRENT
   k_three_end.cq(ucl_device->cq(_end_command_queue));
   k_three_end_vatom.cq(ucl_device->cq(_end_command_queue));
+  #if defined(LAL_OCL_EV_JIT)
+  k_three_end_noev.cq(ucl_device->cq(_end_command_queue));
+  #endif
   #endif
 
   _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair.max_subgroup_size(_block_size);
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end_vatom.max_subgroup_size(_block_size));
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center_noev.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }
 
 template class BaseThree<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h
index 36129e6168..3e830d4217 100644
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@@ -59,10 +59,12 @@ class BaseThree {
                  const double gpu_split, FILE *screen,
                  const void *pair_program, const char *k_two,
                  const char *k_three_center, const char *k_three_end,
-                 const char *k_short_nbor=nullptr);
+                 const char *k_short_nbor=nullptr, const int onetype=-1,
+                 const int onetype3=-1, const int spq=0,
+                 const int tpa_override=0);
 
   /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
+  void estimate_gpu_overhead(const int add_kernels=0);
 
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
@@ -109,7 +111,7 @@ class BaseThree {
   /// Accumulate timers
   inline void acc_timers() {
     if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
       time_pair.add_to_total();
       atom->acc_timers();
       ans->acc_timers();
@@ -134,9 +136,9 @@ class BaseThree {
                     int *numj, int **firstneigh, bool &success);
 
   /// Build neighbor list on device
-  int build_nbor_list(const int inum, const int host_inum,
-                       const int nall, double **host_x, int *host_type,
-                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+  void build_nbor_list(const int inum, const int host_inum, const int nall,
+                       double **host_x, int *host_type, double *sublo,
+                       double *subhi, tagint *tag, int **nspecial,
                        tagint **special, bool &success);
 
   /// Pair loop with host neighboring
@@ -147,12 +149,12 @@ class BaseThree {
                int &host_start, const double cpu_time, bool &success);
 
   /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **numj, const double cpu_time, bool &success);
+  int ** compute(const int ago, const int inum_full, const int nall,
+                 double **host_x, int *host_type, double *sublo,
+                 double *subhi, tagint *tag, int **nspecial, tagint **special,
+                 const bool eflag, const bool vflag, const bool eatom,
+                 const bool vatom, int &host_start, int **ilist,
+                 int **numj, const double cpu_time, bool &success);
 
   // -------------------------- DEVICE DATA -------------------------
 
@@ -188,14 +190,29 @@ class BaseThree {
   /// Neighbor data
   Neighbor *nbor;
 
-  UCL_D_Vec<int> dev_short_nbor;
   UCL_Kernel k_short_nbor;
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
+  UCL_Program *pair_program, *pair_program_noev;
   UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
+  UCL_Kernel k_pair_noev, k_three_center_noev, k_three_end_noev;
+  UCL_Kernel *k_sel, *k_3center_sel, *k_3end_sel;
   inline int block_pair() { return _block_pair; }
   inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) {
+      k_sel = &k_pair;
+      k_3center_sel = &k_three_center;
+      k_3end_sel = &k_three_end;
+    } else {
+      k_sel = &k_pair_noev;
+      k_3center_sel = &k_three_center_noev;
+      k_3end_sel = &k_three_end_noev;
+    }
+    #endif
+  }
+
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex;
@@ -203,18 +220,19 @@ class BaseThree {
  protected:
   bool _compiled;
   int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
-  int _gpu_nbor;
+  int _gpu_nbor, _onetype, _onetype3, _spq;
   double _max_bytes, _max_an_bytes;
-  int _max_nbors, _ainum, _nall;
+  int _ainum, _nall;
   double _gpu_overhead, _driver_overhead;
-  UCL_D_Vec<int> *_nbor_data;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
                        const char *two, const char *three_center,
-                       const char *three_end, const char* short_nbor);
+                       const char *three_end, const char* short_nbor,
+                       const int onetype, const int onetype3,
+                       const int spq);
 
-  virtual void loop(const bool _eflag, const bool _vflag,
-                    const int evatom) = 0;
+  virtual int loop(const int eflag, const int vflag, const int evatom,
+                   bool &success) = 0;
 };
 
 }
diff --git a/lib/gpu/lal_beck.cpp b/lib/gpu/lal_beck.cpp
index be1722c32c..57551d9787 100644
--- a/lib/gpu/lal_beck.cpp
+++ b/lib/gpu/lal_beck.cpp
@@ -113,20 +113,9 @@ double BeckT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BeckT::loop(const bool _eflag, const bool _vflag) {
+int BeckT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -134,8 +123,8 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &beck1, &beck2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &beck1, &beck2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom);
@@ -147,6 +136,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Beck<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_beck.cu b/lib/gpu/lal_beck.cu
index f24132b9a2..a2a15e4d21 100644
--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@@ -39,22 +39,25 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -98,14 +101,14 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp term6 = pow(term1,(numtyp)-3);
           numtyp term1inv = ucl_recip(term1);
           numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
           e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -116,9 +119,9 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
@@ -137,6 +140,9 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -144,19 +150,19 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
     beck2[tid]=beck2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -200,14 +206,14 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp term6 = pow(term1,(numtyp)-3);
           numtyp term1inv = ucl_recip(term1);
           numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
           e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -218,8 +224,8 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_beck.h b/lib/gpu/lal_beck.h
index 638f1bf626..c6413ed766 100644
--- a/lib/gpu/lal_beck.h
+++ b/lib/gpu/lal_beck.h
@@ -72,7 +72,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_beck_ext.cpp b/lib/gpu/lal_beck_ext.cpp
index dcba4e4f40..ab65237e27 100644
--- a/lib/gpu/lal_beck_ext.cpp
+++ b/lib/gpu/lal_beck_ext.cpp
@@ -55,7 +55,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
   int init_ok=0;
   if (world_me==0)
     init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta,
-                      AA, BB, special_lj, inum, nall, 300,
+                      AA, BB, special_lj, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen);
 
   BLMF.device->world_barrier();
@@ -73,7 +73,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, AA, BB,
-                        special_lj, inum, nall, 300, maxspecial,
+                        special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);
 
     BLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp
index 4a6b789687..c4796b3450 100644
--- a/lib/gpu/lal_born.cpp
+++ b/lib/gpu/lal_born.cpp
@@ -138,20 +138,9 @@ double BornT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BornT::loop(const bool _eflag, const bool _vflag) {
+int BornT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -159,8 +148,8 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1,&coeff2,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1,&coeff2,
                           &cutsq_sigma, &sp_lj,
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
@@ -176,6 +165,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
                      &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Born<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu
index f9fea6d618..825175af8f 100644
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@@ -40,22 +40,25 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -92,12 +95,12 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
             + coeff2[mtype].z*r2inv*r6inv;
           energy+=factor_lj*(e-coeff2[mtype].w);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -108,9 +111,9 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_born_fast(const __global numtyp4 *restrict x_,
@@ -130,27 +133,30 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -187,12 +193,12 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
             + coeff2[mtype].z*r2inv*r6inv;
           energy+=factor_lj*(e-coeff2[mtype].w);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -203,8 +209,8 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_born.h b/lib/gpu/lal_born.h
index 2a7f355d69..3f5277b682 100644
--- a/lib/gpu/lal_born.h
+++ b/lib/gpu/lal_born.h
@@ -82,7 +82,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_born_coul_long.cpp b/lib/gpu/lal_born_coul_long.cpp
index 1b147395f6..8c7084f4a4 100644
--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@@ -129,20 +129,9 @@ double BornCoulLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
+int BornCoulLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -150,8 +139,8 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force,
@@ -170,6 +159,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
                    &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class BornCoulLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu
index 14e644b45a..d38a101c30 100644
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@@ -48,6 +48,9 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -57,18 +60,18 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -124,7 +127,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < cutsq_sigma[mtype].y) {
@@ -133,7 +136,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -144,9 +147,9 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
@@ -169,28 +172,31 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -246,7 +252,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < cutsq_sigma[mtype].y) {
@@ -255,7 +261,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -266,8 +272,8 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_born_coul_long.h b/lib/gpu/lal_born_coul_long.h
index e383d18e0c..a33b8f436a 100644
--- a/lib/gpu/lal_born_coul_long.h
+++ b/lib/gpu/lal_born_coul_long.h
@@ -80,7 +80,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
 
  protected:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_born_coul_long_cs.cu b/lib/gpu/lal_born_coul_long_cs.cu
index 6f04fcea94..077ec2f74f 100644
--- a/lib/gpu/lal_born_coul_long_cs.cu
+++ b/lib/gpu/lal_born_coul_long_cs.cu
@@ -63,6 +63,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -72,18 +75,18 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -155,7 +158,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             numtyp e = prefactor*_erfc;
             if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@@ -167,7 +170,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -178,9 +181,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
@@ -203,28 +206,31 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -296,7 +302,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             numtyp e = prefactor*_erfc;
             if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@@ -308,7 +314,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -319,8 +325,8 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_born_coul_long_cs_ext.cpp b/lib/gpu/lal_born_coul_long_cs_ext.cpp
index badc8b0808..fc6b89692f 100644
--- a/lib/gpu/lal_born_coul_long_cs_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_cs_ext.cpp
@@ -60,7 +60,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   if (world_me==0)
     init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma, offset,
-                          special_lj, inum, nall, 300, maxspecial, cell_size,
+                          special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                           gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                           host_special_coul, qqrd2e, g_ewald);
 
@@ -80,7 +80,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     if (gpu_rank==i && world_me!=0)
       init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                             host_born3, host_a, host_c, host_d, sigma, offset,
-                            special_lj, inum, nall, 300, maxspecial, cell_size,
+                            special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                             gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                             host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_born_coul_long_ext.cpp b/lib/gpu/lal_born_coul_long_ext.cpp
index d0825529b1..9d17f2fa7d 100644
--- a/lib/gpu/lal_born_coul_long_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@@ -60,7 +60,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   if (world_me==0)
     init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma, offset,
-                          special_lj, inum, nall, 300, maxspecial, cell_size,
+                          special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                           gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                           host_special_coul, qqrd2e, g_ewald);
 
@@ -80,7 +80,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     if (gpu_rank==i && world_me!=0)
       init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                             host_born3, host_a, host_c, host_d, sigma, offset,
-                            special_lj, inum, nall, 300, maxspecial, cell_size,
+                            special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                             gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                             host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp
index 1624dd9d50..e6caebbab8 100644
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@@ -131,20 +131,9 @@ double BornCoulWolfT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
+int BornCoulWolfT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -152,8 +141,8 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->atom->q,
@@ -171,6 +160,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
                    &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class BornCoulWolf<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu
index 0eeda48ec0..aefcac8127 100644
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@@ -51,6 +51,9 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -60,18 +63,18 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -79,7 +82,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -137,7 +140,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             numtyp e=v_sh;
             if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@@ -149,7 +152,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -160,9 +163,9 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
@@ -186,28 +189,31 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -216,7 +222,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -273,7 +279,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             numtyp e=v_sh;
             if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@@ -285,7 +291,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -296,8 +302,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_born_coul_wolf.h b/lib/gpu/lal_born_coul_wolf.h
index fa53f48939..0aad07dfa5 100644
--- a/lib/gpu/lal_born_coul_wolf.h
+++ b/lib/gpu/lal_born_coul_wolf.h
@@ -81,7 +81,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
 
  protected:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_born_coul_wolf_cs.cu b/lib/gpu/lal_born_coul_wolf_cs.cu
index b957b8be69..866d256f33 100644
--- a/lib/gpu/lal_born_coul_wolf_cs.cu
+++ b/lib/gpu/lal_born_coul_wolf_cs.cu
@@ -52,6 +52,9 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -61,18 +64,18 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -80,7 +83,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -139,7 +142,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             acctyp e=v_sh;
             if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@@ -151,7 +154,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -162,9 +165,9 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
@@ -188,28 +191,31 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -218,7 +224,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -276,7 +282,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             acctyp e=v_sh;
             if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@@ -288,7 +294,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -299,8 +305,8 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_born_coul_wolf_cs_ext.cpp b/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
index e2211644af..ae162a7c52 100644
--- a/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
@@ -60,7 +60,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   if (world_me==0)
     init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma,
-                          offset, special_lj, inum, nall, 300,
+                          offset, special_lj, inum, nall, max_nbors,
                           maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e,
                           alf, e_shift, f_shift);
@@ -81,7 +81,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     if (gpu_rank==i && world_me!=0)
       init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                             host_born3, host_a, host_c, host_d, sigma,
-                            offset, special_lj, inum, nall, 300,
+                            offset, special_lj, inum, nall, max_nbors,
                             maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                             host_cut_coulsq, host_special_coul, qqrd2e,
                             alf, e_shift, f_shift);
diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp
index d664f30212..bc38db1b9c 100644
--- a/lib/gpu/lal_born_coul_wolf_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@@ -60,7 +60,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   if (world_me==0)
     init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma,
-                          offset, special_lj, inum, nall, 300,
+                          offset, special_lj, inum, nall, max_nbors,
                           maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e,
                           alf, e_shift, f_shift);
@@ -81,7 +81,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     if (gpu_rank==i && world_me!=0)
       init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                             host_born3, host_a, host_c, host_d, sigma,
-                            offset, special_lj, inum, nall, 300,
+                            offset, special_lj, inum, nall, max_nbors,
                             maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                             host_cut_coulsq, host_special_coul, qqrd2e,
                             alf, e_shift, f_shift);
diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp
index 63991889d9..2321a1264d 100644
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@@ -58,7 +58,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   if (world_me==0)
     init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                         host_born3, host_a, host_c, host_d, sigma,
-                        offset, special_lj, inum, nall, 300,
+                        offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen);
 
   BORNMF.device->world_barrier();
@@ -77,7 +77,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     if (gpu_rank==i && world_me!=0)
       init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma,
-                          offset, special_lj, inum, nall, 300,
+                          offset, special_lj, inum, nall, max_nbors,
                           maxspecial, cell_size, gpu_split, screen);
 
     BORNMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp
index 5a335a1e51..01411775e1 100644
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@@ -130,20 +130,9 @@ double BuckT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BuckT::loop(const bool _eflag, const bool _vflag) {
+int BuckT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -151,8 +140,8 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -165,6 +154,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Buck<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu
index 0f9044cefc..958c7bdd4d 100644
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@@ -39,22 +39,25 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -91,11 +94,11 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
           energy+=factor_lj*(e-coeff2[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -106,9 +109,9 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
@@ -127,27 +130,30 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -184,11 +190,11 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
           energy+=factor_lj*(e-coeff2[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -199,8 +205,8 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_buck.h b/lib/gpu/lal_buck.h
index 7a09fae5dd..5755dea230 100644
--- a/lib/gpu/lal_buck.h
+++ b/lib/gpu/lal_buck.h
@@ -77,7 +77,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_buck_coul.cpp b/lib/gpu/lal_buck_coul.cpp
index 25607eae17..c3c70e6d4d 100644
--- a/lib/gpu/lal_buck_coul.cpp
+++ b/lib/gpu/lal_buck_coul.cpp
@@ -122,20 +122,9 @@ double BuckCoulT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
+int BuckCoulT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -143,8 +132,8 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -158,6 +147,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
                      &cutsq, &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class BuckCoul<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu
index 163c8e4362..2aaa9c9b3d 100644
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@@ -47,6 +47,9 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -56,18 +59,18 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -119,14 +122,14 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += forcecoul;
           if (rsq < cutsq[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -137,9 +140,9 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
@@ -162,29 +165,32 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
     cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -236,14 +242,14 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += forcecoul;
           if (rsq < cutsq[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -254,8 +260,8 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_buck_coul.h b/lib/gpu/lal_buck_coul.h
index eebba78eb0..bd2afcf9d8 100644
--- a/lib/gpu/lal_buck_coul.h
+++ b/lib/gpu/lal_buck_coul.h
@@ -78,7 +78,7 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_buck_coul_ext.cpp b/lib/gpu/lal_buck_coul_ext.cpp
index 2a089e2040..9cf8f9b00e 100644
--- a/lib/gpu/lal_buck_coul_ext.cpp
+++ b/lib/gpu/lal_buck_coul_ext.cpp
@@ -58,7 +58,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   int init_ok=0;
   if (world_me==0)
     init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                       host_a, host_c, offset, special_lj, inum, nall, 300,
+                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen,
                        host_cut_ljsq, host_cut_coulsq,
                        host_special_coul, qqrd2e);
@@ -78,7 +78,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                       host_a, host_c, offset, special_lj, inum, nall, 300,
+                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen,
                        host_cut_ljsq, host_cut_coulsq,
                        host_special_coul, qqrd2e);
diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp
index 1c0288c2d8..60205a2ad6 100644
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@@ -126,20 +126,9 @@ double BuckCoulLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
+int BuckCoulLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -147,8 +136,8 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -163,6 +152,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
                    &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class BuckCoulLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu
index b1bbf67bc2..f5ce3a7d11 100644
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@@ -48,6 +48,9 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -57,18 +60,18 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -126,7 +129,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < coeff1[mtype].w) {
@@ -134,7 +137,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -145,9 +148,9 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
@@ -171,28 +174,31 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -250,7 +256,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < coeff1[mtype].w) {
@@ -258,7 +264,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -269,8 +275,8 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_buck_coul_long.h b/lib/gpu/lal_buck_coul_long.h
index e2d69475cf..fa978a70be 100644
--- a/lib/gpu/lal_buck_coul_long.h
+++ b/lib/gpu/lal_buck_coul_long.h
@@ -78,7 +78,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_buck_coul_long_ext.cpp b/lib/gpu/lal_buck_coul_long_ext.cpp
index c7e1cd1e35..393ccc3feb 100644
--- a/lib/gpu/lal_buck_coul_long_ext.cpp
+++ b/lib/gpu/lal_buck_coul_long_ext.cpp
@@ -59,7 +59,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   int init_ok=0;
   if (world_me==0)
     init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                        host_a, host_c, offset, special_lj, inum, nall, 300,
+                        host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
@@ -78,7 +78,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                        host_a, host_c, offset, special_lj, inum, nall, 300,
+                        host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp
index cc8b77c0a9..738b33337d 100644
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@@ -56,7 +56,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   int init_ok=0;
   if (world_me==0)
     init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                       host_a, host_c, offset, special_lj, inum, nall, 300,
+                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   BUCKMF.device->world_barrier();
@@ -74,7 +74,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                       host_a, host_c, offset, special_lj, inum, nall, 300,
+                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
     BUCKMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_charmm.cpp b/lib/gpu/lal_charmm.cpp
new file mode 100644
index 0000000000..811a431cc7
--- /dev/null
+++ b/lib/gpu/lal_charmm.cpp
@@ -0,0 +1,166 @@
+/***************************************************************************
+                               charmm.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the charmm/coul pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "charmm_cl.h"
+#elif defined(USE_CUDART)
+const char *charmm_long=0;
+#else
+#include "charmm_cubin.h"
+#endif
+
+#include "lal_charmm.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define CHARMMT CHARMM<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+CHARMMT::CHARMM() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+CHARMMT::~CHARMM() {
+  clear();
+}
+
+template <class numtyp, class acctyp>
+int CHARMMT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int CHARMMT::init(const int ntypes, double host_cut_bothsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4,
+                   double *host_special_lj, const int nlocal, const int nall,
+                   const int max_nbors, const int maxspecial,
+                   const double cell_size, const double gpu_split,
+                   FILE *_screen, double host_cut_ljsq,
+                   const double host_cut_coulsq, double *host_special_coul,
+                   const double qqrd2e, const double cut_lj_innersq,
+                   const double cut_coul_innersq, const double denom_lj,
+                   const double denom_coul, double **epsilon,
+                   double **sigma, const bool mix_arithmetic) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
+                            gpu_split,_screen,charmm,"k_charmm");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_bio_shared_types=this->device->max_bio_shared_types();
+  if (this->_block_bio_size>=64 && mix_arithmetic &&
+      lj_types<=max_bio_shared_types)
+    shared_types=true;
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  int h_size=lj_types*lj_types;
+  if (h_size<max_bio_shared_types)
+    h_size=max_bio_shared_types;
+  UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  for (int i=0; i<h_size*32; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_lj3,host_lj4);
+
+  if (shared_types) {
+    ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
+    this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
+  }
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_bothsq = host_cut_bothsq;
+  _cut_coulsq = host_cut_coulsq;
+  _cut_ljsq = host_cut_ljsq;
+  _cut_lj_innersq = cut_lj_innersq;
+  _cut_coul_innersq = cut_coul_innersq;
+  _qqrd2e=qqrd2e;
+  _denom_lj=denom_lj;
+  _denom_coul=denom_coul;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void CHARMMT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  ljd.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double CHARMMT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(CHARMM<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int CHARMMT::loop(const int eflag, const int vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->_block_bio_size;
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &ljd, &sp_lj,
+                          &this->nbor->dev_nbor, this->_nbor_data,
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &_cut_coulsq, &_qqrd2e, &_denom_lj, &_denom_coul,
+                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
+                          &_cut_coul_innersq, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &ljd, &sp_lj,
+                     &this->nbor->dev_nbor, this->_nbor_data,
+                     &this->ans->force, &this->ans->engv, &eflag,
+                     &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                     &_cut_coulsq, &_qqrd2e, &_denom_lj, &_denom_coul,
+                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
+                     &_cut_coul_innersq, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+  return GX;
+}
+
+template class CHARMM<PRECISION,ACC_PRECISION>;
+}
diff --git a/lib/gpu/lal_charmm.cu b/lib/gpu/lal_charmm.cu
new file mode 100644
index 0000000000..42fb810796
--- /dev/null
+++ b/lib/gpu/lal_charmm.cu
@@ -0,0 +1,303 @@
+// **************************************************************************
+//                               charmm.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for acceleration of the charmm/coul pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_charmm(const __global numtyp4 *restrict x_,
+                       const __global numtyp2 *restrict ljd,
+                       const __global numtyp *restrict sp_lj,
+                       const __global int *dev_nbor,
+                       const __global int *dev_packed,
+                       __global acctyp4 *restrict ans,
+                       __global acctyp *restrict engv,
+                       const int eflag, const int vflag,
+                       const int inum, const int nbor_pitch,
+                       const __global numtyp *restrict q_,
+                       const numtyp cut_coulsq, const numtyp qqrd2e,
+                       const numtyp denom_lj,
+                       const numtyp denom_coul,
+                       const numtyp cut_bothsq,
+                       const numtyp cut_ljsq,
+                       const numtyp cut_lj_innersq,
+                       const numtyp cut_coul_innersq,
+                       const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_bio();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cut_bothsq) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, switch1;
+        numtyp lj3, lj4;
+
+        if (rsq < cut_ljsq) {
+          numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
+          numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
+
+          numtyp sig_r_6 = sig6*sig6*r2inv;
+          sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
+          lj4 = (numtyp)4.0*eps*sig_r_6;
+          lj3 = lj4*sig_r_6;
+          force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq);
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)*
+                             denom_lj;
+            switch1 *= switch1;
+            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)*
+                       denom_lj;
+            switch2 *= lj3-lj4;
+            force_lj = force_lj*switch1+switch2;
+          }
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp rinv = ucl_rsqrt(rsq);
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= factor_coul * qqrd2e * qtmp * rinv;
+          if (rsq > cut_coul_innersq) {
+            numtyp switch3 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+              (cut_coulsq + (numtyp)2.0*rsq - (numtyp)3.0*cut_coul_innersq) *
+              denom_coul;
+            forcecoul *= switch3;
+          }
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (EVFLAG && eflag) {
+          e_coul += forcecoul;
+          if (rsq < cut_ljsq) {
+            numtyp e=lj3-lj4;
+            if (rsq > cut_lj_innersq)
+              e *= switch1;
+            energy+=factor_lj*e;
+          }
+        }
+        if (EVFLAG && vflag) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
+}
+
+__kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
+                            const __global numtyp2 *restrict ljd_in,
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            const int eflag, const int vflag,
+                            const int inum, const int nbor_pitch,
+                            const __global numtyp *restrict q_,
+                            const numtyp cut_coulsq, const numtyp qqrd2e,
+                            const numtyp denom_lj,
+                            const numtyp denom_coul,
+                            const numtyp cut_bothsq,
+                            const numtyp cut_ljsq,
+                            const numtyp cut_lj_innersq,
+                            const numtyp cut_coul_innersq,
+                            const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_bio();
+
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_BIO_SHARED_TYPES)
+    ljd[tid]=ljd_in[tid];
+  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
+    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cut_bothsq) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, switch1;
+        numtyp lj3, lj4;
+
+        if (rsq < cut_ljsq) {
+          numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
+          numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
+
+          numtyp sig_r_6 = sig6*sig6*r2inv;
+          sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
+          lj4 = (numtyp)4.0*eps*sig_r_6;
+          lj3 = lj4*sig_r_6;
+          force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq);
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)*
+                             denom_lj;
+            switch1 *= switch1;
+            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)*
+                       denom_lj;
+            switch2 *= lj3-lj4;
+            force_lj = force_lj*switch1+switch2;
+          }
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp rinv = ucl_rsqrt(rsq);
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= factor_coul * qqrd2e * qtmp * rinv;
+          if (rsq > cut_coul_innersq) {
+            numtyp switch3 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+              (cut_coulsq + (numtyp)2.0*rsq - (numtyp)3.0*cut_coul_innersq) *
+              denom_coul;
+            forcecoul *= switch3;
+          }
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (EVFLAG && eflag) {
+          e_coul += forcecoul;
+          if (rsq < cut_ljsq) {
+            numtyp e=lj3-lj4;
+            if (rsq > cut_lj_innersq)
+              e *= switch1;
+            energy+=factor_lj*e;
+          }
+        }
+        if (EVFLAG && vflag) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
+}
diff --git a/lib/gpu/lal_charmm.h b/lib/gpu/lal_charmm.h
new file mode 100644
index 0000000000..0793d7ca0f
--- /dev/null
+++ b/lib/gpu/lal_charmm.h
@@ -0,0 +1,89 @@
+/***************************************************************************
+                                charmm.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the charmm/coul pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_CHARMM_
+#define LAL_CHARMM_
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class CHARMM : public BaseCharge<numtyp, acctyp> {
+ public:
+  CHARMM();
+  ~CHARMM();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double host_cut_bothsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double cut_lj_innersq,
+           const double cut_coul_innersq, const double denom_lj,
+           const double denom_coul, double **epsilon, double **sigma,
+           const bool mix_arithmetic);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// x = lj1, y = lj2, z = lj3, w = lj4
+  UCL_D_Vec<numtyp4> lj1;
+  /// x = epsilon, y = sigma
+  UCL_D_Vec<numtyp2> ljd;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _qqrd2e, _denom_lj, _denom_coul;
+
+  numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq;
+  numtyp _cut_coul_innersq;
+
+ private:
+  bool _allocated;
+  int loop(const int eflag, const int vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_charmm_ext.cpp b/lib/gpu/lal_charmm_ext.cpp
new file mode 100644
index 0000000000..bed2f21933
--- /dev/null
+++ b/lib/gpu/lal_charmm_ext.cpp
@@ -0,0 +1,137 @@
+/***************************************************************************
+                             charmm_long_ext.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to charmm/coul/long acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_charmm.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static CHARMM<PRECISION,ACC_PRECISION> CRMMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int crm_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4,
+                   double *special_lj, const int inum,
+                   const int nall, const int max_nbors, const int maxspecial,
+                   const double cell_size, int &gpu_mode, FILE *screen,
+                   double host_cut_ljsq, double host_cut_coulsq,
+                   double *host_special_coul, const double qqrd2e,
+                   const double cut_lj_innersq, const double cut_coul_innersq,
+                   const double denom_lj, const double denom_coul,
+                   double **epsilon, double **sigma,
+                   const bool mix_arithmetic) {
+  CRMMF.clear();
+  gpu_mode=CRMMF.device->gpu_mode();
+  double gpu_split=CRMMF.device->particle_split();
+  int first_gpu=CRMMF.device->first_device();
+  int last_gpu=CRMMF.device->last_device();
+  int world_me=CRMMF.device->world_me();
+  int gpu_rank=CRMMF.device->gpu_rank();
+  int procs_per_gpu=CRMMF.device->procs_per_gpu();
+
+  CRMMF.device->init_message(screen,"lj/charmm/coul/charmm",first_gpu,
+                              last_gpu);
+
+  bool message=false;
+  if (CRMMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing Device and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    CRMMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                special_lj, inum, nall, max_nbors, maxspecial, cell_size,
+                gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
+                host_special_coul, qqrd2e, cut_lj_innersq, cut_coul_innersq,
+                denom_lj, denom_coul, epsilon, sigma, mix_arithmetic);
+
+  CRMMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CRMMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
+                          host_lj4, special_lj, inum, nall, max_nbors,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e, cut_lj_innersq, cut_coul_innersq, denom_lj,
+                          denom_coul, epsilon, sigma, mix_arithmetic);
+
+    CRMMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    CRMMF.estimate_gpu_overhead();
+
+  return init_ok;
+}
+
+void crm_gpu_clear() {
+  CRMMF.clear();
+}
+
+int** crm_gpu_compute_n(const int ago, const int inum_full, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
+                          tagint **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum, const double cpu_time,
+                          bool &success, double *host_q, double *boxlo,
+                          double *prd) {
+  return CRMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
+}
+
+void crm_gpu_compute(const int ago, const int inum_full, const int nall,
+                       double **host_x, int *host_type, int *ilist, int *numj,
+                       int **firstneigh, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       const double cpu_time, bool &success, double *host_q,
+                       const int nlocal, double *boxlo, double *prd) {
+  CRMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
+                 eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+                 nlocal,boxlo,prd);
+}
+
+double crm_gpu_bytes() {
+  return CRMMF.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp
index a78996a7d5..8008b1fbb3 100644
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@@ -131,20 +131,9 @@ double CHARMMLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
+int CHARMMLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->_block_bio_size;
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -152,8 +141,8 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &ljd, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -171,6 +160,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
                      &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class CHARMMLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu
index 4e9802f368..77793d0e83 100644
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@@ -47,18 +47,21 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
+  int n_stride;
+  local_allocate_store_bio();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -122,7 +125,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < cut_ljsq) {
@@ -132,7 +135,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
             energy+=factor_lj*e;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -143,9 +146,9 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
@@ -168,6 +171,9 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_bio();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_BIO_SHARED_TYPES)
@@ -175,20 +181,20 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
   if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
     ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -258,7 +264,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < cut_ljsq) {
@@ -268,7 +274,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*e;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -277,10 +283,9 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
           virial[5] += dely*delz*force;
         }
       }
-
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_charmm_long.h b/lib/gpu/lal_charmm_long.h
index 5d9d9ea50b..69f1a0734a 100644
--- a/lib/gpu/lal_charmm_long.h
+++ b/lib/gpu/lal_charmm_long.h
@@ -79,7 +79,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_charmm_long_ext.cpp b/lib/gpu/lal_charmm_long_ext.cpp
index 743b510825..13565f5682 100644
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@@ -60,7 +60,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                offset, special_lj, inum, nall, 300, maxspecial, cell_size,
+                offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                 gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                 host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
                 epsilon,sigma,mix_arithmetic);
@@ -80,7 +80,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
-                          host_lj4, offset, special_lj, inum, nall, 300,
+                          host_lj4, offset, special_lj, inum, nall, max_nbors,
                           maxspecial, cell_size, gpu_split, screen,
                           host_cut_ljsq, host_cut_coulsq, host_special_coul,
                           qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
diff --git a/lib/gpu/lal_colloid.cpp b/lib/gpu/lal_colloid.cpp
index c441d50968..fec7a3ad5f 100644
--- a/lib/gpu/lal_colloid.cpp
+++ b/lib/gpu/lal_colloid.cpp
@@ -140,20 +140,9 @@ double ColloidT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void ColloidT::loop(const bool _eflag, const bool _vflag) {
+int ColloidT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -161,8 +150,8 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &colloid1, &colloid2, &form,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
@@ -176,6 +165,7 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Colloid<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu
index 4983142aa0..8a20f0c400 100644
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@@ -42,22 +42,25 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -146,7 +149,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=(numtyp)0.0;
           if (form[mtype]==0) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
@@ -160,7 +163,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
           }
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -171,9 +174,9 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
@@ -198,6 +201,9 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 colloid2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -205,23 +211,23 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
     colloid1[tid]=colloid1_in[tid];
     colloid2[tid]=colloid2_in[tid];
     form[tid]=form_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -310,7 +316,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=(numtyp)0.0;
           if (form[mtype]==0) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
@@ -325,7 +331,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
           }
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -336,8 +342,8 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_colloid.h b/lib/gpu/lal_colloid.h
index 35426007d8..43f14cd354 100644
--- a/lib/gpu/lal_colloid.h
+++ b/lib/gpu/lal_colloid.h
@@ -81,7 +81,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_colloid_ext.cpp b/lib/gpu/lal_colloid_ext.cpp
index 961ad75925..dcfd1a6d34 100644
--- a/lib/gpu/lal_colloid_ext.cpp
+++ b/lib/gpu/lal_colloid_ext.cpp
@@ -60,7 +60,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                         host_lj4, offset, special_lj, host_a12, host_a1,
                         host_a2, host_d1, host_d2, host_sigma3,
-                        host_sigma6, host_form, inum, nall, 300,
+                        host_sigma6, host_form, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen);
 
   COLLMF.device->world_barrier();
@@ -80,7 +80,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
       init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                           offset, special_lj, host_a12, host_a1, host_a2,
                           host_d1, host_d2, host_sigma3, host_sigma6, host_form,
-                          inum, nall, 300, maxspecial,
+                          inum, nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen);
 
     COLLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_coul.cpp b/lib/gpu/lal_coul.cpp
index 3e29215c91..df9eeae667 100644
--- a/lib/gpu/lal_coul.cpp
+++ b/lib/gpu/lal_coul.cpp
@@ -125,20 +125,9 @@ double CoulT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CoulT::loop(const bool _eflag, const bool _vflag) {
+int CoulT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -146,8 +135,8 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -161,6 +150,7 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
                      &cutsq, &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Coul<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_coul.cu b/lib/gpu/lal_coul.cu
index 03fc568c77..c4da81a3a2 100644
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@@ -46,22 +46,25 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_cl[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_cl[0]=sp_cl_in[0];
   sp_cl[1]=sp_cl_in[1];
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -98,10 +101,10 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += forcecoul;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -112,9 +115,9 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
@@ -134,25 +137,28 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<4)
     sp_cl[tid]=sp_cl_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=_cutsq[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -189,10 +195,10 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += forcecoul;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -203,8 +209,8 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_coul.h b/lib/gpu/lal_coul.h
index 38472375fb..7298536dea 100644
--- a/lib/gpu/lal_coul.h
+++ b/lib/gpu/lal_coul.h
@@ -75,7 +75,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_coul_debye.cpp b/lib/gpu/lal_coul_debye.cpp
index 08ceb99300..1107708ca8 100644
--- a/lib/gpu/lal_coul_debye.cpp
+++ b/lib/gpu/lal_coul_debye.cpp
@@ -126,20 +126,9 @@ double CoulDebyeT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
+int CoulDebyeT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -147,8 +136,8 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->atom->q, &cutsq,
@@ -162,6 +151,7 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_kappa, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class CoulDebye<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_coul_debye.cu b/lib/gpu/lal_coul_debye.cu
index e7f0b97e23..ba922f04a6 100644
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@@ -47,22 +47,25 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_cl[0]=sp_cl_in[0];
   sp_cl[1]=sp_cl_in[1];
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -102,10 +105,10 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul+=qqrd2e*scale[mtype]*qtmp*rinv*screening*factor_coul;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -116,9 +119,9 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
@@ -140,6 +143,9 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
   __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<4)
     sp_cl[tid]=sp_cl_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -147,19 +153,19 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
     cutsq[tid]=_cutsq[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -199,10 +205,10 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul+=qqrd2e*scale[mtype]*qtmp*rinv*screening*factor_coul;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -213,8 +219,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_coul_debye.h b/lib/gpu/lal_coul_debye.h
index 13e4c5b0c6..9054df1995 100644
--- a/lib/gpu/lal_coul_debye.h
+++ b/lib/gpu/lal_coul_debye.h
@@ -76,7 +76,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_coul_debye_ext.cpp b/lib/gpu/lal_coul_debye_ext.cpp
index af54746def..516dca5df8 100644
--- a/lib/gpu/lal_coul_debye_ext.cpp
+++ b/lib/gpu/lal_coul_debye_ext.cpp
@@ -54,7 +54,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, 300,
+    init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);
 
   CDEMF.device->world_barrier();
@@ -71,7 +71,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, 300,
+      init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);
 
     CDEMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_coul_dsf.cpp b/lib/gpu/lal_coul_dsf.cpp
index fe1fbfede7..1a56e84b52 100644
--- a/lib/gpu/lal_coul_dsf.cpp
+++ b/lib/gpu/lal_coul_dsf.cpp
@@ -110,20 +110,9 @@ double CoulDSFT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
+int CoulDSFT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -131,8 +120,8 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -148,6 +137,7 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
                      &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class CoulDSF<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu
index 190fb5b7fd..5241cb5097 100644
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@@ -48,30 +48,33 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -111,11 +114,11 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
           e_coul += e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -126,9 +129,9 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
@@ -147,30 +150,33 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -210,11 +216,11 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
           e_coul += e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -225,8 +231,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_coul_dsf.h b/lib/gpu/lal_coul_dsf.h
index 3d57898f81..a33e98f836 100644
--- a/lib/gpu/lal_coul_dsf.h
+++ b/lib/gpu/lal_coul_dsf.h
@@ -70,7 +70,7 @@ class CoulDSF : public BaseCharge<numtyp, acctyp> {
  private:
   bool _allocated;
   numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_coul_dsf_ext.cpp b/lib/gpu/lal_coul_dsf_ext.cpp
index 2d18f9f94d..e21c70ae4b 100644
--- a/lib/gpu/lal_coul_dsf_ext.cpp
+++ b/lib/gpu/lal_coul_dsf_ext.cpp
@@ -55,7 +55,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size,
+    init_ok=CDMF.init(ntypes, inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen, host_cut_coulsq, host_special_coul,
                       qqrd2e, e_shift, f_shift, alpha);
 
@@ -73,7 +73,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size,
+      init_ok=CDMF.init(ntypes, inum, nall, max_nbors, maxspecial, cell_size,
                         gpu_split, screen, host_cut_coulsq, host_special_coul,
                         qqrd2e, e_shift, f_shift, alpha);
 
diff --git a/lib/gpu/lal_coul_ext.cpp b/lib/gpu/lal_coul_ext.cpp
index 9779526d62..370c186123 100644
--- a/lib/gpu/lal_coul_ext.cpp
+++ b/lib/gpu/lal_coul_ext.cpp
@@ -54,7 +54,7 @@ int coul_gpu_init(const int ntypes, double **host_scale,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, 300,
+    init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, qqrd2e);
 
   COULMF.device->world_barrier();
@@ -71,7 +71,7 @@ int coul_gpu_init(const int ntypes, double **host_scale,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, 300,
+      init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, max_nbors,
                           maxspecial, cell_size, gpu_split, screen, qqrd2e);
 
     COULMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_coul_long.cpp b/lib/gpu/lal_coul_long.cpp
index 02097a2c61..36c1cd751f 100644
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@@ -116,20 +116,9 @@ double CoulLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CoulLongT::loop(const bool _eflag, const bool _vflag) {
+int CoulLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -137,8 +126,8 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv,
                           &eflag, &vflag, &ainum, &nbor_pitch,
@@ -153,6 +142,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class CoulLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu
index 7adcdbbabc..f8a33e90a2 100644
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@@ -29,100 +29,6 @@ _texture( q_tex,int2);
 #define q_tex q_
 #endif
 
-#if (ARCH < 300)
-
-#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-                                                                            \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=e_coul;                                                 \
-                                                                            \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-                                                                            \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    e_coul=red_acc[3][tid];                                                 \
-                                                                            \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-                                                                            \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-                                                                            \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-    }                                                                       \
-  }                                                                         \
-                                                                            \
-  if (offset==0) {                                                          \
-    __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
-      *ap1=(acctyp)0;                                                       \
-      ap1+=inum;                                                            \
-      *ap1=e_coul*(acctyp)0.5;                                              \
-      ap1+=inum;                                                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i]*(acctyp)0.5;                                         \
-        ap1+=inum;                                                          \
-      }                                                                     \
-    }                                                                       \
-    ans[ii]=f;                                                              \
-  }
-
-#else
-
-#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
-                         t_per_atom, offset, eflag, vflag, ans, engv)       \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
-      *ap1=(acctyp)0;                                                       \
-      ap1+=inum;                                                            \
-      *ap1=e_coul*(acctyp)0.5;                                              \
-      ap1+=inum;                                                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i]*(acctyp)0.5;                                         \
-        ap1+=inum;                                                          \
-      }                                                                     \
-    }                                                                       \
-    ans[ii]=f;                                                              \
-  }
-
-#endif
-
 __kernel void k_coul_long(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict scale,
                           const int lj_types,
@@ -140,22 +46,25 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_cl[0]=sp_cl_in[0];
   sp_cl[1]=sp_cl_in[1];
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
 
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp e_coul, virial[6];
+  if (EVFLAG) {
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -197,10 +106,10 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += prefactor*(_erfc-factor_coul);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -211,9 +120,11 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                     vflag,ans,engv);
   } // if ii
+  acctyp energy;
+  if (EVFLAG) energy=(acctyp)0.0;
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
@@ -233,24 +144,27 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<4)
     sp_cl[tid]=sp_cl_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
     scale[tid]=scale_in[tid];
 
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp e_coul, virial[6];
+  if (EVFLAG) {
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -292,10 +206,10 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += prefactor*(_erfc-factor_coul);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -306,8 +220,10 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                     vflag,ans,engv);
   } // if ii
+  acctyp energy;
+  if (EVFLAG) energy=(acctyp)0.0;
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                   vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_coul_long.h b/lib/gpu/lal_coul_long.h
index 0668e0fd02..a89b8e447c 100644
--- a/lib/gpu/lal_coul_long.h
+++ b/lib/gpu/lal_coul_long.h
@@ -74,7 +74,7 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
 
  protected:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_coul_long_cs.cu b/lib/gpu/lal_coul_long_cs.cu
index 85c9d84bdb..dfbc771adc 100644
--- a/lib/gpu/lal_coul_long_cs.cu
+++ b/lib/gpu/lal_coul_long_cs.cu
@@ -43,100 +43,6 @@ _texture( q_tex,int2);
 #define EPS_EWALD (acctyp)(1.0e-6)
 #define EPS_EWALD_SQR (acctyp)(1.0e-12)
 
-#if (ARCH < 300)
-
-#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-                                                                            \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=e_coul;                                                 \
-                                                                            \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-                                                                            \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    e_coul=red_acc[3][tid];                                                 \
-                                                                            \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-                                                                            \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-                                                                            \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-    }                                                                       \
-  }                                                                         \
-                                                                            \
-  if (offset==0) {                                                          \
-    __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
-      *ap1=(acctyp)0;                                                       \
-      ap1+=inum;                                                            \
-      *ap1=e_coul*(acctyp)0.5;                                              \
-      ap1+=inum;                                                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i]*(acctyp)0.5;                                         \
-        ap1+=inum;                                                          \
-      }                                                                     \
-    }                                                                       \
-    ans[ii]=f;                                                              \
-  }
-
-#else
-
-#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
-                         t_per_atom, offset, eflag, vflag, ans, engv)       \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
-      *ap1=(acctyp)0;                                                       \
-      ap1+=inum;                                                            \
-      *ap1=e_coul*(acctyp)0.5;                                              \
-      ap1+=inum;                                                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i]*(acctyp)0.5;                                         \
-        ap1+=inum;                                                          \
-      }                                                                     \
-    }                                                                       \
-    ans[ii]=f;                                                              \
-  }
-
-#endif
-
 __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict scale,
                           const int lj_types,
@@ -154,22 +60,25 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_cl[0]=sp_cl_in[0];
   sp_cl[1]=sp_cl_in[1];
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
 
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp e_coul, virial[6];
+  if (EVFLAG) {
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -229,12 +138,12 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = prefactor*_erfc;
           if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
           e_coul += e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -245,9 +154,11 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                     vflag,ans,engv);
   } // if ii
+  acctyp energy;
+  if (EVFLAG) energy=(acctyp)0.0;
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
@@ -267,24 +178,27 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<4)
     sp_cl[tid]=sp_cl_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
     scale[tid]=scale_in[tid];
 
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp e_coul, virial[6];
+  if (EVFLAG) {
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -344,12 +258,12 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = prefactor*_erfc;
           if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
           e_coul += e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -360,8 +274,9 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                     vflag,ans,engv);
   } // if ii
+  acctyp energy;
+  if (EVFLAG) energy=(acctyp)0.0;
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_coul_long_cs_ext.cpp b/lib/gpu/lal_coul_long_cs_ext.cpp
index ae57eb2038..df92619f2f 100644
--- a/lib/gpu/lal_coul_long_cs_ext.cpp
+++ b/lib/gpu/lal_coul_long_cs_ext.cpp
@@ -54,7 +54,7 @@ int clcs_gpu_init(const int ntypes, double **host_scale,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+    init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial,
                       cell_size, gpu_split, screen, host_cut_coulsq,
                       host_special_coul, qqrd2e, g_ewald);
 
@@ -72,7 +72,7 @@ int clcs_gpu_init(const int ntypes, double **host_scale,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+      init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen, host_cut_coulsq,
                         host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp
index 653b4be4f3..1d9dcfdeca 100644
--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@@ -54,7 +54,7 @@ int cl_gpu_init(const int ntypes, double **host_scale,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CLMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+    init_ok=CLMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial,
                       cell_size, gpu_split, screen, host_cut_coulsq,
                       host_special_coul, qqrd2e, g_ewald);
 
@@ -72,7 +72,7 @@ int cl_gpu_init(const int ntypes, double **host_scale,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=CLMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+      init_ok=CLMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen, host_cut_coulsq,
                         host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 911cdda383..5ba9185e6f 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -18,12 +18,18 @@
 #include <map>
 #include <cmath>
 #include <cstdlib>
-#ifdef _OPENMP
+#if (LAL_USE_OMP == 1)
 #include <omp.h>
 #endif
 
 #if defined(USE_OPENCL)
 #include "device_cl.h"
+
+#ifdef LAL_OCL_EXTRA_ARGS
+#define LAL_DM_STRINGIFY(x) #x
+#define LAL_PRE_STRINGIFY(x) LAL_DM_STRINGIFY(x)
+#endif
+
 #elif defined(USE_CUDART)
 const char *device=0;
 #else
@@ -45,40 +51,48 @@ DeviceT::~Device() {
 }
 
 template <class numtyp, class acctyp>
-int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                         const int last_gpu, const int gpu_mode,
+int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+                         const int first_gpu_id, const int gpu_mode,
                          const double p_split, const int nthreads,
-                         const int t_per_atom, const double cell_size,
-                         char *ocl_vendor, const int block_pair) {
+                         const int t_per_atom, const double user_cell_size,
+                         char *ocl_args, const int ocl_platform,
+                         char *device_type_flags, const int block_pair) {
   _nthreads=nthreads;
-  #ifdef _OPENMP
+  #if (LAL_USE_OMP == 1)
   omp_set_num_threads(nthreads);
   #endif
   _threads_per_atom=t_per_atom;
   _threads_per_charge=t_per_atom;
+  _threads_per_three=t_per_atom;
 
   if (_device_init)
     return 0;
   _device_init=true;
   _comm_world=replica; //world;
   _comm_replica=replica;
-  _first_device=first_gpu;
-  _last_device=last_gpu;
+  int ndevices=ngpu;
+  _first_device=first_gpu_id;
   _gpu_mode=gpu_mode;
   _particle_split=p_split;
-  _cell_size=cell_size;
+  _user_cell_size=user_cell_size;
   _block_pair=block_pair;
-  // support selecting platform though "package device" keyword.
-  // "0:generic" will select platform 0 and tune for generic device
-  // "1:fermi" will select platform 1 and tune for Nvidia Fermi gpu
-  if (ocl_vendor) {
-    char *sep = nullptr;
-    if ((sep = strstr(ocl_vendor,":"))) {
-      *sep = '\0';
-      _platform_id = atoi(ocl_vendor);
-      ocl_vendor = sep+1;
-    }
-  }
+
+  // support selecting OpenCL platform id with "package platform" keyword
+  if (ocl_platform >= 0)
+    _platform_id = ocl_platform;
+
+  gpu=new UCL_Device();
+
+  // ---------------------- OpenCL Compiler Args -------------------------
+  std::string extra_args="";
+  if (ocl_args) extra_args+=":"+std::string(ocl_args);
+  #ifdef LAL_OCL_EXTRA_ARGS
+  extra_args+=":" LAL_PRE_STRINGIFY(LAL_OCL_EXTRA_ARGS);
+  #endif
+  for (int i=0; i<extra_args.length(); i++)
+    if (extra_args[i]==':') extra_args[i]=' ';
+
+  // --------------------------- MPI setup -------------------------------
 
   // Get the rank/size within the world
   MPI_Comm_rank(_comm_world,&_world_me);
@@ -124,10 +138,132 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   int node_rank;
   MPI_Comm_rank(node_comm,&node_rank);
 
+  // ------------------- Device selection parameters----------------------
+
+  if (ndevices > procs_per_node)
+    ndevices = procs_per_node;
+
+  // --------------------- OCL Platform Selection  -----------------------
+
+  // Setup OpenCL platform and parameters based on platform
+  // and device type specifications
+  std::string ocl_vstring="";
+  if (device_type_flags != nullptr) ocl_vstring=device_type_flags;
+
+  // Setup the OpenCL platform
+  // If multiple platforms and no user platform specified,
+  // try to match platform from config matching any user specified
+  // device type. Give preference to platforms with GPUs.
+  // Priority under these conditions to platform with device with
+  // highest compute unit count.
+  int pres;
+  enum UCL_DEVICE_TYPE type=UCL_GPU;
+  #ifndef USE_OPENCL
+  pres=gpu->set_platform(0);
+  #else
+  if (_platform_id>=0)
+    pres=gpu->set_platform(_platform_id);
+  else {
+    std::string vendor="";
+    if (device_type_flags!=nullptr) {
+      if (ocl_vstring=="intelgpu")
+        vendor="intel";
+      else if (ocl_vstring=="intelcpu") {
+        vendor="intel";
+        type=UCL_CPU;
+      } else if (ocl_vstring=="nvidiagpu")
+        vendor="nvidia";
+      else if (ocl_vstring=="amdgpu")
+        vendor="amd";
+      else if (ocl_vstring=="applegpu")
+        vendor="apple";
+    }
+    pres=gpu->auto_set_platform(type,vendor,ndevices,_first_device);
+  }
+  #endif
+  if (pres != UCL_SUCCESS)
+    return -12;
+
+  // ------------------------ Device Selection ---------------------------
+  if (_first_device > -1 && _first_device >= gpu->num_devices())
+    return -2;
+  if (ndevices > gpu->num_devices())
+    return -2;
+  if (_first_device + ndevices > gpu->num_devices())
+    return -2;
+  if (gpu->num_devices()==0)
+    return -2;
+
+  // Fully specified deviceIDs
+  if (_first_device > -1 && ndevices > 0)
+    _last_device = _first_device + ndevices - 1;
+
+  // Find deviceID with most CUs (priority given to the accelerator type)
+  if (_first_device < 0) {
+    int best_device = 0;
+    int best_cus = gpu->cus(0);
+    bool type_match = (gpu->device_type(0) == type);
+    for (int i = 1; i < gpu->num_devices(); i++) {
+      if (type_match==true && gpu->device_type(i)!=type)
+        continue;
+      if (type_match == false && gpu->device_type(i) == type) {
+        type_match = true;
+        best_cus = gpu->cus(i);
+        best_device = i;
+      }
+      if (gpu->cus(i) > best_cus) {
+        best_cus = gpu->cus(i);
+        best_device = i;
+      }
+    }
+    _first_device = _last_device = best_device;
+    type = gpu->device_type(_first_device);
+
+    if (ndevices > 0) {
+      // Expand range to meet specified number of devices
+      while (_last_device - _first_device < ndevices - 1) {
+        if (_last_device + 1 == gpu->num_devices())
+          _first_device--;
+        else if (_first_device == 0)
+          _last_device++;
+        else {
+          if (gpu->device_type(_last_device+1)==type &&
+              gpu->device_type(_first_device-1)!=type)
+            _last_device++;
+          else if (gpu->device_type(_last_device+1)!=type &&
+                   gpu->device_type(_first_device-1)==type)
+            _first_device--;
+          else if (gpu->cus(_last_device+1) > gpu->cus(_first_device-1))
+            _last_device++;
+          else
+            _first_device--;
+        }
+      }
+    }
+  }
+
+  // If ngpus not specified, expand range to include matching devices
+  if (ndevices == 0) {
+    for (int i = _first_device; i < gpu->num_devices(); i++) {
+      if (gpu->device_type(i)==gpu->device_type(_first_device) &&
+          gpu->cus(i)==gpu->cus(_first_device))
+        _last_device = i;
+      else
+        break;
+    }
+    ndevices = _last_device - _first_device + 1;
+    if (ndevices > procs_per_node) {
+      ndevices = procs_per_node;
+      _last_device=_first_device + ndevices - 1;
+    }
+  }
+
+  // ------------------------ MPI Device ID Setup -----------------------
+
   // set the device ID
   _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
-                                       (last_gpu-first_gpu+1)));
-  int my_gpu=node_rank/_procs_per_gpu+first_gpu;
+                                       ndevices));
+  int my_gpu=node_rank/_procs_per_gpu+_first_device;
 
   // Time on the device only if 1 proc per gpu
   _time_device=true;
@@ -146,27 +282,51 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
   MPI_Comm_rank(_comm_gpu,&_gpu_rank);
 
-  gpu=new UCL_Device();
-  if (my_gpu>=gpu->num_devices())
-    return -2;
-
-  #ifndef CUDA_PROXY
+  #if !defined(CUDA_PROXY) && !defined(CUDA_MPS_SUPPORT)
   if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
     return -7;
   #endif
 
-  if (gpu->set_platform_accelerator(_platform_id)!=UCL_SUCCESS)
-    return -12;
+  // --------------- Device Configuration and Setup  -------------------------
 
   if (gpu->set(my_gpu)!=UCL_SUCCESS)
     return -6;
 
-  gpu->push_command_queue();
-  gpu->set_command_queue(1);
+  #if !defined(USE_OPENCL) && !defined(USE_HIP)
+  if (gpu->arch()<7.0) {
+    gpu->push_command_queue();
+    gpu->set_command_queue(1);
+  }
+  #endif
 
   _long_range_precompute=0;
 
-  if (set_ocl_params(ocl_vendor)!=0)
+  // If OpenCL parameters not specified by user, try to auto detect
+  // best option from the platform config
+  #ifdef USE_OPENCL
+  if (device_type_flags==nullptr) {
+    std::string pname = gpu->platform_name();
+    for (int i=0; i<pname.length(); i++)
+      if (pname[i]<='z' && pname[i]>='a')
+        pname[i]=toupper(pname[i]);
+    if (pname.find("NVIDIA")!=std::string::npos)
+      ocl_vstring="nvidiagpu";
+    else if (pname.find("INTEL")!=std::string::npos) {
+      if (gpu->device_type()==UCL_GPU)
+        ocl_vstring="intelgpu";
+      else if (gpu->device_type()==UCL_CPU)
+        ocl_vstring="intelcpu";
+    } else if (pname.find("AMD")!=std::string::npos) {
+      if (gpu->device_type()==UCL_GPU)
+        ocl_vstring="amdgpu";
+    } else if (pname.find("APPLE")!=std::string::npos) {
+      if (gpu->device_type()==UCL_GPU)
+        ocl_vstring="applegpu";
+    }
+  }
+  #endif
+
+  if (set_ocl_params(ocl_vstring, extra_args)!=0)
     return -11;
 
   int flag=0;
@@ -175,71 +335,90 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
       flag=compile_kernels();
     gpu_barrier();
   }
+
+  // Setup auto bin size calculation for calls from atom::sort
+  // - This is repeated in neighbor init with additional info
+  if (_user_cell_size<0.0) {
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    _neighbor_shared.setup_auto_cell_size(true,0,_simd_size);
+    #else
+    _neighbor_shared.setup_auto_cell_size(false,0,_simd_size);
+    #endif
+  } else
+    _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,_simd_size);
+
   return flag;
 }
 
 template <class numtyp, class acctyp>
-int DeviceT::set_ocl_params(char *ocl_vendor) {
+int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) {
   #ifdef USE_OPENCL
-  std::string s_vendor=OCL_DEFAULT_VENDOR;
-  if (ocl_vendor!=nullptr)
-    s_vendor=ocl_vendor;
-  if (s_vendor=="none")
-    s_vendor="generic";
 
-  if (s_vendor=="kepler") {
-    _ocl_vendor_name="NVIDIA Kepler";
-    #if defined (__APPLE__) || defined(MACOSX)
-    _ocl_vendor_string="-DKEPLER_OCL -DNO_OCL_PTX";
-    #else
-    _ocl_vendor_string="-DKEPLER_OCL";
-    #endif
-  } else if (s_vendor=="fermi") {
-    _ocl_vendor_name="NVIDIA Fermi";
-    _ocl_vendor_string="-DFERMI_OCL";
-  } else if (s_vendor=="cypress") {
-    _ocl_vendor_name="AMD Cypress";
-    _ocl_vendor_string="-DCYPRESS_OCL";
-  } else if (s_vendor=="phi") {
-    _ocl_vendor_name="Intel Phi";
-    _ocl_vendor_string="-DPHI_OCL";
-  } else if (s_vendor=="intel") {
-    _ocl_vendor_name="Intel CPU";
-    _ocl_vendor_string="-DINTEL_OCL";
-  } else if (s_vendor=="generic") {
-    _ocl_vendor_name="GENERIC";
-    _ocl_vendor_string="-DGENERIC_OCL";
-  } else {
-    _ocl_vendor_name="CUSTOM";
-    _ocl_vendor_string="-DUSE_OPENCL";
-    int token_count=0;
-    std::string params[13];
-    char *pch = strtok(ocl_vendor,",");
+  #include "lal_pre_ocl_config.h"
+
+  if (s_config=="" || s_config=="none")
+    s_config="generic";
+
+  int config_index=-1;
+  for (int i=0; i<nconfigs; i++)
+    if (s_config==std::string(ocl_config_names[i]))
+      config_index=i;
+
+  if (config_index != -1)
+    s_config=ocl_config_strings[config_index];
+
+  _ocl_config_name="CUSTOM";
+  int token_count=0;
+  std::string params[18];
+  char ocl_config[2048];
+  strcpy(ocl_config,s_config.c_str());
+  char *pch = strtok(ocl_config,",");
+  _ocl_config_name=pch;
+  pch = strtok(nullptr,",");
+  if (pch == nullptr) return -11;
+  while (pch != nullptr) {
+    if (token_count==18)
+      return -11;
+    params[token_count]=pch;
+    token_count++;
     pch = strtok(nullptr,",");
-    if (pch == nullptr) return -11;
-    while (pch != nullptr) {
-      if (token_count==13)
-        return -11;
-      params[token_count]=pch;
-      token_count++;
-      pch = strtok(nullptr,",");
-    }
-    _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
-                        " -DTHREADS_PER_ATOM="+params[1]+
-                        " -DTHREADS_PER_CHARGE="+params[2]+
-                        " -DBLOCK_PAIR="+params[3]+
-                        " -DMAX_SHARED_TYPES="+params[4]+
-                        " -DBLOCK_NBOR_BUILD="+params[5]+
-                        " -DBLOCK_BIO_PAIR="+params[6]+
-                        " -DBLOCK_ELLIPSE="+params[7]+
-                        " -DWARP_SIZE="+params[8]+
-                        " -DPPPM_BLOCK_1D="+params[9]+
-                        " -DBLOCK_CELL_2D="+params[10]+
-                        " -DBLOCK_CELL_ID="+params[11]+
-                        " -DMAX_BIO_SHARED_TYPES="+params[12];
   }
-  _ocl_compile_string="-cl-std=CL1.2 -cl-fast-relaxed-math -cl-mad-enable "+std::string(OCL_INT_TYPE)+" "+
-                      std::string(OCL_PRECISION_COMPILE)+" "+_ocl_vendor_string;
+
+  _ocl_compile_string="-cl-mad-enable ";
+  if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
+  _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
+    std::string(OCL_PRECISION_COMPILE);
+  if (gpu->has_subgroup_support())
+    _ocl_compile_string+=" -DUSE_OPENCL_SUBGROUPS";
+  #ifdef LAL_USE_OLD_NEIGHBOR
+  _ocl_compile_string+=" -DLAL_USE_OLD_NEIGHBOR";
+  #endif
+
+  _ocl_compile_string += " -DCONFIG_ID="+params[0]+
+                         " -DSIMD_SIZE="+params[1]+
+                         " -DMEM_THREADS="+params[2];
+  if (gpu->has_shuffle_support()==false)
+    _ocl_compile_string+=" -DSHUFFLE_AVAIL=0";
+  else
+    _ocl_compile_string+=" -DSHUFFLE_AVAIL="+params[3];
+  _ocl_compile_string += " -DFAST_MATH="+params[4]+
+
+                         " -DTHREADS_PER_ATOM="+params[5]+
+                         " -DTHREADS_PER_CHARGE="+params[6]+
+                         " -DTHREADS_PER_THREE="+params[7]+
+
+                         " -DBLOCK_PAIR="+params[8]+
+                         " -DBLOCK_BIO_PAIR="+params[9]+
+                         " -DBLOCK_ELLIPSE="+params[10]+
+                         " -DPPPM_BLOCK_1D="+params[11]+
+                         " -DBLOCK_NBOR_BUILD="+params[12]+
+                         " -DBLOCK_CELL_2D="+params[13]+
+                         " -DBLOCK_CELL_ID="+params[14]+
+
+                         " -DMAX_SHARED_TYPES="+params[15]+
+                         " -DMAX_BIO_SHARED_TYPES="+params[16]+
+                         " -DPPPM_MAX_SPLINE="+params[17];
+  _ocl_compile_string += extra_args;
   #endif
   return 0;
 }
@@ -269,8 +448,10 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
   else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
     gpu_nbor=2;
   #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
-  if (gpu_nbor==1)
-    gpu_nbor=2;
+  if (gpu_nbor==1) gpu_nbor=2;
+  #endif
+  #ifndef LAL_USE_OLD_NEIGHBOR
+  if (gpu_nbor==1) gpu_nbor=2;
   #endif
 
   if (_init_count==0) {
@@ -328,14 +509,15 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
 
 template <class numtyp, class acctyp>
 int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
-                  const int host_nlocal, const int nall,
-                  const int maxspecial, const int gpu_host,
-                  const int max_nbors, const double cell_size,
-                  const bool pre_cut, const int threads_per_atom) {
+                       const int host_nlocal, const int nall,
+                       const int maxspecial, const int gpu_host,
+                       const int max_nbors, const double cutoff,
+                       const bool pre_cut, const int threads_per_atom,
+                       const bool ilist_map) {
   int ef_nlocal=nlocal;
   if (_particle_split<1.0 && _particle_split>0.0)
     ef_nlocal=static_cast<int>(_particle_split*nlocal);
- 
+
   int gpu_nbor=0;
   if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH)
     gpu_nbor=1;
@@ -345,16 +527,27 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
   if (gpu_nbor==1)
     gpu_nbor=2;
   #endif
+  #ifndef LAL_USE_OLD_NEIGHBOR
+  if (gpu_nbor==1)
+    gpu_nbor=2;
+  #endif
 
   if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                   *gpu,gpu_nbor,gpu_host,pre_cut,_block_cell_2d,
                   _block_cell_id, _block_nbor_build, threads_per_atom,
-                  _warp_size, _time_device, compile_string()))
+                  _simd_size, _time_device, compile_string(), ilist_map))
     return -3;
-  if (_cell_size<0.0)
-    nbor->cell_size(cell_size,cell_size);
-  else
-    nbor->cell_size(_cell_size,cell_size);
+
+  if (_user_cell_size<0.0) {
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    _neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size());
+    #else
+    _neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size());
+    #endif
+  } else
+    _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,
+                                          nbor->simd_size());
+  nbor->set_cutoff(cutoff);
 
   return 0;
 }
@@ -389,13 +582,21 @@ void DeviceT::init_message(FILE *screen, const char *name,
     fprintf(screen,"-------------------------------------\n");
     fprintf(screen,"- Using acceleration for %s:\n",name);
     fprintf(screen,"-  with %d proc(s) per device.\n",_procs_per_gpu);
-    #ifdef _OPENMP
+    #if (LAL_USE_OMP == 1)
     fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
     #endif
     #ifdef USE_OPENCL
-    fprintf(screen,"-  with OpenCL Parameters for: %s\n",
-            _ocl_vendor_name.c_str());
+    fprintf(screen,"-  with OpenCL Parameters for: %s (%d)\n",
+            _ocl_config_name.c_str(),_config_id);
     #endif
+    if (shuffle_avail())
+      fprintf(screen,"-  Horizontal vector operations: ENABLED\n");
+    else
+      fprintf(screen,"-  Horizontal vector operations: DISABLED\n");
+    if (gpu->shared_memory(first_gpu))
+      fprintf(screen,"-  Shared memory system: Yes\n");
+    else
+      fprintf(screen,"-  Shared memory system: No\n");
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
 
@@ -431,7 +632,8 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
                                     double &gpu_overhead,
                                     double &gpu_driver_overhead) {
   UCL_H_Vec<int> *host_data_in=nullptr, *host_data_out=nullptr;
-  UCL_D_Vec<int> *dev_data_in=nullptr, *dev_data_out=nullptr, *kernel_data=nullptr;
+  UCL_D_Vec<int> *dev_data_in=nullptr, *dev_data_out=nullptr,
+    *kernel_data=nullptr;
   UCL_Timer *timers_in=nullptr, *timers_out=nullptr, *timers_kernel=nullptr;
   UCL_Timer over_timer(*gpu);
 
@@ -472,7 +674,7 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
   gpu_overhead=0.0;
   gpu_driver_overhead=0.0;
 
-  for (int i=0; i<10; i++) {
+  for (int z=0; z<11; z++) {
     gpu->sync();
     gpu_barrier();
     over_timer.start();
@@ -486,9 +688,11 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
       timers_in[i].stop();
     }
 
+    const int numel=1;
     for (int i=0; i<kernel_calls; i++) {
       timers_kernel[i].start();
-      zero(kernel_data[i],1);
+      k_zero.set_size(1,_block_pair);
+      k_zero.run(&(kernel_data[i]),&numel);
       timers_kernel[i].stop();
     }
 
@@ -498,9 +702,12 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
       timers_out[i].stop();
     }
     over_timer.stop();
-
-    double time=over_timer.seconds();
+    #ifndef GERYON_OCL_FLUSH
+    if (_data_out_estimate)
+      dev_data_out[0].flush();
+    #endif
     driver_time=MPI_Wtime()-driver_time;
+    double time=over_timer.seconds();
 
     if (time_device()) {
       for (int i=0; i<_data_in_estimate; i++)
@@ -513,9 +720,12 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
 
     double mpi_time, mpi_driver_time;
     MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
-    MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
-    gpu_overhead+=mpi_time;
-    gpu_driver_overhead+=mpi_driver_time;
+    MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,
+                  gpu_comm());
+    if (z>0) {
+      gpu_overhead+=mpi_time;
+      gpu_driver_overhead+=mpi_driver_time;
+    }
   }
   gpu_overhead/=10.0;
   gpu_driver_overhead/=10.0;
@@ -567,19 +777,22 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
-  double t_time=times[0]+times[1]+times[2]+times[3]+times[4];
+
+  #ifdef USE_OPENCL
+  // Workaround for timing issue on Intel OpenCL
+  if (times[3] > 80e6) times[3]=0.0;
+  #endif
 
   if (replica_me()==0)
-    if (screen && times[5]>0.0) {
+    if (screen && times[6]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
       fprintf(screen,"      Device Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (time_device() && t_time>0) {
+      if (time_device() && times[3]>0) {
         fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
-        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
         if (nbor.gpu_nbor()>0)
           fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/_replica_size);
@@ -587,13 +800,15 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
           fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
       }
-      if (nbor.gpu_nbor()==2)
-        fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[8]/_replica_size);
       if (times[5]>0)
         fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
+      fprintf(screen,"Vector width:    %d.\n", simd_size());
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+      if (nbor.gpu_nbor()==2)
+        fprintf(screen,"CPU Neighbor:    %.4f s.\n",times[8]/_replica_size);
+      fprintf(screen,"CPU Cast/Pack:   %.4f s.\n",times[4]/_replica_size);
       fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
       fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[7]/_replica_size);
 
@@ -612,24 +827,29 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
                                   const double max_bytes,
                                   const double cpu_time,
                                   const double idle_time, FILE *screen) {
-  double single[8], times[8];
+  double single[9], times[9];
 
   single[0]=time_out.total_seconds();
   single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
   single[2]=time_map.total_seconds();
   single[3]=time_rho.total_seconds();
   single[4]=time_interp.total_seconds();
-  single[5]=ans.transfer_time()+ans.cast_time();
+  single[5]=ans.transfer_time();
   single[6]=cpu_time;
   single[7]=idle_time;
+  single[8]=ans.cast_time();
 
-  MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
+  MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
   double my_max_bytes=max_bytes+atom.max_gpu_bytes();
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
-  double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
+  #ifdef USE_OPENCL
+  // Workaround for timing issue on Intel OpenCL
+  if (times[3] > 80e6) times[3]=0.0;
+  #endif
+
 
   if (replica_me()==0)
     if (screen && times[6]>0.0) {
@@ -639,7 +859,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (time_device() && t_time>0) {
+      if (time_device() && times[3]>0) {
         fprintf(screen,"Data Out:        %.4f s.\n",times[0]/_replica_size);
         fprintf(screen,"Data In:         %.4f s.\n",times[1]/_replica_size);
         fprintf(screen,"Kernel (map):    %.4f s.\n",times[2]/_replica_size);
@@ -649,12 +869,13 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
                 (times[0]+times[2]+times[3])/_replica_size);
         fprintf(screen,"Total interp:    %.4f s.\n",
                 (times[1]+times[4])/_replica_size);
-        fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
+        fprintf(screen,"Force copy:      %.4f s.\n",times[5]/_replica_size);
         fprintf(screen,"Total:           %.4f s.\n",
                 (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
                 _replica_size);
       }
       fprintf(screen,"CPU Poisson:     %.4f s.\n",times[6]/_replica_size);
+      fprintf(screen,"CPU Data Cast:   %.4f s.\n",times[8]/_replica_size);
       fprintf(screen,"CPU Idle Time:   %.4f s.\n",times[7]/_replica_size);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
 
@@ -699,14 +920,15 @@ int DeviceT::compile_kernels() {
           return flag;
 
   dev_program=new UCL_Program(*gpu);
-  int success=dev_program->load_string(device,compile_string().c_str());
+  int success=dev_program->load_string(device,compile_string().c_str(),
+                                       nullptr,stderr);
   if (success!=UCL_SUCCESS)
     return -6;
   k_zero.set_function(*dev_program,"kernel_zero");
   k_info.set_function(*dev_program,"kernel_info");
   _compiled=true;
 
-  UCL_Vector<int,int> gpu_lib_data(15,*gpu,UCL_NOT_PINNED);
+  UCL_Vector<int,int> gpu_lib_data(19,*gpu,UCL_NOT_PINNED);
   k_info.set_size(1,1);
   k_info.run(&gpu_lib_data);
   gpu_lib_data.update_host(false);
@@ -717,39 +939,81 @@ int DeviceT::compile_kernels() {
     return -4;
   #endif
 
-  _num_mem_threads=gpu_lib_data[1];
-  _warp_size=gpu_lib_data[2];
-  if (_threads_per_atom<1)
-    _threads_per_atom=gpu_lib_data[3];
-  if (_threads_per_charge<1)
-    _threads_per_charge=gpu_lib_data[13];
-  _pppm_max_spline=gpu_lib_data[4];
-  _pppm_block=gpu_lib_data[5];
-  if (_block_pair == -1) _block_pair=gpu_lib_data[6];
-  _max_shared_types=gpu_lib_data[7];
-  _block_cell_2d=gpu_lib_data[8];
-  _block_cell_id=gpu_lib_data[9];
-  _block_nbor_build=gpu_lib_data[10];
-  _block_bio_pair=gpu_lib_data[11];
-  _max_bio_shared_types=gpu_lib_data[12];
-  _block_ellipse=gpu_lib_data[14];
+  _config_id=gpu_lib_data[1];
 
-  if (static_cast<size_t>(_block_pair)>gpu->group_size())
-    _block_pair=gpu->group_size();
-  if (static_cast<size_t>(_block_bio_pair)>gpu->group_size())
-    _block_bio_pair=gpu->group_size();
-  if (_threads_per_atom>_warp_size)
-    _threads_per_atom=_warp_size;
-  if (_warp_size%_threads_per_atom!=0)
+  if (sizeof(numtyp)==sizeof(float))
+    _simd_size=std::max(gpu_lib_data[2],gpu->preferred_fp32_width());
+  else
+    _simd_size=std::max(gpu_lib_data[2],gpu->preferred_fp64_width());
+
+  _num_mem_threads=gpu_lib_data[3];
+  _shuffle_avail=gpu_lib_data[4];
+  _fast_math=gpu_lib_data[5];
+
+  if (_threads_per_atom<1)
+    _threads_per_atom=gpu_lib_data[6];
+  if (_threads_per_charge<1)
+    _threads_per_charge=gpu_lib_data[7];
+  if (_threads_per_three<1)
+    _threads_per_three=gpu_lib_data[8];
+
+  if (_block_pair == -1) {
+    _block_pair=gpu_lib_data[9];
+    _block_bio_pair=gpu_lib_data[10];
+    _block_ellipse=gpu_lib_data[11];
+  } else {
+    _block_bio_pair=_block_pair;
+    _block_ellipse=_block_pair;
+  }
+  _pppm_block=gpu_lib_data[12];
+  _block_nbor_build=gpu_lib_data[13];
+  _block_cell_2d=gpu_lib_data[14];
+  _block_cell_id=gpu_lib_data[15];
+
+  _max_shared_types=gpu_lib_data[16];
+  _max_bio_shared_types=gpu_lib_data[17];
+  _pppm_max_spline=gpu_lib_data[18];
+
+  if (static_cast<size_t>(_block_pair)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_block_bio_pair)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_block_ellipse)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_pppm_block)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_block_nbor_build)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_block_cell_2d)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_block_cell_2d)>gpu->group_size_dim(1) ||
+      static_cast<size_t>(_block_cell_id)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_max_shared_types*_max_shared_types*
+                          sizeof(numtyp)*17 > gpu->slm_size()) ||
+      static_cast<size_t>(_max_bio_shared_types*2*sizeof(numtyp) >
+                          gpu->slm_size()))
+    return -13;
+
+  if (_block_pair % _simd_size != 0 || _block_bio_pair % _simd_size != 0 ||
+      _block_ellipse % _simd_size != 0 || _pppm_block % _simd_size != 0 ||
+      _block_nbor_build % _simd_size != 0 ||
+      _block_pair < _max_shared_types * _max_shared_types ||
+      _block_bio_pair * 2 < _max_bio_shared_types ||
+      _pppm_block < _pppm_max_spline * _pppm_max_spline)
+    return -11;
+
+  if (_threads_per_atom>_simd_size)
+    _threads_per_atom=_simd_size;
+  if (_simd_size%_threads_per_atom!=0)
     _threads_per_atom=1;
   if (_threads_per_atom & (_threads_per_atom - 1))
     _threads_per_atom=1;
-  if (_threads_per_charge>_warp_size)
-    _threads_per_charge=_warp_size;
-  if (_warp_size%_threads_per_charge!=0)
+  if (_threads_per_charge>_simd_size)
+    _threads_per_charge=_simd_size;
+  if (_simd_size%_threads_per_charge!=0)
     _threads_per_charge=1;
   if (_threads_per_charge & (_threads_per_charge - 1))
     _threads_per_charge=1;
+  if (_threads_per_three>_simd_size)
+    _threads_per_three=_simd_size;
+  if (_simd_size%_threads_per_three!=0)
+    _threads_per_three=1;
+  if (_threads_per_three & (_threads_per_three - 1))
+    _threads_per_three=1;
 
   return flag;
 }
@@ -765,14 +1029,16 @@ Device<PRECISION,ACC_PRECISION> global_device;
 }
 
 using namespace LAMMPS_AL;
-int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                    const int last_gpu, const int gpu_mode,
+int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+                    const int first_gpu_id, const int gpu_mode,
                     const double particle_split, const int nthreads,
-                    const int t_per_atom, const double cell_size,
-                    char *opencl_vendor, const int block_pair) {
-  return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
+                    const int t_per_atom, const double user_cell_size,
+                    char *opencl_config, const int ocl_platform,
+                    char *device_type_flags, const int block_pair) {
+  return global_device.init_device(world,replica,ngpu,first_gpu_id,gpu_mode,
                                    particle_split,nthreads,t_per_atom,
-                                   cell_size,opencl_vendor,block_pair);
+                                   user_cell_size,opencl_config,ocl_platform,
+                                   device_type_flags,block_pair);
 }
 
 void lmp_clear_device() {
@@ -780,8 +1046,16 @@ void lmp_clear_device() {
 }
 
 double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                      double **vatom, double *virial, double &ecoul) {
-  return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
+                      double **vatom, double *virial, double &ecoul,
+                      int &error_flag) {
+  return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul,error_flag);
+}
+
+double lmp_gpu_update_bin_size(const double subx, const double suby,
+                               const double subz, const int nlocal,
+                               const double cut) {
+  return global_device._neighbor_shared.update_cell_size(subx, suby,
+                                                         subz, nlocal, cut);
 }
 
 bool lmp_gpu_config(const std::string &category, const std::string &setting)
diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu
index afc7a0b988..61341964b2 100644
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@@ -26,20 +26,30 @@ __kernel void kernel_zero(__global int *restrict mem,
 }
 
 __kernel void kernel_info(__global int *info) {
-  info[0]=ARCH;
-  info[1]=MEM_THREADS;
-  info[2]=WARP_SIZE;
-  info[3]=THREADS_PER_ATOM;
-  info[4]=PPPM_MAX_SPLINE;
-  info[5]=PPPM_BLOCK_1D;
-  info[6]=BLOCK_PAIR;
-  info[7]=MAX_SHARED_TYPES;
-  info[8]=BLOCK_CELL_2D;
-  info[9]=BLOCK_CELL_ID;
-  info[10]=BLOCK_NBOR_BUILD;
-  info[11]=BLOCK_BIO_PAIR;
-  info[12]=MAX_BIO_SHARED_TYPES;
-  info[13]=THREADS_PER_CHARGE;
-  info[14]=BLOCK_ELLIPSE;
-}
+  #ifdef __CUDA_ARCH__
+  info[0]=__CUDA_ARCH__;
+  #else
+  info[0]=0;
+  #endif
+  info[1]=CONFIG_ID;
+  info[2]=SIMD_SIZE;
+  info[3]=MEM_THREADS;
+  info[4]=SHUFFLE_AVAIL;
+  info[5]=FAST_MATH;
 
+  info[6]=THREADS_PER_ATOM;
+  info[7]=THREADS_PER_CHARGE;
+  info[8]=THREADS_PER_THREE;
+
+  info[9]=BLOCK_PAIR;
+  info[10]=BLOCK_BIO_PAIR;
+  info[11]=BLOCK_ELLIPSE;
+  info[12]=PPPM_BLOCK_1D;
+  info[13]=BLOCK_NBOR_BUILD;
+  info[14]=BLOCK_CELL_2D;
+  info[15]=BLOCK_CELL_ID;
+
+  info[16]=MAX_SHARED_TYPES;
+  info[17]=MAX_BIO_SHARED_TYPES;
+  info[18]=PPPM_MAX_SPLINE;
+}
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 21bd039c42..bd5b81558c 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -39,22 +39,23 @@ class Device {
 
   /// Initialize the device for use by this process
   /** Sets up a per-device MPI communicator for load balancing and initializes
-    * the device (>=first_gpu and <=last_gpu) that this proc will be using
+    * the device (ngpu starting at first_gpu_id) that this proc will be using
     * Returns:
     * -  0 if successful
     * - -2 if GPU not found
     * - -4 if GPU library not compiled for GPU
     * - -6 if GPU could not be initialized for use
     * - -7 if accelerator sharing is not currently allowed on system
-    * - -11 if vendor_string has the wrong number of parameters **/
-  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                  const int last_gpu, const int gpu_mode,
+    * - -11 if config_string has the wrong number of parameters **/
+  int init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+                  const int first_gpu_id, const int gpu_mode,
                   const double particle_split, const int nthreads,
-                  const int t_per_atom, const double cell_size,
-                  char *vendor_string, const int block_pair);
+                  const int t_per_atom, const double user_cell_size,
+                  char *config_string, const int ocl_platform,
+                  char *device_type_flags, const int block_pair);
 
   /// Initialize the device for Atom storage
-  /** \param charge True if charges need to be stored 
+  /** \param charge True if charges need to be stored
     * \param rot True if quaternions need to be stored
     * \param nlocal Total number of local particles to allocate memory for
     * \param nall Total number of local+ghost particles
@@ -94,10 +95,11 @@ class Device {
     *                 1 if gpu_nbor is true, and host needs a half nbor list,
     *                 2 if gpu_nbor is true, and host needs a full nbor list
     * \param max_nbors Initial number of rows in the neighbor matrix
-    * \param cell_size cutoff+skin
+    * \param cutoff cutoff+skin
     * \param pre_cut True if cutoff test will be performed in separate kernel
     *                than the force kernel
     * \param threads_per_atom value to be used by the neighbor list only
+    * \param ilist_map true if ilist mapping data structures used (3-body)
     *
     * Returns:
     * -  0 if successful
@@ -108,8 +110,9 @@ class Device {
   int init_nbor(Neighbor *nbor, const int nlocal,
                 const int host_nlocal, const int nall,
                 const int maxspecial, const int gpu_host,
-                const int max_nbors, const double cell_size,
-                const bool pre_cut, const int threads_per_atom);
+                const int max_nbors, const double cutoff,
+                const bool pre_cut, const int threads_per_atom,
+                const bool ilist_map = false);
 
   /// Output a message for pair_style acceleration with device stats
   void init_message(FILE *screen, const char *name,
@@ -161,13 +164,16 @@ class Device {
 
   /// Add "answers" (force,energies,etc.) into LAMMPS structures
   inline double fix_gpu(double **f, double **tor, double *eatom,
-                        double **vatom, double *virial, double &ecoul) {
+                        double **vatom, double *virial, double &ecoul,
+                        int &error_flag) {
+    error_flag=0;
     atom.data_unavail();
     if (ans_queue.empty()==false) {
       stop_host_timer();
       double evdw=0.0;
       while (ans_queue.empty()==false) {
-        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
+        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul,
+                                             error_flag);
         ans_queue.pop();
       }
       return evdw;
@@ -228,45 +234,49 @@ class Device {
   /// True if device is being timed
   inline bool time_device() const { return _time_device; }
 
+  /// Accelerator device configuration id
+  inline int config_id() const { return _config_id; }
+  /// Number of threads executing concurrently on same multiproc
+  inline int simd_size() const { return _simd_size; }
   /// Return the number of threads accessing memory simulatenously
   inline int num_mem_threads() const { return _num_mem_threads; }
+  /// 1 if horizontal vector operations enabled, 0 otherwise
+  inline int shuffle_avail() const { return _shuffle_avail; }
+  /// For OpenCL, 0 if fast-math options disabled, 1 enabled
+  inline int fast_math() const { return _fast_math; }
+
   /// Return the number of threads per atom for pair styles
   inline int threads_per_atom() const { return _threads_per_atom; }
   /// Return the number of threads per atom for pair styles using charge
   inline int threads_per_charge() const { return _threads_per_charge; }
+  /// Return the number of threads per atom for 3-body pair styles
+  inline int threads_per_three() const { return _threads_per_three; }
+
   /// Return the min of the pair block size or the device max block size
   inline int pair_block_size() const { return _block_pair; }
-  /// Return the maximum number of atom types that can be used with shared mem
-  inline int max_shared_types() const { return _max_shared_types; }
-  /// Return the maximum order for PPPM splines
-  inline int pppm_max_spline() const { return _pppm_max_spline; }
-  /// Return the block size for PPPM kernels
-  inline int pppm_block() const { return _pppm_block; }
-  /// Return the block size for neighbor binning
-  inline int block_cell_2d() const { return _block_cell_2d; }
-  /// Return the block size for atom mapping for neighbor builds
-  inline int block_cell_id() const { return _block_cell_id; }
-  /// Return the block size for neighbor build kernel
-  inline int block_nbor_build() const { return _block_nbor_build; }
   /// Return the block size for "bio" pair styles
   inline int block_bio_pair() const { return _block_bio_pair; }
   /// Return the block size for "ellipse" pair styles
   inline int block_ellipse() const { return _block_ellipse; }
+  /// Return the block size for PPPM kernels
+  inline int pppm_block() const { return _pppm_block; }
+  /// Return the block size for neighbor build kernel
+  inline int block_nbor_build() const { return _block_nbor_build; }
+  /// Return the block size for neighbor binning
+  inline int block_cell_2d() const { return _block_cell_2d; }
+  /// Return the block size for atom mapping for neighbor builds
+  inline int block_cell_id() const { return _block_cell_id; }
+
+  /// Return the maximum number of atom types that can be used with shared mem
+  inline int max_shared_types() const { return _max_shared_types; }
   /// Return the maximum number of atom types for shared mem with "bio" styles
   inline int max_bio_shared_types() const { return _max_bio_shared_types; }
+  /// Return the maximum order for PPPM splines
+  inline int pppm_max_spline() const { return _pppm_max_spline; }
+
   /// Architecture gpu code compiled for (returns 0 for OpenCL)
   inline double ptx_arch() const { return _ptx_arch; }
-  /// Number of threads executing concurrently on same multiproc
-  inline int warp_size() const { return _warp_size; }
-
-  // -------------------- SHARED DEVICE ROUTINES --------------------
-  // Perform asynchronous zero of integer array
-  void zero(UCL_D_Vec<int> &mem, const int numel) {
-    int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
-                                    _block_pair));
-    k_zero.set_size(num_blocks,_block_pair);
-    k_zero.run(&mem,&numel);
-  }
+  inline void set_simd_size(int simd_sz) { _simd_size = simd_sz; }
 
   // -------------------------- DEVICE DATA -------------------------
 
@@ -304,6 +314,15 @@ class Device {
   }
 
   inline std::string compile_string() { return _ocl_compile_string; }
+  inline std::string ocl_config_name() { return _ocl_config_name; }
+
+  template <class t>
+  inline std::string toa(const t& in) {
+    std::ostringstream o;
+    o.precision(2);
+    o << in;
+    return o.str();
+  }
 
  private:
   std::queue<Answer<numtyp,acctyp> *> ans_queue;
@@ -316,13 +335,13 @@ class Device {
   double _particle_split;
   double _cpu_full;
   double _ptx_arch;
-  double _cell_size; // -1 if the cutoff is used
+  double _user_cell_size; // -1 if the cutoff is used
 
-  int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
-  int _pppm_max_spline, _pppm_block;
-  int _block_pair, _block_ellipse, _max_shared_types;
-  int _block_cell_2d, _block_cell_id, _block_nbor_build;
-  int _block_bio_pair, _max_bio_shared_types;
+  int _config_id, _simd_size, _num_mem_threads, _shuffle_avail, _fast_math;
+  int _threads_per_atom, _threads_per_charge, _threads_per_three;
+  int _block_pair, _block_bio_pair, _block_ellipse;
+  int _pppm_block, _block_nbor_build, _block_cell_2d, _block_cell_id;
+  int _max_shared_types, _max_bio_shared_types, _pppm_max_spline;
 
   UCL_Program *dev_program;
   UCL_Kernel k_zero, k_info;
@@ -331,17 +350,8 @@ class Device {
 
   int _data_in_estimate, _data_out_estimate;
 
-  std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string;
-  int set_ocl_params(char *);
-
-  template <class t>
-  inline std::string toa(const t& in) {
-    std::ostringstream o;
-    o.precision(2);
-    o << in;
-    return o.str();
-  }
-
+  std::string _ocl_config_name, _ocl_config_string, _ocl_compile_string;
+  int set_ocl_params(std::string, std::string);
 };
 
 }
diff --git a/lib/gpu/lal_dipole_lj.cpp b/lib/gpu/lal_dipole_lj.cpp
index b0929e2ffb..ffdeb41ca8 100644
--- a/lib/gpu/lal_dipole_lj.cpp
+++ b/lib/gpu/lal_dipole_lj.cpp
@@ -125,20 +125,9 @@ double DipoleLJT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
+int DipoleLJT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -146,8 +135,8 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
@@ -165,6 +154,7 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class DipoleLJ<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu
index a3ed0d8d40..cbe68ff692 100644
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@@ -31,106 +31,178 @@ _texture_2d( mu_tex,int4);
 #define mu_tex mu_
 #endif
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
 
-#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[8][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=tor.x;                                                  \
-    red_acc[4][tid]=tor.y;                                                  \
-    red_acc[5][tid]=tor.z;                                                  \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<6; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add6(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z,       \
+                     tor.x, tor.y, tor.z);                                  \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
       }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    tor.x=red_acc[3][tid];                                                  \
-    tor.y=red_acc[4][tid];                                                  \
-    tor.z=red_acc[5][tid];                                                  \
-    if (eflag>0 || vflag>0) {                                               \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      red_acc[6][tid]=energy;                                               \
-      red_acc[7][tid]=ecoul;                                                \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<8; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-      energy=red_acc[6][tid];                                               \
-      ecoul=red_acc[7][tid];                                                \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-      engv[ei]=e_coul*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                        \
-        ei+=inum;                                                         \
-      }                                                                     \
-    }                                                                       \
+  if (offset==0 && ii<inum) {                                               \
     ans[ii]=f;                                                              \
     ans[ii+inum]=tor;                                                       \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int ev_stride=NUM_BLOCKS_X;                                     \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+          engv[ei]=e_coul*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else
+
+#if (EVFLAG == 1)
+
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z);       \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+            engv[ei]=e_coul*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
   }
 
 #else
 
 #define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
                         t_per_atom, offset, eflag, vflag, ans, engv)        \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
-        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
-        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-      engv[ei]=e_coul*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                        \
-        ei+=inum;                                                         \
-      }                                                                     \
-    }                                                                       \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z);       \
+  if (offset==0 && ii<inum) {                                               \
     ans[ii]=f;                                                              \
     ans[ii+inum]=tor;                                                       \
   }
 
 #endif
+#endif
 
 __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict lj1,
@@ -151,6 +223,9 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -160,22 +235,19 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
+  acctyp4 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -305,7 +377,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
         tor.y+=fq*ticoul.y;
         tor.z+=fq*ticoul.z;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           acctyp e = (acctyp)0.0;
           if (rsq < lj1[mtype].w) {
             e = qtmp*qj*rinv;
@@ -324,7 +396,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*force.x;
           virial[1] += dely*force.y;
           virial[2] += delz*force.z;
@@ -335,9 +407,9 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv);
 }
 
 __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
@@ -361,33 +433,33 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
+  acctyp4 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -518,7 +590,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
         tor.y+=fq*ticoul.y;
         tor.z+=fq*ticoul.z;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           acctyp e = (acctyp)0;
           if (rsq < lj1[mtype].w) {
             e = qtmp*qj*rinv;
@@ -537,7 +609,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*force.x;
           virial[1] += dely*force.y;
           virial[2] += delz*force.z;
@@ -548,8 +620,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_dipole_lj.h b/lib/gpu/lal_dipole_lj.h
index bd312324c6..395a7472ba 100644
--- a/lib/gpu/lal_dipole_lj.h
+++ b/lib/gpu/lal_dipole_lj.h
@@ -77,7 +77,7 @@ class DipoleLJ : public BaseDipole<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_dipole_lj_ext.cpp b/lib/gpu/lal_dipole_lj_ext.cpp
index 0a94969c8b..90c9935913 100644
--- a/lib/gpu/lal_dipole_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_ext.cpp
@@ -57,7 +57,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e);
 
@@ -76,7 +76,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
diff --git a/lib/gpu/lal_dipole_lj_sf.cpp b/lib/gpu/lal_dipole_lj_sf.cpp
index dcf95bb126..6b40ffaa11 100644
--- a/lib/gpu/lal_dipole_lj_sf.cpp
+++ b/lib/gpu/lal_dipole_lj_sf.cpp
@@ -125,20 +125,9 @@ double DipoleLJSFT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
+int DipoleLJSFT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -146,8 +135,8 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
@@ -165,6 +154,7 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class DipoleLJSF<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu
index 8032ae82ed..717d8959ba 100644
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@@ -32,106 +32,178 @@ _texture_2d( mu_tex,int4);
 #define mu_tex mu_
 #endif
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
 
-#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[8][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=tor.x;                                                  \
-    red_acc[4][tid]=tor.y;                                                  \
-    red_acc[5][tid]=tor.z;                                                  \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<6; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add6(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z,       \
+                     tor.x, tor.y, tor.z);                                  \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
       }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    tor.x=red_acc[3][tid];                                                  \
-    tor.y=red_acc[4][tid];                                                  \
-    tor.z=red_acc[5][tid];                                                  \
-    if (eflag>0 || vflag>0) {                                               \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      red_acc[6][tid]=energy;                                               \
-      red_acc[7][tid]=ecoul;                                                \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<8; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-      energy=red_acc[6][tid];                                               \
-      ecoul=red_acc[7][tid];                                                \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-      engv[ei]=e_coul*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
+  if (offset==0 && ii<inum) {                                               \
     ans[ii]=f;                                                              \
     ans[ii+inum]=tor;                                                       \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int ev_stride=NUM_BLOCKS_X;                                     \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+          engv[ei]=e_coul*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else
+
+#if (EVFLAG == 1)
+
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z);       \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+            engv[ei]=e_coul*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
   }
 
 #else
 
 #define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
-                         t_per_atom, offset, eflag, vflag, ans, engv)       \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
-        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
-        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-      engv[ei]=e_coul*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z);       \
+  if (offset==0 && ii<inum) {                                               \
     ans[ii]=f;                                                              \
     ans[ii+inum]=tor;                                                       \
   }
 
 #endif
+#endif
 
 __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict lj1,
@@ -152,6 +224,9 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -161,22 +236,19 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
+  acctyp4 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -333,7 +405,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
         tor.y+=fq*ticoul.y;
         tor.z+=fq*ticoul.z;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           acctyp e = (acctyp)0.0;
           if (rsq < lj1[mtype].w) {
             numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
@@ -357,7 +429,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
             energy+=factor_lj*e;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*force.x;
           virial[1] += dely*force.y;
           virial[2] += delz*force.z;
@@ -367,9 +439,9 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
         }
       }
     } // for nbor
-    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv);
 }
 
 __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
@@ -394,33 +466,33 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
+  acctyp4 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -576,7 +648,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
         tor.y+=fq*ticoul.y;
         tor.z+=fq*ticoul.z;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           acctyp e = (acctyp)0.0;
           if (rsq < lj1[mtype].w) {
             numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
@@ -600,7 +672,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*e;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*force.x;
           virial[1] += dely*force.y;
           virial[2] += delz*force.z;
@@ -611,8 +683,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_dipole_lj_sf.h b/lib/gpu/lal_dipole_lj_sf.h
index ae73508065..088d8df03e 100644
--- a/lib/gpu/lal_dipole_lj_sf.h
+++ b/lib/gpu/lal_dipole_lj_sf.h
@@ -77,7 +77,7 @@ class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_dipole_lj_sf_ext.cpp b/lib/gpu/lal_dipole_lj_sf_ext.cpp
index 3626e8305e..0879702887 100644
--- a/lib/gpu/lal_dipole_lj_sf_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp
@@ -57,7 +57,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                         host_lj4, special_lj, inum, nall, 300,
+                         host_lj4, special_lj, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
@@ -76,7 +76,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                           special_lj, inum, nall, 300, maxspecial,
+                           special_lj, inum, nall, max_nbors, maxspecial,
                            cell_size, gpu_split, screen, host_cut_ljsq,
                            host_cut_coulsq, host_special_coul, qqrd2e);
 
diff --git a/lib/gpu/lal_dipole_long_lj.cpp b/lib/gpu/lal_dipole_long_lj.cpp
index 9648e9b15e..5531fa0dc9 100644
--- a/lib/gpu/lal_dipole_long_lj.cpp
+++ b/lib/gpu/lal_dipole_long_lj.cpp
@@ -128,20 +128,9 @@ double DipoleLongLJT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void DipoleLongLJT::loop(const bool _eflag, const bool _vflag) {
+int DipoleLongLJT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -149,8 +138,8 @@ void DipoleLongLJT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
@@ -168,6 +157,7 @@ void DipoleLongLJT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class DipoleLongLJ<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_dipole_long_lj.cu b/lib/gpu/lal_dipole_long_lj.cu
index 3aafba43aa..407b63f93e 100644
--- a/lib/gpu/lal_dipole_long_lj.cu
+++ b/lib/gpu/lal_dipole_long_lj.cu
@@ -31,106 +31,178 @@ _texture_2d( mu_tex,int4);
 #define mu_tex mu_
 #endif
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
 
-#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[8][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=tor.x;                                                  \
-    red_acc[4][tid]=tor.y;                                                  \
-    red_acc[5][tid]=tor.z;                                                  \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<6; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add6(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z,       \
+                     tor.x, tor.y, tor.z);                                  \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
       }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    tor.x=red_acc[3][tid];                                                  \
-    tor.y=red_acc[4][tid];                                                  \
-    tor.z=red_acc[5][tid];                                                  \
-    if (eflag>0 || vflag>0) {                                               \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      red_acc[6][tid]=energy;                                               \
-      red_acc[7][tid]=ecoul;                                                \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<8; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-      energy=red_acc[6][tid];                                               \
-      ecoul=red_acc[7][tid];                                                \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-      engv[ei]=e_coul*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                        \
-        ei+=inum;                                                         \
-      }                                                                     \
-    }                                                                       \
+  if (offset==0 && ii<inum) {                                               \
     ans[ii]=f;                                                              \
     ans[ii+inum]=tor;                                                       \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int ev_stride=NUM_BLOCKS_X;                                     \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+          engv[ei]=e_coul*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else
+
+#if (EVFLAG == 1)
+
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z);       \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+            engv[ei]=e_coul*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
   }
 
 #else
 
 #define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
                         t_per_atom, offset, eflag, vflag, ans, engv)        \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
-        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
-        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-      engv[ei]=e_coul*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                        \
-        ei+=inum;                                                         \
-      }                                                                     \
-    }                                                                       \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z);       \
+  if (offset==0 && ii<inum) {                                               \
     ans[ii]=f;                                                              \
     ans[ii+inum]=tor;                                                       \
   }
 
 #endif
+#endif
 
 #define MY_PIS (acctyp)1.77245385090551602729
 
@@ -154,6 +226,9 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -163,17 +238,15 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
+  acctyp4 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   numtyp pre1 = numtyp(2.0) * g_ewald / MY_PIS;
   numtyp pre2 = numtyp(4.0) * (g_ewald*g_ewald*g_ewald) / MY_PIS;
@@ -182,7 +255,6 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -352,7 +424,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
         tor.y+=qqrd2e*ticoul.y;
         tor.z+=qqrd2e*ticoul.z;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           acctyp e = (acctyp)0.0;
           if (rsq < cut_coulsq && factor_coul > (numtyp)0.0) {
             e = qqrd2e*(b0*g0 + b1*g1 + b2*g2);
@@ -368,7 +440,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*force.x;
           virial[1] += dely*force.y;
           virial[2] += delz*force.z;
@@ -379,9 +451,9 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv);
 }
 
 __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
@@ -406,26 +478,27 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
+  acctyp4 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
@@ -436,7 +509,6 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -606,7 +678,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
         tor.y+=qqrd2e*ticoul.y;
         tor.z+=qqrd2e*ticoul.z;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           acctyp e = (acctyp)0.0;
           if (rsq < cut_coulsq && factor_coul > (numtyp)0.0) {
             e = qqrd2e*(b0*g0 + b1*g1 + b2*g2);
@@ -622,7 +694,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*force.x;
           virial[1] += dely*force.y;
           virial[2] += delz*force.z;
@@ -633,8 +705,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_dipole_long_lj.h b/lib/gpu/lal_dipole_long_lj.h
index 77e22a10a7..c8f37efd2b 100644
--- a/lib/gpu/lal_dipole_long_lj.h
+++ b/lib/gpu/lal_dipole_long_lj.h
@@ -77,7 +77,7 @@ class DipoleLongLJ : public BaseDipole<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_dipole_long_lj_ext.cpp b/lib/gpu/lal_dipole_long_lj_ext.cpp
index b2751e8a82..fd61706ba9 100644
--- a/lib/gpu/lal_dipole_long_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_long_lj_ext.cpp
@@ -58,7 +58,7 @@ int dplj_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=DPLJMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
@@ -77,7 +77,7 @@ int dplj_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=DPLJMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_dpd.cpp b/lib/gpu/lal_dpd.cpp
index c5cbc7eb53..f890fb53a3 100644
--- a/lib/gpu/lal_dpd.cpp
+++ b/lib/gpu/lal_dpd.cpp
@@ -52,15 +52,31 @@ int DPDT::init(const int ntypes,
                const int max_nbors, const int maxspecial,
                const double cell_size,
                const double gpu_split, FILE *_screen) {
+  const int max_shared_types=this->device->max_shared_types();
+
+  int onetype=0;
+  #ifdef USE_OPENCL
+  if (maxspecial==0)
+    for (int i=1; i<ntypes; i++)
+      for (int j=i; j<ntypes; j++)
+        if (host_cutsq[i][j]>0) {
+          if (onetype>0)
+            onetype=-1;
+          else if (onetype==0)
+            onetype=i*max_shared_types+j;
+        }
+  if (onetype<0) onetype=0;
+  #endif
+
   int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,_screen,dpd,"k_dpd");
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
+                            gpu_split,_screen,dpd,"k_dpd",onetype);
   if (success!=0)
     return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
@@ -117,20 +133,9 @@ double DPDT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void DPDT::loop(const bool _eflag, const bool _vflag) {
+int DPDT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -138,8 +143,8 @@ void DPDT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->v, &cutsq,
@@ -155,6 +160,7 @@ void DPDT::loop(const bool _eflag, const bool _vflag) {
                      &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/lal_dpd.cu b/lib/gpu/lal_dpd.cu
index a29e04fc7f..2794110a92 100644
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@@ -179,16 +179,19 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_pair();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -249,14 +252,14 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           // unshifted eng of conservative term:
           // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
           // eng shifted to 0.0 at cutoff
           numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
           energy+=factor_dpd*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -267,9 +270,9 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
@@ -289,6 +292,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
+  #ifndef ONETYPE
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -296,25 +300,36 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp coeffx=coeff_in[ONETYPE].x;
+  const numtyp coeffy=coeff_in[ONETYPE].y;
+  const numtyp coeffz=coeff_in[ONETYPE].z;
+  const numtyp coeffw=coeff_in[ONETYPE].w;
+  const numtyp cutsq_p=cutsq[ONETYPE];
+  #endif
+
+  int n_stride;
+  local_allocate_store_pair();
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    #endif
     numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i];
     int itag=iv.w;
 
@@ -322,11 +337,16 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
       int j=dev_packed[nbor];
+      #ifndef ONETYPE
       factor_dpd = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
+      #endif
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int mtype=itype+jx.w;
+      const numtyp cutsq_p=cutsq[mtype];
+      #endif
       numtyp4 jv; fetch4(jv,j,vel_tex); //v_[j];
       int jtag=jv.w;
 
@@ -336,7 +356,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutsq[mtype]) {
+      if (rsq<cutsq_p) {
         numtyp r=ucl_sqrt(rsq);
         if (r < EPSILON) continue;
 
@@ -345,7 +365,10 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
         numtyp delvy = iv.y - jv.y;
         numtyp delvz = iv.z - jv.z;
         numtyp dot = delx*delvx + dely*delvy + delz*delvz;
-        numtyp wd = (numtyp)1.0 - r/coeff[mtype].w;
+        #ifndef ONETYPE
+        const numtyp coeffw=coeff[mtype].w;
+        #endif
+        numtyp wd = (numtyp)1.0 - r/coeffw;
 
         unsigned int tag1=itag, tag2=jtag;
         if (tag1 > tag2) {
@@ -359,24 +382,37 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
         // drag force = -gamma * wd^2 * (delx dot delv) / r
         // random force = sigma * wd * rnd * dtinvsqrt;
 
+        #ifndef ONETYPE
+        const numtyp coeffx=coeff[mtype].x;
+        const numtyp coeffy=coeff[mtype].y;
+        const numtyp coeffz=coeff[mtype].z;
+        #endif
         numtyp force = (numtyp)0.0;
-        if (!tstat_only) force = coeff[mtype].x*wd;
-        force -= coeff[mtype].y*wd*wd*dot*rinv;
-        force += coeff[mtype].z*wd*randnum*dtinvsqrt;
+        if (!tstat_only) force = coeffx*wd;
+        force -= coeffy*wd*wd*dot*rinv;
+        force += coeffz*wd*randnum*dtinvsqrt;
+        #ifndef ONETYPE
         force*=factor_dpd*rinv;
+        #else
+        force*=rinv;
+        #endif
 
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           // unshifted eng of conservative term:
           // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
           // eng shifted to 0.0 at cutoff
-          numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
+          numtyp e = (numtyp)0.5*coeffx*coeffw * wd*wd;
+          #ifndef ONETYPE
           energy+=factor_dpd*e;
+          #else
+          energy+=e;
+          #endif
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -387,8 +423,8 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_dpd.h b/lib/gpu/lal_dpd.h
index 3c36c39e05..be93d988a3 100644
--- a/lib/gpu/lal_dpd.h
+++ b/lib/gpu/lal_dpd.h
@@ -78,7 +78,7 @@ class DPD : public BaseDPD<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_dpd_ext.cpp b/lib/gpu/lal_dpd_ext.cpp
index d727a87319..7637ff03c0 100644
--- a/lib/gpu/lal_dpd_ext.cpp
+++ b/lib/gpu/lal_dpd_ext.cpp
@@ -55,7 +55,7 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0,
   int init_ok=0;
   if (world_me==0)
     init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
-                       host_cut, special_lj, false, inum, nall, 300,
+                       host_cut, special_lj, false, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   DPDMF.device->world_barrier();
@@ -73,7 +73,7 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
-                         host_cut, special_lj, false, inum, nall, 300,
+                         host_cut, special_lj, false, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen);
 
     DPDMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index 03479cd16a..cdafe72898 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -52,9 +52,23 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
                const int maxspecial, const double cell_size,
                const double gpu_split, FILE *_screen)
 {
+  int max_shared_types=this->device->max_shared_types();
+
+  int onetype=0;
+  #ifdef USE_OPENCL
+  for (int i=1; i<ntypes; i++)
+    if (host_type2frho[i]>=0 && host_type2frho[i]<=nfrho-1) {
+      if (onetype>0)
+        onetype=-1;
+      else if (onetype==0)
+        onetype=i*max_shared_types+i;
+    }
+  if (onetype<0) onetype=0;
+  #endif
+
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
-                            gpu_split,_screen,eam,"k_eam");
+                            gpu_split,_screen,eam,"k_eam",onetype);
 
   if (success!=0)
     return success;
@@ -72,6 +86,13 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
   fp_tex.get_texture(*(this->pair_program),"fp_tex");
   fp_tex.bind_float(_fp,1);
+
+  #if defined(LAL_OCL_EV_JIT)
+  k_energy_fast_noev.set_function(*(this->pair_program_noev),"k_energy_fast");
+  #else
+  k_energy_sel = &k_energy_fast;
+  #endif
+
   _compiled_energy = true;
 
   // Initialize timers for selected GPU
@@ -88,7 +109,6 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   int lj_types=ntypes;
   shared_types=false;
 
-  int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
@@ -260,6 +280,9 @@ void EAMT::clear() {
   if (_compiled_energy) {
     k_energy_fast.clear();
     k_energy.clear();
+    #if defined(LAL_OCL_EV_JIT)
+    k_energy_fast_noev.clear();
+    #endif
     _compiled_energy=false;
   }
 
@@ -278,11 +301,18 @@ template <class numtyp, class acctyp>
 void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
                    const int nall, double **host_x, int *host_type,
                    int *ilist, int *numj, int **firstneigh,
-                   const bool eflag, const bool vflag,
+                   const bool eflag_in, const bool vflag_in,
                    const bool eatom, const bool vatom,
                    int &host_start, const double cpu_time,
                    bool &success, void **fp_ptr) {
   this->acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  this->set_kernel(eflag,vflag);
 
   if (this->device->time_device()) {
     // Put time from the second part to the total time_pair
@@ -346,12 +376,20 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
 template <class numtyp, class acctyp>
 int** EAMT::compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, double *sublo,
-                    double *subhi, tagint *tag, int **nspecial, tagint **special,
-                    const bool eflag, const bool vflag, const bool eatom,
+                    double *subhi, tagint *tag, int **nspecial,
+                    tagint **special, const bool eflag_in,
+                    const bool vflag_in, const bool eatom,
                     const bool vatom, int &host_start, int **ilist, int **jnum,
                     const double cpu_time, bool &success, int &inum,
                     void **fp_ptr) {
   this->acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  this->set_kernel(eflag,vflag);
 
   if (this->device->time_device()) {
     // Put time from the second part to the total time_pair
@@ -430,9 +468,9 @@ void EAMT::compute2(int *ilist, const bool eflag, const bool vflag,
 
   loop2(eflag,vflag);
   if (ilist == nullptr)
-    this->ans->copy_answers(eflag,vflag,eatom,vatom);
+    this->ans->copy_answers(eflag,vflag,eatom,vatom, this->ans->inum());
   else
-    this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist);
+    this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist, this->ans->inum());
 
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
@@ -442,20 +480,9 @@ void EAMT::compute2(int *ilist, const bool eflag, const bool vflag,
 // Calculate per-atom energies and forces
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void EAMT::loop(const bool _eflag, const bool _vflag) {
+int EAMT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -464,13 +491,18 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
 
   if (shared_types) {
-    this->k_energy_fast.set_size(GX,BX);
-    this->k_energy_fast.run(&this->atom->x, &type2rhor_z2r, &type2frho,
-                            &rhor_spline2, &frho_spline1,&frho_spline2,
-                            &this->nbor->dev_nbor,  &this->_nbor_data->begin(),
-                            &_fp, &this->ans->engv, &eflag, &ainum,
-                            &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
-                            &_rhomax, &_nrho, &_nr, &this->_threads_per_atom);
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_energy_sel = &k_energy_fast;
+    else k_energy_sel = &k_energy_fast_noev;
+    #endif
+
+    k_energy_sel->set_size(GX,BX);
+    k_energy_sel->run(&this->atom->x, &type2rhor_z2r, &type2frho,
+                      &rhor_spline2, &frho_spline1,&frho_spline2,
+                      &this->nbor->dev_nbor,  &this->_nbor_data->begin(),
+                      &_fp, &this->ans->engv, &eflag, &ainum,
+                      &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
+                      &_rhomax, &_nrho, &_nr, &this->_threads_per_atom);
   } else {
     this->k_energy.set_size(GX,BX);
     this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho,
@@ -482,6 +514,7 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
   }
 
   this->time_pair.stop();
+  return ainum;
 }
 
 // ---------------------------------------------------------------------------
@@ -510,8 +543,8 @@ void EAMT::loop2(const bool _eflag, const bool _vflag) {
   this->time_pair2.start();
 
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &_fp, &type2rhor_z2r,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &_fp, &type2rhor_z2r,
                           &rhor_spline1, &z2r_spline1, &z2r_spline2,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu
index b22ce7b575..3955f3cc8a 100644
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@@ -36,6 +36,16 @@ _texture( z2r_sp1_tex,int4);
 _texture( z2r_sp2_tex,int4);
 #endif
 
+#if (__CUDACC_VER_MAJOR__ >= 11)
+#define fp_tex fp_
+#define rhor_sp1_tex rhor_spline1
+#define rhor_sp2_tex rhor_spline2
+#define frho_sp1_tex frho_spline1
+#define frho_sp2_tex frho_spline2
+#define z2r_sp1_tex z2r_spline1
+#define z2r_sp2_tex z2r_spline2
+#endif
+
 #else
 
 #define pos_tex x_
@@ -52,30 +62,33 @@ _texture( z2r_sp2_tex,int4);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MAX(A,B) ((A) > (B) ? (A) : (B))
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
+
+#define local_allocate_store_energy_fp()                                    \
+    __local acctyp red_acc[BLOCK_PAIR];
 
 #define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,           \
-                        eflag,vflag,engv,rdrho,nrho,i,rhomax)               \
+                        eflag,vflag,engv,rdrho,nrho,i,rhomax,tfrho)         \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=rho;                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s)                                                       \
          red_acc[tid] += red_acc[tid+s];                                    \
       }                                                                     \
       rho=red_acc[tid];                                                     \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     numtyp p = rho*rdrho + (numtyp)1.0;                                     \
     int m=p;                                                                \
     m = MAX(1,MIN(m,nrho-1));                                               \
     p -= m;                                                                 \
     p = MIN(p,(numtyp)1.0);                                                 \
-    int index = type2frho[itype]*(nrho+1)+m;                                \
+    int index = tfrho*(nrho+1)+m;                                           \
     numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex);                        \
     numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z;                          \
     fp_[i]=fp;                                                              \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       fetch4(coeff,index,frho_sp2_tex);                                     \
       energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
       if (rho > rhomax) energy += fp*(rho-rhomax);                          \
@@ -83,15 +96,18 @@ _texture( z2r_sp2_tex,int4);
     }                                                                       \
   }
 
+#define local_allocate_store_answers_eam()                                  \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+
 #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom,     \
                       offset, elag, vflag, ans, engv)                       \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
     red_acc[0][tid]=f.x;                                                    \
     red_acc[1][tid]=f.y;                                                    \
     red_acc[2][tid]=f.z;                                                    \
     red_acc[3][tid]=energy;                                                 \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s) {                                                     \
         for (int r=0; r<4; r++)                                             \
           red_acc[r][tid] += red_acc[r][tid+s];                             \
@@ -101,10 +117,12 @@ _texture( z2r_sp2_tex,int4);
     f.y=red_acc[1][tid];                                                    \
     f.z=red_acc[2][tid];                                                    \
     energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
+      simdsync();                                                           \
       for (int r=0; r<6; r++)                                               \
         red_acc[r][tid]=virial[r];                                          \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        simdsync();                                                         \
         if (offset < s) {                                                   \
           for (int r=0; r<6; r++)                                           \
             red_acc[r][tid] += red_acc[r][tid+s];                           \
@@ -114,13 +132,13 @@ _texture( z2r_sp2_tex,int4);
         virial[r]=red_acc[r][tid];                                          \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       engv[ei]+=energy*(acctyp)0.5;                                         \
       ei+=inum;                                                             \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         engv[ei]=virial[i]*(acctyp)0.5;                                     \
         ei+=inum;                                                           \
@@ -131,53 +149,57 @@ _texture( z2r_sp2_tex,int4);
 
 #else
 
+#define local_allocate_store_energy_fp()
+
 #define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,           \
-                        eflag,vflag,engv,rdrho,nrho,i,rhomax)               \
+                        eflag,vflag,engv,rdrho,nrho,i,rhomax,tfrho)         \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1)                           \
-        rho += shfl_xor(rho, s, t_per_atom);                                \
+      rho += shfl_down(rho, s, t_per_atom);                                 \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     numtyp p = rho*rdrho + (numtyp)1.0;                                     \
     int m=p;                                                                \
     m = MAX(1,MIN(m,nrho-1));                                               \
     p -= m;                                                                 \
     p = MIN(p,(numtyp)1.0);                                                 \
-    int index = type2frho[itype]*(nrho+1)+m;                                \
+    int index = tfrho*(nrho+1)+m;                                           \
     numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex);                        \
     numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z;                          \
     fp_[i]=fp;                                                              \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       fetch4(coeff,index,frho_sp2_tex);                                     \
       energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
       if (rho > rhomax) energy += fp*(rho-rhomax);                          \
-      engv[ii]=energy;                                          \
+      engv[ii]=energy;                                                      \
     }                                                                       \
   }
 
+#define local_allocate_store_answers_eam()
+
 #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom,     \
                           offset, eflag, vflag, ans, engv)                  \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
+      f.x += shfl_down(f.x, s, t_per_atom);                                 \
+      f.y += shfl_down(f.y, s, t_per_atom);                                 \
+      f.z += shfl_down(f.z, s, t_per_atom);                                 \
+      if (EVFLAG) energy += shfl_down(energy, s, t_per_atom);               \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+        for (int r=0; r<6; r++)                                             \
+          virial[r] += shfl_down(virial[r], s, t_per_atom);                 \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       engv[ei]+=energy*(acctyp)0.5;                                         \
       ei+=inum;                                                             \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         engv[ei]=virial[i]*(acctyp)0.5;                                     \
         ei+=inum;                                                           \
@@ -203,21 +225,23 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
                        const numtyp rdr, const numtyp rdrho,
                        const numtyp rhomax, const int nrho,
                        const int nr, const int t_per_atom) {
-  int tid, ii, offset;
+  int tid, ii, offset, i, itype;
   atom_info(t_per_atom,ii,tid,offset);
 
+  int n_stride;
+  local_allocate_store_energy_fp();
+
   acctyp rho = (acctyp)0;
-  acctyp energy = (acctyp)0;
+  acctyp energy;
+  if (EVFLAG && eflag) energy=(acctyp)0;
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    __local int n_stride;
+    int nbor, nbor_end, numj;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int itype=ix.w;
+    itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
@@ -245,10 +269,10 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
         rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       }
     } // for nbor
-
-    store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
-        eflag,vflag,engv,rdrho,nrho,i,rhomax);
   } // if ii
+  const numtyp tfrho=type2frho[itype];
+  store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
+                  eflag,vflag,engv,rdrho,nrho,i,rhomax,tfrho);
 }
 
 __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
@@ -267,34 +291,41 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
                             const numtyp rdrho, const numtyp rhomax,
                             const int nrho, const int nr,
                             const int t_per_atom) {
-  int tid, ii, offset;
+  int tid, ii, offset, i, itype;
   atom_info(t_per_atom,ii,tid,offset);
 
+  #ifndef ONETYPE
   __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local int type2frho[MAX_SHARED_TYPES];
-
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     type2rhor_z2r[tid]=type2rhor_z2r_in[tid];
   }
-
   if (tid<MAX_SHARED_TYPES) {
     type2frho[tid]=type2frho_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp type2rhor_z2rx=
+    type2rhor_z2r_in[ONETYPE*MAX_SHARED_TYPES+ONETYPE].x;
+  const numtyp tfrho=type2frho_in[ONETYPE];
+  #endif
+
+  int n_stride;
+  local_allocate_store_energy_fp();
 
   acctyp rho = (acctyp)0;
-  acctyp energy = (acctyp)0;
-
-  __syncthreads();
+  acctyp energy;
+  if (EVFLAG && eflag) energy=(acctyp)0;
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    __local int n_stride;
+    int nbor, nbor_end, numj;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int itype=ix.w;
+    #ifndef ONETYPE
+    itype=ix.w;
+    #endif
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
@@ -315,17 +346,23 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
         p -= m;
         p = MIN(p,(numtyp)1.0);
 
+        #ifndef ONETYPE
         int jtype=fast_mul((int)MAX_SHARED_TYPES,jx.w);
         int mtype = jtype+itype;
         int index = type2rhor_z2r[mtype].x*(nr+1)+m;
+        #else
+        int index = type2rhor_z2rx*(nr+1)+m;
+        #endif
         numtyp4 coeff; fetch4(coeff,index,rhor_sp2_tex);
         rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       }
     } // for nbor
-
-    store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
-                    eflag,vflag,engv,rdrho,nrho,i,rhomax);
   } // if ii
+  #ifndef ONETYPE
+  const numtyp tfrho=type2frho[itype];
+  #endif
+  store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
+                  eflag,vflag,engv,rdrho,nrho,i,rhomax,tfrho);
 }
 
 __kernel void k_eam(const __global numtyp4 *restrict x_,
@@ -345,19 +382,20 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_answers_eam();
+
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -418,10 +456,10 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           energy += phi;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -431,10 +469,9 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
         }
       }
     } // for nbor
-    store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
-
+  store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                    ans,engv);
 }
 
 __kernel void k_eam_fast(const __global numtyp4 *x_,
@@ -453,40 +490,51 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
+  #ifndef ONETYPE
   __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     type2rhor_z2r[tid]=type2rhor_z2r_in[tid];
   }
+  __syncthreads();
+  #else
+  const int oi=ONETYPE*MAX_SHARED_TYPES+ONETYPE;
+  const numtyp type2rhor_z2rx=type2rhor_z2r_in[oi].x;
+  const numtyp type2rhor_z2ry=type2rhor_z2r_in[oi].y;
+  #endif
+
+  int n_stride;
+  local_allocate_store_answers_eam();
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
+    #ifndef ONETYPE
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    #endif
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int jw=jx.w;
       int jtype=fast_mul((int)MAX_SHARED_TYPES,jw);
+      #endif
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -503,20 +551,35 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
         p = MIN(p,(numtyp)1.0);
 
         numtyp4 coeff;
-        int mtype,index;
+        #ifndef ONETYPE
+        int mtype;
+        #endif
+        int index;
 
+        #ifndef ONETYPE
         mtype = itype+jw;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
+        #else
+        index = type2rhor_z2rx*(nr+1)+m;
+        #endif
         fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
 
+        #ifndef ONETYPE
         mtype = jtype+iw;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
+        #else
+        index = type2rhor_z2rx*(nr+1)+m;
+        #endif
         fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
 
+        #ifndef ONETYPE
         mtype = itype+jw;
         index = type2rhor_z2r[mtype].y*(nr+1)+m;
+        #else
+        index = type2rhor_z2ry*(nr+1)+m;
+        #endif
         fetch4(coeff,index,z2r_sp1_tex);
         numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
         fetch4(coeff,index,z2r_sp2_tex);
@@ -534,10 +597,10 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           energy += phi;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -547,8 +610,8 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
         }
       }
     } // for nbor
-    store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                    ans,engv);
 }
 
diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h
index fa05075883..3cbaeac0b8 100644
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@@ -90,7 +90,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
                 const bool eatom, const bool vatom);
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Kernel k_energy, k_energy_fast;
+  UCL_Kernel k_energy, k_energy_fast, k_energy_fast_noev, *k_energy_sel;
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture fp_tex;
@@ -133,8 +133,8 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
 protected:
   bool _allocated;
   int _nlocal;
-  void loop(const bool _eflag, const bool _vflag);
-  void loop2(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
+  void loop2(const bool eflag, const bool vflag);
 };
 
 }
diff --git a/lib/gpu/lal_eam_alloy_ext.cpp b/lib/gpu/lal_eam_alloy_ext.cpp
index e5f1010e76..f7c4986e68 100644
--- a/lib/gpu/lal_eam_alloy_ext.cpp
+++ b/lib/gpu/lal_eam_alloy_ext.cpp
@@ -67,7 +67,7 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
     init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
                        host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
-                       nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
+                       nfrho, nr, nlocal, nall, max_nbors, maxspecial, cell_size,
                        gpu_split, screen);
 
   EAMALMF.device->world_barrier();
@@ -87,7 +87,7 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
       init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
                          host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
-                         nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
+                         nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMALMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp
index 78f2e3c1f8..3010e0ea7f 100644
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@@ -67,7 +67,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
     init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
                        host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
-                       nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
+                       nfrho, nr, nlocal, nall, max_nbors, maxspecial, cell_size,
                        gpu_split, screen);
 
   EAMMF.device->world_barrier();
@@ -87,7 +87,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
       init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
                          host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
-                         nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
+                         nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMMF.device->gpu_barrier();
@@ -98,7 +98,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
     fprintf(screen,"\n");
 
   if (init_ok==0)
-    EAMMF.estimate_gpu_overhead();
+    EAMMF.estimate_gpu_overhead(1);
   return init_ok;
 }
 
diff --git a/lib/gpu/lal_eam_fs_ext.cpp b/lib/gpu/lal_eam_fs_ext.cpp
index 37208e54f8..205b601562 100644
--- a/lib/gpu/lal_eam_fs_ext.cpp
+++ b/lib/gpu/lal_eam_fs_ext.cpp
@@ -67,7 +67,7 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
     init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
                        host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
-                       nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
+                       nfrho, nr, nlocal, nall, max_nbors, maxspecial, cell_size,
                        gpu_split, screen);
 
   EAMFSMF.device->world_barrier();
@@ -87,7 +87,7 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
       init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
                          host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
-                         nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
+                         nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMFSMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h
index e6122c7404..1c549ab6a6 100644
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@@ -32,22 +32,21 @@ _texture_2d( quat_tex,int4);
 #define quat_tex qif
 #endif
 
-#define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
-                    i, numj, stride, nbor_end, nbor_begin)                   \
-    i=nbor_mem[ii];                                                          \
-    nbor_begin=ii+nbor_stride;                                               \
-    numj=nbor_mem[nbor_begin];                                               \
-    nbor_begin+=nbor_stride;                                                 \
-    nbor_end=nbor_begin+fast_mul(nbor_stride,numj);                          \
-    nbor_begin+=fast_mul(offset,nbor_stride);                                \
-    stride=fast_mul(t_per_atom,nbor_stride);
+#define nbor_info_e_ss(nbor_mem, nbor_stride, t_per_atom, ii, offset,        \
+                       i, numj, stride, nbor_end, nbor_begin)                \
+  i=nbor_mem[ii];                                                            \
+  nbor_begin=ii+nbor_stride;                                                 \
+  numj=nbor_mem[nbor_begin];                                                 \
+  nbor_begin+=nbor_stride;                                                   \
+  nbor_end=nbor_begin+fast_mul(nbor_stride,numj);                            \
+  nbor_begin+=fast_mul(offset,nbor_stride);                                  \
+  stride=fast_mul(t_per_atom,nbor_stride);
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
 
 #define store_answers_t(f, tor, energy, virial, ii, astride, tid,           \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+                        t_per_atom, offset, eflag, vflag, ans, engv, inum)  \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[7][BLOCK_PAIR];                                  \
     red_acc[0][tid]=f.x;                                                    \
     red_acc[1][tid]=f.y;                                                    \
     red_acc[2][tid]=f.z;                                                    \
@@ -55,6 +54,7 @@ _texture_2d( quat_tex,int4);
     red_acc[4][tid]=tor.y;                                                  \
     red_acc[5][tid]=tor.z;                                                  \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s) {                                                     \
         for (int r=0; r<6; r++)                                             \
           red_acc[r][tid] += red_acc[r][tid+s];                             \
@@ -66,28 +66,39 @@ _texture_2d( quat_tex,int4);
     tor.x=red_acc[3][tid];                                                  \
     tor.y=red_acc[4][tid];                                                  \
     tor.z=red_acc[5][tid];                                                  \
-    if (eflag>0 || vflag>0) {                                               \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      red_acc[6][tid]=energy;                                               \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<7; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
+    if (EVFLAG && (eflag || vflag)) {                                       \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid]=virial[r];                                        \
+        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                     \
+          simdsync();                                                       \
+          if (offset < s) {                                                 \
+            for (int r=0; r<6; r++)                                         \
+              red_acc[r][tid] += red_acc[r][tid+s];                         \
+          }                                                                 \
+        }                                                                   \
+        for (int r=0; r<6; r++)                                             \
+          virial[r]=red_acc[r][tid];                                        \
+      }                                                                     \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        red_acc[0][tid]=energy;                                             \
+        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                     \
+          simdsync();                                                       \
+          if (offset < s) red_acc[0][tid] += red_acc[0][tid+s];             \
         }                                                                   \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-      energy=red_acc[6][tid];                                               \
+      energy=red_acc[0][tid];                                               \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       *ap1=energy*(acctyp)0.5;                                              \
       ap1+=astride;                                                         \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         *ap1=virial[i]*(acctyp)0.5;                                         \
         ap1+=astride;                                                       \
@@ -100,12 +111,12 @@ _texture_2d( quat_tex,int4);
 #define acc_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset,   \
                     eflag, vflag, ans, engv)                                \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
     red_acc[0][tid]=f.x;                                                    \
     red_acc[1][tid]=f.y;                                                    \
     red_acc[2][tid]=f.z;                                                    \
     red_acc[3][tid]=energy;                                                 \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s) {                                                     \
         for (int r=0; r<4; r++)                                             \
           red_acc[r][tid] += red_acc[r][tid+s];                             \
@@ -115,10 +126,11 @@ _texture_2d( quat_tex,int4);
     f.y=red_acc[1][tid];                                                    \
     f.z=red_acc[2][tid];                                                    \
     energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int r=0; r<6; r++)                                               \
         red_acc[r][tid]=virial[r];                                          \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        simdsync();                                                         \
         if (offset < s) {                                                   \
           for (int r=0; r<6; r++)                                           \
             red_acc[r][tid] += red_acc[r][tid+s];                           \
@@ -128,13 +140,13 @@ _texture_2d( quat_tex,int4);
         virial[r]=red_acc[r][tid];                                          \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     engv+=ii;                                                               \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       *engv+=energy*(acctyp)0.5;                                            \
       engv+=inum;                                                           \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         *engv+=virial[i]*(acctyp)0.5;                                       \
         engv+=inum;                                                         \
@@ -150,31 +162,31 @@ _texture_2d( quat_tex,int4);
 #else
 
 #define store_answers_t(f, tor, energy, virial, ii, astride, tid,           \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+                        t_per_atom, offset, eflag, vflag, ans, engv, inum)  \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
-        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
-        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
+      f.x += shfl_down(f.x, s, t_per_atom);                                 \
+      f.y += shfl_down(f.y, s, t_per_atom);                                 \
+      f.z += shfl_down(f.z, s, t_per_atom);                                 \
+      tor.x += shfl_down(tor.x, s, t_per_atom);                             \
+      tor.y += shfl_down(tor.y, s, t_per_atom);                             \
+      tor.z += shfl_down(tor.z, s, t_per_atom);                             \
+      if (EVFLAG) energy += shfl_down(energy, s, t_per_atom);               \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+        for (int r=0; r<6; r++)                                             \
+          virial[r] += shfl_down(virial[r], s, t_per_atom);                 \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       *ap1=energy*(acctyp)0.5;                                              \
       ap1+=astride;                                                         \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         *ap1=virial[i]*(acctyp)0.5;                                         \
         ap1+=astride;                                                       \
@@ -188,25 +200,25 @@ _texture_2d( quat_tex,int4);
                     eflag, vflag, ans, engv)                                \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
+      f.x += shfl_down(f.x, s, t_per_atom);                                 \
+      f.y += shfl_down(f.y, s, t_per_atom);                                 \
+      f.z += shfl_down(f.z, s, t_per_atom);                                 \
+      if (EVFLAG) energy += shfl_down(energy, s, t_per_atom);               \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+        for (int r=0; r<6; r++)                                             \
+          virial[r] += shfl_down(virial[r], s, t_per_atom);                 \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     engv+=ii;                                                               \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       *engv+=energy*(acctyp)0.5;                                            \
       engv+=inum;                                                           \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         *engv+=virial[i]*(acctyp)0.5;                                       \
         engv+=inum;                                                         \
diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu
index 5ad935ba9b..9b9d03914c 100644
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@@ -34,7 +34,8 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
                           __global int *dev_nbor,
                           const int nbor_pitch, const int start, const int inum,
                           const __global int *dev_ij,
-                          const int form_low, const int form_high) {
+                          const int form_low, const int form_high,
+                          const int t_per_atom) {
 
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X+start;
@@ -45,12 +46,15 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
     int numj=dev_ij[nbor];
     nbor+=nbor_pitch;
     int nbor_end=nbor+fast_mul(numj,nbor_pitch);
-    int packed=ii+nbor_pitch+nbor_pitch;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul(iw,ntypes);
     int newj=0;
+
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
+
     for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
       int sj=dev_ij[nbor];
       int j = sj & NEIGHMASK;
@@ -68,9 +72,11 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
         rsq+=t*t;
 
         if (rsq<cf.x) {
-          dev_nbor[packed]=sj;
-          packed+=nbor_pitch;
+          *out_list=sj;
+          out_list++;
           newj++;
+          if ((newj & (t_per_atom-1))==0)
+            out_list+=out_stride;
         }
       }
     }
@@ -90,7 +96,8 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
                                const int nbor_pitch, const int start,
                                const int inum,
                                const __global int *dev_ij,
-                               const int form_low, const int form_high) {
+                               const int form_low, const int form_high,
+                               const int t_per_atom) {
 
   int ii=THREAD_ID_X;
   __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -108,13 +115,15 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
     int numj=dev_ij[nbor];
     nbor+=nbor_pitch;
     int nbor_end=nbor+fast_mul(numj,nbor_pitch);
-    int packed=ii+nbor_pitch+nbor_pitch;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     int newj=0;
+
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
     for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
       int sj=dev_ij[nbor];
       int j = sj & NEIGHMASK;
@@ -132,9 +141,11 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
         rsq+=t*t;
 
         if (rsq<cutsq[mtype]) {
-          dev_nbor[packed]=sj;
-          packed+=nbor_pitch;
+          *out_list=sj;
+          out_list++;
           newj++;
+          if ((newj & (t_per_atom-1))==0)
+            out_list+=out_stride;
         }
       }
     }
diff --git a/lib/gpu/lal_gauss.cpp b/lib/gpu/lal_gauss.cpp
index 2f965758eb..6d8f0f02aa 100644
--- a/lib/gpu/lal_gauss.cpp
+++ b/lib/gpu/lal_gauss.cpp
@@ -122,20 +122,9 @@ double GaussT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void GaussT::loop(const bool _eflag, const bool _vflag) {
+int GaussT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -143,19 +132,20 @@ void GaussT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &gauss1, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &gauss1,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &gauss1, &_lj_types, &sp_lj,
+    this->k_pair.run(&this->atom->x, &gauss1, &_lj_types,
                      &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Gauss<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu
index 2192fb39ca..2540b8492f 100644
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@@ -27,7 +27,6 @@ _texture_2d( pos_tex,int4);
 __kernel void k_gauss(const __global numtyp4 *restrict x_,
                       const __global numtyp4 *restrict gauss1,
                       const int lj_types,
-                      const __global numtyp *restrict sp_lj_in,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
                       __global acctyp4 *restrict ans,
@@ -37,23 +36,20 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  __local numtyp sp_lj[4];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
+  int n_stride;
+  local_allocate_store_pair();
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -85,12 +81,12 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
             gauss1[mtype].w);
           energy+=e; //factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -101,14 +97,13 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp4 *restrict gauss1_in,
-                           const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
@@ -119,26 +114,26 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[4];
-  if (tid<4)
-    sp_lj[tid]=sp_lj_in[tid];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     gauss1[tid]=gauss1_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -170,12 +165,12 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
             gauss1[mtype].w);
           energy+=e; //factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -186,8 +181,8 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_gauss.h b/lib/gpu/lal_gauss.h
index 1399b82d03..ecb04c49b2 100644
--- a/lib/gpu/lal_gauss.h
+++ b/lib/gpu/lal_gauss.h
@@ -73,7 +73,7 @@ class Gauss : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp
index a2804ce3cf..afec2e86f2 100644
--- a/lib/gpu/lal_gauss_ext.cpp
+++ b/lib/gpu/lal_gauss_ext.cpp
@@ -55,7 +55,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
   int init_ok=0;
   if (world_me==0)
     init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
-                       offset, special_lj, inum, nall, 300,
+                       offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   GLMF.device->world_barrier();
@@ -73,7 +73,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
-                        offset, special_lj, inum, nall, 300, maxspecial,
+                        offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);
 
     GLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_gayberne.cpp b/lib/gpu/lal_gayberne.cpp
index f17fc50f5f..2b1a190e5a 100644
--- a/lib/gpu/lal_gayberne.cpp
+++ b/lib/gpu/lal_gayberne.cpp
@@ -127,7 +127,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
     host_write[i*4+2]=host_shape[i][2];
   }
   UCL_H_Vec<numtyp4> view4;
-  view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device));
+  view4.view(host_write,shape.numel());
   ucl_copy(shape,view4,false);
 
   well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
@@ -136,7 +136,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
     host_write[i*4+1]=host_well[i][1];
     host_write[i*4+2]=host_well[i][2];
   }
-  view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
+  view4.view(host_write,well.numel());
   ucl_copy(well,view4,false);
 
   _allocated=true;
@@ -184,19 +184,8 @@ double GayBerneT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void GayBerneT::loop(const bool _eflag, const bool _vflag) {
+int GayBerneT::loop(const int eflag, const int vflag) {
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=0, NGX;
   int stride=this->nbor->nbor_pitch();
   int ainum=this->ans->inum();
@@ -213,8 +202,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
       this->time_nbor1.stop();
 
       this->time_ellipsoid.start();
-      this->k_ellipsoid.set_size(GX,BX);
-      this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
+      this->k_elps_sel->set_size(GX,BX);
+      this->k_elps_sel->run(&this->atom->x, &this->atom->quat,
                             &this->shape, &this->well, &this->gamma_upsilon_mu,
                             &this->sigma_epsilon, &this->_lj_types,
                             &this->lshape, &this->nbor->dev_nbor, &stride,
@@ -230,7 +219,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
         this->time_ellipsoid2.stop();
         this->time_lj.start();
         this->time_lj.stop();
-        return;
+        return ainum;
       }
 
       // ------------ SPHERE_ELLIPSE ---------------
@@ -246,8 +235,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
       this->time_nbor2.stop();
 
       this->time_ellipsoid2.start();
-      this->k_sphere_ellipsoid.set_size(GX,BX);
-      this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
+      this->k_sphere_elps_sel->set_size(GX,BX);
+      this->k_sphere_elps_sel->run(&this->atom->x, &this->atom->quat,
                                    &this->shape,  &this->well,
                                    &this->gamma_upsilon_mu,
                                    &this->sigma_epsilon, &this->_lj_types,
@@ -276,8 +265,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
     this->time_lj.start();
     if (this->_last_ellipse<this->ans->inum()) {
       if (this->_shared_types) {
-        this->k_lj_fast.set_size(GX,BX);
-        this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
+        this->k_lj_sel->set_size(GX,BX);
+        this->k_lj_sel->run(&this->atom->x, &this->lj1, &this->lj3,
                             &this->gamma_upsilon_mu, &stride,
                             &this->nbor->dev_packed, &this->ans->force,
                             &this->ans->engv, &this->dev_error, &eflag,
@@ -303,8 +292,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
                                  ELLIPSE_ELLIPSE,_shared_types,_lj_types);
     this->time_nbor1.stop();
     this->time_ellipsoid.start();
-    this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->x,  &this->atom->quat,
+    this->k_elps_sel->set_size(GX,BX);
+    this->k_elps_sel->run(&this->atom->x,  &this->atom->quat,
                           &this->shape, &this->well, &this->gamma_upsilon_mu,
                           &this->sigma_epsilon, &this->_lj_types, &this->lshape,
                           &this->nbor->dev_nbor, &stride, &this->ans->force,
@@ -312,6 +301,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
                           &eflag, &vflag, &ainum, &this->_threads_per_atom);
     this->time_ellipsoid.stop();
   }
+  return ainum;
 }
 
 template class GayBerne<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu
index c9d0353ca8..9267dfd85d 100644
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@@ -100,29 +100,27 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   sp_lj[0]=gum[3];
   sp_lj[1]=gum[4];
   sp_lj[2]=gum[5];
   sp_lj[3]=gum[6];
 
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp4 f, tor;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
+    nbor_info_p(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
@@ -322,10 +320,10 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
       }
 
       numtyp temp2 = factor_lj*eta*chi;
-      if (eflag>0)
+      if (EVFLAG && eflag)
         energy+=u_r*temp2;
       numtyp temp1 = -eta*u_r*factor_lj;
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         r12[0]*=-r;
         r12[1]*=-r;
         r12[2]*=-r;
@@ -356,8 +354,8 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
       tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
 
     } // for nbor
-    store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv,inum);
 }
 
diff --git a/lib/gpu/lal_gayberne.h b/lib/gpu/lal_gayberne.h
index 750c739cec..5cdc6bcd67 100644
--- a/lib/gpu/lal_gayberne.h
+++ b/lib/gpu/lal_gayberne.h
@@ -86,7 +86,7 @@ class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu
index fdf40720aa..4582f0d411 100644
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@@ -17,6 +17,13 @@
 #include "lal_ellipsoid_extra.h"
 #endif
 
+#if (SHUFFLE_AVAIL == 0)
+#define local_allocate_store_ellipse_lj local_allocate_store_ellipse
+#else
+#define local_allocate_store_ellipse_lj()                                   \
+    __local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
+#endif
+
 __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
                                           const __global numtyp4 *restrict q,
                                           const __global numtyp4 *restrict shape,
@@ -38,25 +45,26 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
   ii+=start;
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse_lj();
+
   sp_lj[0]=gum[3];
   sp_lj[1]=gum[4];
   sp_lj[2]=gum[5];
   sp_lj[3]=gum[6];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
+    nbor_info_p(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
@@ -214,10 +222,10 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       }
 
       numtyp temp2 = factor_lj*eta*chi;
-      if (eflag>0)
+      if (EVFLAG && eflag)
         energy+=u_r*temp2;
       numtyp temp1 = -eta*u_r*factor_lj;
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         r12[0]*=-1;
         r12[1]*=-1;
         r12[2]*=-1;
@@ -239,9 +247,9 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
         f.z+=temp1*dchi[2]-temp2*dUr[2];
       }
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
@@ -261,26 +269,27 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
   ii+=start;
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   sp_lj[0]=gum[3];
   sp_lj[1]=gum[4];
   sp_lj[2]=gum[5];
   sp_lj[3]=gum[6];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,nbor_end,nbor);
+    nbor_info_e_ss(dev_ij,stride,t_per_atom,ii,offset,i,numj,
+                   n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
@@ -312,11 +321,11 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
           energy+=factor_lj*(e-lj3[ii].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -327,9 +336,9 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                ans,engv);
   } // if ii
+  acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+              ans,engv);
 }
 
 __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
@@ -351,31 +360,32 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
   __local numtyp sp_lj[4];
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   if (tid<4)
     sp_lj[tid]=gum[tid+3];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,nbor_end,nbor);
+    nbor_info_e_ss(dev_ij,stride,t_per_atom,ii,offset,i,numj,
+                   n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int iw=ix.w;
@@ -406,11 +416,11 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -421,8 +431,8 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                ans,engv);
   } // if ii
+  acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+              ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp
index 5bd015e364..40fefe28b3 100644
--- a/lib/gpu/lal_lj.cpp
+++ b/lib/gpu/lal_lj.cpp
@@ -51,16 +51,31 @@ int LJT::init(const int ntypes,
                           const int nall, const int max_nbors,
                           const int maxspecial, const double cell_size,
                           const double gpu_split, FILE *_screen) {
+  const int max_shared_types=this->device->max_shared_types();
+
+  int onetype=0;
+  #ifdef USE_OPENCL
+  if (maxspecial==0)
+    for (int i=1; i<ntypes; i++)
+      for (int j=i; j<ntypes; j++)
+        if (host_cutsq[i][j]>0) {
+          if (onetype>0)
+            onetype=-1;
+          else if (onetype==0)
+            onetype=i*max_shared_types+j;
+        }
+  if (onetype<0) onetype=0;
+  #endif
+
   int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj,"k_lj");
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
+                            gpu_split,_screen,lj,"k_lj",onetype);
   if (success!=0)
     return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
@@ -130,20 +145,9 @@ double LJT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJT::loop(const bool _eflag, const bool _vflag) {
+int LJT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -151,8 +155,8 @@ void LJT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -165,6 +169,7 @@ void LJT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJ<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu
index 7297a287e6..382cd140d9 100644
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@@ -38,16 +38,19 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_pair();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -81,11 +84,11 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -96,9 +99,9 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
@@ -114,6 +117,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
+  #ifndef ONETYPE
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -121,38 +125,58 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp lj1x=lj1_in[ONETYPE].x;
+  const numtyp lj1y=lj1_in[ONETYPE].y;
+  const numtyp cutsq=lj1_in[ONETYPE].z;
+  numtyp lj3x, lj3y, lj3z;
+  if (EVFLAG && eflag) {
+    lj3x=lj3_in[ONETYPE].x;
+    lj3y=lj3_in[ONETYPE].y;
+    lj3z=lj3_in[ONETYPE].z;
+  }
+  #endif
+
+  int n_stride;
+  local_allocate_store_pair();
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-
     numtyp factor_lj;
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
+    #endif
 
+    NOUNROLL
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
+      #ifndef ONETYPE
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
+      #endif
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int mtype=itype+jx.w;
+      numtyp cutsq=lj1[mtype].z;
+      #endif
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -160,20 +184,37 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
 
-      if (r2inv<lj1[mtype].z) {
+      if (r2inv<cutsq) {
+        #ifndef ONETYPE
+        numtyp lj1x=lj1[mtype].x;
+        numtyp lj1y=lj1[mtype].y;
+        #endif
+
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        numtyp force = r2inv*r6inv*(lj1x*r6inv-lj1y);
+        #ifndef ONETYPE
+        force*=factor_lj;
+        #endif
 
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
-          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z);
+        if (EVFLAG && eflag) {
+          #ifndef ONETYPE
+          numtyp lj3x=lj3[mtype].x;
+          numtyp lj3y=lj3[mtype].y;
+          numtyp lj3z=lj3[mtype].z;
+          #endif
+          numtyp e=r6inv*(lj3x*r6inv-lj3y);
+          #ifndef ONETYPE
+          energy+=factor_lj*(e-lj3z);
+          #else
+          energy+=(e-lj3z);
+          #endif
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -182,10 +223,9 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
           virial[5] += dely*delz*force;
         }
       }
-
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj.h b/lib/gpu/lal_lj.h
index c6fec0d159..cdf850efd7 100644
--- a/lib/gpu/lal_lj.h
+++ b/lib/gpu/lal_lj.h
@@ -76,7 +76,7 @@ class LJ : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj96.cpp b/lib/gpu/lal_lj96.cpp
index 6f74cd0f19..df7dc11558 100644
--- a/lib/gpu/lal_lj96.cpp
+++ b/lib/gpu/lal_lj96.cpp
@@ -113,20 +113,9 @@ double LJ96T::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJ96T::loop(const bool _eflag, const bool _vflag) {
+int LJ96T::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -134,8 +123,8 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -149,6 +138,7 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) {
                      &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJ96<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu
index c602e7555e..d1f7e3791f 100644
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@@ -39,22 +39,25 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -89,11 +92,11 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -104,9 +107,9 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
@@ -125,27 +128,30 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -180,11 +186,11 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -195,8 +201,8 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj96.h b/lib/gpu/lal_lj96.h
index eef6863f37..535e32a580 100644
--- a/lib/gpu/lal_lj96.h
+++ b/lib/gpu/lal_lj96.h
@@ -71,7 +71,7 @@ class LJ96 : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj96_ext.cpp b/lib/gpu/lal_lj96_ext.cpp
index f68b35de57..be7ffc5a09 100644
--- a/lib/gpu/lal_lj96_ext.cpp
+++ b/lib/gpu/lal_lj96_ext.cpp
@@ -55,7 +55,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                        host_lj4, offset, special_lj, inum, nall, 300,
+                        host_lj4, offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen);
 
   LJ96MF.device->world_barrier();
@@ -73,7 +73,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, inum,  nall, 300, maxspecial,
+                          offset, special_lj, inum,  nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen);
 
     LJ96MF.device->gpu_barrier();
diff --git a/lib/gpu/lal_lj_class2_long.cpp b/lib/gpu/lal_lj_class2_long.cpp
index 24b07212ed..31e03a2a82 100644
--- a/lib/gpu/lal_lj_class2_long.cpp
+++ b/lib/gpu/lal_lj_class2_long.cpp
@@ -123,20 +123,9 @@ double LJClass2LongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
+int LJClass2LongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -144,8 +133,8 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -161,6 +150,7 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJClass2Long<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu
index 65f0bf993c..5c8a2d46b2 100644
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@@ -47,6 +47,9 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -56,18 +59,18 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -123,7 +126,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -131,7 +134,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -142,9 +145,9 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
@@ -168,28 +171,31 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -245,7 +251,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -253,7 +259,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -264,8 +270,8 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_class2_long.h b/lib/gpu/lal_lj_class2_long.h
index eac6451b2e..84e07bf7cd 100644
--- a/lib/gpu/lal_lj_class2_long.h
+++ b/lib/gpu/lal_lj_class2_long.h
@@ -75,7 +75,7 @@ class LJClass2Long : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_class2_long_ext.cpp b/lib/gpu/lal_lj_class2_long_ext.cpp
index f669a81189..311b027536 100644
--- a/lib/gpu/lal_lj_class2_long_ext.cpp
+++ b/lib/gpu/lal_lj_class2_long_ext.cpp
@@ -58,7 +58,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                        offset, special_lj, inum, nall, 300, maxspecial,
+                        offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
@@ -77,7 +77,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, inum, nall, 300, maxspecial,
+                          offset, special_lj, inum, nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_lj_coul.cpp b/lib/gpu/lal_lj_coul.cpp
index 59ce9c5e61..cd8a411a79 100644
--- a/lib/gpu/lal_lj_coul.cpp
+++ b/lib/gpu/lal_lj_coul.cpp
@@ -125,20 +125,9 @@ double LJCoulT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJCoulT::loop(const bool _eflag, const bool _vflag) {
+int LJCoulT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -146,8 +135,8 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -161,6 +150,7 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) {
                      &cutsq, &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJCoul<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu
index afbb972942..c728967bc5 100644
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@@ -47,6 +47,9 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -56,18 +59,18 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -115,14 +118,14 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += forcecoul;
           if (rsq < lj1[mtype].z) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -133,9 +136,9 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
@@ -158,29 +161,32 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -228,14 +234,14 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += forcecoul;
           if (rsq < lj1[mtype].z) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -246,8 +252,8 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_coul.h b/lib/gpu/lal_lj_coul.h
index 0e11162aa5..eb490d5820 100644
--- a/lib/gpu/lal_lj_coul.h
+++ b/lib/gpu/lal_lj_coul.h
@@ -77,7 +77,7 @@ class LJCoul : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_coul_debye.cpp b/lib/gpu/lal_lj_coul_debye.cpp
index 556a0a5cd3..78ef1bf3f7 100644
--- a/lib/gpu/lal_lj_coul_debye.cpp
+++ b/lib/gpu/lal_lj_coul_debye.cpp
@@ -127,20 +127,9 @@ double LJCoulDebyeT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
+int LJCoulDebyeT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -148,8 +137,8 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->atom->q, &cutsq,
@@ -163,6 +152,7 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_kappa, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJCoulDebye<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu
index 053fbeccc8..1804625649 100644
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@@ -48,6 +48,9 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -57,18 +60,18 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -120,7 +123,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < lj1[mtype].z) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
@@ -129,7 +132,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
             e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -140,9 +143,9 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
@@ -166,29 +169,32 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -240,7 +246,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < lj1[mtype].z) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
@@ -249,7 +255,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
             e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -260,8 +266,8 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_coul_debye.h b/lib/gpu/lal_lj_coul_debye.h
index 22fcf7234b..19abf32169 100644
--- a/lib/gpu/lal_lj_coul_debye.h
+++ b/lib/gpu/lal_lj_coul_debye.h
@@ -77,7 +77,7 @@ class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_coul_debye_ext.cpp b/lib/gpu/lal_lj_coul_debye_ext.cpp
index 95588eb95a..4f81b01457 100644
--- a/lib/gpu/lal_lj_coul_debye_ext.cpp
+++ b/lib/gpu/lal_lj_coul_debye_ext.cpp
@@ -58,7 +58,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                        host_lj4, offset, special_lj, inum, nall, 300,
+                        host_lj4, offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, kappa);
 
@@ -77,7 +77,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, inum, nall, 300, maxspecial,
+                          offset, special_lj, inum, nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e, kappa);
 
diff --git a/lib/gpu/lal_lj_coul_ext.cpp b/lib/gpu/lal_lj_coul_ext.cpp
index 060088a7cb..5b7f97e630 100644
--- a/lib/gpu/lal_lj_coul_ext.cpp
+++ b/lib/gpu/lal_lj_coul_ext.cpp
@@ -57,7 +57,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e);
 
@@ -76,7 +76,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
diff --git a/lib/gpu/lal_lj_coul_long.cpp b/lib/gpu/lal_lj_coul_long.cpp
index 66897a4aa7..e6be361abb 100644
--- a/lib/gpu/lal_lj_coul_long.cpp
+++ b/lib/gpu/lal_lj_coul_long.cpp
@@ -140,20 +140,9 @@ double LJCoulLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
+int LJCoulLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -161,8 +150,8 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -178,6 +167,7 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJCoulLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu
index ac3479421f..85af3c3433 100644
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@@ -47,6 +47,9 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -56,18 +59,18 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -121,7 +124,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -129,7 +132,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -140,9 +143,9 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
@@ -164,28 +167,31 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -239,7 +245,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -247,7 +253,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -258,8 +264,8 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_coul_long.h b/lib/gpu/lal_lj_coul_long.h
index 8f77671dc0..bc4fce40a5 100644
--- a/lib/gpu/lal_lj_coul_long.h
+++ b/lib/gpu/lal_lj_coul_long.h
@@ -80,7 +80,7 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_coul_long_ext.cpp b/lib/gpu/lal_lj_coul_long_ext.cpp
index 33771af53c..6a027bdc7e 100644
--- a/lib/gpu/lal_lj_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_coul_long_ext.cpp
@@ -58,7 +58,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                        offset, special_lj, inum, nall, 300, maxspecial,
+                        offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
@@ -77,7 +77,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, inum, nall, 300, maxspecial,
+                          offset, special_lj, inum, nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_lj_coul_msm.cpp b/lib/gpu/lal_lj_coul_msm.cpp
index 9a17d068ec..656736865b 100644
--- a/lib/gpu/lal_lj_coul_msm.cpp
+++ b/lib/gpu/lal_lj_coul_msm.cpp
@@ -157,20 +157,9 @@ double LJCoulMSMT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) {
+int LJCoulMSMT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -178,8 +167,8 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -195,6 +184,7 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_order, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJCoulMSM<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_coul_msm.cu b/lib/gpu/lal_lj_coul_msm.cu
index a3c36eed85..39fc723736 100644
--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@@ -28,6 +28,11 @@ _texture( gcons_tex,int2);
 _texture( dgcons_tex,int2);
 #endif
 
+#if (__CUDACC_VER_MAJOR__ >= 11)
+#define gcons_tex gcons
+#define dgcons_tex dgcons
+#endif
+
 #else
 #define pos_tex x_
 #define q_tex q_
@@ -100,6 +105,9 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -109,18 +117,18 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -175,7 +183,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(egamma-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -183,7 +191,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -194,9 +202,9 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
@@ -220,28 +228,31 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -296,7 +307,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(egamma-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -304,7 +315,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -315,8 +326,8 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_coul_msm.h b/lib/gpu/lal_lj_coul_msm.h
index 6369ce8cb5..a929848aaf 100644
--- a/lib/gpu/lal_lj_coul_msm.h
+++ b/lib/gpu/lal_lj_coul_msm.h
@@ -80,7 +80,7 @@ class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_coul_msm_ext.cpp b/lib/gpu/lal_lj_coul_msm_ext.cpp
index d957cbe376..2d9d77fe77 100644
--- a/lib/gpu/lal_lj_coul_msm_ext.cpp
+++ b/lib/gpu/lal_lj_coul_msm_ext.cpp
@@ -59,7 +59,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   if (world_me==0)
     init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                         host_gcons, host_dgcons, offset,
-                        special_lj, inum, nall, 300, maxspecial,
+                        special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, order, qqrd2e);
 
@@ -79,7 +79,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                           host_gcons, host_dgcons, offset,
-                          special_lj, inum, nall, 300, maxspecial,
+                          special_lj, inum, nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, order, qqrd2e);
 
diff --git a/lib/gpu/lal_lj_cubic.cpp b/lib/gpu/lal_lj_cubic.cpp
index f8200ec037..fa5073d409 100644
--- a/lib/gpu/lal_lj_cubic.cpp
+++ b/lib/gpu/lal_lj_cubic.cpp
@@ -119,20 +119,9 @@ double LJCubicT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJCubicT::loop(const bool _eflag, const bool _vflag) {
+int LJCubicT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -140,8 +129,8 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj2, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj2, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -154,6 +143,7 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJCubic<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_cubic.cu b/lib/gpu/lal_lj_cubic.cu
index f93013fe75..a91326d521 100644
--- a/lib/gpu/lal_lj_cubic.cu
+++ b/lib/gpu/lal_lj_cubic.cu
@@ -46,16 +46,19 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_pair();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -98,7 +101,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e;
           if (rsq <= lj2[mtype].x)
             e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
@@ -106,7 +109,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
             e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0);
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -117,9 +120,9 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
@@ -140,27 +143,30 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp2 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     lj2[tid]=lj2_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -203,7 +209,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e;
           if (rsq <= lj2[mtype].x)
             e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
@@ -211,7 +217,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
             e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0);
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -222,8 +228,8 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_cubic.h b/lib/gpu/lal_lj_cubic.h
index 9578ca27e4..a37044b279 100644
--- a/lib/gpu/lal_lj_cubic.h
+++ b/lib/gpu/lal_lj_cubic.h
@@ -73,7 +73,7 @@ class LJCubic : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_cubic_ext.cpp b/lib/gpu/lal_lj_cubic_ext.cpp
index f02ce0f184..2f8ebac37b 100644
--- a/lib/gpu/lal_lj_cubic_ext.cpp
+++ b/lib/gpu/lal_lj_cubic_ext.cpp
@@ -58,7 +58,7 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq,
   if (world_me==0)
     init_ok=LJCubicLMF.init(ntypes, cutsq, cut_inner_sq, cut_inner, sigma,
                             epsilon, host_lj1, host_lj2, host_lj3, host_lj4,
-                            special_lj, inum, nall, 300, maxspecial,
+                            special_lj, inum, nall, max_nbors, maxspecial,
                             cell_size, gpu_split, screen);
 
   LJCubicLMF.device->world_barrier();
@@ -77,7 +77,7 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq,
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCubicLMF.init(ntypes, cutsq, cut_inner_sq, cut_inner, sigma,
                               epsilon, host_lj1, host_lj2, host_lj3, host_lj4,
-                              special_lj, inum, nall, 300, maxspecial,
+                              special_lj, inum, nall, max_nbors, maxspecial,
                               cell_size, gpu_split, screen);
 
     LJCubicLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_lj_dsf.cpp b/lib/gpu/lal_lj_dsf.cpp
index b888f33f00..d41aa13deb 100644
--- a/lib/gpu/lal_lj_dsf.cpp
+++ b/lib/gpu/lal_lj_dsf.cpp
@@ -125,20 +125,9 @@ double LJDSFT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJDSFT::loop(const bool _eflag, const bool _vflag) {
+int LJDSFT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -146,8 +135,8 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -163,6 +152,7 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) {
                      &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJDSF<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu
index c1bb197148..5beedb0bbb 100644
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@@ -50,6 +50,9 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -59,18 +62,18 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -78,7 +81,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -130,7 +133,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
             e_coul += e;
@@ -140,7 +143,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -151,9 +154,9 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
@@ -176,28 +179,31 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -206,7 +212,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -257,7 +263,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
             e_coul += e;
@@ -267,7 +273,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -278,8 +284,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_lj_dsf.h b/lib/gpu/lal_lj_dsf.h
index b176e087db..b303285e9c 100644
--- a/lib/gpu/lal_lj_dsf.h
+++ b/lib/gpu/lal_lj_dsf.h
@@ -77,7 +77,7 @@ class LJDSF : public BaseCharge<numtyp, acctyp> {
  private:
   bool _allocated;
   numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_dsf_ext.cpp b/lib/gpu/lal_lj_dsf_ext.cpp
index 6d53896a11..e70059261c 100644
--- a/lib/gpu/lal_lj_dsf_ext.cpp
+++ b/lib/gpu/lal_lj_dsf_ext.cpp
@@ -59,7 +59,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
                        f_shift, alpha);
@@ -79,7 +79,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
                          f_shift, alpha);
diff --git a/lib/gpu/lal_lj_expand.cpp b/lib/gpu/lal_lj_expand.cpp
index 1c58cecfae..3d9e526d0c 100644
--- a/lib/gpu/lal_lj_expand.cpp
+++ b/lib/gpu/lal_lj_expand.cpp
@@ -133,20 +133,9 @@ double LJExpandT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJExpandT::loop(const bool _eflag, const bool _vflag) {
+int LJExpandT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -154,8 +143,8 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -168,6 +157,7 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJExpand<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu
index 46ed9e2a31..2eff2cd89b 100644
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@@ -41,22 +41,25 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -93,11 +96,11 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -108,9 +111,9 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
@@ -129,27 +132,30 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(numtyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -186,11 +192,11 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -201,8 +207,8 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_expand.h b/lib/gpu/lal_lj_expand.h
index 2560d166c7..94448a871d 100644
--- a/lib/gpu/lal_lj_expand.h
+++ b/lib/gpu/lal_lj_expand.h
@@ -76,7 +76,7 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_expand_coul_long.cpp b/lib/gpu/lal_lj_expand_coul_long.cpp
index 3e5e00ef6a..41c2ff6229 100644
--- a/lib/gpu/lal_lj_expand_coul_long.cpp
+++ b/lib/gpu/lal_lj_expand_coul_long.cpp
@@ -140,20 +140,9 @@ double LJExpandCoulLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJExpandCoulLongT::loop(const bool _eflag, const bool _vflag) {
+int LJExpandCoulLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -161,8 +150,8 @@ void LJExpandCoulLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -178,6 +167,7 @@ void LJExpandCoulLongT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJExpandCoulLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_expand_coul_long.cu b/lib/gpu/lal_lj_expand_coul_long.cu
index 0f0fe4c2fb..abb3d5ca3f 100644
--- a/lib/gpu/lal_lj_expand_coul_long.cu
+++ b/lib/gpu/lal_lj_expand_coul_long.cu
@@ -47,6 +47,9 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -56,18 +59,18 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -125,7 +128,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -133,7 +136,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -144,9 +147,9 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
@@ -168,6 +171,9 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -175,20 +181,20 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
     lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -246,7 +252,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -254,7 +260,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -265,8 +271,8 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_expand_coul_long.h b/lib/gpu/lal_lj_expand_coul_long.h
index 404a36e5bc..44f7aff3fe 100644
--- a/lib/gpu/lal_lj_expand_coul_long.h
+++ b/lib/gpu/lal_lj_expand_coul_long.h
@@ -80,7 +80,7 @@ class LJExpandCoulLong : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_expand_coul_long_ext.cpp b/lib/gpu/lal_lj_expand_coul_long_ext.cpp
index 3ff1bef701..e5506dd7aa 100644
--- a/lib/gpu/lal_lj_expand_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_expand_coul_long_ext.cpp
@@ -58,7 +58,7 @@ int ljecl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJECLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                        offset, shift, special_lj, inum, nall, 300, maxspecial,
+                        offset, shift, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
@@ -77,7 +77,7 @@ int ljecl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJECLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, shift, special_lj, inum, nall, 300, maxspecial,
+                          offset, shift, special_lj, inum, nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp
index 603e425d3f..02decf2712 100644
--- a/lib/gpu/lal_lj_expand_ext.cpp
+++ b/lib/gpu/lal_lj_expand_ext.cpp
@@ -56,7 +56,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, shift, special_lj, inum, nall, 300,
+                       host_lj4, offset, shift, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   LJEMF.device->world_barrier();
@@ -74,7 +74,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, shift, special_lj, inum, nall, 300, maxspecial,
+                         offset, shift, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split,screen);
 
     LJEMF.device->world_barrier();
diff --git a/lib/gpu/lal_lj_ext.cpp b/lib/gpu/lal_lj_ext.cpp
index 124cf46c8c..fa00fc4f64 100644
--- a/lib/gpu/lal_lj_ext.cpp
+++ b/lib/gpu/lal_lj_ext.cpp
@@ -55,7 +55,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   LJLMF.device->world_barrier();
@@ -73,7 +73,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);
 
     LJLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_lj_gromacs.cpp b/lib/gpu/lal_lj_gromacs.cpp
index 0563151ddd..8a385ece6b 100644
--- a/lib/gpu/lal_lj_gromacs.cpp
+++ b/lib/gpu/lal_lj_gromacs.cpp
@@ -121,20 +121,9 @@ double LJGROMACST::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJGROMACST::loop(const bool _eflag, const bool _vflag) {
+int LJGROMACST::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -142,8 +131,8 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &ljsw,
                           &sp_lj, &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv,
@@ -159,6 +148,7 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) {
                      &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJGROMACS<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_gromacs.cu b/lib/gpu/lal_lj_gromacs.cu
index 21381bef30..4117cc1440 100644
--- a/lib/gpu/lal_lj_gromacs.cu
+++ b/lib/gpu/lal_lj_gromacs.cu
@@ -42,21 +42,24 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -99,7 +102,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           e += lj3[mtype].w;
           if (rsq > lj1[mtype].w) {
@@ -108,7 +111,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
           }
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -119,9 +122,9 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                  vflag,ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                vflag,ans,engv);
 }
 
 __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
@@ -142,6 +145,9 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 ljsw[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -150,18 +156,18 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
     ljsw[tid]=ljsw_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -204,7 +210,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           e += lj3[mtype].w;
           if (rsq > lj1[mtype].w) {
@@ -213,7 +219,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
           }
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -224,8 +230,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                  vflag,ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_lj_gromacs.h b/lib/gpu/lal_lj_gromacs.h
index 3dec13c6d7..8fedaf07a1 100644
--- a/lib/gpu/lal_lj_gromacs.h
+++ b/lib/gpu/lal_lj_gromacs.h
@@ -76,7 +76,7 @@ class LJGROMACS : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_gromacs_ext.cpp b/lib/gpu/lal_lj_gromacs_ext.cpp
index 99d32ab09a..19d1d12513 100644
--- a/lib/gpu/lal_lj_gromacs_ext.cpp
+++ b/lib/gpu/lal_lj_gromacs_ext.cpp
@@ -58,7 +58,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                 special_lj, inum, nall, 300, maxspecial, cell_size,
+                 special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                  gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3,
                  host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq);
 
@@ -77,7 +77,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                           special_lj, inum, nall, 300, maxspecial, cell_size,
+                           special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                            gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3,
                            host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq);
 
diff --git a/lib/gpu/lal_lj_sdk.cpp b/lib/gpu/lal_lj_sdk.cpp
index c6a282576c..0da094c953 100644
--- a/lib/gpu/lal_lj_sdk.cpp
+++ b/lib/gpu/lal_lj_sdk.cpp
@@ -113,20 +113,9 @@ double CGCMMT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CGCMMT::loop(const bool _eflag, const bool _vflag) {
+int CGCMMT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -134,8 +123,8 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -149,6 +138,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
                      &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class CGCMM<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_sdk.cu b/lib/gpu/lal_lj_sdk.cu
index 249b29a4b2..1bd9a93d5e 100644
--- a/lib/gpu/lal_lj_sdk.cu
+++ b/lib/gpu/lal_lj_sdk.cu
@@ -39,22 +39,25 @@ __kernel void k_lj_sdk(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -97,10 +100,10 @@ __kernel void k_lj_sdk(const __global numtyp4 *restrict x_,
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
-        if (eflag>0)
+        if (EVFLAG && eflag)
           energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
                     lj3[mtype].z;
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -111,9 +114,9 @@ __kernel void k_lj_sdk(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_lj_sdk_fast(const __global numtyp4 *restrict x_,
@@ -132,27 +135,30 @@ __kernel void k_lj_sdk_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -195,10 +201,10 @@ __kernel void k_lj_sdk_fast(const __global numtyp4 *restrict x_,
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
-        if (eflag>0)
+        if (EVFLAG && eflag)
           energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
                     lj3[mtype].z;
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -209,8 +215,7 @@ __kernel void k_lj_sdk_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
-
diff --git a/lib/gpu/lal_lj_sdk.h b/lib/gpu/lal_lj_sdk.h
index fc50756a3f..043bafdda8 100644
--- a/lib/gpu/lal_lj_sdk.h
+++ b/lib/gpu/lal_lj_sdk.h
@@ -71,7 +71,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_sdk_ext.cpp b/lib/gpu/lal_lj_sdk_ext.cpp
index de0c5fef4f..4497233861 100644
--- a/lib/gpu/lal_lj_sdk_ext.cpp
+++ b/lib/gpu/lal_lj_sdk_ext.cpp
@@ -56,7 +56,7 @@ int sdk_gpu_init(const int ntypes, double **cutsq, int **cg_types,
   int init_ok=0;
   if (world_me==0)
     init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   CMMMF.device->world_barrier();
@@ -74,7 +74,7 @@ int sdk_gpu_init(const int ntypes, double **cutsq, int **cg_types,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
-                         host_lj4, offset, special_lj, inum, nall, 300,
+                         host_lj4, offset, special_lj, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen);
 
     CMMMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_lj_sdk_long.cpp b/lib/gpu/lal_lj_sdk_long.cpp
index 74dbfc40e3..d78e8d84da 100644
--- a/lib/gpu/lal_lj_sdk_long.cpp
+++ b/lib/gpu/lal_lj_sdk_long.cpp
@@ -124,20 +124,9 @@ double CGCMMLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
+int CGCMMLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -145,8 +134,8 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -161,6 +150,7 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class CGCMMLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_sdk_long.cu b/lib/gpu/lal_lj_sdk_long.cu
index 6dd1829c71..3972ed2076 100644
--- a/lib/gpu/lal_lj_sdk_long.cu
+++ b/lib/gpu/lal_lj_sdk_long.cu
@@ -47,6 +47,9 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -56,18 +59,18 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -130,7 +133,7 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].y) {
@@ -138,7 +141,7 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_,
                       lj3[mtype].w;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -149,9 +152,9 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
@@ -173,6 +176,9 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -180,20 +186,20 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
     lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -256,7 +262,7 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].y) {
@@ -264,7 +270,7 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
                       lj3[mtype].w;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -275,8 +281,7 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_lj_sdk_long.h b/lib/gpu/lal_lj_sdk_long.h
index 608488bd30..102b007b59 100644
--- a/lib/gpu/lal_lj_sdk_long.h
+++ b/lib/gpu/lal_lj_sdk_long.h
@@ -75,7 +75,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_sdk_long_ext.cpp b/lib/gpu/lal_lj_sdk_long_ext.cpp
index f293487282..3170ac8b52 100644
--- a/lib/gpu/lal_lj_sdk_long_ext.cpp
+++ b/lib/gpu/lal_lj_sdk_long_ext.cpp
@@ -58,7 +58,7 @@ int sdkl_gpu_init(const int ntypes, double **cutsq, int **cg_type,
   int init_ok=0;
   if (world_me==0)
     init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                        host_lj4, offset, special_lj, inum, nall, 300,
+                        host_lj4, offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
 
@@ -77,7 +77,7 @@ int sdkl_gpu_init(const int ntypes, double **cutsq, int **cg_type,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                          host_lj4, offset, special_lj, inum,  nall, 300,
+                          host_lj4, offset, special_lj, inum,  nall, max_nbors,
                           maxspecial, cell_size, gpu_split, screen,
                           host_cut_ljsq, host_cut_coulsq, host_special_coul,
                           qqrd2e, g_ewald);
diff --git a/lib/gpu/lal_lj_tip4p_long.cpp b/lib/gpu/lal_lj_tip4p_long.cpp
index 1f3b32248c..66477d1fb4 100644
--- a/lib/gpu/lal_lj_tip4p_long.cpp
+++ b/lib/gpu/lal_lj_tip4p_long.cpp
@@ -65,6 +65,12 @@ int LJTIP4PLongT::init(const int ntypes,
   k_pair_distrib.set_function(*this->pair_program,"k_lj_tip4p_long_distrib");
   k_pair_reneigh.set_function(*this->pair_program,"k_lj_tip4p_reneigh");
   k_pair_newsite.set_function(*this->pair_program,"k_lj_tip4p_newsite");
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_distrib_noev.set_function(*this->pair_program_noev,
+                                   "k_lj_tip4p_long_distrib");
+  #else
+  k_pair_dt_sel = &k_pair_distrib;
+  #endif
 
   TypeH = tH;
   TypeO = tO;
@@ -151,6 +157,9 @@ void LJTIP4PLongT::clear() {
   k_pair_distrib.clear();
   k_pair_reneigh.clear();
   k_pair_newsite.clear();
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_distrib_noev.clear();
+  #endif
 
   this->clear_atomic();
 }
@@ -164,19 +173,9 @@ double LJTIP4PLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
+int LJTIP4PLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
 
   int ainum=this->ans->inum();
   const int nall = this->atom->nall();
@@ -210,8 +209,8 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
   this->ansO.zero();
   this->device->gpu->sync();
   if(shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
           &this->ans->force, &this->ans->engv, &eflag, &vflag,
           &ainum, &nbor_pitch, &this->_threads_per_atom,
@@ -228,12 +227,19 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
           &this->atom->q, &cutsq, &_qqrd2e, &_g_ewald,
           &cut_coulsq, &cut_coulsqplus, &this->ansO);
   }
+  #if defined(LAL_OCL_EV_JIT)
+  if (eflag || vflag) k_pair_dt_sel = &k_pair_distrib;
+  else k_pair_dt_sel = &k_pair_distrib_noev;
+  #endif
+
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
-  this->k_pair_distrib.set_size(GX,BX);
-  this->k_pair_distrib.run(&this->atom->x, &this->ans->force, &this->ans->engv,
-      &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom,
-      &hneight, &m, &TypeO, &TypeH, &alpha,&this->atom->q,  &this->ansO);
+  k_pair_dt_sel->set_size(GX,BX);
+  k_pair_dt_sel->run(&this->atom->x, &this->ans->force, &this->ans->engv,
+                     &eflag, &vflag, &ainum, &nbor_pitch,
+                     &this->_threads_per_atom, &hneight, &m, &TypeO, &TypeH,
+                     &alpha,&this->atom->q,  &this->ansO);
   this->time_pair.stop();
+  return GX;
 }
 
 
@@ -269,22 +275,26 @@ void LJTIP4PLongT::copy_relations_data(int n, tagint *tag, int *map_array,
   }
 }
 
-
-
-
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void LJTIP4PLongT::compute(const int f_ago, const int inum_full,
-                               const int nall, double **host_x, int *host_type,
-                               int *ilist, int *numj, int **firstneigh,
-                               const bool eflag, const bool vflag,
-                               const bool eatom, const bool vatom,
-                               int &host_start, const double cpu_time,
-                               bool &success, double *host_q,
-                               const int nlocal, double *boxlo, double *prd) {
+                           const int nall, double **host_x, int *host_type,
+                           int *ilist, int *numj, int **firstneigh,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom,
+                           int &host_start, const double cpu_time,
+                           bool &success, double *host_q,
+                           const int nlocal, double *boxlo, double *prd) {
   this->acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  this->set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -315,7 +325,7 @@ void LJTIP4PLongT::compute(const int f_ago, const int inum_full,
 
   t_ago = ago;
   loop(eflag,vflag);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,inum);
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 }
@@ -325,16 +335,23 @@ void LJTIP4PLongT::compute(const int f_ago, const int inum_full,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** LJTIP4PLongT::compute(const int ago, const int inum_full,
-                                const int nall, double **host_x, int *host_type,
-                                double *sublo, double *subhi, tagint *tag,
-                                int *map_array, int map_size, int *sametag, int max_same,
-                                int **nspecial, tagint **special, const bool eflag,
-                                const bool vflag, const bool eatom,
-                                const bool vatom, int &host_start,
-                                int **ilist, int **jnum,
-                                const double cpu_time, bool &success,
-                                double *host_q, double *boxlo, double *prd) {
+                            const int nall, double **host_x, int *host_type,
+                            double *sublo, double *subhi, tagint *tag,
+                            int *map_array, int map_size, int *sametag,
+                            int max_same, int **nspecial, tagint **special,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom,
+                            int &host_start, int **ilist, int **jnum,
+                            const double cpu_time, bool &success,
+                            double *host_q, double *boxlo, double *prd) {
   this->acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  this->set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -373,7 +390,7 @@ int** LJTIP4PLongT::compute(const int ago, const int inum_full,
 
   t_ago = ago;
   loop(eflag,vflag);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,inum);
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 
diff --git a/lib/gpu/lal_lj_tip4p_long.cu b/lib/gpu/lal_lj_tip4p_long.cu
index 782ae43662..bd900d9244 100644
--- a/lib/gpu/lal_lj_tip4p_long.cu
+++ b/lib/gpu/lal_lj_tip4p_long.cu
@@ -129,7 +129,7 @@ __kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_,
         f.x += fM.x * (acctyp)0.5 * alpha;
         f.y += fM.y * (acctyp)0.5 * alpha;
         f.z += fM.z * (acctyp)0.5 * alpha;
-        if (vflag > 0) {
+        if (EVFLAG && vflag) {
           vM = ansO[inum  +iO];
           engv[inum*engv_iter + i] += vM.x * (acctyp)0.5 * alpha; engv_iter++;
           engv[inum*engv_iter + i] += vM.y * (acctyp)0.5 * alpha; engv_iter++;
@@ -147,13 +147,13 @@ __kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_,
       f.x += fM.x * (acctyp)(1 - alpha);
       f.y += fM.y * (acctyp)(1 - alpha);
       f.z += fM.z * (acctyp)(1 - alpha);
-      if (eflag > 0) {
+      if (EVFLAG && eflag) {
         eM = engv[i+inum];
         engv[inum+i] = eM*(acctyp)(1 - alpha);
         if (iH1 < inum) engv[inum+iH1] += eM * (acctyp)0.5 * alpha;
         if (iH2 < inum) engv[inum+iH2] += eM * (acctyp)0.5 * alpha;
       }
-      if (vflag > 0) {
+      if (EVFLAG && vflag) {
         vM = ansO[inum   + i];
         engv[inum*engv_iter + i] += vM.x * (acctyp)(1 - alpha); engv_iter++;
         engv[inum*engv_iter + i] += vM.y * (acctyp)(1 - alpha); engv_iter++;
@@ -276,22 +276,27 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy = (acctyp)0;
-  acctyp e_coul = (acctyp)0;
+  int n_stride;
+  local_allocate_store_charge();
+
   acctyp4 f, fO;
   f.x=(acctyp)0;  f.y=(acctyp)0;  f.z=(acctyp)0;
   fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
-  acctyp virial[6],vO[6];
-  for (int i=0; i<6; i++) {
-    virial[i]=(acctyp)0;
-    vO[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6], vO[6];
+  if (EVFLAG) {
+    energy = (acctyp)0;
+    e_coul = (acctyp)0;
+    for (int i=0; i<6; i++) {
+      virial[i]=(acctyp)0;
+      vO[i]=(acctyp)0;
+    }
   }
 
+  int i;
   if (ii<inum) {
-    int i, numj, nbor, nbor_end;
-    __local int n_stride;
+    int numj, nbor, nbor_end;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-        n_stride,nbor_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
@@ -343,11 +348,11 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
         f.y += dely*forcelj;
         f.z += delz*forcelj;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = r6inv * (lj3[mtype].x*r6inv-lj3[mtype].y);
           energy += factor_lj * (e - lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*forcelj;
           virial[1] += dely*dely*forcelj;
           virial[2] += delz*delz*forcelj;
@@ -396,10 +401,10 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
             fO.z += delz * force_coul;
             fO.w += 0;
           }
-          if (eflag>0) {
+          if (EVFLAG && eflag) {
             e_coul += prefactor*(_erfc-factor_coul);
           }
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             acctyp4 fd;
             fd.x = delx*force_coul;
             fd.y = dely*force_coul;
@@ -489,10 +494,10 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
             f.y += fd.y;
             f.z += fd.z;
 
-            if (eflag>0) {
+            if (EVFLAG && eflag) {
               e_coul += prefactor*(_erfc-factor_coul) * (acctyp)0.5 * alpha;
             }
-            if (vflag>0) {
+            if (EVFLAG && vflag) {
               numtyp4 xH1; fetch4(xH1,iH1,pos_tex);
               numtyp4 xH2; fetch4(xH2,iH2,pos_tex);
               numtyp4 xO;  fetch4(xO,iO,pos_tex);
@@ -508,62 +513,64 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
         }
       } // if cut_coulsqplus
     } // for nbor
-    if (t_per_atom>1) {
-#if (ARCH < 300)
-      __local acctyp red_acc[6][BLOCK_PAIR];
-      red_acc[0][tid]=fO.x;
-      red_acc[1][tid]=fO.y;
-      red_acc[2][tid]=fO.z;
-      red_acc[3][tid]=fO.w;
+  } // if ii
+  if (t_per_atom>1) {
+#if (SHUFFLE_AVAIL == 0)
+    red_acc[0][tid]=fO.x;
+    red_acc[1][tid]=fO.y;
+    red_acc[2][tid]=fO.z;
+    red_acc[3][tid]=fO.w;
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      simdsync();
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    fO.x=red_acc[0][tid];
+    fO.y=red_acc[1][tid];
+    fO.z=red_acc[2][tid];
+    fO.w=red_acc[3][tid];
+    if (EVFLAG && vflag) {
+      simdsync();
+      for (int r=0; r<6; r++) red_acc[r][tid]=vO[r];
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        simdsync();
         if (offset < s) {
-          for (int r=0; r<4; r++)
+          for (int r=0; r<6; r++)
             red_acc[r][tid] += red_acc[r][tid+s];
         }
       }
-      fO.x=red_acc[0][tid];
-      fO.y=red_acc[1][tid];
-      fO.z=red_acc[2][tid];
-      fO.w=red_acc[3][tid];
-      if (vflag>0) {
-        for (int r=0; r<6; r++) red_acc[r][tid]=vO[r];
-        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-          if (offset < s) {
-            for (int r=0; r<6; r++)
-              red_acc[r][tid] += red_acc[r][tid+s];
-          }
-        }
-        for (int r=0; r<6; r++) vO[r]=red_acc[r][tid];
-      }
+      for (int r=0; r<6; r++) vO[r]=red_acc[r][tid];
+    }
 #else
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      fO.x += shfl_down(fO.x, s, t_per_atom);
+      fO.y += shfl_down(fO.y, s, t_per_atom);
+      fO.z += shfl_down(fO.z, s, t_per_atom);
+      fO.w += shfl_down(fO.w, s, t_per_atom);
+    }
+    if (EVFLAG && vflag) {
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        fO.x += shfl_xor(fO.x, s, t_per_atom);
-        fO.y += shfl_xor(fO.y, s, t_per_atom);
-        fO.z += shfl_xor(fO.z, s, t_per_atom);
-        fO.w += shfl_xor(fO.w, s, t_per_atom);
-      }
-      if (vflag>0) {
-        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-          for (int r=0; r<6; r++)
-            vO[r] += shfl_xor(vO[r], s, t_per_atom);
-        }
+        for (int r=0; r<6; r++)
+          vO[r] += shfl_down(vO[r], s, t_per_atom);
       }
+    }
 #endif
+  }
+  if(offset == 0 && ii<inum) {
+    ansO[i] = fO;
+    if (EVFLAG && vflag) {
+      ansO[inum   + i].x = vO[0];
+      ansO[inum   + i].y = vO[1];
+      ansO[inum   + i].z = vO[2];
+      ansO[inum*2 + i].x = vO[3];
+      ansO[inum*2 + i].y = vO[4];
+      ansO[inum*2 + i].z = vO[5];
     }
-    if(offset == 0) {
-      ansO[i] = fO;
-      if (vflag>0) {
-        ansO[inum   + i].x = vO[0];
-        ansO[inum   + i].y = vO[1];
-        ansO[inum   + i].z = vO[2];
-        ansO[inum*2 + i].x = vO[3];
-        ansO[inum*2 + i].y = vO[4];
-        ansO[inum*2 + i].z = vO[5];
-      }
-    }
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-        vflag,ans,engv);
-  } // if ii
+  }
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
@@ -592,28 +599,32 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
-  acctyp energy = (acctyp)0;
-  acctyp e_coul = (acctyp)0;
   acctyp4 f, fO;
   f.x=(acctyp)0;  f.y=(acctyp)0;  f.z=(acctyp)0;
   fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
-  acctyp virial[6],vO[6];
-  for (int i=0; i<6; i++) {
-    virial[i]=(acctyp)0;
-    vO[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6], vO[6];
+  if (EVFLAG) {
+    energy = (acctyp)0;
+    e_coul = (acctyp)0;
+    for (int i=0; i<6; i++) {
+      virial[i]=(acctyp)0;
+      vO[i]=(acctyp)0;
+    }
   }
 
   __syncthreads();
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
         n_stride,nbor_end,nbor);
 
@@ -667,11 +678,11 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
         f.y += dely*forcelj;
         f.z += delz*forcelj;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = r6inv * (lj3[mtype].x*r6inv-lj3[mtype].y);
           energy += factor_lj * (e - lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*forcelj;
           virial[1] += dely*dely*forcelj;
           virial[2] += delz*delz*forcelj;
@@ -720,10 +731,10 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
             fO.z += delz * force_coul;
             fO.w += 0;
           }
-          if (eflag>0) {
+          if (EVFLAG && eflag) {
             e_coul += prefactor*(_erfc-factor_coul);
           }
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             acctyp4 fd;
             fd.x = delx*force_coul;
             fd.y = dely*force_coul;
@@ -813,10 +824,10 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
             f.y += fd.y;
             f.z += fd.z;
 
-            if (eflag>0) {
+            if (EVFLAG && eflag) {
               e_coul += prefactor*(_erfc-factor_coul) * (acctyp)0.5 * alpha;
             }
-            if (vflag>0) {
+            if (EVFLAG && vflag) {
               numtyp4 xH1; fetch4(xH1,iH1,pos_tex);
               numtyp4 xH2; fetch4(xH2,iH2,pos_tex);
               numtyp4 xO;  fetch4(xO,iO,pos_tex);
@@ -833,13 +844,13 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
       } // if cut_coulsqplus
     } // for nbor
     if (t_per_atom>1) {
-#if (ARCH < 300)
-      __local acctyp red_acc[6][BLOCK_PAIR];
+#if (SHUFFLE_AVAIL == 0)
       red_acc[0][tid]=fO.x;
       red_acc[1][tid]=fO.y;
       red_acc[2][tid]=fO.z;
       red_acc[3][tid]=fO.w;
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        simdsync();
         if (offset < s) {
           for (int r=0; r<4; r++)
             red_acc[r][tid] += red_acc[r][tid+s];
@@ -849,9 +860,10 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
       fO.y=red_acc[1][tid];
       fO.z=red_acc[2][tid];
       fO.w=red_acc[3][tid];
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         for (int r=0; r<6; r++) red_acc[r][tid]=vO[r];
         for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+          simdsync();
           if (offset < s) {
             for (int r=0; r<6; r++)
               red_acc[r][tid] += red_acc[r][tid+s];
@@ -861,22 +873,22 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
       }
 #else
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        fO.x += shfl_xor(fO.x, s, t_per_atom);
-        fO.y += shfl_xor(fO.y, s, t_per_atom);
-        fO.z += shfl_xor(fO.z, s, t_per_atom);
-        fO.w += shfl_xor(fO.w, s, t_per_atom);
+        fO.x += shfl_down(fO.x, s, t_per_atom);
+        fO.y += shfl_down(fO.y, s, t_per_atom);
+        fO.z += shfl_down(fO.z, s, t_per_atom);
+        fO.w += shfl_down(fO.w, s, t_per_atom);
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
           for (int r=0; r<6; r++)
-            vO[r] += shfl_xor(vO[r], s, t_per_atom);
+            vO[r] += shfl_down(vO[r], s, t_per_atom);
         }
       }
 #endif
     }
     if(offset == 0) {
       ansO[i] = fO;
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         ansO[inum   + i].x = vO[0];
         ansO[inum   + i].y = vO[1];
         ansO[inum   + i].z = vO[2];
@@ -885,7 +897,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
         ansO[inum*2 + i].z = vO[5];
       }
     }
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-        vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
diff --git a/lib/gpu/lal_lj_tip4p_long.h b/lib/gpu/lal_lj_tip4p_long.h
index 90c342e246..b163a62309 100644
--- a/lib/gpu/lal_lj_tip4p_long.h
+++ b/lib/gpu/lal_lj_tip4p_long.h
@@ -74,13 +74,13 @@ public:
 
   /// Reimplement BaseCharge pair loop with device neighboring
   int** compute(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, double *sublo,
-                double *subhi, tagint *tag,int *map_array, int map_size, int *sametag, int max_same,
-                int **nspecial,
-                tagint **special, const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                double *charge, double *boxlo, double *prd);
+                double **host_x, int *host_type, double *sublo, double *subhi,
+                tagint *tag,int *map_array, int map_size, int *sametag,
+                int max_same, int **nspecial, tagint **special,
+                const bool eflag, const bool vflag, const bool eatom,
+                const bool vatom, int &host_start, int **ilist, int **numj,
+                const double cpu_time, bool &success, double *charge,
+                double *boxlo, double *prd);
 
 
   // --------------------------- TYPE DATA --------------------------
@@ -115,11 +115,12 @@ public:
   UCL_D_Vec<int> atom_sametag;
 
   UCL_Kernel k_pair_distrib, k_pair_reneigh, k_pair_newsite;
+  UCL_Kernel k_pair_distrib_noev, *k_pair_dt_sel;
 
  private:
   bool _allocated;
   int t_ago;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_tip4p_long_ext.cpp b/lib/gpu/lal_lj_tip4p_long_ext.cpp
index d0d6c7a3d2..7395506c2d 100644
--- a/lib/gpu/lal_lj_tip4p_long_ext.cpp
+++ b/lib/gpu/lal_lj_tip4p_long_ext.cpp
@@ -62,7 +62,7 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   if (world_me==0)
     init_ok=LJTIP4PLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
         host_lj4, offset, special_lj, inum,
-        tH, tO, alpha, qdist, nall, 300,
+        tH, tO, alpha, qdist, nall, max_nbors,
         maxspecial, cell_size, gpu_split, screen,
         host_cut_ljsq, host_cut_coulsq, host_cut_coulsqplus,
         host_special_coul, qqrd2e, g_ewald, map_size, max_same);
@@ -83,7 +83,7 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     if (gpu_rank==i && world_me!=0)
       init_ok=LJTIP4PLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
           offset, special_lj, inum,
-          tH, tO, alpha, qdist, nall, 300, maxspecial,
+          tH, tO, alpha, qdist, nall, max_nbors, maxspecial,
           cell_size, gpu_split, screen, host_cut_ljsq,
           host_cut_coulsq, host_cut_coulsqplus,
           host_special_coul, qqrd2e,
@@ -97,7 +97,7 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     fprintf(screen,"\n");
 
   if (init_ok==0)
-    LJTIP4PLMF.estimate_gpu_overhead();
+    LJTIP4PLMF.estimate_gpu_overhead(2);
   return init_ok;
 }
 
diff --git a/lib/gpu/lal_mie.cpp b/lib/gpu/lal_mie.cpp
index 394d1f8a2f..e370b7bde5 100644
--- a/lib/gpu/lal_mie.cpp
+++ b/lib/gpu/lal_mie.cpp
@@ -113,20 +113,9 @@ double MieT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void MieT::loop(const bool _eflag, const bool _vflag) {
+int MieT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -134,8 +123,8 @@ void MieT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &mie1, &mie3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &mie1, &mie3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom);
@@ -147,6 +136,7 @@ void MieT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Mie<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_mie.cu b/lib/gpu/lal_mie.cu
index 36ec8a496b..fedfaf157a 100644
--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@@ -39,22 +39,25 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -89,12 +92,12 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=(mie3[mtype].x*rgamR - mie3[mtype].y*rgamA) -
             mie3[mtype].z;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -105,9 +108,9 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
@@ -126,6 +129,9 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 mie1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 mie3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -133,19 +139,19 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
     mie3[tid]=mie3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -180,12 +186,12 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=(mie3[mtype].x*rgamR - mie3[mtype].y*rgamA) -
             mie3[mtype].z;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -196,8 +202,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
-
diff --git a/lib/gpu/lal_mie.h b/lib/gpu/lal_mie.h
index dfc2ee6e53..9a41596ccb 100644
--- a/lib/gpu/lal_mie.h
+++ b/lib/gpu/lal_mie.h
@@ -72,7 +72,7 @@ class Mie : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_mie_ext.cpp b/lib/gpu/lal_mie_ext.cpp
index f612de4336..5cbb9c29d2 100644
--- a/lib/gpu/lal_mie_ext.cpp
+++ b/lib/gpu/lal_mie_ext.cpp
@@ -58,7 +58,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
   if (world_me==0)
     init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2,
                       host_mie3, host_mie4, host_gamA, host_gamR,
-                      offset, special_lj, inum, nall, 300,
+                      offset, special_lj, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen);
 
   MLMF.device->world_barrier();
@@ -77,7 +77,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
     if (gpu_rank==i && world_me!=0)
       init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2,
                         host_mie3, host_mie4, host_gamA, host_gamR,
-                        offset, special_lj, inum, nall, 300, maxspecial,
+                        offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);
 
     MLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_morse.cpp b/lib/gpu/lal_morse.cpp
index 09da65d252..4bedc67ed7 100644
--- a/lib/gpu/lal_morse.cpp
+++ b/lib/gpu/lal_morse.cpp
@@ -112,20 +112,9 @@ double MorseT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void MorseT::loop(const bool _eflag, const bool _vflag) {
+int MorseT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -133,8 +122,8 @@ void MorseT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &mor1, &mor2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &mor1, &mor2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -147,6 +136,7 @@ void MorseT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Morse<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu
index d6bab1e131..b1c8f2673b 100644
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@@ -41,22 +41,25 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -91,11 +94,11 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y;
           energy+=e*factor_lj;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -106,9 +109,9 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
@@ -127,27 +130,30 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 mor1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp2 mor2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     mor1[tid]=mor1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       mor2[tid]=mor2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -182,11 +188,11 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y;
           energy+=e*factor_lj;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -197,8 +203,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
-
diff --git a/lib/gpu/lal_morse.h b/lib/gpu/lal_morse.h
index bf5f1c0f8f..c5948d8be8 100644
--- a/lib/gpu/lal_morse.h
+++ b/lib/gpu/lal_morse.h
@@ -71,7 +71,7 @@ class Morse : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_morse_ext.cpp b/lib/gpu/lal_morse_ext.cpp
index 3b62d10305..f43676a1b5 100644
--- a/lib/gpu/lal_morse_ext.cpp
+++ b/lib/gpu/lal_morse_ext.cpp
@@ -56,7 +56,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
   int init_ok=0;
   if (world_me==0)
     init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   MORMF.device->world_barrier();
@@ -74,7 +74,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);
 
     MORMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index 6c4890ef47..aabba49575 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -1,6 +1,7 @@
 /***************************************************************************
                                  neighbor.cpp
                              -------------------
+                            Nitin Dhamankar (Intel)
                             W. Michael Brown (ORNL)
                               Peng Wang (Nvidia)
 
@@ -32,22 +33,25 @@ int Neighbor::bytes_per_atom(const int max_nbors) const {
 }
 
 bool Neighbor::init(NeighborShared *shared, const int inum,
-                       const int host_inum, const int max_nbors,
-                       const int maxspecial, UCL_Device &devi,
-                       const int gpu_nbor, const int gpu_host,
-                       const bool pre_cut, const int block_cell_2d,
-                       const int block_cell_id, const int block_nbor_build,
-                       const int threads_per_atom, const int warp_size,
-                       const bool time_device,
-                       const std::string compile_flags) {
+                    const int host_inum, const int max_nbors,
+                    const int maxspecial, UCL_Device &devi, const int gpu_nbor,
+                    const int gpu_host, const bool pre_cut,
+                    const int block_cell_2d, const int block_cell_id,
+                    const int block_nbor_build, const int threads_per_atom,
+                    const int simd_size, const bool time_device,
+                    const std::string compile_flags, const bool ilist_map) {
   clear();
+  _ilist_map = ilist_map;
 
   _threads_per_atom=threads_per_atom;
   _block_cell_2d=block_cell_2d;
   _block_cell_id=block_cell_id;
-  _max_block_nbor_build=block_nbor_build;
   _block_nbor_build=block_nbor_build;
-  _warp_size=warp_size;
+  _simd_size=simd_size;
+  #ifndef LAL_USE_OLD_NEIGHBOR
+  if (_block_nbor_build < _simd_size)
+    _block_nbor_build = _simd_size;
+  #endif
   _shared=shared;
   dev=&devi;
   _gpu_nbor=gpu_nbor;
@@ -90,7 +94,13 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
     _max_atoms=1000;
 
   _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
-  _max_nbors=(max_nbors/threads_per_atom+1)*threads_per_atom;
+
+  _max_neighbor_factor=1.0e-2*max_nbors*1.1;
+  if (_gpu_nbor != 1)
+    _max_nbors=0;
+  else
+    _max_nbors=300;
+  if (_old_max_nbors) _max_nbors=_old_max_nbors;
 
   _maxspecial=maxspecial;
   if (gpu_nbor==0)
@@ -103,8 +113,36 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
   if (!success)
     return false;
 
-  if (_use_packing==false)
-    _shared->compile_kernels(devi,gpu_nbor,compile_flags);
+  if (_use_packing==false) {
+    #ifndef LAL_USE_OLD_NEIGHBOR
+      _shared->compile_kernels(devi, gpu_nbor, compile_flags+
+        " -DMAX_SUBGROUPS_PER_BLOCK="+toa(_block_nbor_build/_simd_size));
+    #else
+      _shared->compile_kernels(devi,gpu_nbor,compile_flags);
+    #endif
+
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    if (_gpu_nbor) {
+      #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || \
+          defined(CL_VERSION_3_0))
+      if (dev->has_subgroup_support()) {
+        int simd_size_kernel=
+          _shared->k_build_nbor.max_subgroup_size(_block_nbor_build);
+        if (_simd_size != simd_size_kernel) {
+          _simd_size = simd_size_kernel;
+          if (_block_nbor_build < _simd_size)
+            _block_nbor_build = _simd_size;
+          _shared->clear();
+          _shared->compile_kernels(devi, gpu_nbor, compile_flags+
+            " -DMAX_SUBGROUPS_PER_BLOCK="+toa(_block_nbor_build/_simd_size));
+        }
+      }
+      #endif
+      _bin_stencil.get_global(*(_shared->build_program),"bin_stencil");
+    }
+    #endif
+  }
+  _max_block_nbor_build=_block_nbor_build;
 
   return success;
 }
@@ -113,24 +151,44 @@ void Neighbor::alloc(bool &success) {
   dev_nbor.clear();
   host_acc.clear();
   int nt=_max_atoms+_max_host;
-  if (_use_packing==false || _gpu_nbor>0)
-    success=success &&
-            (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS);
-  else
+  if (_max_nbors)
+    _max_nbors = ((_max_nbors-1)/_threads_per_atom+1)*_threads_per_atom;
+  if (_use_packing==false || _gpu_nbor>0) {
+    if (_max_nbors)
+      success=success &&
+        (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS);
+  } else
     success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
                                        UCL_READ_ONLY)==UCL_SUCCESS);
-  success=success && (host_acc.alloc(nt*2,*dev,
-                                     UCL_READ_WRITE)==UCL_SUCCESS);
+  if (_gpu_nbor != 2 || _max_host>0)
+    success=success && (host_acc.alloc(nt*2,*dev,
+                                       UCL_READ_WRITE)==UCL_SUCCESS);
 
   _c_bytes=dev_nbor.row_bytes();
   if (_alloc_packed) {
+    if (_use_packing==false) {
+      dev_packed_begin.clear();
+      success=success && (dev_packed_begin.alloc(_max_atoms,*dev,
+         _packed_permissions)==UCL_SUCCESS);
+    }
+
     dev_packed.clear();
-    success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
-                                         _packed_permissions)==UCL_SUCCESS);
-    dev_ilist.clear();
-    success=success && (dev_ilist.alloc(_max_atoms,*dev,
-                                      UCL_READ_WRITE)==UCL_SUCCESS);
-    _c_bytes+=dev_packed.row_bytes()+dev_ilist.row_bytes();
+    if (_max_nbors)
+      success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
+                                           _packed_permissions)==UCL_SUCCESS);
+    if (_ilist_map) {
+      if (_gpu_nbor) {
+        if (three_ilist.numel()==0)
+          success=success && (three_ilist.alloc(16,*dev,UCL_READ_WRITE,
+                                                UCL_READ_ONLY)==UCL_SUCCESS);
+      } else {
+        three_ilist.clear();
+        success=success && (three_ilist.alloc(_max_atoms,*dev,UCL_READ_WRITE,
+                                              UCL_READ_ONLY)==UCL_SUCCESS);
+      }
+      _c_bytes+=three_ilist.row_bytes();
+    }
+    _c_bytes+=dev_packed.row_bytes()+dev_packed_begin.row_bytes();
   }
   if (_max_host>0) {
     nbor_host.clear();
@@ -138,8 +196,9 @@ void Neighbor::alloc(bool &success) {
     host_ilist.clear();
     host_jlist.clear();
 
-    success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE,
-                             UCL_READ_WRITE)==UCL_SUCCESS) && success;
+    if (_max_nbors)
+      success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE,
+                               UCL_READ_WRITE)==UCL_SUCCESS) && success;
     success=success && (dev_numj_host.alloc(_max_host,*dev,
                                             UCL_READ_WRITE)==UCL_SUCCESS);
     success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
@@ -157,7 +216,8 @@ void Neighbor::alloc(bool &success) {
       ptr+=_max_nbors;
     }
     _c_bytes+=nbor_host.device.row_bytes()+dev_numj_host.row_bytes();
-  } else {
+  } else if (dev_nbor.numel()) {
+    if (!success) return;
     // Some OpenCL implementations return errors for nullptr pointers as args
     nbor_host.device.view(dev_nbor);
     dev_numj_host.view(dev_nbor);
@@ -188,6 +248,12 @@ void Neighbor::clear() {
   if (_ncells>0) {
     _ncells=0;
     cell_counts.clear();
+#ifndef LAL_USE_OLD_NEIGHBOR
+    cell_subgroup_counts.clear();
+    subgroup2cell.clear();
+    _host_bin_stencil.clear();
+    _bin_stencil.clear();
+#endif
     if (_gpu_nbor==2)
       delete [] cell_iter;
   }
@@ -195,12 +261,15 @@ void Neighbor::clear() {
     _allocated=false;
     _nbor_time_avail=false;
 
+    _old_max_nbors=_max_nbors;
+    _max_nbors=0;
     host_packed.clear();
     host_acc.clear();
-    dev_ilist.clear();
+    three_ilist.clear();
     dev_nbor.clear();
     nbor_host.clear();
     dev_packed.clear();
+    dev_packed_begin.clear();
     dev_numj_host.clear();
     host_ilist.clear();
     host_jlist.clear();
@@ -236,9 +305,9 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
   UCL_H_Vec<int> ilist_view;
   ilist_view.view(ilist,inum,*dev);
   ucl_copy(dev_nbor,ilist_view,false);
-
-  UCL_D_Vec<int> nbor_offset;
-  UCL_H_Vec<int> host_offset;
+  #ifndef GERYON_OCL_FLUSH
+  dev_nbor.flush();
+  #endif
 
   int copy_count=0;
   int ij_count=0;
@@ -263,9 +332,12 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
 
       if (ij_count==IJ_SIZE) {
         dev_nbor.sync();
-        host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
-        nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
-        ucl_copy(nbor_offset,host_offset,true);
+        _host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
+        _nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
+        ucl_copy(_nbor_offset,_host_offset,true);
+        #ifndef GERYON_OCL_FLUSH
+        _nbor_offset.flush();
+        #endif
         copy_count++;
         ij_count=0;
         dev_count+=IJ_SIZE;
@@ -275,21 +347,29 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
   }
   if (ij_count!=0) {
     dev_nbor.sync();
-    host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
-    nbor_offset.view_offset(dev_count,dev_packed,ij_count);
-    ucl_copy(nbor_offset,host_offset,true);
+    _host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
+    _nbor_offset.view_offset(dev_count,dev_packed,ij_count);
+    ucl_copy(_nbor_offset,_host_offset,true);
+  }
+  _acc_view.view_offset(inum,dev_nbor,inum*2);
+  if (_use_packing)
+    ucl_copy(_acc_view,host_acc,inum*2,true);
+  else {
+    ucl_copy(_acc_view,host_acc,inum,true);
+    _host_offset.view_offset(inum,host_acc,inum);
+    ucl_copy(dev_packed_begin,_host_offset,inum,true);
   }
-  UCL_D_Vec<int> acc_view;
-  acc_view.view_offset(inum,dev_nbor,inum*2);
-  ucl_copy(acc_view,host_acc,inum*2,true);
 
-  UCL_H_Vec<int> host_view;
-  host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
-  for (int ii=0; ii<inum; ii++) {
-    int i=ilist[ii];
-    host_view[i] = ii;
+  if (_ilist_map && _gpu_nbor==0) {
+    #ifndef GERYON_OCL_FLUSH
+    _acc_view.flush();
+    #endif
+    for (int ii=0; ii<inum; ii++) {
+      int i=ilist[ii];
+      three_ilist[i] = ii;
+    }
+    three_ilist.update_device(inum,true);
   }
-  ucl_copy(dev_ilist,host_view,true);
 
   time_nbor.stop();
 
@@ -298,7 +378,8 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
     int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
                                  block_size));
     _shared->k_nbor.set_size(GX,block_size);
-    _shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
+    _shared->k_nbor.run(&dev_nbor, &dev_packed, &dev_packed_begin, &inum,
+                        &_threads_per_atom);
     time_kernel.stop();
   }
 }
@@ -315,9 +396,6 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
   ilist_view.view(ilist,inum,*dev);
   ucl_copy(dev_nbor,ilist_view,false);
 
-  UCL_D_Vec<int> nbor_offset;
-  UCL_H_Vec<int> host_offset;
-
   int copy_count=0;
   int ij_count=0;
   int acc_count=0;
@@ -346,9 +424,9 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
 
       if (ij_count==IJ_SIZE) {
         dev_nbor.sync();
-        host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
-        nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
-        ucl_copy(nbor_offset,host_offset,true);
+        _host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
+        _nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
+        ucl_copy(_nbor_offset,_host_offset,true);
         copy_count++;
         ij_count=0;
         dev_count+=IJ_SIZE;
@@ -358,13 +436,18 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
   }
   if (ij_count!=0) {
     dev_nbor.sync();
-    host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
-    nbor_offset.view_offset(dev_count,dev_packed,ij_count);
-    ucl_copy(nbor_offset,host_offset,true);
+    _host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
+    _nbor_offset.view_offset(dev_count,dev_packed,ij_count);
+    ucl_copy(_nbor_offset,_host_offset,true);
+  }
+  _acc_view.view_offset(inum,dev_nbor,inum*2);
+  if (_use_packing)
+    ucl_copy(_acc_view,host_acc,inum*2,true);
+  else {
+    ucl_copy(_acc_view,host_acc,inum,true);
+    _host_offset.view_offset(inum,host_acc,inum);
+    ucl_copy(dev_packed_begin,_host_offset,inum,true);
   }
-  UCL_D_Vec<int> acc_view;
-  acc_view.view_offset(inum,dev_nbor,inum*2);
-  ucl_copy(acc_view,host_acc,inum*2,true);
   time_nbor.stop();
 
   if (_use_packing==false) {
@@ -372,20 +455,28 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
     int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
                                  block_size));
     _shared->k_nbor.set_size(GX,block_size);
-    _shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
+    _shared->k_nbor.run(&dev_nbor, &dev_packed, &dev_packed_begin, &inum,
+                        &_threads_per_atom);
     time_kernel.stop();
   }
 }
 
 template <class numtyp, class acctyp>
-void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
+void Neighbor::resize_max_neighbors(int maxn, bool &success) {
+  if (maxn == 0) maxn = 1;
   if (maxn>_max_nbors) {
     int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
-    mn=(mn/_threads_per_atom+1)*_threads_per_atom;
-    success=success && (dev_nbor.resize((mn+1)*_max_atoms)==UCL_SUCCESS);
+    mn = ((mn-1)/_threads_per_atom+1)*_threads_per_atom;
+    dev_nbor.clear();
+    success=success &&
+      (dev_nbor.alloc((mn+2)*_max_atoms,*dev)==UCL_SUCCESS);
+    if (!success) return;
     _gpu_bytes=dev_nbor.row_bytes();
     if (_max_host>0) {
-      success=success && (nbor_host.resize(mn*_max_host)==UCL_SUCCESS);
+      nbor_host.clear();
+      success=(nbor_host.alloc(mn*_max_host,*dev,UCL_READ_WRITE,
+                               UCL_READ_WRITE)==UCL_SUCCESS) && success;
+      if (!success) return;
       int *ptr=nbor_host.host.begin();
       for (int i=0; i<_max_host; i++) {
         host_jlist[i]=ptr;
@@ -397,7 +488,9 @@ void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
       dev_numj_host.view(dev_nbor);
     }
     if (_alloc_packed) {
-      success=success && (dev_packed.resize((mn+2)*_max_atoms)==UCL_SUCCESS);
+      dev_packed.clear();
+      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
+                                           _packed_permissions)==UCL_SUCCESS);
       _gpu_bytes+=dev_packed.row_bytes();
     }
     _max_nbors=mn;
@@ -409,32 +502,66 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
                                const int nall, Atom<numtyp,acctyp> &atom,
                                double *sublo, double *subhi, tagint *tag,
                                int **nspecial, tagint **special, bool &success,
-                               int &mn) {
+                               int &mn, UCL_Vector<int,int> &error_flag) {
   _nbor_time_avail=true;
   const int nt=inum+host_inum;
 
+  const double subx = subhi[0]-sublo[0];
+  const double suby = subhi[1]-sublo[1];
+  const double subz = subhi[2]-sublo[2];
+
   // Calculate number of cells and allocate storage for binning as necessary
-  int ncellx, ncelly, ncellz, ncell_3d;
-  int ghost_cells=2*_cells_in_cutoff;
-  ncellx = static_cast<int>(ceil((subhi[0]-sublo[0])/_cell_size))+ghost_cells;
-  ncelly = static_cast<int>(ceil((subhi[1]-sublo[1])/_cell_size))+ghost_cells;
-  ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
-  ncell_3d = ncellx * ncelly * ncellz;
+  int ncellx, ncelly, ncellz;
+  int cells_in_cutoff=static_cast<int>(ceil(_cutoff/_cell_size));
+  int ghost_cells=2*cells_in_cutoff;
+  ncellx = static_cast<int>(ceil(subx/_cell_size))+ghost_cells;
+  ncelly = static_cast<int>(ceil(suby/_cell_size))+ghost_cells;
+  ncellz = static_cast<int>(ceil(subz/_cell_size))+ghost_cells;
+
+  #ifndef LAL_USE_OLD_NEIGHBOR
+  if (_auto_cell_size && subz>0.0) {
+    if (_old_ncellx!=ncellx || _old_ncelly!=ncelly || _old_ncellz!=ncellz) {
+      _cell_size = _shared->best_cell_size(subx, suby, subz, nt, _cutoff);
+      cells_in_cutoff=static_cast<int>(ceil(_cutoff/_cell_size));
+      ghost_cells=2*cells_in_cutoff;
+      ncellx = static_cast<int>(ceil(subx/_cell_size))+ghost_cells;
+      ncelly = static_cast<int>(ceil(suby/_cell_size))+ghost_cells;
+      ncellz = static_cast<int>(ceil(subz/_cell_size))+ghost_cells;
+    }
+  }
+  #endif
+
+  int ncell_3d = ncellx * ncelly * ncellz;
   if (ncell_3d+1>_ncells) {
     cell_counts.clear();
+#ifndef LAL_USE_OLD_NEIGHBOR
+    cell_subgroup_counts.clear();
+#endif
 
     if (_gpu_nbor==2) {
       if (_ncells>0)
         delete [] cell_iter;
       cell_iter = new int[ncell_3d+1];
-      cell_counts.alloc(ncell_3d+1,dev_nbor,UCL_READ_WRITE,UCL_READ_ONLY);
+      success = success && (cell_counts.alloc(ncell_3d+1,*dev,
+                               UCL_READ_WRITE,UCL_READ_ONLY) == UCL_SUCCESS);
+#ifndef LAL_USE_OLD_NEIGHBOR
+      success = success && (cell_subgroup_counts.alloc(ncell_3d+1,*dev,
+                               UCL_READ_WRITE,UCL_READ_ONLY) == UCL_SUCCESS);
+      if (!success) return;
+      cell_subgroup_counts.host[0]=0;
+#endif
     } else {
       cell_counts.device.clear();
-      cell_counts.device.alloc(ncell_3d+1,dev_nbor);
+      success = success && (cell_counts.device.alloc(ncell_3d+1,
+                                                     *dev) == UCL_SUCCESS);
     }
+    if (!success) return;
 
     _ncells=ncell_3d+1;
     _cell_bytes=cell_counts.device.row_bytes();
+#ifndef LAL_USE_OLD_NEIGHBOR
+    _cell_bytes+=cell_subgroup_counts.row_bytes()+subgroup2cell.row_bytes();
+#endif
   }
 
   const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);
@@ -463,7 +590,13 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
   }
 
   // If binning on CPU, do this now
+#ifndef LAL_USE_OLD_NEIGHBOR
+  int subgroup_count = 0;
+#endif
   if (_gpu_nbor==2) {
+    #ifndef GERYON_OCL_FLUSH
+    dev_nbor.flush();
+    #endif
     double stime = MPI_Wtime();
     int *cell_id=atom.host_cell_id.begin();
     int *particle_id=atom.host_particle_id.begin();
@@ -472,21 +605,21 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     cell_counts.host.zero();
     double i_cell_size=1.0/_cell_size;
 
-    int offset_hi=_cells_in_cutoff+1;
+    int offset_hi=cells_in_cutoff+1;
     for (int i=0; i<nt; i++) {
       double px, py, pz;
       px=x[i][0]-sublo[0];
       py=x[i][1]-sublo[1];
       pz=x[i][2]-sublo[2];
 
-      int ix = static_cast<int>(px*i_cell_size+1);
-      ix = std::max(ix,_cells_in_cutoff);
+      int ix = static_cast<int>(px*i_cell_size+cells_in_cutoff);
+      ix = std::max(ix,cells_in_cutoff);
       ix = std::min(ix,ncellx-offset_hi);
-      int iy = static_cast<int>(py*i_cell_size+1);
-      iy = std::max(iy,_cells_in_cutoff);
+      int iy = static_cast<int>(py*i_cell_size+cells_in_cutoff);
+      iy = std::max(iy,cells_in_cutoff);
       iy = std::min(iy,ncelly-offset_hi);
-      int iz = static_cast<int>(pz*i_cell_size+1);
-      iz = std::max(iz,_cells_in_cutoff);
+      int iz = static_cast<int>(pz*i_cell_size+cells_in_cutoff);
+      iz = std::max(iz,cells_in_cutoff);
       iz = std::min(iz,ncellz-offset_hi);
 
       int id = ix+iy*ncellx+iz*ncellx*ncelly;
@@ -494,19 +627,40 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       cell_counts[id+1]++;
     }
 
+#ifndef LAL_USE_OLD_NEIGHBOR
+    // populate subgroup counts only for the local atoms
+    for (int i=1; i<_ncells; i++) {
+      cell_subgroup_counts[i] = ceil(static_cast<double>(cell_counts[i]) /
+                                     _simd_size);
+      subgroup_count += cell_subgroup_counts[i];
+      cell_subgroup_counts[i] += cell_subgroup_counts[i-1];
+    }
+    if (subgroup_count > subgroup2cell.numel()) {
+      subgroup2cell.clear();
+      success = success && (subgroup2cell.alloc(1.1*subgroup_count,*dev,
+                                UCL_READ_WRITE,UCL_READ_ONLY) == UCL_SUCCESS);
+      if (!success) return;
+      _cell_bytes=cell_counts.device.row_bytes() +
+        cell_subgroup_counts.row_bytes()+subgroup2cell.row_bytes();
+    }
+    for (int i=1; i<_ncells; i++)
+      for (int j=cell_subgroup_counts[i-1]; j<cell_subgroup_counts[i]; j++)
+        subgroup2cell[j] = i-1;
+#endif
+
     for (int i=nt; i<nall; i++) {
       double px, py, pz;
-      px=x[i][0]-sublo[0];
-      py=x[i][1]-sublo[1];
-      pz=x[i][2]-sublo[2];
+      px=x[i][0]-sublo[0]+_cell_size*cells_in_cutoff;
+      py=x[i][1]-sublo[1]+_cell_size*cells_in_cutoff;
+      pz=x[i][2]-sublo[2]+_cell_size*cells_in_cutoff;
 
-      int ix = static_cast<int>(px*i_cell_size+1);
+      int ix = static_cast<int>(px*i_cell_size);
       ix = std::max(ix,0);
       ix = std::min(ix,ncellx-1);
-      int iy = static_cast<int>(py*i_cell_size+1);
+      int iy = static_cast<int>(py*i_cell_size);
       iy = std::max(iy,0);
       iy = std::min(iy,ncelly-1);
-      int iz = static_cast<int>(pz*i_cell_size+1);
+      int iz = static_cast<int>(pz*i_cell_size);
       iz = std::max(iz,0);
       iz = std::min(iz,ncellz-1);
 
@@ -518,21 +672,54 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     mn=0;
     for (int i=0; i<_ncells; i++)
       mn=std::max(mn,cell_counts[i]);
-    mn*=8;
-    set_nbor_block_size(mn/2);
-
+    double mind=std::min(subx,suby);
+    mind=std::min(mind,subz) + _cutoff;
+    double ics;
+    if (mind >= _cell_size) ics = i_cell_size;
+    else ics = 1.0 / mind;
+    double vadjust=_cutoff*ics;
+    vadjust*=vadjust*vadjust*4.1888;
+    if (_cutoff < _cell_size) vadjust*=1.46;
+    mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
+    if (mn<33) mn+=3;
     resize_max_neighbors<numtyp,acctyp>(mn,success);
+    set_nbor_block_size(mn/2);
     if (!success)
       return;
     _total_atoms=nt;
 
+    // For neighbor builds for host atoms, _max_nbors is used for neighbor
+    // allocation offsets.
+    if (_max_host > 0) mn=_max_nbors;
+
     cell_iter[0]=0;
     for (int i=1; i<_ncells; i++) {
       cell_counts[i]+=cell_counts[i-1];
       cell_iter[i]=cell_counts[i];
     }
     time_hybrid1.start();
-    cell_counts.update_device(true);
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    if (_old_ncellx!=ncellx || _old_ncelly!=ncelly || _old_ncellz!=ncellz) {
+      _old_ncellx = ncellx;
+      _old_ncelly = ncelly;
+      _old_ncellz = ncellz;
+      const int bin_stencil_stride = cells_in_cutoff * 2 + 1;
+      const int bin_stencil_size = bin_stencil_stride * bin_stencil_stride;
+      if (bin_stencil_size > _host_bin_stencil.numel())
+        _host_bin_stencil.alloc(bin_stencil_size,*dev);
+        for (int s = 0; s<bin_stencil_size; s++) {
+          const int nbory = s % bin_stencil_stride - cells_in_cutoff;
+          const int nborz = s / bin_stencil_stride - cells_in_cutoff;
+          _host_bin_stencil[s] = nbory*ncellx + nborz*ncellx*ncelly;
+        }
+      _bin_stencil.update_device(_host_bin_stencil,bin_stencil_size);
+    }
+    #endif
+    cell_counts.update_device(ncell_3d+1,true);
+#ifndef LAL_USE_OLD_NEIGHBOR
+    cell_subgroup_counts.update_device(ncell_3d+1,true);
+    subgroup2cell.update_device(subgroup_count,true);
+#endif
     time_hybrid1.stop();
     for (int i=0; i<nall; i++) {
       int celli=cell_id[i];
@@ -541,7 +728,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       particle_id[ploc]=i;
     }
     time_hybrid2.start();
-    ucl_copy(atom.dev_particle_id,atom.host_particle_id,true);
+    ucl_copy(atom.dev_particle_id,atom.host_particle_id,nall,true);
     time_hybrid2.stop();
     _bin_time+=MPI_Wtime()-stime;
   }
@@ -563,7 +750,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     _shared->k_cell_id.run(&atom.x, &atom.dev_cell_id,
                            &atom.dev_particle_id, &sublo0, &sublo1,
                            &sublo2, &i_cell_size, &ncellx, &ncelly, &ncellz,
-                           &nt, &nall, &_cells_in_cutoff);
+                           &nt, &nall, &cells_in_cutoff);
 
     atom.sort_neighbor(nall);
 
@@ -575,22 +762,37 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
 
   /* build the neighbor list */
   const int cell_block=_block_nbor_build;
+#ifndef LAL_USE_OLD_NEIGHBOR
+  int nblocks = (subgroup_count-1)/(cell_block/_simd_size)+1;
+  _shared->k_build_nbor.set_size(nblocks, cell_block);
+  _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
+                            &cell_counts, &dev_nbor, &nbor_host,
+                            &dev_numj_host, &mn, &cutoff_cast, &ncellx,
+                            &ncelly, &ncellz, &inum, &nt, &nall,
+                            &_threads_per_atom, &cells_in_cutoff,
+                            &cell_subgroup_counts, &subgroup2cell,
+                            &subgroup_count, _bin_stencil.begin(),
+                            &error_flag);
+  error_flag.update_host();
+#else
   _shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
                                  (ncellz-ghost_cells),cell_block,1);
   _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
                             &cell_counts, &dev_nbor, &nbor_host,
-                            &dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
+                            &dev_numj_host, &mn, &cutoff_cast, &ncellx,
                             &ncelly, &ncellz, &inum, &nt, &nall,
-                            &_threads_per_atom, &_cells_in_cutoff);
+                            &_threads_per_atom, &cells_in_cutoff);
+#endif
 
   /* Get the maximum number of nbors and realloc if necessary */
-  UCL_D_Vec<int> numj;
-  numj.view_offset(inum,dev_nbor,inum);
-  ucl_copy(host_acc,numj,inum,true);
-  if (nt>inum) {
-    UCL_H_Vec<int> host_offset;
-    host_offset.view_offset(inum,host_acc,nt-inum);
-    ucl_copy(host_offset,dev_numj_host,nt-inum,true);
+  UCL_D_Vec<int> _numj_view;
+  if (_gpu_nbor!=2 || inum<nt) {
+    _numj_view.view_offset(inum,dev_nbor,inum);
+    ucl_copy(host_acc,_numj_view,inum,true);
+    if (nt>inum) {
+      _host_offset.view_offset(inum,host_acc,nt-inum);
+      ucl_copy(_host_offset,dev_numj_host,nt-inum,true);
+    }
   }
 
   if (_gpu_nbor!=2) {
@@ -608,7 +810,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       if (_time_device)
         time_kernel.add_to_total();
       build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag,
-                      nspecial, special, success, mn);
+                      nspecial, special, success, mn, error_flag);
       return;
     }
   }
@@ -634,5 +836,5 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
 template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
      (double **x, const int inum, const int host_inum, const int nall,
       Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
-      tagint *, int **, tagint **, bool &success, int &mn);
-
+      tagint *, int **, tagint **, bool &success, int &mn,
+      UCL_Vector<int,int> &error_flag);
diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index 996deaff6d..5939567a41 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -1,6 +1,7 @@
 /***************************************************************************
                                   neighbor.h
                              -------------------
+                            Nitin Dhamankar (Intel)
                             W. Michael Brown (ORNL)
                               Peng Wang (Nvidia)
 
@@ -19,14 +20,25 @@
 
 #include "lal_atom.h"
 #include "lal_neighbor_shared.h"
+#include <sstream>
 
 #define IJ_SIZE 131072
 
+#if !defined(USE_OPENCL) && !defined(USE_HIP)
+#ifndef LAL_USE_OLD_NEIGHBOR
+// Issue with incorrect results with CUDA 11.2
+#if (CUDA_VERSION > 11019) && (CUDA_VERSION < 11030)
+#define LAL_USE_OLD_NEIGHBOR
+#endif
+#endif
+#endif
+
 namespace LAMMPS_AL {
 
 class Neighbor {
  public:
-  Neighbor() : _allocated(false), _use_packing(false), _ncells(0) {}
+  Neighbor() : _allocated(false), _use_packing(false), _ncells(0),
+    _old_max_nbors(0) {}
   ~Neighbor() { clear(); }
 
   /// Determine whether neighbor unpacking should be used
@@ -37,7 +49,7 @@ class Neighbor {
   /// Clear any old data and setup for new LAMMPS run
   /** \param inum Initial number of particles whose neighbors stored on device
     * \param host_inum Initial number of particles whose nbors copied to host
-    * \param max_nbors Initial number of rows in the neighbor matrix
+    * \param max_nbors Factor (in percentage) applied to density calculated max
     * \param gpu_nbor 0 if neighboring will be performed on host
     *        gpu_nbor 1 if neighboring will be performed on device
     *        gpu_nbor 2 if binning on host and neighboring on device
@@ -48,33 +60,41 @@ class Neighbor {
     *                than the force kernel
     * \param threads_per_atom Number of threads used per atom for force
     *                         calculation
-    * \param compile_flags Flags for JIT compiling **/
+    * \param compile_flags Flags for JIT compiling
+    * \param ilist_map true if ilist mapping data structures used (3-body) **/
   bool init(NeighborShared *shared, const int inum, const int host_inum,
             const int max_nbors, const int maxspecial, UCL_Device &dev,
             const int gpu_nbor, const int gpu_host, const bool pre_cut,
             const int block_cell_2d, const int block_cell_id,
             const int block_nbor_build, const int threads_per_atom,
-            const int warp_size, const bool time_device,
-            const std::string compile_flags);
+            const int simd_size, const bool time_device,
+            const std::string compile_flags, const bool ilist_map);
 
-  /// Set the size of the cutoff+skin
-  inline void cell_size(const double size, const double cutoff) {
-    _cell_size=size;
+  /// Set the cutoff+skin
+  inline void set_cutoff(const double cutoff) {
     _cutoff=cutoff;
-    if (cutoff>size)
-      _cells_in_cutoff=static_cast<int>(ceil(cutoff/size));
-    else
-      _cells_in_cutoff=1;
+
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    _cell_size=_shared->cell_size();
+    _auto_cell_size=_shared->auto_cell_size();
+    const int cells_in_cutoff=static_cast<int>(ceil(_cutoff/_cell_size));
+    if (cells_in_cutoff > 2) _cell_size=_cutoff*0.5;
+    _old_ncellx = _old_ncelly = _old_ncellz = -1;
+    #else
+    _cell_size=cutoff;
+    _auto_cell_size=false;
+    #endif
   }
 
-  /// Get the size of the cutoff+skin
-  inline double cell_size() const { return _cell_size; }
+  /// Get the cutoff+skin
+  inline double cutoff() { return _cutoff; }
 
   /// Check if there is enough memory for neighbor data and realloc if not
   /** \param inum Number of particles whose nbors will be stored on device
     * \param max_nbor Current max number of neighbors for a particle
     * \param success False if insufficient memory **/
-  inline void resize(const int inum, const int max_nbor, bool &success) {
+  inline void resize(const int inum, int max_nbor, bool &success) {
+    if (max_nbor == 0) max_nbor = 1;
     if (inum>_max_atoms || max_nbor>_max_nbors) {
       _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
       if (max_nbor>_max_nbors)
@@ -88,8 +108,9 @@ class Neighbor {
     * \param host_inum Number of particles whose nbors will be copied to host
     * \param max_nbor Current max number of neighbors for a particle
     * \param success False if insufficient memory **/
-  inline void resize(const int inum, const int host_inum, const int max_nbor,
+  inline void resize(const int inum, const int host_inum, int max_nbor,
                      bool &success) {
+    if (max_nbor == 0) max_nbor = 1;
     if (inum>_max_atoms || max_nbor>_max_nbors || host_inum>_max_host) {
       _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
       _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
@@ -99,15 +120,8 @@ class Neighbor {
     }
   }
 
-  inline void acc_timers() {
+  inline void acc_timers(FILE *screen) {
     if (_nbor_time_avail) {
-      if (_gpu_nbor==2) {
-        int mn=0;
-        for (int i=0; i<_total_atoms; i++)
-          mn=std::max(mn,host_acc[i]);
-        if (mn>_max_nbors)
-          assert(0==1);
-      }
       if (_time_device) {
         time_nbor.add_to_total();
         if (_use_packing==false) time_kernel.add_to_total();
@@ -172,9 +186,10 @@ class Neighbor {
   /// Build nbor list on the device
   template <class numtyp, class acctyp>
   void build_nbor_list(double **x, const int inum, const int host_inum,
-                       const int nall, Atom<numtyp,acctyp> &atom, double *sublo,
-                       double *subhi, tagint *tag, int **nspecial, tagint **special,
-                       bool &success, int &max_nbors);
+                       const int nall, Atom<numtyp,acctyp> &atom,
+                       double *sublo, double *subhi, tagint *tag,
+                       int **nspecial, tagint **special, bool &success,
+                       int &max_nbors, UCL_Vector<int,int> &error_flag);
 
   /// Return the number of bytes used on device
   inline double gpu_bytes() {
@@ -193,14 +208,16 @@ class Neighbor {
     * - 3rd row is starting location in packed nbors
     * - Remaining rows are the neighbors arranged for coalesced access **/
   UCL_D_Vec<int> dev_nbor;
+  /// Starting location in packed neighbors used only by unpack kernel
+  UCL_D_Vec<int> dev_packed_begin;
   /// Packed storage for neighbor lists copied from host
   UCL_D_Vec<int> dev_packed;
   /// Host buffer for copying neighbor lists
   UCL_H_Vec<int> host_packed;
   /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
   UCL_H_Vec<int> host_acc;
-  /// Device storage for accessing atom indices from the neighbor list (3-body)
-  UCL_D_Vec<int> dev_ilist;
+  /// Storage for accessing atom indices from the neighbor list (3-body)
+  UCL_Vector<int,int> three_ilist;
 
   // ----------------- Data for GPU Neighbor Calculation ---------------
 
@@ -217,18 +234,36 @@ class Neighbor {
   UCL_D_Vec<tagint> dev_special, dev_special_t;
   /// Host/Device storage for number of particles per cell
   UCL_Vector<int,int> cell_counts;
+  #ifndef LAL_USE_OLD_NEIGHBOR
+  /// Host/Device storage for number of subgroups per cell
+  UCL_Vector<int,int> cell_subgroup_counts;
+  /// Host/Device storage for subgroup to cell mapping
+  UCL_Vector<int,int> subgroup2cell;
+  #endif
   int *cell_iter;
 
   /// Device timers
   UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;
 
+  /// Effective SIMD width of neighbor build kernel
+  inline int simd_size() { return _simd_size; }
+
+  template <class t>
+    inline std::string toa(const t& in) {
+    std::ostringstream o;
+    o.precision(2);
+    o << in;
+    return o.str();
+  }
+
  private:
   NeighborShared *_shared;
   UCL_Device *dev;
   bool _allocated, _use_packing, _nbor_time_avail, _time_device;
   int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
-  bool _gpu_host, _alloc_packed;
-  double _cutoff, _cell_size, _bin_time;
+  int _old_max_nbors;
+  bool _gpu_host, _alloc_packed, _ilist_map, _auto_cell_size;
+  double _cutoff, _bin_time, _max_neighbor_factor, _cell_size;
   enum UCL_MEMOPT _packed_permissions;
 
   double _gpu_bytes, _c_bytes, _cell_bytes;
@@ -236,18 +271,29 @@ class Neighbor {
 
   int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build;
   int _ncells, _threads_per_atom, _total_atoms;
-  int _cells_in_cutoff;
 
   template <class numtyp, class acctyp>
-  inline void resize_max_neighbors(const int maxn, bool &success);
+  inline void resize_max_neighbors(int maxn, bool &success);
 
-  int _warp_size;
+  // For viewing host arrays for data copy operations
+  UCL_H_Vec<int> _host_offset;
+  UCL_D_Vec<int> _nbor_offset, _acc_view, _numj_view;
+
+  #ifndef LAL_USE_OLD_NEIGHBOR
+  UCL_H_Vec<int> _host_bin_stencil;
+  UCL_Const _bin_stencil;
+  int _old_ncellx, _old_ncelly, _old_ncellz;
+  #endif
+
+  int _simd_size;
   inline void set_nbor_block_size(const int mn) {
-    int desired=mn/(2*_warp_size);
-    desired*=_warp_size;
-    if (desired<_warp_size) desired=_warp_size;
+    #ifdef LAL_USE_OLD_NEIGHBOR
+    int desired=mn/(2*_simd_size);
+    desired*=_simd_size;
+    if (desired<_simd_size) desired=_simd_size;
     else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
     _block_nbor_build=desired;
+    #endif
   }
 };
 
diff --git a/lib/gpu/lal_neighbor_cpu.cu b/lib/gpu/lal_neighbor_cpu.cu
index f8b32e1746..3dfe23bdc2 100644
--- a/lib/gpu/lal_neighbor_cpu.cu
+++ b/lib/gpu/lal_neighbor_cpu.cu
@@ -19,6 +19,7 @@
 
 __kernel void kernel_unpack(__global int *dev_nbor,
                             const __global int *dev_ij,
+                            const __global int *dev_ij_begin,
                             const int inum, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int offset=tid & (t_per_atom-1);
@@ -28,7 +29,7 @@ __kernel void kernel_unpack(__global int *dev_nbor,
     int nbor=ii+inum;
     int numj=dev_nbor[nbor];
     nbor+=inum;
-    int list=dev_nbor[nbor];
+    int list=dev_ij_begin[ii];
     int list_end=list+numj;
     list+=offset;
     nbor+=fast_mul(ii,t_per_atom-1)+offset;
@@ -40,4 +41,3 @@ __kernel void kernel_unpack(__global int *dev_nbor,
     }
   } // if ii
 }
-
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index f1da437c86..2aca505396 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -1,6 +1,7 @@
 // **************************************************************************
 //                               neighbor_gpu.cu
 //                             -------------------
+//                            Nitin Dhamankar (Intel)
 //                              Peng Wang (Nvidia)
 //                           W. Michael Brown (ORNL)
 //
@@ -32,7 +33,14 @@ _texture( pos_tex,float4);
 _texture_2d( pos_tex,int4);
 #endif
 
-__kernel void calc_cell_id(const numtyp4 *restrict pos,
+#ifdef NV_KERNEL
+#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 2)
+// Issue with incorrect results in CUDA 11.2
+#define LAL_USE_OLD_NEIGHBOR
+#endif
+#endif
+
+__kernel void calc_cell_id(const numtyp4 *restrict x_,
                            unsigned *restrict cell_id,
                            int *restrict particle_id,
                            numtyp boxlo0, numtyp boxlo1, numtyp boxlo2,
@@ -43,7 +51,7 @@ __kernel void calc_cell_id(const numtyp4 *restrict pos,
 
   if (i < nall) {
     numtyp4 p;
-    fetch4(p,i,pos_tex); //pos[i];
+    fetch4(p,i,pos_tex); //x_[i];
 
     p.x -= boxlo0;
     p.y -= boxlo1;
@@ -138,16 +146,219 @@ __kernel void transpose(__global tagint *restrict out,
     out[j*rows_in+i] = block[ti][tj];
 }
 
+#ifndef LAL_USE_OLD_NEIGHBOR
+
+#define MAX_STENCIL_SIZE 25
+#if !defined(MAX_SUBGROUPS_PER_BLOCK)
+#define MAX_SUBGROUPS_PER_BLOCK 8
+#endif
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+__device__ __constant__  int bin_stencil[MAX_STENCIL_SIZE];
+#endif
+
 __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
-                                   const __global int *restrict cell_particle_id,
-                                   const __global int *restrict cell_counts,
-                                   __global int *nbor_list,
-                                   __global int *host_nbor_list,
-                                   __global int *host_numj,
-                                   int neigh_bin_size, numtyp cell_size,
-                                   int ncellx, int ncelly, int ncellz,
-                                   int inum, int nt, int nall, int t_per_atom,
-                                   int cells_in_cutoff)
+                            const __global int *restrict cell_particle_id,
+                            const __global int *restrict cell_counts,
+                            __global int *nbor_list,
+                            __global int *host_nbor_list,
+                            __global int *host_numj,
+                            int neigh_bin_size, numtyp cutoff_neigh,
+                            int ncellx, int ncelly, int ncellz,
+                            int inum, int nt, int nall, int t_per_atom,
+                            int cells_in_cutoff,
+                            const __global int *restrict cell_subgroup_counts,
+                            const __global int *restrict subgroup2cell,
+                            int subgroup_count,
+#if defined(NV_KERNEL) || defined(USE_HIP)
+                            int *not_used, __global int *error_flag)
+#else
+                            __constant int *bin_stencil,
+                            __global int *error_flag)
+#endif
+{
+  int tid = THREAD_ID_X;
+  int bsx = BLOCK_SIZE_X;
+  int simd_size = simd_size();
+  int subgroup_id_local = tid / simd_size;
+  int subgroup_id_global = BLOCK_ID_X * bsx / simd_size + subgroup_id_local;
+  int lane_id = tid % simd_size;
+
+#if (SHUFFLE_AVAIL == 0)
+  __local int cell_list_sh[BLOCK_NBOR_BUILD];
+  __local numtyp4 pos_sh[BLOCK_NBOR_BUILD];
+  __local int local_cell_counts[BLOCK_NBOR_BUILD];
+#endif
+  __local int local_begin[(MAX_STENCIL_SIZE+1)*MAX_SUBGROUPS_PER_BLOCK];
+  __local int local_counts[(MAX_STENCIL_SIZE+1)*MAX_SUBGROUPS_PER_BLOCK];
+
+  if (subgroup_id_global < subgroup_count) {
+    // identify own cell for subgroup (icell) and local atom (i) for the lane
+    int icell = subgroup2cell[subgroup_id_global];
+    int icell_end = cell_counts[icell+1];
+    int i = cell_counts[icell] + (subgroup_id_global -
+                                  cell_subgroup_counts[icell]) *
+      simd_size + lane_id;
+
+    // Get count of the number of iterations to finish all cells
+    const int bin_stencil_stride = cells_in_cutoff * 2 + 1;
+    const int bin_stencil_size = bin_stencil_stride * bin_stencil_stride;
+    int offset = 0;
+    int cell_count = 0, jcellyz, jcell_begin;
+    const int offset2 = subgroup_id_local * (MAX_STENCIL_SIZE+1);
+    const int niter = (bin_stencil_size - 1)/simd_size + 1;
+    int end_idx = simd_size;
+    for (int ni = 0; ni < niter; ni++) {
+      if (ni == niter - 1)
+        end_idx = bin_stencil_size - offset;
+      if (lane_id < end_idx) {
+        jcellyz = icell + bin_stencil[lane_id + offset];
+        jcell_begin = cell_counts[jcellyz - cells_in_cutoff];
+        local_begin[lane_id + offset2 + offset] = jcell_begin;
+            const int local_count = cell_counts[jcellyz + cells_in_cutoff + 1] -
+                                    jcell_begin;
+            cell_count += local_count;
+        local_counts[lane_id + offset2 + offset] = local_count;
+      }
+      offset += simd_size;
+    }
+
+#if (SHUFFLE_AVAIL == 0)
+    local_cell_counts[tid] = cell_count;
+    offset = subgroup_id_local * simd_size;
+    for (unsigned int mask=simd_size/2; mask>0; mask>>=1) {
+      simdsync();
+      local_cell_counts[tid] += local_cell_counts[ offset + lane_id^mask ];
+    }
+    simdsync();
+    cell_count = local_cell_counts[tid];
+#else
+    #pragma unroll
+    for (unsigned int s=simd_size/2; s>0; s>>=1)
+      cell_count += shfl_xor(cell_count, s, simd_size);
+#endif
+
+    int num_iter = cell_count;
+    int remainder = num_iter % simd_size;
+    if (remainder == 0) remainder = simd_size;
+    if (num_iter) num_iter = (num_iter - 1) / simd_size + 1;
+
+    numtyp4 diff;
+    numtyp r2;
+
+    int pid_i = nall, lpid_j, stride;
+    numtyp4 atom_i, atom_j;
+    int cnt = 0;
+    __global int *neigh_counts, *neigh_list;
+
+    if (i < icell_end)
+      pid_i = cell_particle_id[i];
+
+    if (pid_i < nt) {
+      fetch4(atom_i,pid_i,pos_tex); //pos[i];
+    }
+
+    if (pid_i < inum) {
+      stride=inum;
+      neigh_counts=nbor_list+stride+pid_i;
+      neigh_list=neigh_counts+stride+pid_i*(t_per_atom-1);
+      stride=stride*t_per_atom-t_per_atom;
+      nbor_list[pid_i]=pid_i;
+    } else {
+      stride=0;
+      neigh_counts=host_numj+pid_i-inum;
+      neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
+    }
+
+    // loop through neighbors
+    int bin_shift = 0;
+    int zy = -1;
+    int num_atom_cell = 0;
+    int cell_pos = lane_id;
+    end_idx = simd_size;
+    for (int ci = 0; ci < num_iter; ci++) {
+      cell_pos += simd_size;
+      while (cell_pos >= num_atom_cell && zy < bin_stencil_size) {
+        // Shift lane index into atom bins based on remainder from last bin
+        bin_shift += num_atom_cell % simd_size;
+        if (bin_shift >= simd_size) bin_shift -= simd_size;
+        cell_pos = lane_id - bin_shift;
+        if (cell_pos < 0) cell_pos += simd_size;
+        // Move to next bin
+        zy++;
+        jcell_begin = local_begin[offset2 + zy];
+        num_atom_cell = local_counts[offset2 + zy];
+      }
+
+      if (zy < bin_stencil_size) {
+        lpid_j =  cell_particle_id[jcell_begin + cell_pos];
+        fetch4(atom_j,lpid_j,pos_tex);
+#if (SHUFFLE_AVAIL == 0)
+        cell_list_sh[tid] = lpid_j;
+        pos_sh[tid].x = atom_j.x;
+        pos_sh[tid].y = atom_j.y;
+        pos_sh[tid].z = atom_j.z;
+      }
+      simdsync();
+#else
+      }
+#endif
+
+      if (ci == num_iter-1) end_idx = remainder;
+
+      for (int j = 0; j < end_idx; j++) {
+#if (SHUFFLE_AVAIL == 0)
+        int pid_j = cell_list_sh[offset+j]; // gather from shared memory
+        diff.x = atom_i.x - pos_sh[offset+j].x;
+        diff.y = atom_i.y - pos_sh[offset+j].y;
+        diff.z = atom_i.z - pos_sh[offset+j].z;
+#else
+        int pid_j = simd_broadcast_i(lpid_j, j, simd_size);
+#ifdef _DOUBLE_DOUBLE
+        diff.x = atom_i.x - simd_broadcast_d(atom_j.x, j, simd_size);
+        diff.y = atom_i.y - simd_broadcast_d(atom_j.y, j, simd_size);
+        diff.z = atom_i.z - simd_broadcast_d(atom_j.z, j, simd_size);
+#else
+        diff.x = atom_i.x - simd_broadcast_f(atom_j.x, j, simd_size);
+        diff.y = atom_i.y - simd_broadcast_f(atom_j.y, j, simd_size);
+        diff.z = atom_i.z - simd_broadcast_f(atom_j.z, j, simd_size);
+#endif
+#endif
+
+        r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
+//USE CUTOFFSQ?
+        if (r2 < cutoff_neigh*cutoff_neigh && pid_j != pid_i && pid_i < nt) {
+          if (cnt < neigh_bin_size) {
+            cnt++;
+            *neigh_list = pid_j;
+            neigh_list++;
+            if ((cnt & (t_per_atom-1))==0)
+              neigh_list=neigh_list+stride;
+          } else
+            *error_flag=1;
+        }
+      } // for j
+#if (SHUFFLE_AVAIL == 0)
+      simdsync();
+#endif
+    } // for (ci)
+    if (pid_i < nt)
+      *neigh_counts = cnt;
+  } // if (subgroup_id_global < subgroup_count)
+}
+
+#else
+
+__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
+                                const __global int *restrict cell_particle_id,
+                                const __global int *restrict cell_counts,
+                                __global int *nbor_list,
+                                __global int *host_nbor_list,
+                                __global int *host_numj,
+                                int neigh_bin_size, numtyp cell_size,
+                                int ncellx, int ncelly, int ncellz,
+                                int inum, int nt, int nall, int t_per_atom,
+                                int cells_in_cutoff)
 {
   int tid = THREAD_ID_X;
   int ix = BLOCK_ID_X + cells_in_cutoff;
@@ -232,7 +443,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
                 diff.z = atom_i.z - pos_sh[j].z;
 
                 r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
-                if (r2 < cell_size*cell_size && pid_j != pid_i) { //  && r2 > 1e-5
+                if (r2 < cell_size*cell_size && pid_j != pid_i) {
                   cnt++;
                   if (cnt <= neigh_bin_size) {
                     *neigh_list = pid_j;
@@ -253,6 +464,8 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
   } // for (i)
 }
 
+#endif
+
 __kernel void kernel_special(__global int *dev_nbor,
                              __global int *host_nbor_list,
                              const __global int *host_numj,
@@ -310,4 +523,3 @@ __kernel void kernel_special(__global int *dev_nbor,
     }
   } // if ii
 }
-
diff --git a/lib/gpu/lal_neighbor_shared.cpp b/lib/gpu/lal_neighbor_shared.cpp
index f1458b35be..e1c3f5ca68 100644
--- a/lib/gpu/lal_neighbor_shared.cpp
+++ b/lib/gpu/lal_neighbor_shared.cpp
@@ -13,6 +13,7 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
+#include <cmath>
 #include "lal_precision.h"
 #include "lal_neighbor_shared.h"
 
@@ -48,6 +49,45 @@ void NeighborShared::clear() {
   }
 }
 
+double NeighborShared::best_cell_size(const double subx, const double suby,
+                                      const double subz, const int nlocal,
+                                      const double cut) {
+  if (_cached_cell_size && _cut_sort==cut) {
+    _cached_cell_size=false;
+    return _cell_size;
+  }
+
+  const double box_density = static_cast<double>(nlocal) / (subx*suby*subz);
+  const double density=box_density*cut*cut*cut;
+  if (density >= 4.0 * _simd_size) return cut*0.5;
+  else if (density >= 0.5 * _simd_size) return cut;
+
+  const double iters = 60;
+  const double inc = cut/(iters-1);
+  const double iss = 1.0 / _simd_size;
+  double test_size = cut;
+  double best_iters = 1e200;
+  double best_size;
+  for (int i = 0; i < iters; i++) {
+    const double i_test_size = 1.0/test_size;
+    const int ncellx = static_cast<int>(ceil(subx*i_test_size));
+    const int ncelly = static_cast<int>(ceil(suby*i_test_size));
+    const int ncellz = static_cast<int>(ceil(subz*i_test_size));
+    const double density = box_density*test_size*test_size*test_size;
+    const double iters_per_cell = ceil(iss*density);
+    const double iters = ncellx*ncelly*ncellz*iters_per_cell*
+      ceil(density*27.0*iss);
+    if (iters < best_iters) {
+      best_iters = iters;
+      best_size = test_size;
+    }
+    test_size += inc;
+  }
+  const int cells_in_cutoff=static_cast<int>(ceil(cut/best_size));
+  if (cells_in_cutoff > 2) best_size=cut*0.5;
+  return best_size;
+}
+
 void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
                                      const std::string flags) {
   if (_compiled)
@@ -56,11 +96,11 @@ void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
   _gpu_nbor=gpu_nbor;
   if (_gpu_nbor==0) {
     nbor_program=new UCL_Program(dev);
-    nbor_program->load_string(neighbor_cpu,flags.c_str());
+    nbor_program->load_string(neighbor_cpu,flags.c_str(),nullptr,stderr);
     k_nbor.set_function(*nbor_program,"kernel_unpack");
   } else {
     build_program=new UCL_Program(dev);
-    build_program->load_string(neighbor_gpu,flags.c_str());
+    build_program->load_string(neighbor_gpu,flags.c_str(),nullptr,stderr);
 
     if (_gpu_nbor==1) {
       k_cell_id.set_function(*build_program,"calc_cell_id");
diff --git a/lib/gpu/lal_neighbor_shared.h b/lib/gpu/lal_neighbor_shared.h
index 5cfc4e4767..e574aaeaeb 100644
--- a/lib/gpu/lal_neighbor_shared.h
+++ b/lib/gpu/lal_neighbor_shared.h
@@ -47,6 +47,44 @@ class NeighborShared {
   /// Texture for cached position/type access with CUDA
   UCL_Texture neigh_tex;
 
+  /// Use a heuristic to approximate best bin size assuming uniform density
+  /** This is only called by core LAMMPS for atom sort sizes **/
+  inline double update_cell_size(const double subx, const double suby,
+                                 const double subz, const int nlocal,
+                                 const double cut) {
+    if (_auto_cell_size==false || subz==0.0) return cut;
+    else {
+      _cell_size=best_cell_size(subx, suby, subz, nlocal, cut);
+      _cached_cell_size=true;
+      _cut_sort=cut;
+      return _cell_size;
+    }
+  }
+
+  /// Use a heuristic to approximate best bin size assuming uniform density
+  double best_cell_size(const double subx, const double suby,
+                        const double subz, const int nlocal,
+                        const double cut);
+
+  /// Current cutoff used for cell size determination
+  inline double neighbor_cutoff() { return _neighbor_cutoff; }
+
+  /// Current neighbor cell size
+  inline double cell_size() { return _cell_size; }
+
+  /// Return setting for auto cell size
+  inline bool auto_cell_size() { return _auto_cell_size; }
+
+  inline void setup_auto_cell_size(const bool autosize, const double cut,
+                                   const int simd_size) {
+    _auto_cell_size = autosize;
+    _cached_cell_size = false;
+    _neighbor_cutoff = cut;
+    _cell_size = cut;
+    _simd_size = simd_size;
+    if (_simd_size < 2) _auto_cell_size = false;
+  }
+
   /// Compile kernels for neighbor lists
   void compile_kernels(UCL_Device &dev, const int gpu_nbor,
                        const std::string flags);
@@ -59,6 +97,8 @@ class NeighborShared {
  private:
   bool _compiled;
   int _gpu_nbor;
+  bool _auto_cell_size, _cached_cell_size;
+  double _neighbor_cutoff, _cell_size, _simd_size, _cut_sort;
 };
 
 }
diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp
index 6b5bf88ea5..6e8fe237a6 100644
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@@ -71,7 +71,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
   if (flag!=0)
     return 0;
   if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
-    flag=-5;
+    flag=-15;
     return 0;
   }
   if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) {
@@ -133,7 +133,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
                       UCL_SUCCESS);
   UCL_H_Vec<grdtyp> view;
   view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
-  ucl_copy(d_rho_coeff,view,true);
+  ucl_copy(d_rho_coeff,view,false);
   _max_bytes+=d_rho_coeff.row_bytes();
 
   // Allocate storage for grid
@@ -191,6 +191,7 @@ void PPPMT::clear(const double cpu_time) {
   d_brick_counts.clear();
   error_flag.clear();
   d_brick_atoms.clear();
+  d_rho_coeff.clear();
 
   acc_timers();
   device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
@@ -261,7 +262,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
   double delvolinv = delxinv*delyinv*delzinv;
   grdtyp f_delvolinv = delvolinv;
 
-  device->zero(d_brick_counts,d_brick_counts.numel());
+  d_brick_counts.zero();
   k_particle_map.set_size(GX,BX);
   k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum,
                      &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y,
@@ -286,6 +287,10 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
   error_flag.update_host(true);
   time_out.stop();
 
+  #ifndef GERYON_OCL_FLUSH
+  error_flag.flush();
+  #endif
+
   _precompute_done=true;
 }
 
@@ -351,7 +356,7 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) {
                &ans->force);
   time_interp.stop();
 
-  ans->copy_answers(false,false,false,false);
+  ans->copy_answers(false,false,false,false,0);
   if (_kspace_split==false)
     device->add_ans_object(ans);
 }
@@ -374,18 +379,19 @@ void PPPMT::compile_kernels(UCL_Device &dev) {
   #ifdef USE_OPENCL
   flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
          ucl_template_name<grdtyp>()+"4";
+  if (sizeof(grdtyp)==sizeof(double)) flags+=std::string(" -DGRD_DBL");
   #endif
 
   if (pppm_program) delete pppm_program;
   pppm_program=new UCL_Program(dev);
 
   #ifdef USE_OPENCL
-  pppm_program->load_string(pppm,flags.c_str());
+  pppm_program->load_string(pppm,flags.c_str(),nullptr,screen);
   #else
   if (sizeof(grdtyp)==sizeof(float))
-    pppm_program->load_string(pppm_f,flags.c_str());
+    pppm_program->load_string(pppm_f,flags.c_str(),nullptr,screen);
   else
-    pppm_program->load_string(pppm_d,flags.c_str());
+    pppm_program->load_string(pppm_d,flags.c_str(),nullptr,screen);
   #endif
 
   k_particle_map.set_function(*pppm_program,"particle_map");
diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu
index ee9f1b61d6..e17df5b88c 100644
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@@ -35,11 +35,14 @@ _texture( q_tex,int2);
 #define pos_tex x_
 #define q_tex q_
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
+
+#ifdef GRD_DBL
 #if defined(cl_amd_fp64)
 #pragma OPENCL EXTENSION cl_amd_fp64 : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
+#endif
 
 #endif
 
diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp
index b826881392..d548b94be1 100644
--- a/lib/gpu/lal_pppm_ext.cpp
+++ b/lib/gpu/lal_pppm_ext.cpp
@@ -129,7 +129,8 @@ double pppm_gpu_bytes_f() {
 void pppm_gpu_forces_f(double **f) {
   double etmp;
   PPPMF.atom->data_unavail();
-  PPPMF.ans->get_answers(f,nullptr,nullptr,nullptr,nullptr,etmp);
+  int error_flag;
+  PPPMF.ans->get_answers(f,nullptr,nullptr,nullptr,nullptr,etmp,error_flag);
 }
 
 double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
@@ -173,6 +174,7 @@ double pppm_gpu_bytes_d() {
 void pppm_gpu_forces_d(double **f) {
   double etmp;
   PPPMD.atom->data_unavail();
-  PPPMD.ans->get_answers(f,nullptr,nullptr,nullptr,nullptr,etmp);
+  int error_flag;
+  PPPMD.ans->get_answers(f,nullptr,nullptr,nullptr,nullptr,etmp,error_flag);
 }
 
diff --git a/lib/gpu/lal_pre_cuda_hip.h b/lib/gpu/lal_pre_cuda_hip.h
new file mode 100644
index 0000000000..d37b4a94c2
--- /dev/null
+++ b/lib/gpu/lal_pre_cuda_hip.h
@@ -0,0 +1,355 @@
+// **************************************************************************
+//                               pre_cuda_hip.h
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//                           Nitin Dhamankar (Intel)
+//
+//  Device-side preprocessor definitions for CUDA and HIP builds
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+//*************************************************************************
+//                       Device Configuration Definitions
+//                    See lal_preprocessor.h for definitions
+//*************************************************************************/
+
+// -------------------------------------------------------------------------
+//                           CUDA and HIP DEFINITIONS
+// -------------------------------------------------------------------------
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+
+// -------------------------------------------------------------------------
+//                             DEVICE CONFIGURATION
+// -------------------------------------------------------------------------
+
+
+#ifdef __HIP_PLATFORM_HCC__
+#define CONFIG_ID 303
+#define SIMD_SIZE 64
+#else
+#define CONFIG_ID 103
+#define SIMD_SIZE 32
+#endif
+
+#define MEM_THREADS SIMD_SIZE
+#define SHUFFLE_AVAIL 1
+#define FAST_MATH 1
+
+#define THREADS_PER_ATOM 4
+#define THREADS_PER_CHARGE 8
+#define THREADS_PER_THREE 2
+
+#define BLOCK_PAIR 256
+#define BLOCK_BIO_PAIR 256
+#define BLOCK_ELLIPSE 128
+#define PPPM_BLOCK_1D 64
+#define BLOCK_NBOR_BUILD 128
+#define BLOCK_CELL_2D 8
+#define BLOCK_CELL_ID 128
+
+#define MAX_SHARED_TYPES 11
+#define MAX_BIO_SHARED_TYPES 128
+#define PPPM_MAX_SPLINE 8
+
+// -------------------------------------------------------------------------
+//                          LEGACY DEVICE CONFIGURATION
+// -------------------------------------------------------------------------
+
+#ifdef __CUDA_ARCH__
+
+#if (__CUDA_ARCH__ < 200)
+
+#undef CONFIG_ID
+#define CONFIG_ID 101
+#define MEM_THREADS 16
+#undef THREADS_PER_ATOM
+#define THREADS_PER_ATOM 1
+#undef THREADS_PER_CHARGE
+#define THREADS_PER_CHARGE 16
+#undef BLOCK_PAIR
+#define BLOCK_PAIR 64
+#undef BLOCK_BIO_PAIR
+#define BLOCK_BIO_PAIR 64
+#undef BLOCK_NBOR_BUILD
+#define BLOCK_NBOR_BUILD 64
+#undef MAX_SHARED_TYPES
+#define MAX_SHARED_TYPES 8
+#undef SHUFFLE_AVAIL
+#define SHUFFLE_AVAIL 0
+
+#elseif (__CUDA_ARCH__ < 300)
+
+#undef CONFIG_ID
+#define CONFIG_ID 102
+#undef BLOCK_PAIR
+#define BLOCK_PAIR 128
+#undef BLOCK_BIO_PAIR
+#define BLOCK_BIO_PAIR 128
+#undef MAX_SHARED_TYPES
+#define MAX_SHARED_TYPES 8
+#undef SHUFFLE_AVAIL
+#define SHUFFLE_AVAIL 0
+
+#endif
+
+#endif
+
+// -------------------------------------------------------------------------
+//                              KERNEL MACROS
+// -------------------------------------------------------------------------
+
+#ifdef USE_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+#define fast_mul(X,Y) (X)*(Y)
+
+#ifdef __CUDA_ARCH__
+#if (__CUDA_ARCH__ < 200)
+#define fast_mul __mul24
+#endif
+#endif
+
+#define EVFLAG 1
+#define NOUNROLL
+#define GLOBAL_ID_X threadIdx.x+fast_mul(blockIdx.x,blockDim.x)
+#define GLOBAL_ID_Y threadIdx.y+fast_mul(blockIdx.y,blockDim.y)
+#define GLOBAL_SIZE_X fast_mul(gridDim.x,blockDim.x);
+#define GLOBAL_SIZE_Y fast_mul(gridDim.y,blockDim.y);
+#define THREAD_ID_X threadIdx.x
+#define THREAD_ID_Y threadIdx.y
+#define BLOCK_ID_X blockIdx.x
+#define BLOCK_ID_Y blockIdx.y
+#define BLOCK_SIZE_X blockDim.x
+#define BLOCK_SIZE_Y blockDim.y
+#define NUM_BLOCKS_X gridDim.x
+
+#define __kernel extern "C" __global__
+#ifdef __local
+#undef __local
+#endif
+#define __local __shared__
+#define __global
+#define restrict __restrict__
+#define atom_add atomicAdd
+#define ucl_inline static __inline__ __device__
+
+#define simd_size() SIMD_SIZE
+
+#define simdsync()
+
+#ifdef NV_KERNEL
+#if (__CUDACC_VER_MAJOR__ >= 9)
+#undef simdsync
+#define simdsync() __syncwarp(0xffffffff)
+#endif
+#endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+#undef simdsync()
+#define simdsync() __syncwarp(0xffffffff)
+#endif
+
+// -------------------------------------------------------------------------
+//                         KERNEL MACROS - TEXTURES
+// -------------------------------------------------------------------------
+
+#ifdef __HIP_PLATFORM_HCC__
+#define _texture(name, type)  __device__ type* name
+#define _texture_2d(name, type)  __device__ type* name
+#else
+#define _texture(name, type)  texture<type> name
+#define _texture_2d(name, type) texture<type,1> name
+#endif
+
+#if (__CUDACC_VER_MAJOR__ < 11)
+  #ifdef _DOUBLE_DOUBLE
+  #define fetch4(ans,i,pos_tex) {                        \
+    int4 xy = tex1Dfetch(pos_tex,i*2);                   \
+    int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
+    ans.x=__hiloint2double(xy.y, xy.x);                  \
+    ans.y=__hiloint2double(xy.w, xy.z);                  \
+    ans.z=__hiloint2double(zt.y, zt.x);                  \
+    ans.w=__hiloint2double(zt.w, zt.z);                  \
+  }
+  #define fetch(ans,i,q_tex) {                           \
+    int2 qt = tex1Dfetch(q_tex,i);                       \
+    ans=__hiloint2double(qt.y, qt.x);                    \
+  }
+  #else
+  #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
+  #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
+  #endif
+#else
+  #define fetch4(ans,i,x) ans=x[i]
+  #define fetch(ans,i,q) ans=q[i]
+  #undef _texture
+  #undef _texture_2d
+  #define _texture(name, type)
+  #define _texture_2d(name, type)
+  #define pos_tex x_
+  #define quat_tex qif
+  #define q_tex q_
+  #define vel_tex v_
+  #define mu_tex mu_
+#endif
+
+#ifdef __HIP_PLATFORM_HCC__
+
+#undef fetch4
+#undef fetch
+
+#ifdef _DOUBLE_DOUBLE
+#define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
+#define fetch(ans,i,q_tex)    (ans=*(((double *)  q_tex) + i))
+#else
+#define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
+#define fetch(ans,i,q_tex)    (ans=*(((float *)  q_tex) + i))
+#endif
+
+#endif
+
+// -------------------------------------------------------------------------
+//                           KERNEL MACROS - MATH
+// -------------------------------------------------------------------------
+
+#ifdef CUDA_PRE_THREE
+struct __builtin_align__(16) _double4
+{
+  double x, y, z, w;
+};
+typedef struct _double4 double4;
+#endif
+
+#ifdef _DOUBLE_DOUBLE
+
+#define ucl_exp exp
+#define ucl_powr pow
+#define ucl_atan atan
+#define ucl_cbrt cbrt
+#define ucl_ceil ceil
+#define ucl_abs fabs
+#define ucl_rsqrt rsqrt
+#define ucl_sqrt sqrt
+#define ucl_recip(x) ((numtyp)1.0/(x))
+
+#else
+
+#define ucl_atan atanf
+#define ucl_cbrt cbrtf
+#define ucl_ceil ceilf
+#define ucl_abs fabsf
+#define ucl_recip(x) ((numtyp)1.0/(x))
+#define ucl_rsqrt rsqrtf
+#define ucl_sqrt sqrtf
+#define ucl_exp expf
+#define ucl_powr powf
+
+#endif
+
+// -------------------------------------------------------------------------
+//                         KERNEL MACROS - SHUFFLE
+// -------------------------------------------------------------------------
+
+#if SHUFFLE_AVAIL == 1
+
+#ifndef USE_HIP
+#if (__CUDACC_VER_MAJOR__ < 9)
+#define CUDA_PRE_NINE
+#endif
+#endif
+
+#if defined(CUDA_PRE_NINE) || defined(__HIP_PLATFORM_HCC__)
+
+  #ifdef _SINGLE_SINGLE
+    #define shfl_down __shfl_down
+    #define shfl_xor __shfl_xor
+  #else
+    ucl_inline double shfl_down(double var, unsigned int delta, int width) {
+      int2 tmp;
+      tmp.x = __double2hiint(var);
+      tmp.y = __double2loint(var);
+      tmp.x = __shfl_down(tmp.x,delta,width);
+      tmp.y = __shfl_down(tmp.y,delta,width);
+      return __hiloint2double(tmp.x,tmp.y);
+    }
+    ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) {
+      int2 tmp;
+      tmp.x = __double2hiint(var);
+      tmp.y = __double2loint(var);
+      tmp.x = __shfl_xor(tmp.x,lanemask,width);
+      tmp.y = __shfl_xor(tmp.y,lanemask,width);
+      return __hiloint2double(tmp.x,tmp.y);
+    }
+  #endif
+  #define simd_broadcast_i __shfl
+  #define simd_broadcast_f __shfl
+  #ifdef _DOUBLE_DOUBLE
+    ucl_inline double simd_broadcast_d(double var, unsigned int src,
+                                       int width) {
+      int2 tmp;
+      tmp.x = __double2hiint(var);
+      tmp.y = __double2loint(var);
+      tmp.x = __shfl(tmp.x,src,width);
+      tmp.y = __shfl(tmp.y,src,width);
+      return __hiloint2double(tmp.x,tmp.y);
+    }
+  #endif
+
+#else
+
+  #ifdef _SINGLE_SINGLE
+  ucl_inline float shfl_down(float var, unsigned int delta, int width) {
+    return __shfl_down_sync(0xffffffff, var, delta, width);
+  }
+  ucl_inline float shfl_xor(float var, unsigned int lanemask, int width) {
+    return __shfl_xor_sync(0xffffffff, var, lanemask, width);
+  }
+  #else
+  ucl_inline double shfl_down(double var, unsigned int delta, int width) {
+    int2 tmp;
+    tmp.x = __double2hiint(var);
+    tmp.y = __double2loint(var);
+    tmp.x = __shfl_down_sync(0xffffffff,tmp.x,delta,width);
+    tmp.y = __shfl_down_sync(0xffffffff,tmp.y,delta,width);
+    return __hiloint2double(tmp.x,tmp.y);
+  }
+  ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) {
+    int2 tmp;
+    tmp.x = __double2hiint(var);
+    tmp.y = __double2loint(var);
+    tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,lanemask,width);
+    tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,lanemask,width);
+    return __hiloint2double(tmp.x,tmp.y);
+  }
+  #endif
+  #define simd_broadcast_i(var, src, width) \
+    __shfl_sync(0xffffffff, var, src, width)
+  #define simd_broadcast_f(var, src, width) \
+    __shfl_sync(0xffffffff, var, src, width)
+  #ifdef _DOUBLE_DOUBLE
+  ucl_inline double simd_broadcast_d(double var, unsigned int src, int width) {
+    int2 tmp;
+    tmp.x = __double2hiint(var);
+    tmp.y = __double2loint(var);
+    tmp.x = __shfl_sync(0xffffffff,tmp.x,src,width);
+    tmp.y = __shfl_sync(0xffffffff,tmp.y,src,width);
+    return __hiloint2double(tmp.x,tmp.y);
+  }
+  #endif
+#endif
+
+#endif
+
+// -------------------------------------------------------------------------
+//                            END CUDA / HIP DEFINITIONS
+// -------------------------------------------------------------------------
+
+#endif
diff --git a/lib/gpu/lal_pre_ocl_config.h b/lib/gpu/lal_pre_ocl_config.h
new file mode 100644
index 0000000000..15c503c942
--- /dev/null
+++ b/lib/gpu/lal_pre_ocl_config.h
@@ -0,0 +1,53 @@
+// **************************************************************************
+//                               pre_ocl_config.h
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//                           Nitin Dhamankar (Intel)
+//
+//  Device-side preprocessor definitions
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+//*************************************************************************
+//                       Device Configuration Definitions
+//                    See lal_preprocessor.h for definitions
+//                           Configuration order:
+//
+//  {CONFIG_NAME, CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH,
+//   THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
+//   BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
+//   BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
+//   PPPM_MAX_SPLINE}
+//
+//*************************************************************************/
+
+const int nconfigs=6;
+const char * ocl_config_names[] =
+  {
+   "generic",
+   "nvidiagpu",
+   "amdgpu",
+   "intelgpu",
+   "applegpu",
+   "intelcpu"
+  };
+const char * ocl_config_strings[] =
+  {
+   "GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8",
+   "NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8",
+   "AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8",
+#ifdef _SINGLE_SINGLE
+   "INTEL_GPU,500,8,16,1,1,4,8,1,64,64,64,64,64,8,128,8,128,8",
+   "APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8",
+#else
+   "INTEL_GPU,500,8,16,1,1,2,8,1,64,64,64,64,64,8,128,8,128,8",
+   "APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8",
+#endif
+   "INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8"
+  };
diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h
index 7f82ba18aa..bb2423198f 100644
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@@ -20,6 +20,29 @@
 #include <cuda_runtime.h>
 #endif
 
+// ---------------------- OPENMP PREPROCESSOR STUFF ------------------
+#if defined(_OPENMP)
+  #if !defined(LAL_USE_OMP)
+  #define LAL_USE_OMP 1
+  #endif
+
+  #if !defined(LAL_USE_OMP_SIMD)
+    #if (_OPENMP >= 201307)
+    #define LAL_USE_OMP_SIMD 1
+    #else
+    #define LAL_USE_OMP_SIMD 0
+    #endif
+  #endif
+#else
+  #if !defined(LAL_USE_OMP)
+  #define LAL_USE_OMP 0
+  #endif
+
+  #if !defined(LAL_USE_OMP_SIMD)
+  #define LAL_USE_OMP_SIMD 0
+  #endif
+#endif
+
 struct _lgpu_int2 {
   int x; int y;
 };
@@ -75,6 +98,7 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define ACC_PRECISION double
 #define numtyp2 _lgpu_float2
 #define numtyp4 _lgpu_float4
+#define acctyp2 _lgpu_double2
 #define acctyp4 _lgpu_double4
 #endif
 
@@ -84,6 +108,7 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define ACC_PRECISION double
 #define numtyp2 _lgpu_double2
 #define numtyp4 _lgpu_double4
+#define acctyp2 _lgpu_double2
 #define acctyp4 _lgpu_double4
 #endif
 
@@ -93,44 +118,16 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define ACC_PRECISION float
 #define numtyp2 _lgpu_float2
 #define numtyp4 _lgpu_float4
+#define acctyp2 _lgpu_float2
 #define acctyp4 _lgpu_float4
 #endif
 
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 
-// OCL_DEFAULT_VENDOR: preprocessor define for hardware
-// specific sizes of OpenCL kernel related constants
-
-#ifdef FERMI_OCL
-#define OCL_DEFAULT_VENDOR "fermi"
-#endif
-
-#ifdef KEPLER_OCL
-#define OCL_DEFAULT_VENDOR "kepler"
-#endif
-
-#ifdef CYPRESS_OCL
-#define OCL_DEFAULT_VENDOR "cypress"
-#endif
-
-#ifdef GENERIC_OCL
-#define OCL_DEFAULT_VENDOR "generic"
-#endif
-
-#ifdef INTEL_OCL
-#define OCL_DEFAULT_VENDOR "intel"
-#endif
-
-#ifdef PHI_OCL
-#define OCL_DEFAULT_VENDOR "phi"
-#endif
-
-#ifndef OCL_DEFAULT_VENDOR
-#define OCL_DEFAULT_VENDOR "none"
-#endif
-
-// default to 32-bit smallint and other ints, 64-bit bigint: same as defined in src/lmptype.h
-#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && !defined(LAMMPS_SMALLBIG)
+// default to 32-bit smallint and other ints, 64-bit bigint:
+//   same as defined in src/lmptype.h
+#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \
+  !defined(LAMMPS_SMALLBIG)
 #define LAMMPS_SMALLBIG
 #endif
 
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 7c94438272..12cf6345c2 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -1,9 +1,10 @@
 // **************************************************************************
-//                              preprocessor.cu
+//                               preprocessor.h
 //                             -------------------
 //                           W. Michael Brown (ORNL)
+//                           Nitin Dhamankar (Intel)
 //
-//  Device code for CUDA-specific preprocessor definitions
+//  Device-side preprocessor definitions
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
@@ -14,566 +15,136 @@
 // ***************************************************************************/
 
 //*************************************************************************
-//                           Preprocessor Definitions
+//                       Device Configuration Definitions
 //
-//  Note: It is assumed that constants with the same names are defined with
-//  the same values in all files.
+//  For OpenCL, the configuration is a string (optionally controlled at
+//  runtime) where tokens specify the values below in order)
 //
-//  ARCH
-//     Definition:   Architecture number for accelerator
+//  CONFIG_ID:
+//     Definition:   Unique ID for a configuration
+//                   100-199 for NVIDIA GPUs with CUDA / HIP
+//                   200-299 for NVIDIA GPUs with OpenCL
+//                   300-399 for AMD GPUs with HIP
+//                   400-499 for AMD GPUs with OpenCL
+//                   500-599 for Intel GPUs with OpenCL
+//  SIMD_SIZE:
+//     Definition:   For CUDA this is the warp size.
+//                   For AMD this is the wavefront size.
+//                   For OpenCL < 2.1 this is the number of workitems
+//                     guarenteed to have the same instruction pointer
+//                   For OpenCL >= 2.1 this is the smallest expected subgroup
+//                     size. Actually subgroup sizes are determined per kernel.
 //  MEM_THREADS
-//     Definition:   Number of threads with sequential ids accessing memory
-//                   simultaneously on multiprocessor
-//  WARP_SIZE:
-//     Definition:   Number of threads guaranteed to be on the same instruction
+//     Definition:   Number of elements in main memory transaction. Used in
+//                   PPPM. If unknown, set to SIMD_SIZE.
+//  SHUFFLE_AVAIL
+//     Definition:   Controls the use of instructions for horizontal vector
+//                   operations. 0 disables and will increase shared memory
+//                   usage. 1 enables for CUDA, HIP, and OpenCL >= 2.1 on
+//                   NVIDIA and Intel devices.
+//  FAST_MATH
+//     Definition:   0: do not use -cl-fast-relaxed-math optimization flag or
+//                   native transcendentals for OpenCL (fused multiply-add
+//                   still enabled). For CUDA and HIP, this is controlled by
+//                   the Makefile at compile time. 1: enable fast math opts
+//
 //  THREADS_PER_ATOM
-//     Definition:   Default number of threads assigned per atom for pair styles
-//     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
+//     Definition:   Default number of work items or CUDA threads assigned per
+//                   per atom for pair styles
+//     Restrictions: Must be power of 2; THREADS_PER_ATOM<=SIMD_SIZE
 //  THREADS_PER_CHARGE
-//     Definition:   Default number of threads assigned per atom for pair styles
-//                   with charge
-//     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
-//  PPPM_MAX_SPLINE
-//     Definition:   Maximum order for splines in PPPM
-//  PPPM_BLOCK_1D
-//     Definition:   Thread block size for PPPM kernels
-//     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
-//                   PPPM_BLOCK_1D%32==0
+//     Definition:   Default number of work items or CUDA threads assigned per
+//                   per atom for pair styles using charge
+//     Restrictions: Must be power of 2; THREADS_PER_ATOM<=SIMD_SIZE
+//  THREADS_PER_THREE
+//     Definition:   Default number of work items or CUDA threads assigned per
+//                   per atom for 3-body styles
+//     Restrictions: Must be power of 2; THREADS_PER_ATOM^2<=SIMD_SIZE
+//
 //  BLOCK_PAIR
-//     Definition:   Default thread block size for pair styles
-//     Restrictions:
+//     Definition:   Default block size for pair styles
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  BLOCK_BIO_PAIR
+//     Definition:   Default block size for CHARMM styles
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  BLOCK_ELLIPSE
+//     Definition:   Default block size for ellipsoidal models and some 3-body
+//                   styles
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  PPPM_BLOCK_1D
+//     Definition:   Default block size for PPPM kernels
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  BLOCK_NBOR_BUILD
+//     Definition:   Default block size for neighbor list builds
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  BLOCK_CELL_2D
+//     Definition:   Default block size in each dimension for matrix transpose
+//  BLOCK_CELL_ID
+//     Definition:   Unused in current implementation; Maintained for legacy
+//                   purposes and specialized builds
+//
 //  MAX_SHARED_TYPES 8
 //     Definition:   Max # of atom type params can be stored in shared memory
 //     Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
-//  BLOCK_CELL_2D
-//     Definition:   Default block size in each dimension for cell list builds
-//                   and matrix transpose
-//  BLOCK_CELL_ID
-//     Definition:   Default block size for binning atoms in cell list builds
-//  BLOCK_NBOR_BUILD
-//     Definition:   Default block size for neighbor list builds
-//  BLOCK_BIO_PAIR
-//     Definition:   Default thread block size for "bio" pair styles
 //  MAX_BIO_SHARED_TYPES
 //     Definition:   Max # of atom type params can be stored in shared memory
-//     Restrictions:  MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
+//     Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
+//  PPPM_MAX_SPLINE
+//     Definition:   Maximum order for splines in PPPM
+//     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
 //
 //*************************************************************************/
 
-#define _texture(name, type)  texture<type> name
-#define _texture_2d(name, type) texture<type,1> name
-
 // -------------------------------------------------------------------------
-//                            HIP DEFINITIONS
+//                           CUDA and HIP DEFINITIONS
 // -------------------------------------------------------------------------
 
-#ifdef USE_HIP
-  #include <hip/hip_runtime.h>
-  #ifdef __HIP_PLATFORM_HCC__
-    #define mul24(x, y) __mul24(x, y)
-    #undef _texture
-    #undef _texture_2d
-    #define _texture(name, type)  __device__ type* name
-    #define _texture_2d(name, type)  __device__ type* name
-  #endif
-  #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
-  #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
-  #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
-  #define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
-  #define THREAD_ID_X threadIdx.x
-  #define THREAD_ID_Y threadIdx.y
-  #define BLOCK_ID_X blockIdx.x
-  #define BLOCK_ID_Y blockIdx.y
-  #define BLOCK_SIZE_X blockDim.x
-  #define BLOCK_SIZE_Y blockDim.y
-  #define __kernel extern "C" __global__
-  #ifdef __local
-    #undef __local
-  #endif
-  #define __local __shared__
-  #define __global
-  #define restrict __restrict__
-  #define atom_add atomicAdd
-  #define ucl_inline static __inline__ __device__
-
-  #define THREADS_PER_ATOM 4
-  #define THREADS_PER_CHARGE 8
-  #define BLOCK_NBOR_BUILD 128
-  #define BLOCK_PAIR 256
-  #define BLOCK_BIO_PAIR 256
-  #define BLOCK_ELLIPSE 128
-  #define MAX_SHARED_TYPES 11
-
-  #ifdef _SINGLE_SINGLE
-    ucl_inline double shfl_xor(double var, int laneMask, int width) {
-  #ifdef __HIP_PLATFORM_HCC__
-      return __shfl_xor(var, laneMask, width);
-  #else
-      return __shfl_xor_sync(0xffffffff, var, laneMask, width);
-  #endif
-    }
-  #else
-    ucl_inline double shfl_xor(double var, int laneMask, int width) {
-      int2 tmp;
-      tmp.x = __double2hiint(var);
-      tmp.y = __double2loint(var);
-  #ifdef __HIP_PLATFORM_HCC__
-      tmp.x = __shfl_xor(tmp.x,laneMask,width);
-      tmp.y = __shfl_xor(tmp.y,laneMask,width);
-  #else
-      tmp.x = __shfl_xor_sync(0xffffffff, tmp.x,laneMask,width);
-      tmp.y = __shfl_xor_sync(0xffffffff, tmp.y,laneMask,width);
-  #endif
-      return __hiloint2double(tmp.x,tmp.y);
-    }
-  #endif
-
-  #ifdef __HIP_PLATFORM_HCC__
-    #define ARCH 600
-    #define WARP_SIZE 64
-  #endif
-
-  #ifdef __HIP_PLATFORM_NVCC__
-    #define ARCH __CUDA_ARCH__
-    #define WARP_SIZE 32
-  #endif
-
-  #define fast_mul(X,Y) (X)*(Y)
-
-  #define MEM_THREADS WARP_SIZE
-  #define PPPM_BLOCK_1D 64
-  #define BLOCK_CELL_2D 8
-  #define BLOCK_CELL_ID 128
-  #define MAX_BIO_SHARED_TYPES 128
-
-  #ifdef __HIP_PLATFORM_NVCC__
-    #ifdef _DOUBLE_DOUBLE
-      #define fetch4(ans,i,pos_tex) {                        \
-        int4 xy = tex1Dfetch(pos_tex,i*2);                   \
-        int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
-        ans.x=__hiloint2double(xy.y, xy.x);                  \
-        ans.y=__hiloint2double(xy.w, xy.z);                  \
-        ans.z=__hiloint2double(zt.y, zt.x);                  \
-        ans.w=__hiloint2double(zt.w, zt.z);                  \
-      }
-      #define fetch(ans,i,q_tex) {                           \
-        int2 qt = tex1Dfetch(q_tex,i);                       \
-        ans=__hiloint2double(qt.y, qt.x);                    \
-      }
-    #else
-      #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
-      #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
-    #endif
-  #else
-    #ifdef _DOUBLE_DOUBLE
-      #define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
-      #define fetch(ans,i,q_tex)    (ans=*(((double *)  q_tex) + i))
-    #else
-      #define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
-      #define fetch(ans,i,q_tex)    (ans=*(((float *)  q_tex) + i))
-    #endif
-  #endif
-
-  #ifdef _DOUBLE_DOUBLE
-    #define ucl_exp exp
-    #define ucl_powr pow
-    #define ucl_atan atan
-    #define ucl_cbrt cbrt
-    #define ucl_ceil ceil
-    #define ucl_abs fabs
-    #define ucl_rsqrt rsqrt
-    #define ucl_sqrt sqrt
-    #define ucl_recip(x) ((numtyp)1.0/(x))
-
-  #else
-    #define ucl_atan atanf
-    #define ucl_cbrt cbrtf
-    #define ucl_ceil ceilf
-    #define ucl_abs fabsf
-    #define ucl_recip(x) ((numtyp)1.0/(x))
-    #define ucl_rsqrt rsqrtf
-    #define ucl_sqrt sqrtf
-
-    #ifdef NO_HARDWARE_TRANSCENDENTALS
-      #define ucl_exp expf
-      #define ucl_powr powf
-    #else
-      #define ucl_exp __expf
-      #define ucl_powr __powf
-    #endif
-  #endif
-#endif
-    
-// -------------------------------------------------------------------------
-//                            CUDA DEFINITIONS
-// -------------------------------------------------------------------------
-
-#ifdef NV_KERNEL
-
-#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
-#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
-#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
-#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
-#define THREAD_ID_X threadIdx.x
-#define THREAD_ID_Y threadIdx.y
-#define BLOCK_ID_X blockIdx.x
-#define BLOCK_ID_Y blockIdx.y
-#define BLOCK_SIZE_X blockDim.x
-#define BLOCK_SIZE_Y blockDim.y
-#define __kernel extern "C" __global__
-#define __local __shared__
-#define __global
-#define restrict __restrict__
-#define atom_add atomicAdd
-#define ucl_inline static __inline__ __device__
-
-#ifdef __CUDA_ARCH__
-#define ARCH __CUDA_ARCH__
-#else
-#define ARCH 100
-#endif
-
-#if (ARCH < 200)
-
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 16
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_PAIR 64
-#define BLOCK_BIO_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#else
-
-#if (ARCH < 300)
-
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_PAIR 128
-#define BLOCK_BIO_PAIR 128
-#define MAX_SHARED_TYPES 8
-
-#else
-
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_PAIR 256
-#define BLOCK_BIO_PAIR 256
-#define BLOCK_ELLIPSE 128
-#define MAX_SHARED_TYPES 11
-
-#if (__CUDACC_VER_MAJOR__ < 9)
-
-#ifdef _SINGLE_SINGLE
-#define shfl_xor __shfl_xor
-#else
-ucl_inline double shfl_xor(double var, int laneMask, int width) {
-  int2 tmp;
-  tmp.x = __double2hiint(var);
-  tmp.y = __double2loint(var);
-  tmp.x = __shfl_xor(tmp.x,laneMask,width);
-  tmp.y = __shfl_xor(tmp.y,laneMask,width);
-  return __hiloint2double(tmp.x,tmp.y);
-}
-#endif
-
-#else
-
-#ifdef _SINGLE_SINGLE
-ucl_inline double shfl_xor(double var, int laneMask, int width) {
-  return __shfl_xor_sync(0xffffffff, var, laneMask, width);
-}
-#else
-ucl_inline double shfl_xor(double var, int laneMask, int width) {
-  int2 tmp;
-  tmp.x = __double2hiint(var);
-  tmp.y = __double2loint(var);
-  tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,laneMask,width);
-  tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,laneMask,width);
-  return __hiloint2double(tmp.x,tmp.y);
-}
-#endif
-
-#endif
-
-#endif
-
-#endif
-
-#define WARP_SIZE 32
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#ifdef _DOUBLE_DOUBLE
-#define fetch4(ans,i,pos_tex) {                        \
-  int4 xy = tex1Dfetch(pos_tex,i*2);                   \
-  int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
-  ans.x=__hiloint2double(xy.y, xy.x);                  \
-  ans.y=__hiloint2double(xy.w, xy.z);                  \
-  ans.z=__hiloint2double(zt.y, zt.x);                  \
-  ans.w=__hiloint2double(zt.w, zt.z);                  \
-}
-#define fetch(ans,i,q_tex) {                           \
-  int2 qt = tex1Dfetch(q_tex,i);                       \
-  ans=__hiloint2double(qt.y, qt.x);                    \
-}
-#else
-#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
-#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
-#endif
-
-#if (__CUDA_ARCH__ < 200)
-#define fast_mul __mul24
-#define MEM_THREADS 16
-#else
-#define fast_mul(X,Y) (X)*(Y)
-#define MEM_THREADS 32
-#endif
-
-#ifdef CUDA_PRE_THREE
-struct __builtin_align__(16) _double4
-{
-  double x, y, z, w;
-};
-typedef struct _double4 double4;
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-
-#define ucl_exp exp
-#define ucl_powr pow
-#define ucl_atan atan
-#define ucl_cbrt cbrt
-#define ucl_ceil ceil
-#define ucl_abs fabs
-#define ucl_rsqrt rsqrt
-#define ucl_sqrt sqrt
-#define ucl_recip(x) ((numtyp)1.0/(x))
-
-#else
-
-#define ucl_atan atanf
-#define ucl_cbrt cbrtf
-#define ucl_ceil ceilf
-#define ucl_abs fabsf
-#define ucl_recip(x) ((numtyp)1.0/(x))
-#define ucl_rsqrt rsqrtf
-#define ucl_sqrt sqrtf
-
-#ifdef NO_HARDWARE_TRANSCENDENTALS
-
-#define ucl_exp expf
-#define ucl_powr powf
-
-#else
-
-#define ucl_exp __expf
-#define ucl_powr __powf
-
-#endif
-
-#endif
-
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include "lal_pre_cuda_hip.h"
 #endif
 
 // -------------------------------------------------------------------------
-//                            NVIDIA GENERIC OPENCL DEFINITIONS
+//                         OPENCL DEVICE CONFIGURATAIONS
 // -------------------------------------------------------------------------
 
-#ifdef NV_GENERIC_OCL
+// See lal_pre_ocl_config.h for OpenCL device configurations
+
+#if !defined(NV_KERNEL) && !defined(USE_HIP)
 
 #define USE_OPENCL
-#define fast_mul mul24
-#define MEM_THREADS 16
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_BIO_PAIR 64
-
-#define WARP_SIZE 32
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#endif
 
 // -------------------------------------------------------------------------
-//                           NVIDIA FERMI OPENCL DEFINITIONS
+//                         OPENCL KERNEL MACROS
 // -------------------------------------------------------------------------
 
-#ifdef FERMI_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 32
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_PAIR 128
-#define MAX_SHARED_TYPES 11
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_BIO_PAIR 128
-
-#define WARP_SIZE 32
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#endif
-
-// -------------------------------------------------------------------------
-//                           NVIDIA KEPLER OPENCL DEFINITIONS
-// -------------------------------------------------------------------------
-
-#ifdef KEPLER_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 32
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_PAIR 256
-#define MAX_SHARED_TYPES 11
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_BIO_PAIR 256
-#define BLOCK_ELLIPSE 128
-
-#define WARP_SIZE 32
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#ifndef NO_OCL_PTX
-#define ARCH 300
-#ifdef _SINGLE_SINGLE
-inline float shfl_xor(float var, int laneMask, int width) {
-  float ret;
-  int c;
-  c = ((WARP_SIZE-width) << 8) | 0x1f;
-  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
-  return ret;
-}
+#if (__OPENCL_VERSION__ > 199)
+#define NOUNROLL __attribute__((opencl_unroll_hint(1)))
 #else
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-inline double shfl_xor(double var, int laneMask, int width) {
-  int c = ((WARP_SIZE-width) << 8) | 0x1f;
-  int x,y,x2,y2;
-  double ans;
-  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
-  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c));
-  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c));
-  asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
-  return ans;
-}
-#endif
+#define NOUNROLL
 #endif
 
-#endif
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define GLOBAL_SIZE_X get_global_size(0)
+#define THREAD_ID_Y get_local_id(1)
+#define BLOCK_ID_Y get_group_id(1)
+#define NUM_BLOCKS_X get_num_groups(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define ucl_inline inline
 
 // -------------------------------------------------------------------------
-//                            AMD CYPRESS OPENCL DEFINITIONS
+//                      OPENCL KERNEL MACROS - TEXTURES
 // -------------------------------------------------------------------------
 
-#ifdef CYPRESS_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 32
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_PAIR 128
-#define MAX_SHARED_TYPES 8
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_BIO_PAIR 64
-
-#define WARP_SIZE 64
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#endif
+#define fetch4(ans,i,x) ans=x[i]
+#define fetch(ans,i,q) ans=q[i]
 
 // -------------------------------------------------------------------------
-//                           INTEL CPU OPENCL DEFINITIONS
+//                       OPENCL KERNEL MACROS - MATH
 // -------------------------------------------------------------------------
 
-#ifdef INTEL_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 16
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 1
-#define MAX_SHARED_TYPES 0
-#define BLOCK_NBOR_BUILD 4
-#define BLOCK_BIO_PAIR 2
-#define BLOCK_ELLIPSE 2
-
-#define WARP_SIZE 1
-#define PPPM_BLOCK_1D 32
-#define BLOCK_CELL_2D 1
-#define BLOCK_CELL_ID 2
-#define MAX_BIO_SHARED_TYPES 0
-
-#endif
-
-// -------------------------------------------------------------------------
-//                           INTEL PHI OPENCL DEFINITIONS
-// -------------------------------------------------------------------------
-
-#ifdef PHI_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 16
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 16
-#define MAX_SHARED_TYPES 0
-#define BLOCK_NBOR_BUILD 16
-#define BLOCK_BIO_PAIR 16
-#define BLOCK_ELLIPSE 16
-
-#define WARP_SIZE 1
-#define PPPM_BLOCK_1D 32
-#define BLOCK_CELL_2D 4
-#define BLOCK_CELL_ID 16
-#define MAX_BIO_SHARED_TYPES 0
-
-#endif
-
-// -------------------------------------------------------------------------
-//                            GENERIC OPENCL DEFINITIONS
-// -------------------------------------------------------------------------
-
-#ifdef GENERIC_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 16
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_BIO_PAIR 64
-
-#define WARP_SIZE 1
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#endif
-
-// -------------------------------------------------------------------------
-//                     OPENCL Stuff for All Hardware
-// -------------------------------------------------------------------------
-#ifdef USE_OPENCL
-
 #ifndef _SINGLE_SINGLE
 
 #ifndef cl_khr_fp64
@@ -589,48 +160,14 @@ inline double shfl_xor(double var, int laneMask, int width) {
 
 #endif
 
-#ifndef fast_mul
 #define fast_mul(X,Y) (X)*(Y)
-#endif
-
-#ifndef ARCH
-#define ARCH 0
-#endif
-
-#ifndef DRIVER
-#define DRIVER 0
-#endif
-
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define GLOBAL_SIZE_X get_global_size(0)
-#define THREAD_ID_Y get_local_id(1)
-#define BLOCK_ID_Y get_group_id(1)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define ucl_inline inline
-#define fetch4(ans,i,x) ans=x[i]
-#define fetch(ans,i,q) ans=q[i]
 
 #define ucl_atan atan
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
 
-#ifdef _DOUBLE_DOUBLE
-#define NO_HARDWARE_TRANSCENDENTALS
-#endif
-
-#ifdef NO_HARDWARE_TRANSCENDENTALS
-
-#define ucl_exp exp
-#define ucl_powr powr
-#define ucl_rsqrt rsqrt
-#define ucl_sqrt sqrt
-#define ucl_recip(x) ((numtyp)1.0/(x))
-
-#else
+#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
 
 #define ucl_exp native_exp
 #define ucl_powr native_powr
@@ -638,23 +175,128 @@ inline double shfl_xor(double var, int laneMask, int width) {
 #define ucl_sqrt native_sqrt
 #define ucl_recip native_recip
 
+#else
+
+#define ucl_exp exp
+#define ucl_powr powr
+#define ucl_rsqrt rsqrt
+#define ucl_sqrt sqrt
+#define ucl_recip(x) ((numtyp)1.0/(x))
+
 #endif
 
+// -------------------------------------------------------------------------
+//                      OPENCL KERNEL MACROS - SHUFFLE
+// -------------------------------------------------------------------------
+
+#if (SHUFFLE_AVAIL == 1)
+  #ifdef cl_intel_subgroups
+    #pragma OPENCL EXTENSION cl_intel_subgroups : enable
+    #define shfl_down(var, delta, width) \
+      intel_sub_group_shuffle_down(var, var, delta)
+    #define shfl_xor(var, lanemask, width) \
+      intel_sub_group_shuffle_xor(var, lanemask)
+    #define simd_broadcast_i(var, src, width) sub_group_broadcast(var, src)
+    #define simd_broadcast_f(var, src, width) sub_group_broadcast(var, src)
+    #define simd_broadcast_d(var, src, width) sub_group_broadcast(var, src)
+  #else
+    #ifdef _SINGLE_SINGLE
+      inline float shfl_down(float var, unsigned int delta, int width) {
+        float ret;
+        int c;
+        c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c), "r"(0xffffffff));
+        return ret;
+      }
+      inline float shfl_xor(float var, unsigned int lanemask, int width) {
+        float ret;
+        int c;
+        c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(lanemask), "r"(c), "r"(0xffffffff));
+        return ret;
+      }
+    #else
+      inline double shfl_down(double var, unsigned int delta, int width) {
+        int c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        int x,y,x2,y2;
+        double ans;
+        asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
+        asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(delta), "r"(c), "r"(0xffffffff));
+        asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(delta), "r"(c), "r"(0xffffffff));
+        asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
+        return ans;
+      }
+      inline double shfl_xor(double var, unsigned int lanemask, int width) {
+        int c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        int x,y,x2,y2;
+        double ans;
+        asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
+        asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(lanemask), "r"(c), "r"(0xffffffff));
+        asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(lanemask), "r"(c), "r"(0xffffffff));
+        asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
+        return ans;
+      }
+    #endif
+    inline int simd_broadcast_i(int var, unsigned int src, int width) {
+      int ret;
+      int c;
+      c = ((SIMD_SIZE-width) << 8) | 0x1f;
+      asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(src), "r"(c), "r"(0xffffffff));
+      return ret;
+    }
+    inline float simd_broadcast_f(float var, unsigned int src, int width) {
+      float ret;
+      int c;
+      c = ((SIMD_SIZE-width) << 8) | 0x1f;
+      asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(src), "r"(c), "r"(0xffffffff));
+      return ret;
+    }
+    #ifdef _DOUBLE_DOUBLE
+      inline double simd_broadcast_d(double var, unsigned int src, int width) {
+        int c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        int x,y,x2,y2;
+        double ans;
+        asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
+        asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(src), "r"(c), "r"(0xffffffff));
+        asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(src), "r"(c), "r"(0xffffffff));
+        asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
+        return ans;
+      }
+    #endif
+  #endif
+#endif
+
+// -------------------------------------------------------------------------
+//                      OPENCL KERNEL MACROS - SUBGROUPS
+// -------------------------------------------------------------------------
+
+#ifdef USE_OPENCL_SUBGROUPS
+  #ifndef cl_intel_subgroups
+    #pragma OPENCL EXTENSION cl_khr_subgroups : enable
+  #endif
+  #define simdsync() sub_group_barrier(CLK_LOCAL_MEM_FENCE)
+  #define simd_size() get_max_sub_group_size()
+#else
+  #define simdsync()
+  #define simd_size() SIMD_SIZE
+#endif
+
+// -------------------------------------------------------------------------
+//                            END OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
 #endif
 
 // -------------------------------------------------------------------------
 //                  ARCHITECTURE INDEPENDENT DEFINITIONS
 // -------------------------------------------------------------------------
 
-#ifndef PPPM_MAX_SPLINE
-#define PPPM_MAX_SPLINE 8
-#endif
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
+#define acctyp2 double2
 #define acctyp4 double4
 #endif
 
@@ -663,6 +305,7 @@ inline double shfl_xor(double var, int laneMask, int width) {
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
+#define acctyp2 double2
 #define acctyp4 double4
 #endif
 
@@ -671,6 +314,7 @@ inline double shfl_xor(double var, int laneMask, int width) {
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
+#define acctyp2 float2
 #define acctyp4 float4
 #endif
 
@@ -686,11 +330,9 @@ inline double shfl_xor(double var, int laneMask, int width) {
 #define NEIGHMASK 0x3FFFFFFF
 ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
 
-#ifndef BLOCK_ELLIPSE
-#define BLOCK_ELLIPSE BLOCK_PAIR
-#endif
-
-// default to 32-bit smallint and other ints, 64-bit bigint: same as defined in src/lmptype.h
-#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && !defined(LAMMPS_SMALLBIG)
+// default to 32-bit smallint and other ints, 64-bit bigint:
+// same as defined in src/lmptype.h
+#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \
+    !defined(LAMMPS_SMALLBIG)
 #define LAMMPS_SMALLBIG
 #endif
diff --git a/lib/gpu/lal_re_squared.cpp b/lib/gpu/lal_re_squared.cpp
index 81dc3b13a4..aabfb9d39f 100644
--- a/lib/gpu/lal_re_squared.cpp
+++ b/lib/gpu/lal_re_squared.cpp
@@ -116,7 +116,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
     host_write[i*4+2]=host_shape[i][2];
   }
   UCL_H_Vec<numtyp4> view4;
-  view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device));
+  view4.view(host_write,shape.numel());
   ucl_copy(shape,view4,false);
 
   well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
@@ -125,7 +125,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
     host_write[i*4+1]=host_well[i][1];
     host_write[i*4+2]=host_well[i][2];
   }
-  view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
+  view4.view(host_write,well.numel());
   ucl_copy(well,view4,false);
 
   _allocated=true;
@@ -172,18 +172,8 @@ double RESquaredT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void RESquaredT::loop(const bool _eflag, const bool _vflag) {
+int RESquaredT::loop(const int eflag, const int vflag) {
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
 
   int GX=0, NGX;
   int stride=this->nbor->nbor_pitch();
@@ -201,8 +191,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->time_nbor1.stop();
 
       this->time_ellipsoid.start();
-      this->k_ellipsoid.set_size(GX,BX);
-      this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
+      this->k_elps_sel->set_size(GX,BX);
+      this->k_elps_sel->run(&this->atom->x, &this->atom->quat,
                             &this->shape, &this->well, &this->special_lj,
                             &this->sigma_epsilon, &this->_lj_types,
                             &this->nbor->dev_nbor, &stride,
@@ -218,8 +208,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->time_nbor2.stop();
 
       this->time_ellipsoid2.start();
-      this->k_ellipsoid_sphere.set_size(GX,BX);
-      this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat,
+      this->k_elps_sphere_sel->set_size(GX,BX);
+      this->k_elps_sphere_sel->run(&this->atom->x, &this->atom->quat,
                                    &this->shape, &this->well, &this->special_lj,
                                    &this->sigma_epsilon, &this->_lj_types,
                                    &this->nbor->dev_nbor, &stride,
@@ -233,7 +223,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
         this->time_nbor3.zero();
         this->time_ellipsoid3.zero();
         this->time_lj.zero();
-        return;
+        return ainum;
       }
 
       // ------------ SPHERE_ELLIPSE ---------------
@@ -249,8 +239,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->time_nbor3.stop();
 
       this->time_ellipsoid3.start();
-      this->k_sphere_ellipsoid.set_size(GX,BX);
-      this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
+      this->k_sphere_elps_sel->set_size(GX,BX);
+      this->k_sphere_elps_sel->run(&this->atom->x, &this->atom->quat,
                                    &this->shape, &this->well, &this->special_lj,
                                    &this->sigma_epsilon, &this->_lj_types,
                                    &this->nbor->dev_nbor, &stride,
@@ -277,8 +267,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
     this->time_lj.start();
     if (this->_last_ellipse<this->ans->inum()) {
       if (this->_shared_types) {
-        this->k_lj_fast.set_size(GX,BX);
-        this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
+        this->k_lj_sel->set_size(GX,BX);
+        this->k_lj_sel->run(&this->atom->x, &this->lj1, &this->lj3,
                             &this->special_lj, &stride,
                             &this->nbor->dev_packed, &this->ans->force,
                             &this->ans->engv, &this->dev_error,
@@ -303,8 +293,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
                                  ELLIPSE_ELLIPSE,_shared_types,_lj_types);
     this->time_nbor1.stop();
     this->time_ellipsoid.start();
-    this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
+    this->k_elps_sel->set_size(GX,BX);
+    this->k_elps_sel->run(&this->atom->x, &this->atom->quat,
                           &this->shape, &this->well, &this->special_lj,
                           &this->sigma_epsilon, &this->_lj_types,
                           &this->nbor->dev_nbor, &stride, &this->ans->force,
@@ -312,6 +302,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
                           &eflag, &vflag, &ainum, &this->_threads_per_atom);
     this->time_ellipsoid.stop();
   }
+  return ainum;
 }
 
 template class RESquared<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu
index 8852a46913..c69a338749 100644
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@@ -51,33 +51,30 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   sp_lj[0]=splj[0];
   sp_lj[1]=splj[1];
   sp_lj[2]=splj[2];
   sp_lj[3]=splj[3];
 
-  __local numtyp b_alpha, cr60;
-  b_alpha=(numtyp)45.0/(numtyp)56.0;
-  cr60=ucl_cbrt((numtyp)60.0);
+  const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0;
+  const numtyp cr60=ucl_cbrt((numtyp)60.0);
 
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp4 f, tor;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
+    nbor_info_p(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
@@ -349,17 +346,17 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
         numtyp force=dUr*Ur+dUa*Ua;
         if (i==0) {
           f.x+=force;
-          if (vflag>0)
+          if (EVFLAG && vflag)
             virial[0]+=-r[0]*force;
         } else if (i==1) {
           f.y+=force;
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             virial[1]+=-r[1]*force;
             virial[3]+=-r[0]*force;
           }
         } else {
           f.z+=force;
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             virial[2]+=-r[2]*force;
             virial[4]+=-r[0]*force;
             virial[5]+=-r[1]*force;
@@ -452,8 +449,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv,inum);
 }
-
diff --git a/lib/gpu/lal_re_squared.h b/lib/gpu/lal_re_squared.h
index 9e4f4af67a..1b0a837764 100644
--- a/lib/gpu/lal_re_squared.h
+++ b/lib/gpu/lal_re_squared.h
@@ -82,7 +82,7 @@ class RESquared : public BaseEllipsoid<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu
index 112a4db8d9..ca1b08facd 100644
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@@ -17,12 +17,18 @@
 #include "lal_ellipsoid_extra.h"
 #endif
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
+#define local_allocate_store_ellipse_lj local_allocate_store_ellipse
+#else
+#define local_allocate_store_ellipse_lj()                                   \
+    __local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
+#endif
+
+#if (SHUFFLE_AVAIL == 0)
 
 #define store_answers_rt(f, tor, energy, virial, ii, astride, tid,           \
-                         t_per_atom, offset, eflag, vflag, ans, engv)        \
+                         t_per_atom, offset, eflag, vflag, ans, engv, inum)  \
   if (t_per_atom>1) {                                                        \
-    __local acctyp red_acc[7][BLOCK_PAIR];                                   \
     red_acc[0][tid]=f.x;                                                     \
     red_acc[1][tid]=f.y;                                                     \
     red_acc[2][tid]=f.z;                                                     \
@@ -30,6 +36,7 @@
     red_acc[4][tid]=tor.y;                                                   \
     red_acc[5][tid]=tor.z;                                                   \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                          \
+      simdsync();                                                            \
       if (offset < s) {                                                      \
         for (int r=0; r<6; r++)                                              \
           red_acc[r][tid] += red_acc[r][tid+s];                              \
@@ -41,28 +48,39 @@
     tor.x=red_acc[3][tid];                                                   \
     tor.y=red_acc[4][tid];                                                   \
     tor.z=red_acc[5][tid];                                                   \
-    if (eflag>0 || vflag>0) {                                                \
-      for (int r=0; r<6; r++)                                                \
-        red_acc[r][tid]=virial[r];                                           \
-      red_acc[6][tid]=energy;                                                \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                        \
-        if (offset < s) {                                                    \
-          for (int r=0; r<7; r++)                                            \
-            red_acc[r][tid] += red_acc[r][tid+s];                            \
+    if (EVFLAG && (eflag || vflag)) {                                        \
+      if (vflag) {                                                           \
+        simdsync();                                                          \
+        for (int r=0; r<6; r++)                                              \
+          red_acc[r][tid]=virial[r];                                         \
+        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                      \
+          simdsync();                                                        \
+          if (offset < s) {                                                  \
+            for (int r=0; r<6; r++)                                          \
+              red_acc[r][tid] += red_acc[r][tid+s];                          \
+          }                                                                  \
+        }                                                                    \
+        for (int r=0; r<6; r++)                                              \
+          virial[r]=red_acc[r][tid];                                         \
+      }                                                                      \
+      if (eflag) {                                                           \
+        simdsync();                                                          \
+        red_acc[0][tid]=energy;                                              \
+        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                      \
+          simdsync();                                                        \
+          if (offset < s) red_acc[0][tid] += red_acc[0][tid+s];              \
         }                                                                    \
       }                                                                      \
-      for (int r=0; r<6; r++)                                                \
-        virial[r]=red_acc[r][tid];                                           \
-      energy=red_acc[6][tid];                                                \
+      energy=red_acc[0][tid];                                                \
     }                                                                        \
   }                                                                          \
-  if (offset==0) {                                                           \
+  if (offset==0 && ii<inum) {                                                \
     __global acctyp *ap1=engv+ii;                                            \
-    if (eflag>0) {                                                           \
+    if (EVFLAG && eflag) {                                                   \
       *ap1+=energy*(acctyp)0.5;                                              \
       ap1+=astride;                                                          \
     }                                                                        \
-    if (vflag>0) {                                                           \
+    if (EVFLAG && vflag) {                                                   \
       for (int i=0; i<6; i++) {                                              \
         *ap1+=virial[i]*(acctyp)0.5;                                         \
         ap1+=astride;                                                        \
@@ -82,32 +100,32 @@
 
 #else
 
-#define store_answers_rt(f, tor, energy, virial, ii, astride, tid,          \
-                         t_per_atom, offset, eflag, vflag, ans, engv)       \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
-        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
-        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
+#define store_answers_rt(f, tor, energy, virial, ii, astride, tid,           \
+                         t_per_atom, offset, eflag, vflag, ans, engv, inum)  \
+  if (t_per_atom>1) {                                                        \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                          \
+        f.x += shfl_down(f.x, s, t_per_atom);                                \
+        f.y += shfl_down(f.y, s, t_per_atom);                                \
+        f.z += shfl_down(f.z, s, t_per_atom);                                \
+        tor.x += shfl_down(tor.x, s, t_per_atom);                            \
+        tor.y += shfl_down(tor.y, s, t_per_atom);                            \
+        tor.z += shfl_down(tor.z, s, t_per_atom);                            \
+        energy += shfl_down(energy, s, t_per_atom);                          \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
           for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+            virial[r] += shfl_down(virial[r], s, t_per_atom);                \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       *ap1+=energy*(acctyp)0.5;                                             \
       ap1+=astride;                                                         \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         *ap1+=virial[i]*(acctyp)0.5;                                        \
         ap1+=astride;                                                       \
@@ -147,35 +165,34 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   sp_lj[0]=splj[0];
   sp_lj[1]=splj[1];
   sp_lj[2]=splj[2];
   sp_lj[3]=splj[3];
 
-  __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
-  b_alpha=(numtyp)45.0/(numtyp)56.0;
-  cr60=ucl_cbrt((numtyp)60.0);
-  solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
-  solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
+  const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0;
+  const numtyp cr60=ucl_cbrt((numtyp)60.0);
+  const numtyp solv_f_a =
+     (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
+  const numtyp solv_f_r =
+     (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
 
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp4 f, tor;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
+    nbor_info_p(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
@@ -316,17 +333,17 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
         numtyp force=dUr*Ur+dUa*Ua;
         if (i==0) {
           f.x+=force;
-          if (vflag>0)
+          if (EVFLAG && vflag)
             virial[0]+=-r[0]*force;
         } else if (i==1) {
           f.y+=force;
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             virial[1]+=-r[1]*force;
             virial[3]+=-r[0]*force;
           }
         } else {
           f.z+=force;
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             virial[2]+=-r[2]*force;
             virial[4]+=-r[0]*force;
             virial[5]+=-r[1]*force;
@@ -378,9 +395,9 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_rt(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
-                     vflag,ans,engv);
   } // if ii
+  store_answers_rt(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv,inum);
 }
 
 __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
@@ -403,31 +420,33 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
   ii+=start;
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse_lj();
+
   sp_lj[0]=splj[0];
   sp_lj[1]=splj[1];
   sp_lj[2]=splj[2];
   sp_lj[3]=splj[3];
 
-  __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
-  b_alpha=(numtyp)45.0/(numtyp)56.0;
-  cr60=ucl_cbrt((numtyp)60.0);
-  solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
-  solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
+  const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0;
+  const numtyp cr60=ucl_cbrt((numtyp)60.0);
+  const numtyp solv_f_a =
+    (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
+  const numtyp solv_f_r =
+    (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int j, numj;
-    __local int n_stride;
-    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
+    nbor_info_p(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
                 n_stride,nbor_end,nbor);
 
     numtyp4 jx; fetch4(jx,j,pos_tex);
@@ -561,17 +580,17 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
         numtyp force=dUr*Ur+dUa*Ua;
         if (i==0) {
           f.x+=force;
-          if (vflag>0)
+          if (EVFLAG && vflag)
             virial[0]+=-r[0]*force;
         } else if (i==1) {
           f.y+=force;
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             virial[1]+=-r[1]*force;
             virial[3]+=-r[0]*force;
           }
         } else {
           f.z+=force;
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             virial[2]+=-r[2]*force;
             virial[4]+=-r[0]*force;
             virial[5]+=-r[1]*force;
@@ -579,9 +598,9 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
         }
       }
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
@@ -601,26 +620,27 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
   ii+=start;
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   sp_lj[0]=gum[0];
   sp_lj[1]=gum[1];
   sp_lj[2]=gum[2];
   sp_lj[3]=gum[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,nbor_end,nbor);
+    nbor_info_e_ss(dev_ij,stride,t_per_atom,ii,offset,i,numj,
+                   n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
@@ -652,11 +672,11 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
           energy+=factor_lj*(e-lj3[ii].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -666,9 +686,9 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
         }
       }
     } // for nbor
-    acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                ans,engv);
   } // if ii
+  acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+              ans,engv);
 }
 
 __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
@@ -690,31 +710,32 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
   __local numtyp sp_lj[4];
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   if (tid<4)
     sp_lj[tid]=gum[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,nbor_end,nbor);
+    nbor_info_e_ss(dev_ij,stride,t_per_atom,ii,offset,i,numj,
+                   n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int iw=ix.w;
@@ -745,11 +766,11 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -760,8 +781,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                ans,engv);
   } // if ii
+  acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+              ans,engv);
 }
-
diff --git a/lib/gpu/lal_soft.cpp b/lib/gpu/lal_soft.cpp
index 8e944fa0a5..e77be5a011 100644
--- a/lib/gpu/lal_soft.cpp
+++ b/lib/gpu/lal_soft.cpp
@@ -121,20 +121,9 @@ double SoftT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void SoftT::loop(const bool _eflag, const bool _vflag) {
+int SoftT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -142,8 +131,8 @@ void SoftT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom);
@@ -155,6 +144,7 @@ void SoftT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Soft<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_soft.cu b/lib/gpu/lal_soft.cu
index 5df34e7b1d..74ac0e0c97 100644
--- a/lib/gpu/lal_soft.cu
+++ b/lib/gpu/lal_soft.cu
@@ -40,22 +40,25 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -91,11 +94,11 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg));
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -106,9 +109,9 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
@@ -125,25 +128,28 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -179,11 +185,11 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg));
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -194,8 +200,8 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_soft.h b/lib/gpu/lal_soft.h
index b33314ee03..fd86f62927 100644
--- a/lib/gpu/lal_soft.h
+++ b/lib/gpu/lal_soft.h
@@ -73,7 +73,7 @@ class Soft : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_soft_ext.cpp b/lib/gpu/lal_soft_ext.cpp
index 7c0cbe7973..a32a5e5a00 100644
--- a/lib/gpu/lal_soft_ext.cpp
+++ b/lib/gpu/lal_soft_ext.cpp
@@ -55,7 +55,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
   int init_ok=0;
   if (world_me==0)
     init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut,
-                      special_lj, inum, nall, 300,
+                      special_lj, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen);
 
   SLMF.device->world_barrier();
@@ -73,7 +73,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut,
-                        special_lj, inum, nall, 300, maxspecial,
+                        special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);
 
     SLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp
index 5c7bd45c76..eb42c710cc 100644
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@@ -43,114 +43,83 @@ int SWT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
-           const double cell_size, const double gpu_split, FILE *_screen,
-           int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-           const double* epsilon, const double* sigma,
-           const double* lambda, const double* gamma,
-           const double* costheta, const double* biga,
-           const double* bigb, const double* powerp,
-           const double* powerq, const double* cut, const double* cutsq)
-{
+int SWT::init(const int ntypes, const int nlocal, const int nall,
+              const int max_nbors, const double cell_size,
+              const double gpu_split, FILE *_screen, double **ncutsq,
+              double **ncut, double **sigma, double **powerp, double **powerq,
+              double **sigma_gamma, double **c1, double **c2, double **c3,
+              double **c4, double **c5, double **c6, double ***lambda_epsilon,
+              double ***costheta, const int *map, int ***e2param) {
+  _lj_types=ntypes;
+
+  int oldparam=-1;
+  int onetype=-1;
+  int onetype3=0;
+  int spq=1;
+  int mtypes=0;
+  #ifdef USE_OPENCL
+  for (int ii=1; ii<ntypes; ii++) {
+    int i=map[ii];
+    if (i<0) continue;
+    for (int jj=1; jj<ntypes; jj++) {
+      int j=map[jj];
+      if (j<0) continue;
+      if (powerp[ii][jj] != 4.0 || powerq[ii][jj] != 0.0)
+        spq=0;
+      for (int kk=1; kk<ntypes; kk++) {
+        int k=map[kk];
+        if (k<0) continue;
+        int param=e2param[i][j][k];
+        if (oldparam!=param) {
+          oldparam=param;
+          onetype=ntypes*ii+jj;
+          onetype3=ntypes*ntypes*ii+ntypes*jj+kk;
+          mtypes++;
+        }
+      }
+    }
+  }
+  if (mtypes>1) onetype=-1;
+  #endif
+
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,sw,"k_sw","k_sw_three_center",
-                           "k_sw_three_end","k_sw_short_nbor");
+                           "k_sw_three_end","k_sw_short_nbor",onetype,
+                           onetype3,spq);
   if (success!=0)
     return success;
 
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
+  UCL_H_Vec<numtyp> host_write(ntypes*ntypes*ntypes*4,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  host_write.zero();
 
-  _nparams = nparams;
-  _nelements = nelements;
-
-  UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
-                             UCL_WRITE_ONLY);
-
-  for (int i=0; i<nparams; i++) {
-    dview[i].x=(numtyp)0;
-    dview[i].y=(numtyp)0;
-    dview[i].z=(numtyp)0;
-    dview[i].w=(numtyp)0;
+  for (int i=1; i<ntypes; i++)
+    for (int j=1; j<ntypes; j++) {
+      double ccutsq = ncut[i][j]*ncut[i][j];
+      if (ccutsq > 0.0 && ncutsq[i][j]>=ccutsq)
+        ncutsq[i][j]=ccutsq*0.98;
   }
 
   // pack coefficients into arrays
-  sw1.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-
-  for (int i=0; i<nparams; i++) {
-    dview[i].x=static_cast<numtyp>(epsilon[i]);
-    dview[i].y=static_cast<numtyp>(sigma[i]);
-    dview[i].z=static_cast<numtyp>(lambda[i]);
-    dview[i].w=static_cast<numtyp>(gamma[i]);
-  }
-
-  ucl_copy(sw1,dview,false);
-  sw1_tex.get_texture(*(this->pair_program),"sw1_tex");
-  sw1_tex.bind_float(sw1,4);
-
-  sw2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-
-  for (int i=0; i<nparams; i++) {
-    dview[i].x=static_cast<numtyp>(biga[i]);
-    dview[i].y=static_cast<numtyp>(bigb[i]);
-    dview[i].z=static_cast<numtyp>(powerp[i]);
-    dview[i].w=static_cast<numtyp>(powerq[i]);
-  }
-
-  ucl_copy(sw2,dview,false);
-  sw2_tex.get_texture(*(this->pair_program),"sw2_tex");
-  sw2_tex.bind_float(sw2,4);
-
-  sw3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-
-  for (int i=0; i<nparams; i++) {
-    double sw_cut = cut[i];
-    double sw_cutsq = cutsq[i];
-    if (sw_cutsq>=sw_cut*sw_cut)
-      sw_cutsq=sw_cut*sw_cut-1e-4;
-    dview[i].x=static_cast<numtyp>(sw_cut);
-    dview[i].y=static_cast<numtyp>(sw_cutsq);
-    dview[i].z=static_cast<numtyp>(costheta[i]);
-    dview[i].w=(numtyp)0;
-  }
-
-  ucl_copy(sw3,dview,false);
-  sw3_tex.get_texture(*(this->pair_program),"sw3_tex");
-  sw3_tex.bind_float(sw3,4);
-
-  UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
-                           *(this->ucl_device), UCL_WRITE_ONLY);
-
-  elem2param.alloc(nelements*nelements*nelements,*(this->ucl_device),
-                   UCL_READ_ONLY);
-
-  for (int i = 0; i < nelements; i++)
-    for (int j = 0; j < nelements; j++)
-      for (int k = 0; k < nelements; k++) {
-         int idx = i*nelements*nelements+j*nelements+k;
-         dview_elem2param[idx] = host_elem2param[i][j][k];
-      }
-
-  ucl_copy(elem2param,dview_elem2param,false);
-
-  UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
-  for (int i = 0; i < ntypes; i++)
-    dview_map[i] = host_map[i];
-
-  map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
-  ucl_copy(map,dview_map,false);
+  cutsq.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack1(ntypes,ntypes,cutsq,host_write,ncutsq);
+  sw_pre.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,ntypes,sw_pre,host_write,ncut,sigma,
+                         powerp,powerq);
+  c_14.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,ntypes,c_14,host_write,c1,c2,c3,c4);
+  c_56.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,ntypes,c_56,host_write,c5,c6);
+  cut_sigma_gamma.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,ntypes,cut_sigma_gamma,host_write,ncut,
+                         sigma_gamma);
+  sw_pre3.alloc(ntypes*ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,sw_pre3,host_write,lambda_epsilon,costheta);
 
   _allocated=true;
-  this->_max_bytes=sw1.row_bytes()+sw2.row_bytes()+sw3.row_bytes()+
-    map.row_bytes()+elem2param.row_bytes();
+  this->_max_bytes=cutsq.row_bytes()+sw_pre.row_bytes()+c_14.row_bytes()+
+    c_56.row_bytes()+cut_sigma_gamma.row_bytes()+sw_pre3.row_bytes();
   return 0;
 }
 
@@ -160,11 +129,12 @@ void SWT::clear() {
     return;
   _allocated=false;
 
-  sw1.clear();
-  sw2.clear();
-  sw3.clear();
-  map.clear();
-  elem2param.clear();
+  cutsq.clear();
+  sw_pre.clear();
+  c_14.clear();
+  c_56.clear();
+  cut_sigma_gamma.clear();
+  sw_pre3.clear();
   this->clear_atomic();
 }
 
@@ -179,58 +149,33 @@ double SWT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
-  // Compute the block size and grid size to keep all cores busy
-  int BX=this->block_pair();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
+int SWT::loop(const int eflag, const int vflag, const int evatom,
+              bool &success) {
+  const int nbor_pitch=this->nbor->nbor_pitch();
 
   // build the short neighbor list
   int ainum=this->_ainum;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  int BX=this->block_pair();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
   this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
-                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                 &this->dev_short_nbor, &ainum,
-                 &nbor_pitch, &this->_threads_per_atom);
+  this->k_short_nbor.run(&this->atom->x, &cutsq, &_lj_types,
+                         &this->nbor->dev_nbor, &this->nbor->dev_packed,
+                         &ainum, &nbor_pitch, &this->_threads_per_atom);
 
   // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
   // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
   ainum=this->ans->inum();
-  nbor_pitch=this->nbor->nbor_pitch();
-  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-  this->time_pair.start();
-  
-  this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
-                   &map, &elem2param, &_nelements,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &this->ans->force, &this->ans->engv,
-                   &eflag, &vflag, &ainum, &nbor_pitch,
-                   &this->_threads_per_atom);
-
   BX=this->block_size();
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                            (BX/(KTHREADS*JTHREADS))));
-  this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
-                           &map, &elem2param, &_nelements,
-                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                           &this->dev_short_nbor,
-                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
-                           &nbor_pitch, &this->_threads_per_atom, &evatom);
+  this->k_3center_sel->set_size(GX,BX);
+  this->k_3center_sel->run(&this->atom->x, &cut_sigma_gamma, &sw_pre3,
+                           &_lj_types, &this->nbor->dev_nbor,
+                           &this->ans->force, &this->ans->engv, &eflag,
+                           &vflag, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom, &evatom);
 
   Answer<numtyp,acctyp> *end_ans;
   #ifdef THREE_CONCURRENT
@@ -240,25 +185,32 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   #endif
   if (evatom!=0) {
     this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
-                          &map, &elem2param, &_nelements,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
-
+    this->k_three_end_vatom.run(&this->atom->x, &cut_sigma_gamma,
+                                &sw_pre3, &_lj_types, &this->nbor->dev_nbor,
+                                &this->nbor->three_ilist, &end_ans->force,
+                                &end_ans->engv, &eflag, &vflag, &ainum,
+                                &nbor_pitch,&this->_threads_per_atom,
+                                &this->_gpu_nbor);
   } else {
-    this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
-                          &map, &elem2param, &_nelements,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
-
+    this->k_3end_sel->set_size(GX,BX);
+    this->k_3end_sel->run(&this->atom->x, &cut_sigma_gamma, &sw_pre3,
+                          &_lj_types, &this->nbor->dev_nbor,
+                          &this->nbor->three_ilist, &end_ans->force,
+                          &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
+                          &this->_threads_per_atom, &this->_gpu_nbor);
   }
 
+  BX=this->block_pair();
+  int GXT=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->k_sel->set_size(GXT,BX);
+  this->k_sel->run(&this->atom->x, &sw_pre, &c_14, &c_56,
+                   &_lj_types, &this->nbor->dev_nbor,
+                   &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                   &ainum, &nbor_pitch, &this->_threads_per_atom, &GX);
+
   this->time_pair.stop();
+  return GX;
 }
 
 template class SW<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu
index 2b38bd02dc..621ba87208 100644
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@@ -39,88 +39,161 @@ _texture( sw3_tex,int4);
 
 //#define THREE_CONCURRENT
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
 
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_ELLIPSE];                               \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
       }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
     ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
   }
 
 #else
 
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
+#if (EVFLAG == 1)
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
         ei+=inum;                                                           \
       }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
+  }
+
+#else
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
@@ -129,45 +202,45 @@ _texture( sw3_tex,int4);
   }
 
 #endif
+#endif
 
 __kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
-                           const __global numtyp4 *restrict sw3,
-                           const __global int *restrict map,
-                           const __global int *restrict elem2param,
-                           const int nelements,
-                           const __global int * dev_nbor,
-                           const __global int * dev_packed,
-                           __global int * dev_short_nbor,
-                           const int inum, const int nbor_pitch, const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
+                              const __global numtyp * restrict cutsq,
+                              const int ntypes, __global int * dev_nbor,
+                              const __global int * dev_packed,
+                              const int inum, const int nbor_pitch,
+                              const int t_per_atom) {
+  const int ii=GLOBAL_ID_X;
+
+  #ifdef ONETYPE
+  const numtyp sw_cutsq=cutsq[ONETYPE];
+  #endif
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    const int i=dev_packed[ii];
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int itype=ix.w;
-    itype=map[itype];
+    #ifndef ONETYPE
+    const int itype=ix.w*ntypes;
+    #endif
+    int newj=0;
 
-    int ncount = 0;
-    int m = nbor;
-    dev_short_nbor[m] = 0;
-    int nbor_short = nbor+n_stride;
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int j=dev_packed[nbor];
-      int nj = j;
-      j &= NEIGHMASK;
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
 
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      #ifndef ONETYPE
+      const int mtype=jx.w+itype;
+      const numtyp sw_cutsq=cutsq[mtype];
+      #endif
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -175,74 +248,69 @@ __kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
-        dev_short_nbor[nbor_short] = nj;
-        nbor_short += n_stride;
-        ncount++;
+      if (rsq<sw_cutsq) {
+        *out_list=sj;
+        out_list++;
+        newj++;
+        if ((newj & (t_per_atom-1))==0)
+          out_list+=out_stride;
       }
     } // for nbor
-
-    // store the number of neighbors for each thread
-    dev_short_nbor[m] = ncount;
-
+    dev_nbor[ii+nbor_pitch]=newj;
   } // if ii
 }
 
 __kernel void k_sw(const __global numtyp4 *restrict x_,
-                   const __global numtyp4 *restrict sw1,
-                   const __global numtyp4 *restrict sw2,
-                   const __global numtyp4 *restrict sw3,
-                   const __global int *restrict map,
-                   const __global int *restrict elem2param,
-                   const int nelements,
-                   const __global int * dev_nbor,
-                   const __global int * dev_packed,
-                   const __global int * dev_short_nbor,
+                   const __global numtyp4 * restrict sw_pre,
+                   const __global numtyp4 * restrict c_14,
+                   const __global numtyp2 * restrict c_56,
+                   const int ntypes, const __global int * dev_nbor,
                    __global acctyp4 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
-                   const int nbor_pitch, const int t_per_atom) {
-  __local int n_stride;
+                   const int nbor_pitch, const int t_per_atom,
+                   const int ev_stride) {
+  int n_stride;
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
+  local_allocate_store_pair();
+
+  #ifdef ONETYPE
+  const numtyp4 pre_sw=sw_pre[ONETYPE];
+  const numtyp4 pre_sw_c14=c_14[ONETYPE];
+  numtyp2 pre_sw_c56;
+  if (EVFLAG && eflag) pre_sw_c56=c_56[ONETYPE];
+  #endif
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end, i, numj;
-    const __global int* nbor_mem = dev_packed;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset,i,numj,
+                n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
-    itype=map[itype];
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor];
-      nbor += n_stride;
-      nbor_end = nbor+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
+    itype*=ntypes;
+    #endif
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=nbor_mem[nbor];
+      int j=dev_nbor[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      jtype=map[jtype];
-
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+      #ifndef ONETYPE
+      int mtype=jx.w;
+      mtype+=itype;
+      #endif
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -250,62 +318,49 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
-        numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-        numtyp sw_epsilon=sw1_ijparam.x;
-        numtyp sw_sigma=sw1_ijparam.y;
-        numtyp4 sw2_ijparam; fetch4(sw2_ijparam,ijparam,sw2_tex);
-        numtyp sw_biga=sw2_ijparam.x;
-        numtyp sw_bigb=sw2_ijparam.y;
-        numtyp sw_powerp=sw2_ijparam.z;
-        numtyp sw_powerq=sw2_ijparam.w;
-        numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
-        numtyp sw_cut=sw3_ijparam.x;
-        numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
-            pow(sw_sigma,sw_powerp);
-        numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
-            pow(sw_sigma,sw_powerq);
-        numtyp pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
-            pow(sw_sigma,sw_powerp+(numtyp)1.0);
-        numtyp pre_sw_c4=sw_biga*sw_epsilon*
-            pow(sw_sigma,sw_powerq+(numtyp)1.0);
-        numtyp pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
-            pow(sw_sigma,sw_powerp);
-        numtyp pre_sw_c6=sw_biga*sw_epsilon*
-            pow(sw_sigma,sw_powerq);
+      #ifndef ONETYPE
+      numtyp4 pre_sw=sw_pre[mtype];
+      numtyp4 pre_sw_c14=c_14[mtype];
+      #endif
+      numtyp r=ucl_sqrt(rsq);
+      #ifdef SPQ
+      numtyp rp=r*r;
+      rp=ucl_recip(rp*rp);
+      numtyp rq=(numtyp)1.0;
+      #else
+      numtyp rp=ucl_powr(r,-pre_sw.z);
+      numtyp rq=ucl_powr(r,-pre_sw.w);
+      #endif
+      numtyp rainv=ucl_recip(r-pre_sw.x);
+      numtyp expsrainv=ucl_exp(pre_sw.y*rainv);
+      rainv*=rainv*r;
+      numtyp force = (pre_sw_c14.x*rp-pre_sw_c14.y*rq +
+                     (pre_sw_c14.z*rp-pre_sw_c14.w*rq) * rainv)*
+                     expsrainv*ucl_recip(rsq);
 
-        numtyp r=ucl_sqrt(rsq);
-        numtyp rp=ucl_powr(r,-sw_powerp);
-        numtyp rq=ucl_powr(r,-sw_powerq);
-        numtyp rainv=ucl_recip(r-sw_cut);
-        numtyp expsrainv=ucl_exp(sw_sigma*rainv);
-        rainv*=rainv*r;
-        numtyp force = (pre_sw_c1*rp-pre_sw_c2*rq +
-                       (pre_sw_c3*rp-pre_sw_c4*rq) * rainv)*
-                       expsrainv*ucl_recip(rsq);
+      f.x+=delx*force;
+      f.y+=dely*force;
+      f.z+=delz*force;
 
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
+      if (EVFLAG && eflag) {
+        #ifndef ONETYPE
+        numtyp2 pre_sw_c56=c_56[mtype];
+        #endif
+        energy+=(pre_sw_c56.x*rp - pre_sw_c56.y*rq) * expsrainv;
+      }
 
-        if (eflag>0)
-          energy+=(pre_sw_c5*rp - pre_sw_c6*rq) * expsrainv;
-
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
+      if (EVFLAG && vflag) {
+        virial[0] += delx*delx*force;
+        virial[1] += dely*dely*force;
+        virial[2] += delz*delz*force;
+        virial[3] += delx*dely*force;
+        virial[4] += delx*delz*force;
+        virial[5] += dely*delz*force;
       }
     } // for nbor
-
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
-
+  store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv,ev_stride);
 }
 
 #define threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z, eflag, energy)  \
@@ -334,7 +389,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
   numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq;                    \
   numtyp frad1 = facrad*gsrainvsq1;                                          \
   numtyp frad2 = facrad*gsrainvsq2;                                          \
-  numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs;                     \
+  numtyp facang = (numtyp)2.0 * sw_lambda_epsilon_ijk * facexp*delcs;        \
   numtyp facang12 = rinv12*facang;                                           \
   numtyp csfacang = cs*facang;                                               \
   numtyp csfac1 = rinvsq1*csfacang;                                          \
@@ -349,9 +404,9 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
   fky = delr2y*(frad2+csfac2)-delr1y*facang12;                               \
   fkz = delr2z*(frad2+csfac2)-delr1z*facang12;                               \
                                                                              \
-  if (eflag>0)                                                               \
+  if (EVFLAG && eflag)                                                       \
     energy+=facrad;                                                          \
-  if (vflag>0) {                                                             \
+  if (EVFLAG && vflag) {                                                     \
     virial[0] += delr1x*fjx + delr2x*fkx;                                    \
     virial[1] += delr1y*fjy + delr2y*fky;                                    \
     virial[2] += delr1z*fjz + delr2z*fkz;                                    \
@@ -384,7 +439,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
                                                                              \
   numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq;                    \
   numtyp frad1 = facrad*gsrainvsq1;                                          \
-  numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs;                     \
+  numtyp facang = (numtyp)2.0 * sw_lambda_epsilon_ijk * facexp*delcs;        \
   numtyp facang12 = rinv12*facang;                                           \
   numtyp csfacang = cs*facang;                                               \
   numtyp csfac1 = rinvsq1*csfacang;                                          \
@@ -394,67 +449,68 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
   fjz = delr1z*(frad1+csfac1)-delr2z*facang12;                               \
 }
 
+#ifdef ONETYPE
+#define sw_cut_ij sw_cut
+#define sw_cut_ik sw_cut
+#define sw_sigma_gamma_ij sw_sigma_gamma
+#define sw_sigma_gamma_ik sw_sigma_gamma
+#endif
+
 __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
-                                const __global numtyp4 *restrict sw1,
-                                const __global numtyp4 *restrict sw2,
-                                const __global numtyp4 *restrict sw3,
-                                const __global int *restrict map,
-                                const __global int *restrict elem2param,
-                                const int nelements,
+                                const __global numtyp2 *restrict cut_sig_gamma,
+                                const __global numtyp2 *restrict sw_pre3,
+                                const int ntypes,
                                 const __global int * dev_nbor,
-                                const __global int * dev_packed,
-                                const __global int * dev_short_nbor,
                                 __global acctyp4 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
                                 const int inum,  const int nbor_pitch,
                                 const int t_per_atom, const int evatom) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
-  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
+  int n_stride;
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  local_allocate_store_three();
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  #ifdef ONETYPE
+  const numtyp sw_cut=cut_sig_gamma[ONETYPE].x;
+  const numtyp sw_sigma_gamma=cut_sig_gamma[ONETYPE].y;
+  const numtyp sw_lambda_epsilon_ijk=sw_pre3[ONETYPE3].x;
+  const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
+  #endif
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
-    itype=map[itype];
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-    int nborj_start = nbor_j;
+    itype*=ntypes;
+    #endif
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      jtype=map[jtype];
+      #ifndef ONETYPE
+      int mtypej=jx.w;
+      mtypej+=itype;
+      #endif
 
       // Compute r12
       numtyp delr1x = jx.x-ix.x;
@@ -462,139 +518,116 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
       numtyp delr1z = jx.z-ix.z;
       numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
 
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
-      numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
+      #ifndef ONETYPE
+      const numtyp sw_cut_ij=cut_sig_gamma[mtypej].x;
+      const numtyp sw_sigma_gamma_ij=cut_sig_gamma[mtypej].y;
+      #endif
 
-      if (rsq1 > sw3_ijparam.y) continue;
+      int nbor_k;
+      nbor_k = nbor_j-offset_j+offset_k;
+      if (nbor_k<=nbor_j) nbor_k += n_stride;
 
-      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
-      sw_cut_ij=sw3_ijparam.x;
-
-      int nbor_k,k_end;
-      if (dev_packed==dev_nbor) {
-        nbor_k=nborj_start-offset_j+offset_k;
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      } else {
-        nbor_k = nbor_j-offset_j+offset_k;
-        if (nbor_k<=nbor_j) nbor_k += n_stride;
-        k_end = nbor_end;
-      }
-
-      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
-        if (dev_packed==dev_nbor && k <= j) continue;
-
         numtyp4 kx; fetch4(kx,k,pos_tex);
-        int ktype=kx.w;
-        ktype=map[ktype];
-        int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
-        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
+        #ifndef ONETYPE
+        const int ktype=kx.w;
+        const int mtypek=itype+ktype;
+        #endif
 
         numtyp delr2x = kx.x-ix.x;
         numtyp delr2y = kx.y-ix.y;
         numtyp delr2z = kx.z-ix.z;
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
-        if (rsq2 < sw3_ikparam.y) {   // sw_cutsq=sw3[ikparam].y;
-          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
-          sw_cut_ik=sw3_ikparam.x;
+        #ifndef ONETYPE
+        const numtyp sw_cut_ik=cut_sig_gamma[mtypek].x;
+        const numtyp sw_sigma_gamma_ik=cut_sig_gamma[mtypek].y;
+        const int mtypejk=ntypes*mtypej+ktype;
+        const numtyp sw_lambda_epsilon_ijk=sw_pre3[mtypejk].x;
+        const numtyp sw_costheta_ijk=sw_pre3[mtypejk].y;
+        #endif
 
-          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
-          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
-          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
-          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
-          sw_costheta_ijk=sw3_ijkparam.z;
+        numtyp fjx, fjy, fjz, fkx, fky, fkz;
+        threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
 
-          numtyp fjx, fjy, fjz, fkx, fky, fkz;
-          threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
-
-          f.x -= fjx + fkx;
-          f.y -= fjy + fky;
-          f.z -= fjz + fkz;
-        }
+        f.x -= fjx + fkx;
+        f.y -= fjy + fky;
+        f.z -= fjz + fkz;
       }
     } // for nbor
 
-    numtyp pre;
-    if (evatom==1)
-      pre=THIRD;
-    else
-      pre=(numtyp)2.0;
-    energy*=pre;
-    for (int i=0; i<6; i++)
-      virial[i]*=pre;
-
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-
+    if (EVFLAG) {
+      numtyp pre;
+      if (evatom==1)
+        pre=THIRD;
+      else
+        pre=(numtyp)2.0;
+      energy*=pre;
+      if (vflag)
+      for (int i=0; i<6; i++)
+        virial[i]*=pre;
+    }
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
 }
 
 __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
-                             const __global numtyp4 *restrict sw1,
-                             const __global numtyp4 *restrict sw2,
-                             const __global numtyp4 *restrict sw3,
-                             const __global int *restrict map,
-                             const __global int *restrict elem2param,
-                             const int nelements,
-                             const __global int * dev_nbor,
-                             const __global int * dev_packed,
+                             const __global numtyp2 *restrict cut_sig_gamma,
+                             const __global numtyp2 *restrict sw_pre3,
+                             const int ntypes, const __global int * dev_nbor,
                              const __global int * dev_ilist,
-                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
                              const int inum,  const int nbor_pitch,
                              const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
-  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
-
+  int n_stride;
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  local_allocate_store_three();
+
+  #ifdef ONETYPE
+  const numtyp sw_cut=cut_sig_gamma[ONETYPE].x;
+  const numtyp sw_sigma_gamma=cut_sig_gamma[ONETYPE].y;
+  const numtyp sw_lambda_epsilon_ijk=sw_pre3[ONETYPE3].x;
+  const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
+  #endif
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
-    itype=map[itype];
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
+    itype*=ntypes;
+    #endif
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      jtype=map[jtype];
+      #ifndef ONETYPE
+      const int jtype=jx.w;
+      const int mtypej=itype+jtype;
+      #endif
 
       // Compute r12
       numtyp delr1x = ix.x-jx.x;
@@ -602,148 +635,116 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
       numtyp delr1z = ix.z-jx.z;
       numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
 
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
-      numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
+      #ifndef ONETYPE
+      const numtyp sw_cut_ij=cut_sig_gamma[mtypej].x;
+      const numtyp sw_sigma_gamma_ij=cut_sig_gamma[mtypej].y;
+      #endif
 
-      if (rsq1 > sw3_ijparam.y) continue;
-
-      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
-      sw_cut_ij=sw3_ijparam.x;
-
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
-
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk&(t_per_atom-1));
+      nbor_k+=offset_k;
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
-        int ktype=kx.w;
-        ktype=map[ktype];
-        int ikparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype]; //jk
+        #ifndef ONETYPE
+        const int ktype=kx.w;
+        const int mtypek=jtype*ntypes+ktype;
+        #endif
 
         numtyp delr2x = kx.x - jx.x;
         numtyp delr2y = kx.y - jx.y;
         numtyp delr2z = kx.z - jx.z;
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
-        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
 
-        if (rsq2 < sw3_ikparam.y) {
-          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
-          sw_cut_ik=sw3_ikparam.x;
+        #ifndef ONETYPE
+        const numtyp sw_cut_ik=cut_sig_gamma[mtypek].x;
+        const numtyp sw_sigma_gamma_ik=cut_sig_gamma[mtypek].y;
+        const int mtypejik=jtype*ntypes*ntypes+itype+ktype;
+        const numtyp sw_lambda_epsilon_ijk=sw_pre3[mtypejik].x;
+        const numtyp sw_costheta_ijk=sw_pre3[mtypejik].y;
+        #endif
 
-          int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; //jik
-          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
-          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
-          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
-          sw_costheta_ijk=sw3_ijkparam.z;
+        numtyp fjx, fjy, fjz;
+        threebody_half(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z);
 
-          numtyp fjx, fjy, fjz;
-          threebody_half(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z);
-
-          f.x += fjx;
-          f.y += fjy;
-          f.z += fjz;
-        }
+        f.x += fjx;
+        f.y += fjy;
+        f.z += fjz;
       }
-
     } // for nbor
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
 __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
-                             const __global numtyp4 *restrict sw1,
-                             const __global numtyp4 *restrict sw2,
-                             const __global numtyp4 *restrict sw3,
-                             const __global int *restrict map,
-                             const __global int *restrict elem2param,
-                             const int nelements,
-                             const __global int * dev_nbor,
-                             const __global int * dev_packed,
+                             const __global numtyp2 *restrict cut_sig_gamma,
+                             const __global numtyp2 *restrict sw_pre3,
+                             const int ntypes, const __global int * dev_nbor,
                              const __global int * dev_ilist,
-                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
                              const int inum,  const int nbor_pitch,
                              const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
-  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
-
+  int n_stride;
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  local_allocate_store_three();
+
+  #ifdef ONETYPE
+  const numtyp sw_cut=cut_sig_gamma[ONETYPE].x;
+  const numtyp sw_sigma_gamma=cut_sig_gamma[ONETYPE].y;
+  const numtyp sw_lambda_epsilon_ijk=sw_pre3[ONETYPE3].x;
+  const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
+  #endif
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
-    itype=map[itype];
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
+    itype*=ntypes;
+    #endif
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      jtype=map[jtype];
+      #ifndef ONETYPE
+      const int jtype=jx.w;
+      const int mtypej=itype+jtype;
+      #endif
 
       // Compute r12
       numtyp delr1x = ix.x-jx.x;
@@ -751,88 +752,61 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
       numtyp delr1z = ix.z-jx.z;
       numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
 
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
-      numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
+      #ifndef ONETYPE
+      const numtyp sw_cut_ij=cut_sig_gamma[mtypej].x;
+      const numtyp sw_sigma_gamma_ij=cut_sig_gamma[mtypej].y;
+      #endif
 
-      if (rsq1 > sw3_ijparam.y) continue;
-
-      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
-      sw_cut_ij=sw3_ijparam.x;
-
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
-
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk&(t_per_atom-1));
+      nbor_k+=offset_k;
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
-        int ktype=kx.w;
-        ktype=map[ktype];
-        int ikparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype]; // jk
-        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
+        #ifndef ONETYPE
+        const int ktype=kx.w;
+        const int mtypek=jtype*ntypes+ktype;
+        #endif
 
         numtyp delr2x = kx.x - jx.x;
         numtyp delr2y = kx.y - jx.y;
         numtyp delr2z = kx.z - jx.z;
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
 
-        if (rsq2 < sw3_ikparam.y) {
-          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
-          sw_cut_ik=sw3_ikparam.x;
+        #ifndef ONETYPE
+        const numtyp sw_cut_ik=cut_sig_gamma[mtypek].x;
+        const numtyp sw_sigma_gamma_ik=cut_sig_gamma[mtypek].y;
+        const int mtypejik=jtype*ntypes*ntypes+itype+ktype;
+        const numtyp sw_lambda_epsilon_ijk=sw_pre3[mtypejik].x;
+        const numtyp sw_costheta_ijk=sw_pre3[mtypejik].y;
+        #endif
 
-          int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; // jik
-          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
-          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
-          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
-          sw_costheta_ijk=sw3_ijkparam.z;
+        numtyp fjx, fjy, fjz, fkx, fky, fkz;
+        threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
 
-          numtyp fjx, fjy, fjz, fkx, fky, fkz;
-          threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
-
-          f.x += fjx;
-          f.y += fjy;
-          f.z += fjz;
-        }
+        f.x += fjx;
+        f.y += fjy;
+        f.z += fjz;
       }
-
     } // for nbor
     energy*=THIRD;
     for (int i=0; i<6; i++)
       virial[i]*=THIRD;
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
-
diff --git a/lib/gpu/lal_sw.h b/lib/gpu/lal_sw.h
index 1a2e025ae0..f8b4b465a5 100644
--- a/lib/gpu/lal_sw.h
+++ b/lib/gpu/lal_sw.h
@@ -37,14 +37,13 @@ class SW : public BaseThree<numtyp, acctyp> {
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
-           const double cell_size, const double gpu_split, FILE *screen,
-           int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-           const double* epsilon, const double* sigma,
-           const double* lambda, const double* gamma,
-           const double* costheta, const double* biga,
-           const double* bigb, const double* powerp,
-           const double* powerq, const double* cut, const double* cutsq);
+  int init(const int ntypes, const int nlocal, const int nall,
+           const int max_nbors, const double cell_size,
+           const double gpu_split, FILE *screen, double **ncutsq,
+           double **ncut, double **sigma, double **powerp, double **powerq,
+           double **sigma_gamma, double **c1, double **c2, double **c3,
+           double **c4, double **c5, double **c6, double ***lambda_epsilon,
+           double ***costheta, const int *map, int ***e2param);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
@@ -64,22 +63,21 @@ class SW : public BaseThree<numtyp, acctyp> {
   /// Number of atom types
   int _lj_types;
 
-  /// sw1.x = epsilon, sw1.y = sigma, sw1.z = lambda, sw1.w = gamma
-  UCL_D_Vec<numtyp4> sw1;
-  /// sw2.x = biga, sw2.y = bigb, sw2.z = powerp, sw2.w = powerq
-  UCL_D_Vec<numtyp4> sw2;
-  /// sw3.x = cut, sw3.y = cutsq, sw3.z = costheta
-  UCL_D_Vec<numtyp4> sw3;
-
-  UCL_D_Vec<int> elem2param;
-  UCL_D_Vec<int> map;
-  int _nparams,_nelements;
-
-  UCL_Texture sw1_tex, sw2_tex, sw3_tex;
+  UCL_D_Vec<numtyp> cutsq;
+  /// sw_pre.x = cut, sw_pre.y = sigma, sw_pre.z = powerp, sw_pre.w = powerq
+  UCL_D_Vec<numtyp4> sw_pre;
+  /// c_14.x = c1, c_14.y = c2, c_14.z = c3, c_14.w = c4
+  UCL_D_Vec<numtyp4> c_14;
+  /// c_56.x = c5, c_56.y = c6
+  UCL_D_Vec<numtyp2> c_56;
+  /// cut_sigma_gamma.x = cut, cut_sigma_gamma.y = sigma_gamma
+  UCL_D_Vec<numtyp2> cut_sigma_gamma;
+  /// sw_pre3.x = lambda_epsilon, sw_pre3.y = costheta
+  UCL_D_Vec<numtyp2> sw_pre3;
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag, const int evatom);
+  int loop(const int eflag, const int vflag, const int evatom, bool &success);
 
 };
 
diff --git a/lib/gpu/lal_sw_ext.cpp b/lib/gpu/lal_sw_ext.cpp
index 1935ed615b..5158f135a3 100644
--- a/lib/gpu/lal_sw_ext.cpp
+++ b/lib/gpu/lal_sw_ext.cpp
@@ -27,15 +27,13 @@ static SW<PRECISION,ACC_PRECISION> SWMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors,
-                const double cell_size, int &gpu_mode, FILE *screen,
-                int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-                const double* sw_epsilon, const double* sw_sigma,
-                const double* sw_lambda, const double* sw_gamma,
-                const double* sw_costheta, const double* sw_biga,
-                const double* sw_bigb, const double* sw_powerp,
-                const double* sw_powerq, const double* sw_cut,
-                const double* sw_cutsq) {
+int sw_gpu_init(const int ntypes, const int inum, const int nall,
+                const int max_nbors, const double cell_size, int &gpu_mode,
+                FILE *screen, double **ncutsq, double **ncut, double **sigma,
+                double **powerp, double **powerq, double **sigma_gamma,
+                double **c1, double **c2, double **c3, double **c4,
+                double **c5, double **c6, double ***lambda_epsilon,
+                double ***costheta, const int *map, int ***e2param) {
   SWMF.clear();
   gpu_mode=SWMF.device->gpu_mode();
   double gpu_split=SWMF.device->particle_split();
@@ -62,10 +60,10 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
-                      host_map, nelements, host_elem2param, nparams,
-                      sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
-                      sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
+    init_ok=SWMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split,
+                      screen, ncutsq, ncut, sigma, powerp, powerq,
+                      sigma_gamma, c1, c2, c3, c4, c5, c6, lambda_epsilon,
+                      costheta, map, e2param);
 
   SWMF.device->world_barrier();
   if (message)
@@ -81,11 +79,10 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
-                        host_map, nelements, host_elem2param, nparams,
-                        sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
-                        sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut,
-                        sw_cutsq);
+      init_ok=SWMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split,
+                        screen, ncutsq, ncut, sigma, powerp, powerq,
+                        sigma_gamma, c1, c2, c3, c4, c5, c6, lambda_epsilon,
+                        costheta, map, e2param);
 
     SWMF.device->gpu_barrier();
     if (message)
@@ -127,5 +124,3 @@ void sw_gpu_compute(const int ago, const int nlocal, const int nall,
 double sw_gpu_bytes() {
   return SWMF.host_memory_usage();
 }
-
-
diff --git a/lib/gpu/lal_table.cpp b/lib/gpu/lal_table.cpp
index d07b2716e4..0c336c6990 100644
--- a/lib/gpu/lal_table.cpp
+++ b/lib/gpu/lal_table.cpp
@@ -69,6 +69,20 @@ int TableT::init(const int ntypes,
   k_pair_spline_fast.set_function(*(this->pair_program),"k_table_spline_fast");
   k_pair_bitmap.set_function(*(this->pair_program),"k_table_bitmap");
   k_pair_bitmap_fast.set_function(*(this->pair_program),"k_table_bitmap_fast");
+
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_linear_noev.set_function(*(this->pair_program_noev),
+                                  "k_table_linear_fast");
+  k_pair_spline_noev.set_function(*(this->pair_program_noev),
+                                  "k_table_spline_fast");
+  k_pair_bitmap_noev.set_function(*(this->pair_program_noev),
+                                  "k_table_bitmap_fast");
+  #else
+  k_pair_linear_sel = &k_pair_linear_fast;
+  k_pair_spline_sel = &k_pair_spline_fast;
+  k_pair_bitmap_sel = &k_pair_bitmap_fast;
+  #endif
+
   _compiled_styles = true;
 
   // If atom type constants fit in shared memory use fast kernel
@@ -228,6 +242,11 @@ void TableT::clear() {
     k_pair_spline.clear();
     k_pair_bitmap_fast.clear();
     k_pair_bitmap.clear();
+    #if defined(LAL_OCL_EV_JIT)
+    k_pair_linear_noev.clear();
+    k_pair_spline_noev.clear();
+    k_pair_bitmap_noev.clear();
+    #endif
     _compiled_styles=false;
   }
 
@@ -243,19 +262,22 @@ double TableT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void TableT::loop(const bool _eflag, const bool _vflag) {
+int TableT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
 
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
+  #if defined(LAL_OCL_EV_JIT)
+  if (eflag || vflag) {
+    k_pair_linear_sel = &k_pair_linear_fast;
+    k_pair_spline_sel = &k_pair_spline_fast;
+    k_pair_bitmap_sel = &k_pair_bitmap_fast;
+  } else {
+    k_pair_linear_sel = &k_pair_linear_noev;
+    k_pair_spline_sel = &k_pair_spline_noev;
+    k_pair_bitmap_sel = &k_pair_bitmap_noev;
+  }
+  #endif
+
 
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
@@ -265,37 +287,37 @@ void TableT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     if (_tabstyle == LOOKUP) {
-      this->k_pair_fast.set_size(GX,BX);
-      this->k_pair_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
+      this->k_pair_sel->set_size(GX,BX);
+      this->k_pair_sel->run(&this->atom->x, &tabindex, &coeff2, &coeff3,
                             &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor,
                             &this->_nbor_data->begin(), &this->ans->force,
                             &this->ans->engv, &eflag, &vflag, &ainum,
                             &nbor_pitch, &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == LINEAR) {
-      this->k_pair_linear_fast.set_size(GX,BX);
-      this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2,
-                                   &coeff3, &coeff4, &cutsq, &sp_lj,
-                                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                                   &this->ans->force, &this->ans->engv,
-                                   &eflag, &vflag, &ainum, &nbor_pitch,
-                                   &this->_threads_per_atom, &_tablength);
+      k_pair_linear_sel->set_size(GX,BX);
+      k_pair_linear_sel->run(&this->atom->x, &tabindex, &coeff2,
+                             &coeff3, &coeff4, &cutsq, &sp_lj,
+                             &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                             &this->ans->force, &this->ans->engv,
+                             &eflag, &vflag, &ainum, &nbor_pitch,
+                             &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == SPLINE) {
-      this->k_pair_spline_fast.set_size(GX,BX);
-      this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2,
-                                   &coeff3, &coeff4, &cutsq, &sp_lj,
-                                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                                   &this->ans->force, &this->ans->engv,
-                                   &eflag, &vflag, &ainum, &nbor_pitch,
-                                   &this->_threads_per_atom, &_tablength);
+      k_pair_spline_sel->set_size(GX,BX);
+      k_pair_spline_sel->run(&this->atom->x, &tabindex, &coeff2,
+                             &coeff3, &coeff4, &cutsq, &sp_lj,
+                             &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                             &this->ans->force, &this->ans->engv,
+                             &eflag, &vflag, &ainum, &nbor_pitch,
+                             &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == BITMAP) {
-      this->k_pair_bitmap_fast.set_size(GX,BX);
-      this->k_pair_bitmap_fast.run(&this->atom->x, &tabindex, &nshiftbits,
-                                   &nmask, &coeff2, &coeff3, &coeff4, &cutsq,
-                                   &sp_lj, &this->nbor->dev_nbor,
-                                   &this->_nbor_data->begin(), &this->ans->force,
-                                   &this->ans->engv, &eflag, &vflag,
-                                   &ainum, &nbor_pitch,
-                                   &this->_threads_per_atom, &_tablength);
+      k_pair_bitmap_sel->set_size(GX,BX);
+      k_pair_bitmap_sel->run(&this->atom->x, &tabindex, &nshiftbits,
+                             &nmask, &coeff2, &coeff3, &coeff4, &cutsq,
+                             &sp_lj, &this->nbor->dev_nbor,
+                             &this->_nbor_data->begin(), &this->ans->force,
+                             &this->ans->engv, &eflag, &vflag,
+                             &ainum, &nbor_pitch,
+                             &this->_threads_per_atom, &_tablength);
     }
   } else {
     if (_tabstyle == LOOKUP) {
@@ -334,6 +356,7 @@ void TableT::loop(const bool _eflag, const bool _vflag) {
     }
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Table<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu
index 0cf0de2af0..eb29218712 100644
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@@ -58,24 +58,27 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   int tlm1 = tablength - 1;
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -112,13 +115,13 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1)
             e = coeff3[idx].y;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -129,9 +132,9 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_table_fast(const __global numtyp4 *restrict x_,
@@ -153,18 +156,22 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
@@ -173,7 +180,6 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -211,13 +217,13 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1)
             e = coeff3[idx].y;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -228,9 +234,9 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 /// ---------------- LINEAR -------------------------------------------------
@@ -254,24 +260,27 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   int tlm1 = tablength - 1;
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -312,13 +321,13 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -329,9 +338,9 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
@@ -353,18 +362,22 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
@@ -373,7 +386,6 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -415,13 +427,13 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -432,9 +444,9 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 /// ---------------- SPLINE -------------------------------------------------
@@ -458,24 +470,27 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   int tlm1 = tablength - 1;
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -520,7 +535,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1) {
             e = a * coeff3[idx].y + b * coeff3[idx+1].y +
@@ -529,7 +544,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
           }
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -540,9 +555,9 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_table_spline_fast(const __global numtyp4 *x_,
@@ -564,19 +579,22 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
 
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
   __syncthreads();
 
   int tlm1 = tablength - 1;
@@ -584,7 +602,6 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -630,7 +647,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1) {
             e = a * coeff3[idx].y + b * coeff3[idx+1].y +
@@ -639,7 +656,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
           }
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -650,9 +667,9 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 /// ---------------- BITMAP -------------------------------------------------
@@ -678,24 +695,27 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   int tlm1 = tablength - 1;
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -739,13 +759,13 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable <= tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -756,9 +776,9 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
@@ -782,18 +802,22 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
 
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
@@ -802,7 +826,6 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -847,13 +870,13 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable <= tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -864,7 +887,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
diff --git a/lib/gpu/lal_table.h b/lib/gpu/lal_table.h
index 38ae012bee..b67a369dad 100644
--- a/lib/gpu/lal_table.h
+++ b/lib/gpu/lal_table.h
@@ -56,9 +56,10 @@ class Table : public BaseAtomic<numtyp, acctyp> {
   double host_memory_usage() const;
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Kernel k_pair_linear, k_pair_linear_fast;
-  UCL_Kernel k_pair_spline, k_pair_spline_fast;
-  UCL_Kernel k_pair_bitmap, k_pair_bitmap_fast;
+  UCL_Kernel k_pair_linear, k_pair_linear_fast, k_pair_linear_noev;
+  UCL_Kernel k_pair_spline, k_pair_spline_fast, k_pair_spline_noev;
+  UCL_Kernel k_pair_bitmap, k_pair_bitmap_fast, k_pair_bitmap_noev;
+  UCL_Kernel *k_pair_linear_sel, *k_pair_spline_sel, *k_pair_bitmap_sel;
 
   // --------------------------- TYPE DATA --------------------------
 
@@ -90,7 +91,7 @@ class Table : public BaseAtomic<numtyp, acctyp> {
  private:
   bool _allocated, _compiled_styles;
 
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_table_ext.cpp b/lib/gpu/lal_table_ext.cpp
index f067881b88..6237c4d7cd 100644
--- a/lib/gpu/lal_table_ext.cpp
+++ b/lib/gpu/lal_table_ext.cpp
@@ -55,7 +55,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
   int init_ok=0;
   if (world_me==0)
     init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data,
-                      special_lj, inum, nall, 300, maxspecial, cell_size,
+                      special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen, tabstyle, ntables, tablength);
 
   TBMF.device->world_barrier();
@@ -73,7 +73,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data,
-                      special_lj, inum, nall, 300, maxspecial, cell_size,
+                      special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen, tabstyle, ntables, tablength);
 
     TBMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp
index 63691a2047..e0e87d9148 100644
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@@ -39,7 +39,7 @@ TersoffT::~Tersoff() {
 
 template <class numtyp, class acctyp>
 int TersoffT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
+  return this->bytes_per_atom_atomic(max_nbors)+max_nbors*sizeof(acctyp)*4;
 }
 
 template <class numtyp, class acctyp>
@@ -52,34 +52,82 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
                    const double* c, const double* d, const double* h, const double* gamma,
                    const double* beta, const double* powern, const double* host_cutsq)
 {
+  int oldparam=-1;
+  int onetype=-1;
+  int onetype3=0;
+  int spq=0;
+  int mtypes=0;
+  #ifdef USE_OPENCL
+  for (int ii=1; ii<ntypes; ii++) {
+    const int i=host_map[ii];
+    for (int jj=1; jj<ntypes; jj++) {
+      const int j=host_map[jj];
+      for (int kk=1; kk<ntypes; kk++) {
+        const int k=host_map[kk];
+        if (i<0 || j<0 || k<0) continue;
+        const int ijkparam = host_elem2param[i][j][k];
+        if (oldparam!=ijkparam) {
+          oldparam=ijkparam;
+          onetype=ntypes*ii+jj;
+          onetype3=ijkparam;
+          mtypes++;
+        }
+      }
+    }
+  }
+  if (mtypes>1) onetype=-1;
+  if (onetype>=0) spq=powermint[onetype3];
+  #endif
+
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,tersoff,"k_tersoff_repulsive",
                            "k_tersoff_three_center", "k_tersoff_three_end",
-                           "k_tersoff_short_nbor");
+                           "k_tersoff_short_nbor",onetype,onetype3,spq,1);
   if (success!=0)
     return success;
 
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
-  _zetaij.alloc(ef_nall*max_nbors,*(this->ucl_device),UCL_READ_WRITE);
+  if (this->nbor->max_nbors()) {
+    _zetaij.alloc(ef_nall*this->nbor->max_nbors(),*(this->ucl_device),
+                  UCL_READ_WRITE);
+    _zetaij_eng.alloc(ef_nall*this->nbor->max_nbors(),*(this->ucl_device),
+                      UCL_READ_WRITE);
+  }
 
   k_zeta.set_function(*(this->pair_program),"k_tersoff_zeta");
+  #if defined(LAL_OCL_EV_JIT)
+  k_zeta_noev.set_function(*(this->pair_program_noev),"k_tersoff_zeta");
+  #else
+  k_zeta_selt = &k_zeta;
+  #endif
 
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
+  _ntypes=ntypes;
   _nparams = nparams;
   _nelements = nelements;
 
+  UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
+                               UCL_READ_WRITE);
+  host_write.zero();
+  cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  for (int ii=1; ii<ntypes; ii++) {
+    const int i=host_map[ii];
+    for (int jj=1; jj<ntypes; jj++) {
+      const int j=host_map[jj];
+      for (int kk=1; kk<ntypes; kk++) {
+        const int k=host_map[kk];
+        if (i<0 || j<0 || k<0) continue;
+        const int ijkparam = host_elem2param[i][j][k];
+        if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
+          host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
+      }
+    }
+  }
+  ucl_copy(cutsq_pair,host_write,ntypes*ntypes);
+
+  // --------------------------------------------------------------------
   UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
                            UCL_WRITE_ONLY);
 
@@ -90,32 +138,29 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
     dview[i].w=(numtyp)0;
   }
 
+  // pack coefficients into arrays
   // pack coefficients into arrays
   ts1.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
   for (int i=0; i<nparams; i++) {
-    dview[i].x=static_cast<numtyp>(lam1[i]);
-    dview[i].y=static_cast<numtyp>(lam2[i]);
-    dview[i].z=static_cast<numtyp>(lam3[i]);
-    dview[i].w=static_cast<numtyp>(powermint[i]);
+    dview[i].x=static_cast<numtyp>(lam3[i]);
+    dview[i].y=static_cast<numtyp>(powermint[i]);
+    dview[i].z=static_cast<numtyp>(bigr[i]);
+    dview[i].w=static_cast<numtyp>(bigd[i]);
   }
 
   ucl_copy(ts1,dview,false);
-  ts1_tex.get_texture(*(this->pair_program),"ts1_tex");
-  ts1_tex.bind_float(ts1,4);
 
   ts2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
   for (int i=0; i<nparams; i++) {
     dview[i].x=static_cast<numtyp>(biga[i]);
-    dview[i].y=static_cast<numtyp>(bigb[i]);
+    dview[i].y=static_cast<numtyp>(lam1[i]);
     dview[i].z=static_cast<numtyp>(bigr[i]);
     dview[i].w=static_cast<numtyp>(bigd[i]);
   }
 
   ucl_copy(ts2,dview,false);
-  ts2_tex.get_texture(*(this->pair_program),"ts2_tex");
-  ts2_tex.bind_float(ts2,4);
 
   ts3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -127,46 +172,28 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
   }
 
   ucl_copy(ts3,dview,false);
-  ts3_tex.get_texture(*(this->pair_program),"ts3_tex");
-  ts3_tex.bind_float(ts3,4);
 
   ts4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
   for (int i=0; i<nparams; i++) {
-    dview[i].x=static_cast<numtyp>(c[i]);
-    dview[i].y=static_cast<numtyp>(d[i]);
+    dview[i].x=static_cast<numtyp>(c[i]*c[i]);
+    dview[i].y=static_cast<numtyp>(d[i]*d[i]);
     dview[i].z=static_cast<numtyp>(h[i]);
     dview[i].w=static_cast<numtyp>(gamma[i]);
   }
 
   ucl_copy(ts4,dview,false);
-  ts4_tex.get_texture(*(this->pair_program),"ts4_tex");
-  ts4_tex.bind_float(ts4,4);
 
   ts5.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
   for (int i=0; i<nparams; i++) {
     dview[i].x=static_cast<numtyp>(beta[i]);
     dview[i].y=static_cast<numtyp>(powern[i]);
-    dview[i].z=(numtyp)0;
-    dview[i].w=(numtyp)0;
+    dview[i].z=static_cast<numtyp>(lam2[i]);
+    dview[i].w=static_cast<numtyp>(bigb[i]);
   }
 
   ucl_copy(ts5,dview,false);
-  ts5_tex.get_texture(*(this->pair_program),"ts5_tex");
-  ts5_tex.bind_float(ts5,4);
-
-  UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
-                               UCL_WRITE_ONLY);
-  double cutsqmax = 0.0;
-  for (int i=0; i<nparams; i++) {
-    cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
-    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
-  }
-  cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-  ucl_copy(cutsq,cutsq_view,false);
-
-  _cutshortsq = static_cast<numtyp>(cutsqmax);
 
   UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                            *(this->ucl_device), UCL_WRITE_ONLY);
@@ -183,17 +210,17 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
 
   ucl_copy(elem2param,dview_elem2param,false);
 
-  UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
+  UCL_H_Vec<int> dview_map(ntypes, *(this->ucl_device), UCL_WRITE_ONLY);
   for (int i = 0; i < ntypes; i++)
     dview_map[i] = host_map[i];
 
-  map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
+  map.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY);
   ucl_copy(map,dview_map,false);
 
   _allocated=true;
   this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+
-    ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+
-    map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
+    ts4.row_bytes()+ts5.row_bytes()+map.row_bytes()+
+    elem2param.row_bytes()+_zetaij.row_bytes()+_zetaij_eng.row_bytes();
   return 0;
 }
 
@@ -208,12 +235,16 @@ void TersoffT::clear() {
   ts3.clear();
   ts4.clear();
   ts5.clear();
-  cutsq.clear();
+  cutsq_pair.clear();
   map.clear();
   elem2param.clear();
   _zetaij.clear();
+  _zetaij_eng.clear();
 
   k_zeta.clear();
+  #if defined(LAL_OCL_EV_JIT)
+  k_zeta_noev.clear();
+  #endif
 
   this->clear_atomic();
 }
@@ -229,75 +260,60 @@ double TersoffT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
-  // Compute the block size and grid size to keep all cores busy
-  int BX=this->block_pair();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
-  // build the short neighbor list
-  int ainum=this->_ainum;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/this->_threads_per_atom)));
-
-  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                         &this->_nbor_data->begin(),
-                         &this->dev_short_nbor, &_cutshortsq, &ainum,
-                         &nbor_pitch, &this->_threads_per_atom);
+int TersoffT::loop(const int eflag, const int vflag, const int evatom,
+                   bool &success) {
+  const int nbor_pitch=this->nbor->nbor_pitch();
 
   // re-allocate zetaij if necessary
   int nall = this->_nall;
-  if (nall*this->_max_nbors > _zetaij.cols()) {
+  if (nall*this->nbor->max_nbors() > _zetaij.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(this->_max_nbors*_nmax);
+    _zetaij.clear();
+    _zetaij_eng.clear();
+    success = success && (_zetaij.alloc(this->nbor->max_nbors()*_nmax,
+                                        *(this->ucl_device),
+                                        UCL_READ_WRITE) == UCL_SUCCESS);
+    success = success && (_zetaij_eng.alloc(this->nbor->max_nbors()*_nmax,
+                                            *(this->ucl_device),
+                                            UCL_READ_WRITE) == UCL_SUCCESS);
+    if (!success) return 0;
   }
 
-  nbor_pitch=this->nbor->nbor_pitch();
+  // build the short neighbor list
+  int ainum=this->_ainum;
+  this->time_pair.start();
+
+  int BX=this->block_pair();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
+                         &this->nbor->dev_nbor, &this->nbor->dev_packed,
+                         &ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  #if defined(LAL_OCL_EV_JIT)
+  if (eflag || vflag) k_zeta_selt = &k_zeta;
+  else k_zeta_selt = &k_zeta_noev;
+  #endif
+
   GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
                                (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
+  k_zeta_selt->set_size(GX,BX);
+  k_zeta_selt->run(&this->atom->x, &ts1, &ts3, &ts4, &ts5,
                    &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_zetaij_eng, &this->nbor->dev_nbor, &eflag, &this->_ainum,
+                   &nbor_pitch, &this->_threads_per_atom);
 
   ainum=this->ans->inum();
-  nbor_pitch=this->nbor->nbor_pitch();
-  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  this->time_pair.start();
-  this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &this->ans->force, &this->ans->engv,
-                   &eflag, &vflag, &ainum, &nbor_pitch,
-                   &this->_threads_per_atom);
-
   BX=this->block_size();
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                            (BX/(KTHREADS*JTHREADS))));
-  this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
-                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                           &this->dev_short_nbor,
-                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
-                           &nbor_pitch, &this->_threads_per_atom, &evatom);
+  this->k_3center_sel->set_size(GX,BX);
+  this->k_3center_sel->run(&this->atom->x, &ts1, &ts4, &map,
+                           &elem2param, &_nelements, &_nparams, &_zetaij,
+                           &_zetaij_eng, &this->nbor->dev_nbor,
+                           &this->ans->force, &this->ans->engv, &eflag,
+                           &vflag, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom, &evatom);
 
   Answer<numtyp,acctyp> *end_ans;
   #ifdef THREE_CONCURRENT
@@ -307,24 +323,34 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   #endif
   if (evatom!=0) {
     this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
-                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts4, &map, &elem2param,
+                          &_nelements, &_nparams, &_zetaij, &_zetaij_eng,
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
 
   } else {
-    this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
-                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+    this->k_3end_sel->set_size(GX,BX);
+    this->k_3end_sel->run(&this->atom->x, &ts1, &ts4, &map, &elem2param,
+                          &_nelements, &_nparams, &_zetaij, &_zetaij_eng,
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
   }
 
+  BX=this->block_pair();
+  int GXT=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->k_sel->set_size(GXT,BX);
+  this->k_sel->run(&this->atom->x, &ts2, &map, &elem2param, &_nelements,
+                   &_nparams, &this->nbor->dev_nbor, &this->ans->force,
+                   &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
+                   &this->_threads_per_atom, &GX);
+
   this->time_pair.stop();
+  return GX;
 }
 
 template class Tersoff<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index b08fddfd6e..03ce68be77 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -18,99 +18,28 @@
 
 #ifndef _DOUBLE_DOUBLE
 _texture( pos_tex,float4);
-_texture( ts1_tex,float4);
-_texture( ts2_tex,float4);
-_texture( ts3_tex,float4);
-_texture( ts4_tex,float4);
-_texture( ts5_tex,float4);
 #else
 _texture_2d( pos_tex,int4);
-_texture( ts1_tex,int4);
-_texture( ts2_tex,int4);
-_texture( ts3_tex,int4);
-_texture( ts4_tex,int4);
-_texture( ts5_tex,int4);
 #endif
 
 #else
 #define pos_tex x_
-#define ts1_tex ts1
-#define ts2_tex ts2
-#define ts3_tex ts3
-#define ts4_tex ts4
-#define ts5_tex ts5
 #endif
 
 //#define THREE_CONCURRENT
 
 #define TWOTHIRD (numtyp)0.66666666666666666667
 
-#define zeta_idx(nbor_mem, packed_mem, nbor_pitch, n_stride, t_per_atom,    \
-                 i, nbor_j, offset_j, idx)                                  \
-  if (nbor_mem==packed_mem) {                                               \
-    int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;                       \
-    idx = jj*n_stride + i*t_per_atom + offset_j;                            \
-  } else {                                                                  \
-    idx = nbor_j;                                                           \
-  }
+#if (SHUFFLE_AVAIL == 0)
 
-#if (ARCH < 300)
-
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
-                        offset, eflag, vflag, ans, engv)                    \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
-    old.x+=f.x;                                                             \
-    old.y+=f.y;                                                             \
-    old.z+=f.z;                                                             \
-    ans[ii]=old;                                                            \
-  }
+#define local_allocate_acc_zeta()                                           \
+    __local acctyp red_acc[BLOCK_PAIR];
 
 #define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=z;                                                         \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s) {                                                     \
         red_acc[tid] += red_acc[tid+s];                                     \
       }                                                                     \
@@ -118,36 +47,168 @@ _texture( ts5_tex,int4);
     z=red_acc[tid];                                                         \
   }
 
-#else
-
 #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
-                        offset, eflag, vflag, ans, engv)                    \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      f.x += shfl_xor(f.x, s, t_per_atom);                                  \
-      f.y += shfl_xor(f.y, s, t_per_atom);                                  \
-      f.z += shfl_xor(f.z, s, t_per_atom);                                  \
-      energy += shfl_xor(energy, s, t_per_atom);                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
         ei+=inum;                                                           \
       }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
+  }
+
+#else
+
+#define local_allocate_acc_zeta()
+
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      z += shfl_down(z, s, t_per_atom);                                     \
+    }                                                                       \
+  }
+
+#if (EVFLAG == 1)
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
@@ -155,63 +216,68 @@ _texture( ts5_tex,int4);
     ans[ii]=old;                                                            \
   }
 
-#define acc_zeta(z, tid, t_per_atom, offset)                                \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      z += shfl_xor(z, s, t_per_atom);                                      \
-    }                                                                       \
-  }
+#endif
+#endif
 
+#ifdef LAL_SIMD_IP_SYNC
+#define t_per_atom t_per_atom_in
+#else
+#define t_per_atom 1
 #endif
 
 __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global int * dev_nbor,
+                                   const __global numtyp *restrict cutsq_pair,
+                                   const int ntypes, __global int * dev_nbor,
                                    const __global int * dev_packed,
-                                   __global int * dev_short_nbor,
-                                   const numtyp _cutshortsq,
                                    const int inum, const int nbor_pitch,
-                                   const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
+                                   const int t_per_atom_in) {
+  const int ii=GLOBAL_ID_X;
+
+  #ifdef ONETYPE
+  const numtyp cutsq=cutsq_pair[ONETYPE];
+  #endif
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    const int i=dev_packed[ii];
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
+    const int itype=ix.w*ntypes;
+    #endif
+    int newj=0;
 
-    int ncount = 0;
-    int m = nbor;
-    dev_short_nbor[m] = 0;
-    int nbor_short = nbor+n_stride;
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int j=dev_packed[nbor];
-      int nj = j;
-      j &= NEIGHMASK;
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
 
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
+      #ifndef ONETYPE
+      const int mtype=jx.w+itype;
+      const numtyp cutsq=cutsq_pair[mtype];
+      #endif
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<_cutshortsq) {
-        dev_short_nbor[nbor_short] = nj;
-        nbor_short += n_stride;
-        ncount++;
+      if (rsq<cutsq) {
+        *out_list=sj;
+        out_list++;
+        newj++;
+        if ((newj & (t_per_atom-1))==0)
+          out_list+=out_stride;
       }
     } // for nbor
-
-    // store the number of neighbors for each thread
-    dev_short_nbor[m] = ncount;
-
+    dev_nbor[ii+nbor_pitch]=newj;
   } // if ii
 }
 
@@ -224,74 +290,84 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
 
 __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict ts1_in,
-                             const __global numtyp4 *restrict ts2_in,
                              const __global numtyp4 *restrict ts3_in,
                              const __global numtyp4 *restrict ts4_in,
                              const __global numtyp4 *restrict ts5_in,
-                             const __global numtyp *restrict cutsq,
                              const __global int *restrict map,
                              const __global int *restrict elem2param,
                              const int nelements, const int nparams,
-                             __global acctyp4 * zetaij,
+                             __global acctyp2 * zetaij,
+                             __global acctyp * zetaij_eng,
                              const __global int * dev_nbor,
-                             const __global int * dev_packed,
-                             const __global int * dev_short_nbor,
                              const int eflag, const int inum,
-                             const int nbor_pitch, const int t_per_atom) {
-  __local int tpa_sq,n_stride;
-  tpa_sq = fast_mul(t_per_atom,t_per_atom);
+                             const int nbor_pitch, const int t_per_atom_in) {
+  const int tpa_sq = fast_mul(t_per_atom,t_per_atom);
 
-  int tid, ii, offset;
+  int tid, ii, offset,n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_acc_zeta();
+
+  #ifndef ONETYPE
   // must be increased if there will be more than 3 elements in the future.
   __local numtyp4 ts1[SHARED_SIZE];
-  __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts3[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
   __local numtyp4 ts5[SHARED_SIZE];
   if (tid<nparams) {
     ts1[tid]=ts1_in[tid];
-    ts2[tid]=ts2_in[tid];
     ts3[tid]=ts3_in[tid];
     ts4[tid]=ts4_in[tid];
     ts5[tid]=ts5_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp ijkparam_lam3 = ts1_in[ONETYPE3].x;
+  const int ijkparam_powermint = SPQ; // ts1_in[ONETYPE3].y;
+  const numtyp ijkparam_bigr = ts1_in[ONETYPE3].z;
+  const numtyp ijkparam_bigd = ts1_in[ONETYPE3].w;
+  const numtyp ijkparam_c = ts4_in[ONETYPE3].x;
+  const numtyp ijkparam_d = ts4_in[ONETYPE3].y;
+  const numtyp ijkparam_h =  ts4_in[ONETYPE3].z;
+  const numtyp ijkparam_gamma =  ts4_in[ONETYPE3].w;
+  const numtyp ijparam_c1 = ts3_in[ONETYPE3].x;
+  const numtyp ijparam_c2 = ts3_in[ONETYPE3].y;
+  const numtyp ijparam_c3 = ts3_in[ONETYPE3].z;
+  const numtyp ijparam_c4 = ts3_in[ONETYPE3].w;
+  const numtyp ijparam_beta = ts5_in[ONETYPE3].x;
+  const numtyp ijparam_powern = ts5_in[ONETYPE3].y;
+  const numtyp ijparam_lam2 = ts5_in[ONETYPE3].z;
+  const numtyp ijparam_bigb = ts5_in[ONETYPE3].w;
+  #endif
 
   acctyp z = (acctyp)0;
 
-  __syncthreads();
-
   if (ii<inum) {
     int nbor_j, nbor_end, i, numj;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
     itype=map[itype];
+    #endif
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
     int nborj_start = nbor_j;
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int jtype=jx.w;
       jtype=map[jtype];
       int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+      #endif
 
       // Compute rij
       numtyp4 delr1, delr2;
@@ -299,151 +375,143 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
       delr1.y = jx.y-ix.y;
       delr1.z = jx.z-ix.z;
       numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
+      const numtyp r1 = ucl_sqrt(rsq1);
 
       // compute zeta_ij
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int k_end = nbor_end;
-      if (dev_packed==dev_nbor) {
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
-
-      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex); //x_[k];
         int ktype=kx.w;
+        #ifndef ONETYPE
         ktype=map[ktype];
-        int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
-
+        int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+
+                                ktype];
+        #endif
         // Compute rik
         delr2.x = kx.x-ix.x;
         delr2.y = kx.y-ix.y;
         delr2.z = kx.z-ix.z;
         numtyp rsq2 = delr2.x*delr2.x+delr2.y*delr2.y+delr2.z*delr2.z;
 
-        if (rsq2 > cutsq[ijkparam]) continue;
-
-        numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex);
-        numtyp ijkparam_lam3 = ts1_ijkparam.z;
-        numtyp ijkparam_powermint = ts1_ijkparam.w;
-        numtyp4 ts2_ijkparam = ts2[ijkparam]; //fetch4(ts2_ijkparam,ijkparam,ts2_tex);
-        numtyp ijkparam_bigr = ts2_ijkparam.z;
-        numtyp ijkparam_bigd = ts2_ijkparam.w;
-        numtyp4 ts4_ijkparam = ts4[ijkparam]; //fetch4(ts4_ijkparam,ijkparam,ts4_tex);
-        numtyp ijkparam_c = ts4_ijkparam.x;
-        numtyp ijkparam_d = ts4_ijkparam.y;
-        numtyp ijkparam_h = ts4_ijkparam.z;
-        numtyp ijkparam_gamma = ts4_ijkparam.w;
-        z += zeta(ijkparam_powermint, ijkparam_lam3, ijkparam_bigr, ijkparam_bigd,
-                  ijkparam_c, ijkparam_d, ijkparam_h, ijkparam_gamma,
-                  rsq1, rsq2, delr1, delr2);
+        #ifndef ONETYPE
+        const numtyp4 ts1_ijkparam = ts1[ijkparam];
+        const numtyp ijkparam_lam3 = ts1_ijkparam.x;
+        const int ijkparam_powermint = ts1_ijkparam.y;
+        const numtyp ijkparam_bigr = ts1_ijkparam.z;
+        const numtyp ijkparam_bigd = ts1_ijkparam.w;
+        const numtyp4 ts4_ijkparam = ts4[ijkparam];
+        const numtyp ijkparam_c = ts4_ijkparam.x;
+        const numtyp ijkparam_d = ts4_ijkparam.y;
+        const numtyp ijkparam_h = ts4_ijkparam.z;
+        const numtyp ijkparam_gamma = ts4_ijkparam.w;
+        #endif
+        z += zeta(ijkparam_powermint, ijkparam_lam3, ijkparam_bigr,
+                  ijkparam_bigd, ijkparam_c, ijkparam_d, ijkparam_h,
+                  ijkparam_gamma, r1, rsq2, delr1, delr2);
       }
 
-      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j;
-      if (dev_packed==dev_nbor) idx -= n_stride;
       acc_zeta(z, tid, t_per_atom, offset_k);
 
-      numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
-      numtyp ijparam_lam2 = ts1_ijparam.y;
-      numtyp4 ts2_ijparam = ts2[ijparam]; //fetch4(ts2_ijparam,ijparam,ts2_tex);
-      numtyp ijparam_bigb = ts2_ijparam.y;
-      numtyp ijparam_bigr = ts2_ijparam.z;
-      numtyp ijparam_bigd = ts2_ijparam.w;
-      numtyp4 ts3_ijparam = ts3[ijparam]; //fetch4(ts3_ijparam,ijparam,ts3_tex);
-      numtyp ijparam_c1 = ts3_ijparam.x;
-      numtyp ijparam_c2 = ts3_ijparam.y;
-      numtyp ijparam_c3 = ts3_ijparam.z;
-      numtyp ijparam_c4 = ts3_ijparam.w;
-      numtyp4 ts5_ijparam = ts5[ijparam]; //fetch4(ts5_ijparam,ijparam,ts5_tex);
-      numtyp ijparam_beta = ts5_ijparam.x;
-      numtyp ijparam_powern = ts5_ijparam.y;
+      #ifndef ONETYPE
+      const numtyp ijparam_bigr = ts1[ijparam].z;
+      const numtyp ijparam_bigd = ts1[ijparam].w;
+      const numtyp4 ts3_ijparam = ts3[ijparam];
+      const numtyp ijparam_c1 = ts3_ijparam.x;
+      const numtyp ijparam_c2 = ts3_ijparam.y;
+      const numtyp ijparam_c3 = ts3_ijparam.z;
+      const numtyp ijparam_c4 = ts3_ijparam.w;
+      const numtyp4 ts5_ijparam = ts5[ijparam];
+      const numtyp ijparam_beta = ts5_ijparam.x;
+      const numtyp ijparam_powern = ts5_ijparam.y;
+      const numtyp ijparam_lam2 = ts5_ijparam.z;
+      const numtyp ijparam_bigb = ts5_ijparam.w;
+      #else
+      const numtyp ijparam_bigr = ijkparam_bigr;
+      const numtyp ijparam_bigd = ijkparam_bigd;
+      #endif
 
       if (offset_k == 0) {
         numtyp fpfeng[4];
         force_zeta(ijparam_bigb, ijparam_bigr, ijparam_bigd, ijparam_lam2,
-                   ijparam_beta, ijparam_powern, ijparam_c1, ijparam_c2, ijparam_c3,
-                   ijparam_c4, rsq1, z, eflag, fpfeng);
-        acctyp4 zij;
+                   ijparam_beta, ijparam_powern, ijparam_c1, ijparam_c2,
+                   ijparam_c3, ijparam_c4, r1, z, eflag, fpfeng);
+        acctyp2 zij;
         zij.x = fpfeng[0];
         zij.y = fpfeng[1];
-        zij.z = fpfeng[2];
-        zij.w = z;
-        zetaij[idx] = zij;
+        zetaij[nbor_j-2*nbor_pitch] = zij;
+        if (EVFLAG && eflag) zetaij_eng[nbor_j-2*nbor_pitch] = fpfeng[2];
       }
-
     } // for nbor
   } // if ii
 }
 
 __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
-                                  const __global numtyp4 *restrict ts1_in,
                                   const __global numtyp4 *restrict ts2_in,
-                                  const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
-                                  const __global int * dev_packed,
-                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
-                                  const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
+                                  const int t_per_atom_in,
+                                  const int ev_stride) {
+  int tid, ii, offset, n_stride;
   atom_info(t_per_atom,ii,tid,offset);
 
-  __local numtyp4 ts1[SHARED_SIZE];
+  local_allocate_store_pair();
+
+  #ifndef ONETYPE
   __local numtyp4 ts2[SHARED_SIZE];
   if (tid<nparams) {
-    ts1[tid]=ts1_in[tid];
     ts2[tid]=ts2_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp ijparam_biga = ts2_in[ONETYPE3].x;
+  const numtyp ijparam_lam1 = ts2_in[ONETYPE3].y;
+  const numtyp ijparam_bigr = ts2_in[ONETYPE3].z;
+  const numtyp ijparam_bigd = ts2_in[ONETYPE3].w;
+  #endif
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end, i, numj;
-    const __global int* nbor_mem=dev_packed;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset,i,numj,
+                n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
     itype=map[itype];
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor];
-      nbor += n_stride;
-      nbor_end = nbor+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
+    #endif
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=nbor_mem[nbor];
+      int j=dev_nbor[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int jtype=jx.w;
       jtype=map[jtype];
       int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+      #endif
 
       // Compute r12
 
@@ -452,14 +520,15 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq >= cutsq[ijparam]) continue;
+      #ifndef ONETYPE
+      numtyp4 ts2_ijparam = ts2[ijparam];
+      const numtyp ijparam_biga = ts2_ijparam.x;
+      const numtyp ijparam_lam1 = ts2_ijparam.y;
+      const numtyp ijparam_bigr = ts2_ijparam.z;
+      const numtyp ijparam_bigd = ts2_ijparam.w;
+      #endif
 
       numtyp feng[2];
-      numtyp ijparam_lam1 = ts1[ijparam].x;
-      numtyp4 ts2_ijparam = ts2[ijparam];
-      numtyp ijparam_biga = ts2_ijparam.x;
-      numtyp ijparam_bigr = ts2_ijparam.z;
-      numtyp ijparam_bigd = ts2_ijparam.w;
 
       repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
                 rsq, eflag, feng);
@@ -469,9 +538,9 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
       f.y+=dely*force;
       f.z+=delz*force;
 
-      if (eflag>0)
+      if (EVFLAG && eflag)
         energy+=feng[1];
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         virial[0] += delx*delx*force;
         virial[1] += dely*dely*force;
         virial[2] += delz*delz*force;
@@ -480,86 +549,85 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
         virial[5] += dely*delz*force;
       }
     } // for nbor
-
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
-
+  store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv,ev_stride);
 }
 
 __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
                                      const __global numtyp4 *restrict ts1_in,
-                                     const __global numtyp4 *restrict ts2_in,
                                      const __global numtyp4 *restrict ts4_in,
-                                     const __global numtyp *restrict cutsq,
                                      const __global int *restrict map,
                                      const __global int *restrict elem2param,
                                      const int nelements, const int nparams,
-                                     const __global acctyp4 *restrict zetaij,
+                                     const __global acctyp2 *restrict zetaij,
+                                     const __global acctyp *restrict zetaij_e,
                                      const __global int * dev_nbor,
-                                     const __global int * dev_packed,
-                                     const __global int * dev_short_nbor,
                                      __global acctyp4 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
                                      const int inum,  const int nbor_pitch,
-                                     const int t_per_atom, const int evatom) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
+                                     const int t_per_atom_in,
+                                      const int evatom) {
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset); // offset ranges from 0 to tpa_sq-1
 
+  local_allocate_store_three();
+
+  #ifndef ONETYPE
   __local numtyp4 ts1[SHARED_SIZE];
-  __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
   if (tid<nparams) {
     ts1[tid]=ts1_in[tid];
-    ts2[tid]=ts2_in[tid];
     ts4[tid]=ts4_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp lam3 = ts1_in[ONETYPE3].x;
+  const int powermint = SPQ; // ts1_in[ONETYPE3].y;
+  const numtyp bigr = ts1_in[ONETYPE3].z;
+  const numtyp bigd = ts1_in[ONETYPE3].w;
+  const numtyp c = ts4_in[ONETYPE3].x;
+  const numtyp d = ts4_in[ONETYPE3].y;
+  const numtyp h = ts4_in[ONETYPE3].z;
+  const numtyp gamma = ts4_in[ONETYPE3].w;
+  #endif
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
   numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-  __syncthreads();
-
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
     itype=map[itype];
+    #endif
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
     int nborj_start = nbor_j;
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int jtype=jx.w;
       jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+      #endif
 
       // Compute r12
       numtyp delr1[3];
@@ -567,26 +635,22 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
       delr1[1] = jx.y-ix.y;
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
-      if (rsq1 >= cutsq[ijparam]) continue;
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
       // look up for zeta_ij
-      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
+      acctyp2 zeta_ij = zetaij[nbor_j-2*nbor_pitch];
       numtyp force = zeta_ij.x*tpainv;
       numtyp prefactor = zeta_ij.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
-        energy+=zeta_ij.z*tpainv;
+      if (EVFLAG && eflag) {
+        energy+=zetaij_e[nbor_j-2*nbor_pitch]*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += delr1[0]*delr1[0]*mforce;
         virial[1] += delr1[1]*delr1[1]*mforce;
@@ -597,48 +661,45 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
       }
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int k_end = nbor_end;
-      if (dev_packed==dev_nbor) {
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
-
-      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
+        #ifndef ONETYPE
         int ktype=kx.w;
         ktype=map[ktype];
-        int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
+        int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+
+                                ktype];
+        #endif
 
         numtyp delr2[3];
         delr2[0] = kx.x-ix.x;
         delr2[1] = kx.y-ix.y;
         delr2[2] = kx.z-ix.z;
-        numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+        numtyp rsq2 = delr2[0]*delr2[0]+delr2[1]*delr2[1]+delr2[2]*delr2[2];
 
-        if (rsq2 > cutsq[ijkparam]) continue;
+        #ifndef ONETYPE
+        const numtyp4 ts1_ijkparam = ts1[ijkparam];
+        const numtyp lam3 = ts1_ijkparam.x;
+        const int powermint = ts1_ijkparam.y;
+        const numtyp bigr = ts1_ijkparam.z;
+        const numtyp bigd = ts1_ijkparam.w;
+        const numtyp4 ts4_ijkparam = ts4[ijkparam];
+        const numtyp c = ts4_ijkparam.x;
+        const numtyp d = ts4_ijkparam.y;
+        const numtyp h = ts4_ijkparam.z;
+        const numtyp gamma = ts4_ijkparam.w;
+        #endif
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
 
         numtyp fi[3], fj[3], fk[3];
-        numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex);
-        lam3 = ts1_ijkparam.z;
-        powermint = ts1_ijkparam.w;
-        numtyp4 ts2_ijkparam = ts2[ijkparam]; //fetch4(ts2_ijkparam,ijkparam,ts2_tex);
-        bigr = ts2_ijkparam.z;
-        bigd = ts2_ijkparam.w;
-        numtyp4 ts4_ijkparam = ts4[ijkparam]; //fetch4(ts4_ijkparam,ijkparam,ts4_tex);
-        c = ts4_ijkparam.x;
-        d = ts4_ijkparam.y;
-        h = ts4_ijkparam.z;
-        gamma = ts4_ijkparam.w;
-        if (vflag>0)
-          attractive(bigr, bigd, powermint, lam3, c, d, h, gamma,
-                     prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk);
+        if (EVFLAG && vflag)
+          attractive(bigr, bigd, powermint, lam3, c, d, h, gamma, prefactor,
+                     r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk);
         else
           attractive_fi(bigr, bigd, powermint, lam3, c, d, h, gamma,
                         prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi);
@@ -646,7 +707,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
         f.y += fi[1];
         f.z += fi[2];
 
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           acctyp v[6];
           numtyp pre = (numtyp)2.0;
           if (evatom==1) pre = TWOTHIRD;
@@ -662,87 +723,90 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
         }
       } // nbor_k
     } // for nbor_j
-
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,
-                     offset,eflag,vflag,ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,
+                offset,eflag,vflag,ans,engv);
 }
 
 __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict ts1_in,
-                                  const __global numtyp4 *restrict ts2_in,
                                   const __global numtyp4 *restrict ts4_in,
-                                  const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
-                                  const __global acctyp4 *restrict zetaij,
+                                  const __global acctyp2 *restrict zetaij,
+                                  const __global acctyp *restrict zetaij_e,
                                   const __global int * dev_nbor,
-                                  const __global int * dev_packed,
                                   const __global int * dev_ilist,
-                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
-                                  const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
+                                  const int t_per_atom_in,
+                                  const int gpu_nbor) {
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_store_three();
+
+  #ifndef ONETYPE
   __local numtyp4 ts1[SHARED_SIZE];
-  __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
   if (tid<nparams) {
     ts1[tid]=ts1_in[tid];
-    ts2[tid]=ts2_in[tid];
     ts4[tid]=ts4_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp lam3 = ts1_in[ONETYPE3].x;
+  const int powermint = SPQ; //ts1_in[ONETYPE3].y;
+  const numtyp bigr = ts1_in[ONETYPE3].z;
+  const numtyp bigd = ts1_in[ONETYPE3].w;
+  const numtyp c = ts4_in[ONETYPE3].x;
+  const numtyp d = ts4_in[ONETYPE3].y;
+  const numtyp h = ts4_in[ONETYPE3].z;
+  const numtyp gamma = ts4_in[ONETYPE3].w;
+  #endif
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
-  __local int ijnum_shared[BLOCK_PAIR];
-
-  __syncthreads();
+  #ifdef LAL_SIMD_IP_SYNC
+  __local int localk[BLOCK_PAIR];
+  #endif
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
     itype=map[itype];
+    #endif
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int jtype=jx.w;
       jtype=map[jtype];
+      #endif
 
       // Compute r12
       numtyp delr1[3];
@@ -756,62 +820,52 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
       int nbork_start = nbor_k;
-
       // look up for zeta_ji: find i in the j's neighbor list
+      #ifdef LAL_SIMD_IP_SYNC
       int m = tid / t_per_atom;
-      int ijnum = -1;
+      #endif
+      int ijnum;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
+          #ifdef LAL_SIMD_IP_SYNC
+          localk[m] = nbor_k;
+          #else
           ijnum = nbor_k;
-          ijnum_shared[m] = ijnum;
+          #endif
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      if (ijnum < 0) ijnum = ijnum_shared[m];
+      #ifdef LAL_SIMD_IP_SYNC
+      // Due to conditional, only works on hardware w/out IP divergence in subg
+      simdsync();
+      ijnum = localk[m];
+      #endif
 
-      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
+      acctyp2 zeta_ji = zetaij[ijnum-2*nbor_pitch];
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
-        energy+=zeta_ji.z*tpainv;
+      if (EVFLAG && eflag) {
+        energy+=zetaij_e[ijnum-2*nbor_pitch]*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += mdelr1[0]*mdelr1[0]*mforce;
         virial[1] += mdelr1[1]*mdelr1[1]*mforce;
@@ -823,62 +877,62 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
+        #ifndef ONETYPE
         int ktype=kx.w;
         ktype=map[ktype];
-        int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype];
+        int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+
+                                ktype];
+        #endif
 
         numtyp delr2[3];
         delr2[0] = kx.x-jx.x;
         delr2[1] = kx.y-jx.y;
         delr2[2] = kx.z-jx.z;
-        numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+        numtyp rsq2 = delr2[0]*delr2[0]+delr2[1]*delr2[1]+delr2[2]*delr2[2];
 
+        #ifndef ONETYPE
+        numtyp4 ts1_param = ts1[jikparam];
+        numtyp lam3 = ts1_param.x;
+        int powermint = ts1_param.y;
+        numtyp bigr = ts1_param.z;
+        numtyp bigd = ts1_param.w;
+        numtyp4 ts4_param = ts4[jikparam];
+        numtyp c = ts4_param.x;
+        numtyp d = ts4_param.y;
+        numtyp h = ts4_param.z;
+        numtyp gamma = ts4_param.w;
+        #endif
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
-        numtyp4 ts1_param, ts2_param, ts4_param;
         numtyp fi[3];
 
-        ts1_param = ts1[jikparam]; //fetch4(ts1_jikparam,jikparam,ts1_tex);
-        lam3 = ts1_param.z;
-        powermint = ts1_param.w;
-        ts2_param = ts2[jikparam]; //fetch4(ts2_jikparam,jikparam,ts2_tex);
-        bigr = ts2_param.z;
-        bigd = ts2_param.w;
-        ts4_param = ts4[jikparam]; //fetch4(ts4_jikparam,jikparam,ts4_tex);
-        c = ts4_param.x;
-        d = ts4_param.y;
-        h = ts4_param.z;
-        gamma = ts4_param.w;
         attractive_fj(bigr, bigd, powermint, lam3, c, d, h, gamma,
                       prefactor_ji, r1, r1inv, r2, r2inv, mdelr1, delr2, fi);
         f.x += fi[0];
         f.y += fi[1];
         f.z += fi[2];
 
-        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k;
-        if (dev_packed==dev_nbor) idx -= n_stride;
-
-        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
-        numtyp prefactor_jk = zeta_jk.y;
-        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
-        ts1_param = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex);
-        lam3 = ts1_param.z;
-        powermint = ts1_param.w;
-        ts2_param = ts2[jkiparam]; //fetch4(ts2_jkiparam,jkiparam,ts2_tex);
-        bigr = ts2_param.z;
-        bigd = ts2_param.w;
-        ts4_param = ts4[jkiparam]; //fetch4(ts4_jkiparam,jkiparam,ts4_tex);
+        numtyp prefactor_jk = zetaij[nbor_k-2*nbor_pitch].y;
+        #ifndef ONETYPE
+        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+
+                                itype];
+        ts1_param = ts1[jkiparam];
+        lam3 = ts1_param.x;
+        powermint = ts1_param.y;
+        bigr = ts1_param.z;
+        bigd = ts1_param.w;
+        ts4_param = ts4[jkiparam];
         c = ts4_param.x;
         d = ts4_param.y;
         h = ts4_param.z;
         gamma = ts4_param.w;
+        #endif
         attractive_fk(bigr, bigd, powermint, lam3, c, d, h, gamma,
                       prefactor_jk, r2, r2inv, r1, r1inv, delr2, mdelr1, fi);
         f.x += fi[0];
@@ -886,92 +940,94 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
         f.z += fi[2];
       } // for nbor_k
     } // for nbor_j
-
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
 __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
-                                        const __global numtyp4 *restrict ts1_in,
-                                        const __global numtyp4 *restrict ts2_in,
-                                        const __global numtyp4 *restrict ts4_in,
-                                        const __global numtyp *restrict cutsq,
-                                        const __global int *restrict map,
-                                        const __global int *restrict elem2param,
-                                        const int nelements, const int nparams,
-                                        const __global acctyp4 *restrict zetaij,
-                                        const __global int * dev_nbor,
-                                        const __global int * dev_packed,
-                                        const __global int * dev_ilist,
-                                        const __global int * dev_short_nbor,
-                                        __global acctyp4 *restrict ans,
-                                        __global acctyp *restrict engv,
-                                        const int eflag, const int vflag,
-                                        const int inum,  const int nbor_pitch,
-                                        const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
+                                    const __global numtyp4 *restrict ts1_in,
+                                    const __global numtyp4 *restrict ts4_in,
+                                    const __global int *restrict map,
+                                    const __global int *restrict elem2param,
+                                    const int nelements, const int nparams,
+                                    const __global acctyp2 *restrict zetaij,
+                                    const __global acctyp *restrict zetaij_e,
+                                    const __global int * dev_nbor,
+                                    const __global int * dev_ilist,
+                                    __global acctyp4 *restrict ans,
+                                    __global acctyp *restrict engv,
+                                    const int eflag, const int vflag,
+                                    const int inum,  const int nbor_pitch,
+                                    const int t_per_atom_in,
+                                    const int gpu_nbor) {
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_store_three();
+
+  #ifndef ONETYPE
   __local numtyp4 ts1[SHARED_SIZE];
-  __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
   if (tid<nparams) {
     ts1[tid]=ts1_in[tid];
-    ts2[tid]=ts2_in[tid];
     ts4[tid]=ts4_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp lam3 = ts1_in[ONETYPE3].x;
+  const int powermint = SPQ; //ts1_in[ONETYPE3].y;
+  const numtyp bigr = ts1_in[ONETYPE3].z;
+  const numtyp bigd = ts1_in[ONETYPE3].w;
+  const numtyp c = ts4_in[ONETYPE3].x;
+  const numtyp d = ts4_in[ONETYPE3].y;
+  const numtyp h = ts4_in[ONETYPE3].z;
+  const numtyp gamma = ts4_in[ONETYPE3].w;
+  #endif
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
-  __local int red_acc[BLOCK_PAIR];
-
-  __syncthreads();
+  #ifdef LAL_SIMD_IP_SYNC
+  __local int localk[BLOCK_PAIR];
+  #endif
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
     itype=map[itype];
+    #endif
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int jtype=jx.w;
       jtype=map[jtype];
+      #endif
 
       // Compute r12
       numtyp delr1[3];
@@ -985,62 +1041,53 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
+      #ifdef LAL_SIMD_IP_SYNC
       int m = tid / t_per_atom;
-      int ijnum = -1;
+      #endif
+      int ijnum;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
+          #ifdef LAL_SIMD_IP_SYNC
+          localk[m] = nbor_k;
+          #else
           ijnum = nbor_k;
-          red_acc[m] = ijnum;
+          #endif
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      if (ijnum < 0) ijnum = red_acc[m];
+      #ifdef LAL_SIMD_IP_SYNC
+      // Due to conditional, only works on hardware w/out IP divergence in subg
+      simdsync();
+      ijnum = localk[m];
+      #endif
 
-      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
+      acctyp2 zeta_ji = zetaij[ijnum-2*nbor_pitch];
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
-        energy+=zeta_ji.z*tpainv;
+      if (EVFLAG && eflag) {
+        energy+=zetaij_e[ijnum-2*nbor_pitch]*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += mdelr1[0]*mdelr1[0]*mforce;
         virial[1] += mdelr1[1]*mdelr1[1]*mforce;
@@ -1052,41 +1099,44 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
+        #ifndef ONETYPE
         int ktype=kx.w;
         ktype=map[ktype];
-        int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype];
+        int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+
+                                ktype];
+        #endif
 
         numtyp delr2[3];
         delr2[0] = kx.x-jx.x;
         delr2[1] = kx.y-jx.y;
         delr2[2] = kx.z-jx.z;
-        numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+        numtyp rsq2 = delr2[0]*delr2[0]+delr2[1]*delr2[1]+delr2[2]*delr2[2];
+
+        #ifndef ONETYPE
+        numtyp4 ts1_param = ts1[jikparam];
+        numtyp lam3 = ts1_param.x;
+        int powermint = ts1_param.y;
+        numtyp bigr = ts1_param.z;
+        numtyp bigd = ts1_param.w;
+        numtyp4 ts4_param = ts4[jikparam];
+        numtyp c = ts4_param.x;
+        numtyp d = ts4_param.y;
+        numtyp h = ts4_param.z;
+        numtyp gamma = ts4_param.w;
+        #endif
 
-        if (rsq2 > cutsq[jikparam]) continue;
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
-
         numtyp fi[3], fj[3], fk[3];
-        numtyp4 ts1_param, ts2_param, ts4_param;
-        ts1_param = ts1[jikparam]; //fetch4(ts1_jikparam,jikparam,ts1_tex);
-        lam3 = ts1_param.z;
-        powermint = ts1_param.w;
-        ts2_param = ts2[jikparam]; //fetch4(ts2_jikparam,jikparam,ts2_tex);
-        bigr = ts2_param.z;
-        bigd = ts2_param.w;
-        ts4_param = ts4[jikparam]; //fetch4(ts4_jikparam,jikparam,ts4_tex);
-        c = ts4_param.x;
-        d = ts4_param.y;
-        h = ts4_param.z;
-        gamma = ts4_param.w;
         attractive(bigr, bigd, powermint, lam3, c, d, h, gamma,
-                   prefactor_ji, r1, r1inv, r2, r2inv, mdelr1, delr2, fi, fj, fk);
+                   prefactor_ji, r1, r1inv, r2, r2inv, mdelr1, delr2, fi, fj,
+                   fk);
         f.x += fj[0];
         f.y += fj[1];
         f.z += fj[2];
@@ -1098,26 +1148,25 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
         virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);
 
-        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k;
-        if (dev_packed==dev_nbor) idx -= n_stride;
-        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
-        numtyp prefactor_jk = zeta_jk.y;
+        numtyp prefactor_jk = zetaij[nbor_k-2*nbor_pitch].y;
 
-        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
-        ts1_param = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex);
-        lam3 = ts1_param.z;
-        powermint = ts1_param.w;
-        ts2_param = ts2[jkiparam]; //fetch4(ts2_jkiparam,jkiparam,ts2_tex);
-        bigr = ts2_param.z;
-        bigd = ts2_param.w;
+        #ifndef ONETYPE
+        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+
+                                itype];
+        ts1_param = ts1[jkiparam];
+        lam3 = ts1_param.x;
+        powermint = ts1_param.y;
+        bigr = ts1_param.z;
+        bigd = ts1_param.w;
         ts4_param = ts4[jkiparam]; //fetch4(ts4_jkiparam,jkiparam,ts4_tex);
         c = ts4_param.x;
         d = ts4_param.y;
         h = ts4_param.z;
         gamma = ts4_param.w;
         attractive(bigr, bigd, powermint, lam3, c, d, h, gamma,
-                   prefactor_jk, r2, r2inv, r1, r1inv, delr2, mdelr1, fi, fj, fk);
+                   prefactor_jk, r2, r2inv, r1, r1inv, delr2, mdelr1, fi, fj,
+                   fk);
+        #endif
         f.x += fk[0];
         f.y += fk[1];
         f.z += fk[2];
@@ -1130,14 +1179,13 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
       }
     } // for nbor
-
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
diff --git a/lib/gpu/lal_tersoff.h b/lib/gpu/lal_tersoff.h
index 51e64c987b..8f99569162 100644
--- a/lib/gpu/lal_tersoff.h
+++ b/lib/gpu/lal_tersoff.h
@@ -59,41 +59,36 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
 
   // --------------------------- TYPE DATA --------------------------
 
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
   /// Number of atom types
-  int _lj_types;
+  int _ntypes;
 
-  /// ts1.x = lam1, ts1.y = lam2,  ts1.z = lam3, ts1.w = powermint
+  /// ts1.x = lam3, ts1.y = powermint,  ts1.z = c3, ts1.w = c4
   UCL_D_Vec<numtyp4> ts1;
-  /// ts2.x = biga, ts2.y = bigb,  ts2.z = bigr, ts2.w = bigd
+  /// ts2.x = biga, ts2.y = lam1,  ts2.z = bigr, ts2.w = bigd
   UCL_D_Vec<numtyp4> ts2;
   /// ts3.x = c1,   ts3.y = c2,    ts3.z = c3,   ts3.w = c4
   UCL_D_Vec<numtyp4> ts3;
-  /// ts4.x = c,    ts4.y = d,     ts4.z = h,    ts4.w = gamma
+  /// ts4.x = c*c,  ts4.y = d*d,   ts4.z = h,    ts4.w = gamma
   UCL_D_Vec<numtyp4> ts4;
-  /// ts5.x = beta, ts5.y = powern
+  /// ts5.x = beta, ts5.y = powern, ts5.z = lam2, ts5.w = bigb
   UCL_D_Vec<numtyp4> ts5;
 
-  UCL_D_Vec<numtyp> cutsq;
+  UCL_D_Vec<numtyp> cutsq_pair;
 
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;
   int _nparams,_nelements;
 
   /// Per-atom arrays:
-  /// zetaij.x = force, zetaij.y = prefactor, zetaij.z = evdwl,
-  /// zetaij.w = zetaij
-  UCL_D_Vec<acctyp4>   _zetaij;
+  /// zetaij.x = force, zetaij.y = prefactor
+  UCL_D_Vec<acctyp2>   _zetaij;
+  UCL_D_Vec<acctyp> _zetaij_eng;
 
-  UCL_Kernel k_zeta;
-  UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
-  numtyp _cutshortsq;
+  UCL_Kernel k_zeta, k_zeta_noev, *k_zeta_selt;
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag, const int evatom);
+  int loop(const int eflag, const int vflag, const int evatom, bool &success);
 };
 
 }
diff --git a/lib/gpu/lal_tersoff_ext.cpp b/lib/gpu/lal_tersoff_ext.cpp
index 749842864f..ac700d014a 100644
--- a/lib/gpu/lal_tersoff_ext.cpp
+++ b/lib/gpu/lal_tersoff_ext.cpp
@@ -63,7 +63,7 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=TSMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+    init_ok=TSMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                       host_map, nelements, host_elem2param, nparams,
                       ts_lam1, ts_lam2, ts_lam3, ts_powermint,
                       ts_biga, ts_bigb, ts_bigr, ts_bigd,
@@ -84,7 +84,7 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=TSMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+      init_ok=TSMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                         host_map, nelements, host_elem2param, nparams,
                         ts_lam1, ts_lam2, ts_lam3, ts_powermint,
                         ts_biga, ts_bigb, ts_bigr, ts_bigd,
@@ -99,7 +99,7 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int
     fprintf(screen,"\n");
 
   if (init_ok==0)
-    TSMF.estimate_gpu_overhead();
+    TSMF.estimate_gpu_overhead(1);
   return init_ok;
 }
 
diff --git a/lib/gpu/lal_tersoff_extra.h b/lib/gpu/lal_tersoff_extra.h
index 7ee29751b7..da2568aa1b 100644
--- a/lib/gpu/lal_tersoff_extra.h
+++ b/lib/gpu/lal_tersoff_extra.h
@@ -55,11 +55,9 @@ ucl_inline numtyp ters_gijk(const numtyp costheta,
                             const numtyp param_h,
                             const numtyp param_gamma)
 {
-  const numtyp ters_c = param_c * param_c;
-  const numtyp ters_d = param_d * param_d;
   const numtyp hcth = param_h - costheta;
-  return param_gamma*((numtyp)1.0 + ters_c*ucl_recip(ters_d) -
-         ters_c *ucl_recip(ters_d + hcth*hcth));
+  return param_gamma*((numtyp)1.0 + param_c*ucl_recip(param_d) -
+         param_c *ucl_recip(param_d + hcth*hcth));
 }
 
 /* ---------------------------------------------------------------------- */
@@ -68,19 +66,20 @@ ucl_inline numtyp ters_gijk_d(const numtyp costheta,
                               const numtyp param_c,
                               const numtyp param_d,
                               const numtyp param_h,
-                              const numtyp param_gamma)
+                              const numtyp param_gamma,
+                              numtyp *ans_d)
 {
-  const numtyp ters_c = param_c * param_c;
-  const numtyp ters_d = param_d * param_d;
   const numtyp hcth = param_h - costheta;
-  const numtyp numerator = (numtyp)-2.0 * ters_c * hcth;
-  const numtyp denominator = ucl_recip(ters_d + hcth*hcth);
-  return param_gamma*numerator*denominator*denominator;
+  const numtyp idhh=ucl_recip(param_d + hcth*hcth);
+  const numtyp numerator = (numtyp)-2.0 * param_c * hcth;
+  *ans_d=param_gamma*numerator*idhh*idhh;
+  return param_gamma*((numtyp)1.0+param_c*ucl_recip(param_d)-param_c*idhh);
 }
 
 /* ---------------------------------------------------------------------- */
 
-ucl_inline void costheta_d(const numtyp rij_hat[3],
+ucl_inline void costheta_d(const numtyp cos_theta,
+                           const numtyp rij_hat[3],
                            const numtyp rij,
                            const numtyp rik_hat[3],
                            const numtyp rik,
@@ -89,9 +88,6 @@ ucl_inline void costheta_d(const numtyp rij_hat[3],
                            numtyp *drk)
 {
   // first element is derivative wrt Ri, second wrt Rj, third wrt Rk
-
-  numtyp cos_theta = vec3_dot(rij_hat,rik_hat);
-
   vec3_scaleadd(-cos_theta,rij_hat,rik_hat,drj);
   vec3_scale(ucl_recip(rij),drj,drj);
   vec3_scaleadd(-cos_theta,rik_hat,rij_hat,drk);
@@ -107,7 +103,9 @@ ucl_inline numtyp ters_fc(const numtyp r,
                           const numtyp param_bigd)
 {
   if (r < param_bigr-param_bigd) return (numtyp)1.0;
+  #ifndef ONETYPE
   if (r > param_bigr+param_bigd) return (numtyp)0.0;
+  #endif
   return (numtyp)0.5*((numtyp)1.0 - sin(MY_PI2*(r - param_bigr)/param_bigd));
 }
 
@@ -115,24 +113,23 @@ ucl_inline numtyp ters_fc(const numtyp r,
 
 ucl_inline numtyp ters_fc_d(const numtyp r,
                             const numtyp param_bigr,
-                            const numtyp param_bigd)
+                            const numtyp param_bigd,
+                            numtyp *ans_d)
 {
-  if (r < param_bigr-param_bigd) return (numtyp)0.0;
-  if (r > param_bigr+param_bigd) return (numtyp)0.0;
-  return -(MY_PI4/param_bigd) * cos(MY_PI2*(r - param_bigr)/param_bigd);
-}
-
-/* ---------------------------------------------------------------------- */
-
-ucl_inline numtyp ters_fa(const numtyp r,
-                          const numtyp param_bigb,
-                          const numtyp param_bigr,
-                          const numtyp param_bigd,
-                          const numtyp param_lam2)
-{
-  if (r > param_bigr + param_bigd) return (numtyp)0.0;
-  return -param_bigb * ucl_exp(-param_lam2 * r) *
-    ters_fc(r,param_bigr,param_bigd);
+  if (r < param_bigr-param_bigd) {
+    *ans_d=(numtyp)0.0;
+    return (numtyp)1.0;
+  }
+  #ifndef ONETYPE
+  if (r > param_bigr+param_bigd) {
+    *ans_d=(numtyp)0.0;
+    return (numtyp)0.0;
+  }
+  #endif
+  const numtyp ibigd = ucl_recip(param_bigd);
+  const numtyp angle = MY_PI2*(r - param_bigr)*ibigd;
+  *ans_d=-(MY_PI4*ibigd) * cos(angle);
+  return (numtyp)0.5*((numtyp)1.0 - sin(angle));
 }
 
 /* ---------------------------------------------------------------------- */
@@ -141,33 +138,17 @@ ucl_inline numtyp ters_fa_d(const numtyp r,
                             const numtyp param_bigb,
                             const numtyp param_bigr,
                             const numtyp param_bigd,
-                            const numtyp param_lam2)
+                            const numtyp param_lam2,
+                            numtyp *ans_d)
 {
+  #ifndef ONETYPE
   if (r > param_bigr + param_bigd) return (numtyp)0.0;
-  return param_bigb * ucl_exp(-param_lam2 * r) * (param_lam2 *
-    ters_fc(r,param_bigr,param_bigd) - ters_fc_d(r,param_bigr,param_bigd));
-}
-
-/* ---------------------------------------------------------------------- */
-
-ucl_inline numtyp ters_bij(const numtyp zeta,
-                           const numtyp param_beta,
-                           const numtyp param_powern,
-                           const numtyp param_c1,
-                           const numtyp param_c2,
-                           const numtyp param_c3,
-                           const numtyp param_c4)
-{
-  numtyp tmp = param_beta * zeta;
-  if (tmp > param_c1) return ucl_rsqrt(tmp);
-  if (tmp > param_c2)
-    return ((numtyp)1.0 - ucl_powr(tmp,-param_powern) /
-      ((numtyp)2.0*param_powern))*ucl_rsqrt(tmp);
-  if (tmp < param_c4) return (numtyp)1.0;
-  if (tmp < param_c3)
-    return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern);
-  return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern),
-    (numtyp)-1.0/((numtyp)2.0*param_powern));
+  #endif
+  numtyp dfc;
+  const numtyp fc=ters_fc_d(r,param_bigr,param_bigd,&dfc);
+  const numtyp blr = param_bigb * ucl_exp(-param_lam2 * r);
+  *ans_d = blr * (param_lam2 * fc - dfc);
+  return -blr * fc;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -178,24 +159,35 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
                              const numtyp param_c1,
                              const numtyp param_c2,
                              const numtyp param_c3,
-                             const numtyp param_c4)
+                             const numtyp param_c4,
+                             numtyp *ans_d)
 {
-  numtyp tmp = param_beta * zeta;
-  if (tmp > param_c1)
-    return param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5);
-  if (tmp > param_c2)
-    return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
-    // error in negligible 2nd term fixed 9/30/2015
-                // (1.0 - 0.5*(1.0 +  1.0/(2.0*param->powern)) *
-      ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
-       ucl_powr(tmp,-param_powern)));
-  if (tmp < param_c4) return (numtyp)0.0;
-  if (tmp < param_c3)
-    return (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0);
-
-  numtyp tmp_n = ucl_powr(tmp,param_powern);
-  return (numtyp)-0.5 * ucl_powr((numtyp)1.0+tmp_n, (numtyp) -
-    (numtyp)1.0-((numtyp)1.0 / ((numtyp)2.0 * param_powern)))*tmp_n / zeta;
+  const numtyp tmp = param_beta * zeta;
+  if (tmp > param_c1) {
+    *ans_d = param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5);
+    return ucl_rsqrt(tmp);
+  }
+  if (tmp > param_c2) {
+    const numtyp ptmp = ucl_powr(tmp,-param_powern);
+    const numtyp i2n = ucl_recip((numtyp)2.0 * param_powern);
+    *ans_d = param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
+                           ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 * i2n) *
+                            ptmp));
+    return ((numtyp)1.0 - ptmp * i2n)*ucl_rsqrt(tmp);
+  }
+  if (tmp < param_c4) {
+    *ans_d = (numtyp)0.0;
+    return (numtyp)1.0;
+  }
+  if (tmp < param_c3) {
+    *ans_d = (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0);
+    return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern);
+  }
+  const numtyp tmp_n = (numtyp)1.0+ucl_powr(tmp,param_powern);
+  const numtyp i2n = -ucl_recip((numtyp)2.0*param_powern);
+  *ans_d = (numtyp)-0.5*ucl_powr(tmp_n,(numtyp)-1.0+i2n)*(tmp_n-(numtyp)1.0)/
+    zeta;
+  return ucl_powr(tmp_n, i2n);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -207,7 +199,7 @@ ucl_inline void ters_zetaterm_d(const numtyp prefactor,
                                 const numtyp rik,
                                 const numtyp param_bigr,
                                 const numtyp param_bigd,
-                                const numtyp param_powermint,
+                                const int param_powermint,
                                 const numtyp param_lam3,
                                 const numtyp param_c,
                                 const numtyp param_d,
@@ -220,25 +212,23 @@ ucl_inline void ters_zetaterm_d(const numtyp prefactor,
   numtyp gijk,gijk_d,ex_delr,ex_delr_d,fc,dfc,cos_theta,tmp;
   numtyp dcosdri[3],dcosdrj[3],dcosdrk[3];
 
-  fc = ters_fc(rik,param_bigr,param_bigd);
-  dfc = ters_fc_d(rik,param_bigr,param_bigd);
+  fc = ters_fc_d(rik,param_bigr,param_bigd,&dfc);
 
   numtyp t = param_lam3*(rij-rik);
-  if ((int)param_powermint == 3) tmp = t*t*t;
+  if (param_powermint == 3) tmp = t*t*t;
   else tmp = t;
 
   if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30;
   else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0;
   else ex_delr = ucl_exp(tmp);
 
-  if ((int)param_powermint == 3)
+  if (param_powermint == 3)
     ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr;
   else ex_delr_d = param_lam3 * ex_delr;
 
   cos_theta = vec3_dot(rij_hat,rik_hat);
-  gijk = ters_gijk(cos_theta,param_c,param_d,param_h,param_gamma);
-  gijk_d = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma);
-  costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
+  gijk = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma,&gijk_d);
+  costheta_d(cos_theta,rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
 
   // compute the derivative wrt Ri
   // dri = -dfc*gijk*ex_delr*rik_hat;
@@ -277,7 +267,7 @@ ucl_inline void ters_zetaterm_d_fi(const numtyp prefactor,
                                    const numtyp rik,
                                    const numtyp param_bigr,
                                    const numtyp param_bigd,
-                                   const numtyp param_powermint,
+                                   const int param_powermint,
                                    const numtyp param_lam3,
                                    const numtyp param_c,
                                    const numtyp param_d,
@@ -288,25 +278,23 @@ ucl_inline void ters_zetaterm_d_fi(const numtyp prefactor,
   numtyp gijk,gijk_d,ex_delr,ex_delr_d,fc,dfc,cos_theta,tmp;
   numtyp dcosdri[3],dcosdrj[3],dcosdrk[3];
 
-  fc = ters_fc(rik,param_bigr,param_bigd);
-  dfc = ters_fc_d(rik,param_bigr,param_bigd);
+  fc = ters_fc_d(rik,param_bigr,param_bigd,&dfc);
 
   numtyp t = param_lam3*(rij-rik);
-  if ((int)param_powermint == 3) tmp = t*t*t;
+  if (param_powermint == 3) tmp = t*t*t;
   else tmp = t;
 
   if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30;
   else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0;
   else ex_delr = ucl_exp(tmp);
 
-  if ((int)param_powermint == 3)
+  if (param_powermint == 3)
     ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr;
   else ex_delr_d = param_lam3 * ex_delr;
 
   cos_theta = vec3_dot(rij_hat,rik_hat);
-  gijk = ters_gijk(cos_theta,param_c,param_d,param_h,param_gamma);
-  gijk_d = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma);
-  costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
+  gijk = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma,&gijk_d);
+  costheta_d(cos_theta,rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
 
   // compute the derivative wrt Ri
   // dri = -dfc*gijk*ex_delr*rik_hat;
@@ -327,7 +315,7 @@ ucl_inline void ters_zetaterm_d_fj(const numtyp prefactor,
                                    const numtyp rik,
                                    const numtyp param_bigr,
                                    const numtyp param_bigd,
-                                   const numtyp param_powermint,
+                                   const int param_powermint,
                                    const numtyp param_lam3,
                                    const numtyp param_c,
                                    const numtyp param_d,
@@ -341,21 +329,20 @@ ucl_inline void ters_zetaterm_d_fj(const numtyp prefactor,
   fc = ters_fc(rik,param_bigr,param_bigd);
 
   numtyp t = param_lam3*(rij-rik);
-  if ((int)param_powermint == 3) tmp = t*t*t;
+  if (param_powermint == 3) tmp = t*t*t;
   else tmp = t;
 
   if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30;
   else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0;
   else ex_delr = ucl_exp(tmp);
 
-  if ((int)param_powermint == 3)
+  if (param_powermint == 3)
     ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr;
   else ex_delr_d = param_lam3 * ex_delr;
 
   cos_theta = vec3_dot(rij_hat,rik_hat);
-  gijk = ters_gijk(cos_theta,param_c,param_d,param_h,param_gamma);
-  gijk_d = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma);
-  costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
+  gijk = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma,&gijk_d);
+  costheta_d(cos_theta,rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
 
   // compute the derivative wrt Rj
   // drj = fc*gijk_d*ex_delr*dcosdrj;
@@ -373,7 +360,7 @@ ucl_inline void ters_zetaterm_d_fk(const numtyp prefactor,
                                    const numtyp rik,
                                    const numtyp param_bigr,
                                    const numtyp param_bigd,
-                                   const numtyp param_powermint,
+                                   const int param_powermint,
                                    const numtyp param_lam3,
                                    const numtyp param_c,
                                    const numtyp param_d,
@@ -384,25 +371,23 @@ ucl_inline void ters_zetaterm_d_fk(const numtyp prefactor,
   numtyp gijk,gijk_d,ex_delr,ex_delr_d,fc,dfc,cos_theta,tmp;
   numtyp dcosdri[3],dcosdrj[3],dcosdrk[3];
 
-  fc = ters_fc(rik,param_bigr,param_bigd);
-  dfc = ters_fc_d(rik,param_bigr,param_bigd);
+  fc = ters_fc_d(rik,param_bigr,param_bigd,&dfc);
 
   numtyp t = param_lam3*(rij-rik);
-  if ((int)param_powermint == 3) tmp = t*t*t;
+  if (param_powermint == 3) tmp = t*t*t;
   else tmp = t;
 
   if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30;
   else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0;
   else ex_delr = ucl_exp(tmp);
 
-  if ((int)param_powermint == 3)
+  if (param_powermint == 3)
     ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr;
   else ex_delr_d = param_lam3 * ex_delr;
 
   cos_theta = vec3_dot(rij_hat,rik_hat);
-  gijk = ters_gijk(cos_theta,param_c,param_d,param_h,param_gamma);
-  gijk_d = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma);
-  costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
+  gijk = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma,&gijk_d);
+  costheta_d(cos_theta,rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
 
   // compute the derivative wrt Rk
   // drk = dfc*gijk*ex_delr*rik_hat;
@@ -427,18 +412,17 @@ ucl_inline void repulsive(const numtyp param_bigr,
 {
   numtyp r,tmp_fc,tmp_fc_d,tmp_exp;
   r = ucl_sqrt(rsq);
-  tmp_fc = ters_fc(r,param_bigr,param_bigd);
-  tmp_fc_d = ters_fc_d(r,param_bigr,param_bigd);
-  tmp_exp = ucl_exp(-param_lam1 * r);
+  tmp_fc = ters_fc_d(r,param_bigr,param_bigd,&tmp_fc_d);
+  tmp_exp = param_biga * ucl_exp(-param_lam1 * r);
   // fforce
-  ans[0] = -param_biga*tmp_exp*(tmp_fc_d - tmp_fc*param_lam1)*ucl_recip(r);
+  ans[0] = -tmp_exp*(tmp_fc_d - tmp_fc*param_lam1)*ucl_recip(r);
   // eng
-  if (eflag) ans[1] = tmp_fc * param_biga * tmp_exp;
+  if (EVFLAG && eflag) ans[1] = tmp_fc * tmp_exp;
 }
 
 /* ---------------------------------------------------------------------- */
 
-ucl_inline numtyp zeta(const numtyp param_powermint,
+ucl_inline numtyp zeta(const int param_powermint,
                        const numtyp param_lam3,
                        const numtyp param_bigr,
                        const numtyp param_bigd,
@@ -446,20 +430,19 @@ ucl_inline numtyp zeta(const numtyp param_powermint,
                        const numtyp param_d,
                        const numtyp param_h,
                        const numtyp param_gamma,
-                       const numtyp rsqij,
+                       const numtyp rij,
                        const numtyp rsqik,
                        const numtyp4 delrij,
                        const numtyp4 delrik)
 {
-  numtyp rij,rik,costheta,arg,ex_delr;
+  numtyp rik,costheta,arg,ex_delr;
 
-  rij = ucl_sqrt(rsqij);
   rik = ucl_sqrt(rsqik);
   costheta = (delrij.x*delrik.x + delrij.y*delrik.y +
               delrij.z*delrik.z) / (rij*rik);
 
   numtyp t = param_lam3*(rij-rik);
-  if ((int)param_powermint == 3) arg = t*t*t;
+  if (param_powermint == 3) arg = t*t*t;
   else arg = t;
 
   if (arg > (numtyp)69.0776) ex_delr = (numtyp)1.e30;
@@ -482,22 +465,19 @@ ucl_inline void force_zeta(const numtyp param_bigb,
                            const numtyp param_c2,
                            const numtyp param_c3,
                            const numtyp param_c4,
-                           const numtyp rsq,
+                           const numtyp r,
                            const numtyp zeta_ij,
                            const int eflag,
                            numtyp fpfeng[4])
 {
-  numtyp r,fa,fa_d,bij;
+  numtyp fa,fa_d,bij,bij_d;
 
-  r = ucl_sqrt(rsq);
-  fa = ters_fa(r,param_bigb,param_bigr,param_bigd,param_lam2);
-  fa_d = ters_fa_d(r,param_bigb,param_bigr,param_bigd,param_lam2);
-  bij = ters_bij(zeta_ij,param_beta,param_powern,
-                 param_c1,param_c2, param_c3, param_c4);
-  fpfeng[0] = (numtyp)0.5*bij*fa_d * ucl_recip(r); // fforce
-  fpfeng[1] = (numtyp)-0.5*fa * ters_bij_d(zeta_ij,param_beta, param_powern,
-           param_c1,param_c2, param_c3, param_c4); // prefactor
-  if (eflag) fpfeng[2] = (numtyp)0.5*bij*fa; // eng
+  fa = ters_fa_d(r,param_bigb,param_bigr,param_bigd,param_lam2,&fa_d);
+  bij = ters_bij_d(zeta_ij,param_beta,param_powern,
+                   param_c1,param_c2, param_c3, param_c4, &bij_d);
+  fpfeng[0] = (numtyp)0.5*bij*fa_d*ucl_recip(r); // fforce
+  fpfeng[1] = (numtyp)-0.5*fa*bij_d; // prefactor
+  if (EVFLAG && eflag) fpfeng[2] = (numtyp)0.5*bij*fa; // eng
 }
 
 /* ----------------------------------------------------------------------
@@ -508,7 +488,7 @@ ucl_inline void force_zeta(const numtyp param_bigb,
 
 ucl_inline void attractive(const numtyp param_bigr,
                            const numtyp param_bigd,
-                           const numtyp param_powermint,
+                           const int param_powermint,
                            const numtyp param_lam3,
                            const numtyp param_c,
                            const numtyp param_d,
@@ -535,7 +515,7 @@ ucl_inline void attractive(const numtyp param_bigr,
 
 ucl_inline void attractive_fi(const numtyp param_bigr,
                               const numtyp param_bigd,
-                              const numtyp param_powermint,
+                              const int param_powermint,
                               const numtyp param_lam3,
                               const numtyp param_c,
                               const numtyp param_d,
@@ -560,7 +540,7 @@ ucl_inline void attractive_fi(const numtyp param_bigr,
 
 ucl_inline void attractive_fj(const numtyp param_bigr,
                               const numtyp param_bigd,
-                              const numtyp param_powermint,
+                              const int param_powermint,
                               const numtyp param_lam3,
                               const numtyp param_c,
                               const numtyp param_d,
@@ -585,7 +565,7 @@ ucl_inline void attractive_fj(const numtyp param_bigr,
 
 ucl_inline void attractive_fk(const numtyp param_bigr,
                               const numtyp param_bigd,
-                              const numtyp param_powermint,
+                              const int param_powermint,
                               const numtyp param_lam3,
                               const numtyp param_c,
                               const numtyp param_d,
@@ -610,5 +590,3 @@ ucl_inline void attractive_fk(const numtyp param_bigr,
 
 
 #endif
-
-
diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp
index 2b56991cc6..b7b0fff1b9 100644
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@@ -39,7 +39,7 @@ TersoffMT::~TersoffMod() {
 
 template <class numtyp, class acctyp>
 int TersoffMT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
+  return this->bytes_per_atom_atomic(max_nbors)+max_nbors*sizeof(acctyp)*4;
 }
 
 template <class numtyp, class acctyp>
@@ -52,34 +52,78 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
                    const double* c5, const double* h, const double* beta, const double* powern,
                    const double* powern_del, const double* ca1, const double* host_cutsq)
 {
+  int oldparam=-1;
+  int onetype=-1;
+  int onetype3=0;
+  int spq=1;
+  int mtypes=0;
+  #ifdef USE_OPENCL
+  for (int ii=1; ii<ntypes; ii++) {
+    const int i=host_map[ii];
+    for (int jj=1; jj<ntypes; jj++) {
+      const int j=host_map[jj];
+      for (int kk=1; kk<ntypes; kk++) {
+        const int k=host_map[kk];
+        if (i<0 || j<0 || k<0) continue;
+        const int ijkparam = host_elem2param[i][j][k];
+        if (oldparam!=ijkparam) {
+          oldparam=ijkparam;
+          onetype=ntypes*ii+jj;
+          onetype3=ijkparam;
+          mtypes++;
+        }
+      }
+    }
+  }
+  if (mtypes>1) onetype=-1;
+  #endif
+
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,tersoff_mod,"k_tersoff_mod_repulsive",
-                           "k_tersoff_mod_three_center", "k_tersoff_mod_three_end",
-                           "k_tersoff_mod_short_nbor");
+                           "k_tersoff_mod_three_center",
+                           "k_tersoff_mod_three_end",
+                           "k_tersoff_mod_short_nbor",onetype,onetype3,0,1);
   if (success!=0)
     return success;
 
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
-  _zetaij.alloc(ef_nall*max_nbors,*(this->ucl_device),UCL_READ_WRITE);
+  if (this->nbor->max_nbors())
+    _zetaij.alloc(ef_nall*this->nbor->max_nbors(),*(this->ucl_device),
+                  UCL_READ_WRITE);
 
   k_zeta.set_function(*(this->pair_program),"k_tersoff_mod_zeta");
+  #if defined(LAL_OCL_EV_JIT)
+  k_zeta_noev.set_function(*(this->pair_program_noev),"k_tersoff_mod_zeta");
+  #else
+  k_zeta_selt = &k_zeta;
+  #endif
 
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
+  _ntypes=ntypes;
   _nparams = nparams;
   _nelements = nelements;
 
+  UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
+                               UCL_READ_WRITE);
+  host_write.zero();
+  cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  for (int ii=1; ii<ntypes; ii++) {
+    const int i=host_map[ii];
+    for (int jj=1; jj<ntypes; jj++) {
+      const int j=host_map[jj];
+      for (int kk=1; kk<ntypes; kk++) {
+        const int k=host_map[kk];
+        if (i<0 || j<0 || k<0) continue;
+        const int ijkparam = host_elem2param[i][j][k];
+        if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
+          host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
+      }
+    }
+  }
+  ucl_copy(cutsq_pair,host_write,ntypes*ntypes);
+
   UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
                            UCL_WRITE_ONLY);
 
@@ -101,8 +145,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   }
 
   ucl_copy(ts1,dview,false);
-  ts1_tex.get_texture(*(this->pair_program),"ts1_tex");
-  ts1_tex.bind_float(ts1,4);
 
   ts2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -114,8 +156,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   }
 
   ucl_copy(ts2,dview,false);
-  ts2_tex.get_texture(*(this->pair_program),"ts2_tex");
-  ts2_tex.bind_float(ts2,4);
 
   ts3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -127,8 +167,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   }
 
   ucl_copy(ts3,dview,false);
-  ts3_tex.get_texture(*(this->pair_program),"ts3_tex");
-  ts3_tex.bind_float(ts3,4);
 
   ts4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -140,8 +178,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   }
 
   ucl_copy(ts4,dview,false);
-  ts4_tex.get_texture(*(this->pair_program),"ts4_tex");
-  ts4_tex.bind_float(ts4,4);
 
   ts5.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -153,20 +189,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   }
 
   ucl_copy(ts5,dview,false);
-  ts5_tex.get_texture(*(this->pair_program),"ts5_tex");
-  ts5_tex.bind_float(ts5,4);
-
-  UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
-                               UCL_WRITE_ONLY);
-  double cutsqmax = 0.0;
-  for (int i=0; i<nparams; i++) {
-    cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
-    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
-  }
-  cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-  ucl_copy(cutsq,cutsq_view,false);
-
-  _cutshortsq = static_cast<numtyp>(cutsqmax);
 
   UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                            *(this->ucl_device), UCL_WRITE_ONLY);
@@ -183,17 +205,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
 
   ucl_copy(elem2param,dview_elem2param,false);
 
-  UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
+  UCL_H_Vec<int> dview_map(ntypes, *(this->ucl_device), UCL_WRITE_ONLY);
   for (int i = 0; i < ntypes; i++)
     dview_map[i] = host_map[i];
 
-  map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
+  map.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY);
   ucl_copy(map,dview_map,false);
 
   _allocated=true;
   this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+
-    ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+
-    map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
+    ts4.row_bytes()+map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
   return 0;
 }
 
@@ -208,12 +229,15 @@ void TersoffMT::clear() {
   ts3.clear();
   ts4.clear();
   ts5.clear();
-  cutsq.clear();
+  cutsq_pair.clear();
   map.clear();
   elem2param.clear();
   _zetaij.clear();
 
   k_zeta.clear();
+  #if defined(LAL_OCL_EV_JIT)
+  k_zeta_noev.clear();
+  #endif
 
   this->clear_atomic();
 }
@@ -229,74 +253,54 @@ double TersoffMT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
-  // Compute the block size and grid size to keep all cores busy
-  int BX=this->block_pair();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
-  // build the short neighbor list
-  int ainum=this->_ainum;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/this->_threads_per_atom)));
-
-  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                         &this->_nbor_data->begin(),
-                         &this->dev_short_nbor, &_cutshortsq, &ainum,
-                         &nbor_pitch, &this->_threads_per_atom);
+int TersoffMT::loop(const int eflag, const int vflag, const int evatom,
+                    bool &success) {
+  const int nbor_pitch=this->nbor->nbor_pitch();
 
   // re-allocate zetaij if necessary
   int nall = this->_nall;
-  if (nall*this->_max_nbors > _zetaij.cols()) {
+  if (nall*this->nbor->max_nbors() > _zetaij.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(this->_max_nbors*_nmax);
+    _zetaij.clear();
+    success = success && (_zetaij.alloc(this->nbor->max_nbors()*_nmax,
+                                        *(this->ucl_device),
+                                        UCL_READ_WRITE) == UCL_SUCCESS);
+    if (!success) return 0;
   }
 
-  nbor_pitch=this->nbor->nbor_pitch();
+  // build the short neighbor list
+  int ainum=this->_ainum;
+  this->time_pair.start();
+
+  int BX=this->block_pair();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
+                         &this->nbor->dev_nbor, &this->nbor->dev_packed,
+                         &ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  #if defined(LAL_OCL_EV_JIT)
+  if (eflag || vflag) k_zeta_selt = &k_zeta;
+  else k_zeta_selt = &k_zeta_noev;
+  #endif
+
   GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
                                (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
+  k_zeta_selt->set_size(GX,BX);
+  k_zeta_selt->run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5,
                    &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  ainum=this->ans->inum();
-  nbor_pitch=this->nbor->nbor_pitch();
-  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  this->time_pair.start();
-  this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &this->ans->force, &this->ans->engv,
-                   &eflag, &vflag, &ainum, &nbor_pitch,
+                   &this->nbor->dev_nbor,&eflag, &this->_ainum, &nbor_pitch,
                    &this->_threads_per_atom);
 
+  ainum=this->ans->inum();
   BX=this->block_size();
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                            (BX/(KTHREADS*JTHREADS))));
-  this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
+  this->k_3center_sel->set_size(GX,BX);
+  this->k_3center_sel->run(&this->atom->x, &ts1, &ts2, &ts4, &ts5,
                            &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                           &this->dev_short_nbor,
-                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
+                           &this->nbor->dev_nbor, &this->ans->force,
+                           &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
   Answer<numtyp,acctyp> *end_ans;
@@ -307,24 +311,34 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   #endif
   if (evatom!=0) {
     this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
+    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
 
   } else {
-    this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
+    this->k_3end_sel->set_size(GX,BX);
+    this->k_3end_sel->run(&this->atom->x, &ts1, &ts2, &ts4, &ts5,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
   }
 
+  BX=this->block_pair();
+  int GXT=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                                (BX/this->_threads_per_atom)));
+  this->k_sel->set_size(GXT,BX);
+  this->k_sel->run(&this->atom->x, &ts1, &ts2, &map, &elem2param,
+                   &_nelements, &_nparams, &this->nbor->dev_nbor,
+                   &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                   &ainum, &nbor_pitch, &this->_threads_per_atom, &GX);
+
   this->time_pair.stop();
+  return GX;
 }
 
 template class TersoffMod<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu
index 0f45653264..44b04c6933 100644
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@@ -18,99 +18,28 @@
 
 #ifndef _DOUBLE_DOUBLE
 _texture( pos_tex,float4);
-_texture( ts1_tex,float4);
-_texture( ts2_tex,float4);
-_texture( ts3_tex,float4);
-_texture( ts4_tex,float4);
-_texture( ts5_tex,float4);
 #else
 _texture_2d( pos_tex,int4);
-_texture( ts1_tex,int4);
-_texture( ts2_tex,int4);
-_texture( ts3_tex,int4);
-_texture( ts4_tex,int4);
-_texture( ts5_tex,int4);
 #endif
 
 #else
 #define pos_tex x_
-#define ts1_tex ts1
-#define ts2_tex ts2
-#define ts3_tex ts3
-#define ts4_tex ts4
-#define ts5_tex ts5
 #endif
 
 //#define THREE_CONCURRENT
 
 #define TWOTHIRD (numtyp)0.66666666666666666667
 
-#define zeta_idx(nbor_mem, packed_mem, nbor_pitch, n_stride, t_per_atom,    \
-                 i, nbor_j, offset_j, idx)                                  \
-  if (nbor_mem==packed_mem) {                                               \
-    int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;                       \
-    idx = jj*n_stride + i*t_per_atom + offset_j;                            \
-  } else {                                                                  \
-    idx = nbor_j;                                                           \
-  }
+#if (SHUFFLE_AVAIL == 0)
 
-#if (ARCH < 300)
-
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
-                        offset, eflag, vflag, ans, engv)                    \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
-    old.x+=f.x;                                                             \
-    old.y+=f.y;                                                             \
-    old.z+=f.z;                                                             \
-    ans[ii]=old;                                                            \
-  }
+#define local_allocate_acc_zeta()                                           \
+    __local acctyp red_acc[BLOCK_PAIR];
 
 #define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=z;                                                         \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s) {                                                     \
         red_acc[tid] += red_acc[tid+s];                                     \
       }                                                                     \
@@ -118,36 +47,168 @@ _texture( ts5_tex,int4);
     z=red_acc[tid];                                                         \
   }
 
-#else
-
 #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
-                        offset, eflag, vflag, ans, engv)                    \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      f.x += shfl_xor(f.x, s, t_per_atom);                                  \
-      f.y += shfl_xor(f.y, s, t_per_atom);                                  \
-      f.z += shfl_xor(f.z, s, t_per_atom);                                  \
-      energy += shfl_xor(energy, s, t_per_atom);                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
         ei+=inum;                                                           \
       }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
+  }
+
+#else
+
+#define local_allocate_acc_zeta()
+
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      z += shfl_down(z, s, t_per_atom);                                     \
+    }                                                                       \
+  }
+
+#if (EVFLAG == 1)
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
@@ -155,63 +216,62 @@ _texture( ts5_tex,int4);
     ans[ii]=old;                                                            \
   }
 
-#define acc_zeta(z, tid, t_per_atom, offset)                                \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      z += shfl_xor(z, s, t_per_atom);                                      \
-    }                                                                       \
-  }
-
+#endif
 #endif
 
 __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global int * dev_nbor,
+                                   const __global numtyp *restrict cutsq_pair,
+                                   const int ntypes, __global int * dev_nbor,
                                    const __global int * dev_packed,
-                                   __global int * dev_short_nbor,
-                                   const numtyp _cutshortsq,
                                    const int inum, const int nbor_pitch,
                                    const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
+  const int ii=GLOBAL_ID_X;
+
+  #ifdef ONETYPE
+  const numtyp cutsq=cutsq_pair[ONETYPE];
+  #endif
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    const int i=dev_packed[ii];
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
+    const int itype=ix.w*ntypes;
+    #endif
+    int newj=0;
 
-    int ncount = 0;
-    int m = nbor;
-    dev_short_nbor[m] = 0;
-    int nbor_short = nbor+n_stride;
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int j=dev_packed[nbor];
-      int nj = j;
-      j &= NEIGHMASK;
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
 
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
+      #ifndef ONETYPE
+      const int mtype=jx.w+itype;
+      const numtyp cutsq=cutsq_pair[mtype];
+      #endif
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<_cutshortsq) {
-        dev_short_nbor[nbor_short] = nj;
-        nbor_short += n_stride;
-        ncount++;
+      if (rsq<cutsq) {
+        *out_list=sj;
+        out_list++;
+        newj++;
+        if ((newj & (t_per_atom-1))==0)
+          out_list+=out_stride;
       }
     } // for nbor
-
-    // store the number of neighbors for each thread
-    dev_short_nbor[m] = ncount;
-
+    dev_nbor[ii+nbor_pitch]=newj;
   } // if ii
 }
 
@@ -228,22 +288,20 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict ts3_in,
                              const __global numtyp4 *restrict ts4_in,
                              const __global numtyp4 *restrict ts5_in,
-                             const __global numtyp *restrict cutsq,
                              const __global int *restrict map,
                              const __global int *restrict elem2param,
                              const int nelements, const int nparams,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
-                             const __global int * dev_packed,
-                             const __global int * dev_short_nbor,
                              const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
-  __local int tpa_sq,n_stride;
-  tpa_sq = fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq = fast_mul(t_per_atom,t_per_atom);
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_acc_zeta();
+
   // must be increased if there will be more than 3 elements in the future.
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
@@ -264,28 +322,20 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int nbor_j, nbor_end, i, numj;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
     int nborj_start = nbor_j;
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -304,14 +354,8 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int k_end = nbor_end;
-      if (dev_packed==dev_nbor) {
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
-
-      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -327,8 +371,6 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
         delr2.z = kx.z-ix.z;
         numtyp rsq2 = delr2.x*delr2.x+delr2.y*delr2.y+delr2.z*delr2.z;
 
-        if (rsq2 > cutsq[ijkparam]) continue;
-
         numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex);
         numtyp ijkparam_lam3 = ts1_ijkparam.z;
         numtyp ijkparam_powermint = ts1_ijkparam.w;
@@ -348,9 +390,6 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
                   ijkparam_c5, rsq1, rsq2, delr1, delr2);
       }
 
-      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j;
-      if (dev_packed==dev_nbor) idx -= n_stride;
       acc_zeta(z, tid, t_per_atom, offset_k);
 
       numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
@@ -376,7 +415,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
         zij.y = fpfeng[1];
         zij.z = fpfeng[2];
         zij.w = z;
-        zetaij[idx] = zij;
+        zetaij[nbor_j-2*nbor_pitch] = zij;
       }
 
     } // for nbor
@@ -386,22 +425,20 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
 __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict ts1_in,
                                   const __global numtyp4 *restrict ts2_in,
-                                  const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
-                                  const __global int * dev_packed,
-                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
-                                  const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
+                                  const int t_per_atom, const int ev_stride) {
+  int tid, ii, offset, n_stride;
   atom_info(t_per_atom,ii,tid,offset);
 
+  local_allocate_store_pair();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   if (tid<nparams) {
@@ -409,36 +446,28 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
     ts2[tid]=ts2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end, i, numj;
-    const __global int* nbor_mem=dev_packed;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset,i,numj,
+                n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor];
-      nbor += n_stride;
-      nbor_end = nbor+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=nbor_mem[nbor];
+      int j=dev_nbor[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -453,8 +482,6 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq >= cutsq[ijparam]) continue;
-
       numtyp feng[2];
       numtyp ijparam_lam1 = ts1[ijparam].x;
       numtyp4 ts2_ijparam = ts2[ijparam];
@@ -470,9 +497,9 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
       f.y+=dely*force;
       f.z+=delz*force;
 
-      if (eflag>0)
+      if (EVFLAG && eflag)
         energy+=feng[1];
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         virial[0] += delx*delx*force;
         virial[1] += dely*dely*force;
         virial[2] += delz*delz*force;
@@ -481,11 +508,9 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
         virial[5] += dely*delz*force;
       }
     } // for nbor
-
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
-
+  store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv,ev_stride);
 }
 
 __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
@@ -493,26 +518,24 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
                                      const __global numtyp4 *restrict ts2_in,
                                      const __global numtyp4 *restrict ts4_in,
                                      const __global numtyp4 *restrict ts5_in,
-                                     const __global numtyp *restrict cutsq,
                                      const __global int *restrict map,
                                      const __global int *restrict elem2param,
                                      const int nelements, const int nparams,
                                      const __global acctyp4 *restrict zetaij,
                                      const __global int * dev_nbor,
-                                     const __global int * dev_packed,
-                                     const __global int * dev_short_nbor,
                                      __global acctyp4 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
                                      const int inum,  const int nbor_pitch,
                                      const int t_per_atom, const int evatom) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset); // offset ranges from 0 to tpa_sq-1
 
+  local_allocate_store_three();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
@@ -524,46 +547,37 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
     ts5[tid]=ts5_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
   numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
     int nborj_start = nbor_j;
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delr1[3];
@@ -571,26 +585,22 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
       delr1[1] = jx.y-ix.y;
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
-      if (rsq1 >= cutsq[ijparam]) continue;
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
       // look up for zeta_ij
-      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
+      acctyp4 zeta_ij = zetaij[nbor_j-2*nbor_pitch];
       numtyp force = zeta_ij.x*tpainv;
       numtyp prefactor = zeta_ij.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
+      if (EVFLAG && eflag) {
         energy+=zeta_ij.z*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += delr1[0]*delr1[0]*mforce;
         virial[1] += delr1[1]*delr1[1]*mforce;
@@ -601,14 +611,8 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
       }
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int k_end = nbor_end;
-      if (dev_packed==dev_nbor) {
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
-
-      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -624,7 +628,6 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
         delr2[2] = kx.z-ix.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
 
-        if (rsq2 > cutsq[ijkparam]) continue;
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
 
@@ -643,7 +646,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
         numtyp4 ts5_ijkparam = ts5[ijkparam]; //fetch4(ts5_ijkparam,ijkparam,ts5_tex);
         c5 = ts5_ijkparam.x;
         h = ts5_ijkparam.y;
-        if (vflag>0)
+        if (EVFLAG && vflag)
           attractive(bigr, bigd, powermint, lam3, h, c1, c2, c3, c4, c5,
                      prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk);
         else
@@ -653,7 +656,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
         f.y += fi[1];
         f.z += fi[2];
 
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           acctyp v[6];
           numtyp pre = (numtyp)2.0;
           if (evatom==1) pre = TWOTHIRD;
@@ -669,10 +672,9 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
         }
       } // nbor_k
     } // for nbor_j
-
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,
-                     offset,eflag,vflag,ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,
+                offset,eflag,vflag,ans,engv);
 }
 
 __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
@@ -680,27 +682,25 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict ts2_in,
                                   const __global numtyp4 *restrict ts4_in,
                                   const __global numtyp4 *restrict ts5_in,
-                                  const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global acctyp4 *restrict zetaij,
                                   const __global int * dev_nbor,
-                                  const __global int * dev_packed,
                                   const __global int * dev_ilist,
-                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
                                   const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_store_three();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
@@ -712,23 +712,25 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
     ts5[tid]=ts5_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
-  __local int ijnum_shared[BLOCK_PAIR];
+  #ifdef LAL_SIMD_IP_SYNC
+  __local int localk[BLOCK_PAIR];
+  #endif
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
@@ -737,17 +739,9 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -766,62 +760,51 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
       int nbork_start = nbor_k;
-
       // look up for zeta_ji: find i in the j's neighbor list
+      #ifdef LAL_SIMD_IP_SYNC
       int m = tid / t_per_atom;
-      int ijnum = -1;
+      #endif
+      int ijnum;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
+          #ifdef LAL_SIMD_IP_SYNC
+          localk[m] = nbor_k;
+          #else
           ijnum = nbor_k;
-          ijnum_shared[m] = ijnum;
+          #endif
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      if (ijnum < 0) ijnum = ijnum_shared[m];
+      #ifdef LAL_SIMD_IP_SYNC
+      simdsync();
+      ijnum = localk[m];
+      #endif
 
-      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
+      acctyp4 zeta_ji = zetaij[ijnum-2*nbor_pitch];
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
+      if (EVFLAG && eflag) {
         energy+=zeta_ji.z*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += mdelr1[0]*mdelr1[0]*mforce;
         virial[1] += mdelr1[1]*mdelr1[1]*mforce;
@@ -833,7 +816,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -874,11 +857,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
         f.y += fi[1];
         f.z += fi[2];
 
-        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k;
-        if (dev_packed==dev_nbor) idx -= n_stride;
-
-        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
+        acctyp4 zeta_jk = zetaij[nbor_k-2*nbor_pitch];
         numtyp prefactor_jk = zeta_jk.y;
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
         ts1_param = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex);
@@ -902,15 +881,14 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
         f.z += fi[2];
       } // for nbor_k
     } // for nbor_j
-
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
 __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
@@ -918,27 +896,25 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global numtyp4 *restrict ts2_in,
                                         const __global numtyp4 *restrict ts4_in,
                                         const __global numtyp4 *restrict ts5_in,
-                                        const __global numtyp *restrict cutsq,
                                         const __global int *restrict map,
                                         const __global int *restrict elem2param,
                                         const int nelements, const int nparams,
                                         const __global acctyp4 *restrict zetaij,
                                         const __global int * dev_nbor,
-                                        const __global int * dev_packed,
                                         const __global int * dev_ilist,
-                                        const __global int * dev_short_nbor,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
                                         const int inum,  const int nbor_pitch,
                                         const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_store_three();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
@@ -950,23 +926,25 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
     ts5[tid]=ts5_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
-  __local int ijnum_shared[BLOCK_PAIR];
+  #ifdef LAL_SIMD_IP_SYNC
+  __local int localk[BLOCK_PAIR];
+  #endif
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
@@ -975,17 +953,9 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -1004,62 +974,52 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
+      #ifdef LAL_SIMD_IP_SYNC
       int m = tid / t_per_atom;
-      int ijnum = -1;
+      #endif
+      int ijnum;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
+          #ifdef LAL_SIMD_IP_SYNC
+          localk[m] = nbor_k;
+          #else
           ijnum = nbor_k;
-          ijnum_shared[m] = ijnum;
+          #endif
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      if (ijnum < 0) ijnum = ijnum_shared[m];
+      #ifdef LAL_SIMD_IP_SYNC
+      simdsync();
+      ijnum = localk[m];
+      #endif
 
-      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
+      acctyp4 zeta_ji = zetaij[ijnum-2*nbor_pitch];
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
+      if (EVFLAG && eflag) {
         energy+=zeta_ji.z*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += mdelr1[0]*mdelr1[0]*mforce;
         virial[1] += mdelr1[1]*mdelr1[1]*mforce;
@@ -1071,7 +1031,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1086,8 +1046,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
         delr2[1] = kx.y-jx.y;
         delr2[2] = kx.z-jx.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
-
-        if (rsq2 > cutsq[jikparam]) continue;
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
 
@@ -1120,10 +1078,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
         virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);
 
-        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k;
-        if (dev_packed==dev_nbor) idx -= n_stride;
-        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
+        acctyp4 zeta_jk = zetaij[nbor_k-2*nbor_pitch];
         numtyp prefactor_jk = zeta_jk.y;
 
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -1155,14 +1110,13 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
       }
     } // for nbor
-
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
diff --git a/lib/gpu/lal_tersoff_mod.h b/lib/gpu/lal_tersoff_mod.h
index 29a561c71d..0baa1307cb 100644
--- a/lib/gpu/lal_tersoff_mod.h
+++ b/lib/gpu/lal_tersoff_mod.h
@@ -63,7 +63,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
   bool shared_types;
 
   /// Number of atom types
-  int _lj_types;
+  int _ntypes;
 
   /// ts1.x = lam1, ts1.y = lam2,  ts1.z = lam3, ts1.w = powermint
   UCL_D_Vec<numtyp4> ts1;
@@ -76,7 +76,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
   /// ts5.x = c5, ts5.y = h
   UCL_D_Vec<numtyp4> ts5;
 
-  UCL_D_Vec<numtyp> cutsq;
+  UCL_D_Vec<numtyp> cutsq_pair;
 
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;
@@ -87,13 +87,11 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
   /// zetaij.w = zetaij
   UCL_D_Vec<acctyp4>   _zetaij;
 
-  UCL_Kernel k_zeta;
-  UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
-  numtyp _cutshortsq;
+  UCL_Kernel k_zeta, k_zeta_noev, *k_zeta_selt;
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag, const int evatom);
+  int loop(const int eflag, const int vflag, const int evatom, bool &success);
 };
 
 }
diff --git a/lib/gpu/lal_tersoff_mod_ext.cpp b/lib/gpu/lal_tersoff_mod_ext.cpp
index cce9df8713..cac284fb70 100644
--- a/lib/gpu/lal_tersoff_mod_ext.cpp
+++ b/lib/gpu/lal_tersoff_mod_ext.cpp
@@ -63,7 +63,7 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=TSMMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+    init_ok=TSMMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                       host_map, nelements, host_elem2param, nparams,
                       ts_lam1, ts_lam2, ts_lam3, ts_powermint,
                       ts_biga, ts_bigb, ts_bigr, ts_bigd, ts_c1, ts_c2,
@@ -84,7 +84,7 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=TSMMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+      init_ok=TSMMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                         host_map, nelements, host_elem2param, nparams,
                         ts_lam1, ts_lam2, ts_lam3, ts_powermint,
                         ts_biga, ts_bigb, ts_bigr, ts_bigd, ts_c1, ts_c2,
@@ -99,7 +99,7 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall,
     fprintf(screen,"\n");
 
   if (init_ok==0)
-    TSMMF.estimate_gpu_overhead();
+    TSMMF.estimate_gpu_overhead(1);
   return init_ok;
 }
 
diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp
index 7d254d568d..4456712b0a 100644
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@@ -39,7 +39,7 @@ TersoffZT::~TersoffZBL() {
 
 template <class numtyp, class acctyp>
 int TersoffZT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
+  return this->bytes_per_atom_atomic(max_nbors)+max_nbors*sizeof(acctyp)*4;
 }
 
 template <class numtyp, class acctyp>
@@ -59,34 +59,78 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
                     const double global_a_0, const double global_epsilon_0,
                     const double* host_cutsq)
 {
+  int oldparam=-1;
+  int onetype=-1;
+  int onetype3=0;
+  int spq=1;
+  int mtypes=0;
+  #ifdef USE_OPENCL
+  for (int ii=1; ii<ntypes; ii++) {
+    const int i=host_map[ii];
+    for (int jj=1; jj<ntypes; jj++) {
+      const int j=host_map[jj];
+      for (int kk=1; kk<ntypes; kk++) {
+        const int k=host_map[kk];
+        if (i<0 || j<0 || k<0) continue;
+        const int ijkparam = host_elem2param[i][j][k];
+        if (oldparam!=ijkparam) {
+          oldparam=ijkparam;
+          onetype=ntypes*ii+jj;
+          onetype3=ijkparam;
+          mtypes++;
+        }
+      }
+    }
+  }
+  if (mtypes>1) onetype=-1;
+  #endif
+
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,tersoff_zbl,"k_tersoff_zbl_repulsive",
-                           "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end",
-                           "k_tersoff_zbl_short_nbor");
+                           "k_tersoff_zbl_three_center",
+                           "k_tersoff_zbl_three_end",
+                           "k_tersoff_zbl_short_nbor",onetype,onetype3,0,1);
   if (success!=0)
     return success;
 
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
-  _zetaij.alloc(ef_nall*max_nbors,*(this->ucl_device),UCL_READ_WRITE);
+  if (this->nbor->max_nbors())
+    _zetaij.alloc(ef_nall*this->nbor->max_nbors(),*(this->ucl_device),
+                 UCL_READ_WRITE);
 
   k_zeta.set_function(*(this->pair_program),"k_tersoff_zbl_zeta");
+  #if defined(LAL_OCL_EV_JIT)
+  k_zeta_noev.set_function(*(this->pair_program_noev),"k_tersoff_zbl_zeta");
+  #else
+  k_zeta_selt = &k_zeta;
+  #endif
 
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
+  _ntypes = ntypes;
   _nparams = nparams;
   _nelements = nelements;
 
+  UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
+                               UCL_READ_WRITE);
+  host_write.zero();
+  cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  for (int ii=1; ii<ntypes; ii++) {
+    const int i=host_map[ii];
+    for (int jj=1; jj<ntypes; jj++) {
+      const int j=host_map[jj];
+      for (int kk=1; kk<ntypes; kk++) {
+        const int k=host_map[kk];
+        if (i<0 || j<0 || k<0) continue;
+        const int ijkparam = host_elem2param[i][j][k];
+        if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
+          host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
+      }
+    }
+  }
+  ucl_copy(cutsq_pair,host_write,ntypes*ntypes);
+
   UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
                            UCL_WRITE_ONLY);
 
@@ -108,8 +152,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   }
 
   ucl_copy(ts1,dview,false);
-  ts1_tex.get_texture(*(this->pair_program),"ts1_tex");
-  ts1_tex.bind_float(ts1,4);
 
   ts2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -121,8 +163,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   }
 
   ucl_copy(ts2,dview,false);
-  ts2_tex.get_texture(*(this->pair_program),"ts2_tex");
-  ts2_tex.bind_float(ts2,4);
 
   ts3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -134,8 +174,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   }
 
   ucl_copy(ts3,dview,false);
-  ts3_tex.get_texture(*(this->pair_program),"ts3_tex");
-  ts3_tex.bind_float(ts3,4);
 
   ts4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -147,8 +185,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   }
 
   ucl_copy(ts4,dview,false);
-  ts4_tex.get_texture(*(this->pair_program),"ts4_tex");
-  ts4_tex.bind_float(ts4,4);
 
   ts5.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -160,8 +196,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   }
 
   ucl_copy(ts5,dview,false);
-  ts5_tex.get_texture(*(this->pair_program),"ts5_tex");
-  ts5_tex.bind_float(ts5,4);
 
   ts6.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -173,20 +207,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   }
 
   ucl_copy(ts6,dview,false);
-  ts6_tex.get_texture(*(this->pair_program),"ts6_tex");
-  ts6_tex.bind_float(ts6,4);
-
-  UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
-                               UCL_WRITE_ONLY);
-  double cutsqmax = 0.0;
-  for (int i=0; i<nparams; i++) {
-    cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
-    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
-  }
-  cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-  ucl_copy(cutsq,cutsq_view,false);
-
-  _cutshortsq = static_cast<numtyp>(cutsqmax);
 
   UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                            *(this->ucl_device), UCL_WRITE_ONLY);
@@ -203,11 +223,11 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
 
   ucl_copy(elem2param,dview_elem2param,false);
 
-  UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
+  UCL_H_Vec<int> dview_map(ntypes, *(this->ucl_device), UCL_WRITE_ONLY);
   for (int i = 0; i < ntypes; i++)
     dview_map[i] = host_map[i];
 
-  map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
+  map.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY);
   ucl_copy(map,dview_map,false);
 
   _global_e = global_e;
@@ -216,8 +236,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
 
   _allocated=true;
   this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+
-    ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+
-    map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
+    ts4.row_bytes()+ts5.row_bytes()+map.row_bytes()+elem2param.row_bytes()+
+    _zetaij.row_bytes();
   return 0;
 }
 
@@ -233,12 +253,15 @@ void TersoffZT::clear() {
   ts4.clear();
   ts5.clear();
   ts6.clear();
-  cutsq.clear();
+  cutsq_pair.clear();
   map.clear();
   elem2param.clear();
   _zetaij.clear();
 
   k_zeta.clear();
+  #if defined(LAL_OCL_EV_JIT)
+  k_zeta_noev.clear();
+  #endif
 
   this->clear_atomic();
 }
@@ -254,75 +277,54 @@ double TersoffZT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
-  // Compute the block size and grid size to keep all cores busy
-  int BX=this->block_pair();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
-  // build the short neighbor list
-  int ainum=this->_ainum;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/this->_threads_per_atom)));
-
-  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                         &this->_nbor_data->begin(),
-                         &this->dev_short_nbor, &_cutshortsq, &ainum,
-                         &nbor_pitch, &this->_threads_per_atom);
+int TersoffZT::loop(const int eflag, const int vflag, const int evatom,
+                    bool &success) {
+  const int nbor_pitch=this->nbor->nbor_pitch();
 
   // re-allocate zetaij if necessary
   int nall = this->_nall;
-  if (nall*this->_max_nbors > _zetaij.cols()) {
+  if (nall*this->nbor->max_nbors() > _zetaij.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(this->_max_nbors*_nmax);
+    _zetaij.clear();
+    success = success && (_zetaij.alloc(this->nbor->max_nbors()*_nmax,
+                                        *(this->ucl_device),
+                                        UCL_READ_WRITE) == UCL_SUCCESS);
+    if (!success) return 0;
   }
 
-  nbor_pitch=this->nbor->nbor_pitch();
-  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  ainum=this->ans->inum();
-  nbor_pitch=this->nbor->nbor_pitch();
-  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
+  // build the short neighbor list
+  int ainum=this->_ainum;
   this->time_pair.start();
-  this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &ts1, &ts2, &ts6,
-                   &_global_e, &_global_a_0, &_global_epsilon_0, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &this->ans->force, &this->ans->engv,
-                   &eflag, &vflag, &ainum, &nbor_pitch,
+
+  int BX=this->block_pair();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
+                         &this->nbor->dev_nbor, &this->nbor->dev_packed,
+                         &ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  #if defined(LAL_OCL_EV_JIT)
+  if (eflag || vflag) k_zeta_selt = &k_zeta;
+  else k_zeta_selt = &k_zeta_noev;
+  #endif
+
+  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
+                           (BX/(JTHREADS*KTHREADS))));
+  k_zeta_selt->set_size(GX,BX);
+  k_zeta_selt->run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6,
+                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
+                   &this->nbor->dev_nbor, &eflag, &this->_ainum, &nbor_pitch,
                    &this->_threads_per_atom);
 
+  ainum=this->ans->inum();
   BX=this->block_size();
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                            (BX/(KTHREADS*JTHREADS))));
-  this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
-                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                           &this->dev_short_nbor,
-                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
+  this->k_3center_sel->set_size(GX,BX);
+  this->k_3center_sel->run(&this->atom->x, &ts1, &ts2, &ts4, &map,
+                           &elem2param, &_nelements, &_nparams, &_zetaij,
+                           &this->nbor->dev_nbor, &this->ans->force,
+                           &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
   Answer<numtyp,acctyp> *end_ans;
@@ -333,24 +335,35 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   #endif
   if (evatom!=0) {
     this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
+    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
 
   } else {
-    this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
-                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+    this->k_3end_sel->set_size(GX,BX);
+    this->k_3end_sel->run(&this->atom->x, &ts1, &ts2, &ts4, &map,
+                          &elem2param, &_nelements, &_nparams, &_zetaij,
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
   }
 
+  BX=this->block_pair();
+  int GXT=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                           (BX/this->_threads_per_atom)));
+  this->k_sel->set_size(GXT,BX);
+  this->k_sel->run(&this->atom->x, &ts1, &ts2, &ts6, &_global_e, &_global_a_0,
+                   &_global_epsilon_0, &map, &elem2param, &_nelements,
+                   &_nparams, &this->nbor->dev_nbor, &this->ans->force,
+                   &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
+                   &this->_threads_per_atom, &GX);
+
   this->time_pair.stop();
+  return GX;
 }
 
 template class TersoffZBL<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu
index f631cab91f..fce1ccc406 100644
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@@ -48,72 +48,16 @@ _texture( ts6_tex,int4);
 
 #define TWOTHIRD (numtyp)0.66666666666666666667
 
-#define zeta_idx(nbor_mem, packed_mem, nbor_pitch, n_stride, t_per_atom,    \
-                 i, nbor_j, offset_j, idx)                                  \
-  if (nbor_mem==packed_mem) {                                               \
-    int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;                       \
-    idx = jj*n_stride + i*t_per_atom + offset_j;                            \
-  } else {                                                                  \
-    idx = nbor_j;                                                           \
-  }
+#if (SHUFFLE_AVAIL == 0)
 
-#if (ARCH < 300)
-
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
-                        offset, eflag, vflag, ans, engv)                    \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
-    old.x+=f.x;                                                             \
-    old.y+=f.y;                                                             \
-    old.z+=f.z;                                                             \
-    ans[ii]=old;                                                            \
-  }
+#define local_allocate_acc_zeta()                                           \
+    __local acctyp red_acc[BLOCK_PAIR];
 
 #define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=z;                                                         \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s) {                                                     \
         red_acc[tid] += red_acc[tid+s];                                     \
       }                                                                     \
@@ -121,36 +65,168 @@ _texture( ts6_tex,int4);
     z=red_acc[tid];                                                         \
   }
 
-#else
-
 #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
-                        offset, eflag, vflag, ans, engv)                    \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      f.x += shfl_xor(f.x, s, t_per_atom);                                  \
-      f.y += shfl_xor(f.y, s, t_per_atom);                                  \
-      f.z += shfl_xor(f.z, s, t_per_atom);                                  \
-      energy += shfl_xor(energy, s, t_per_atom);                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
         ei+=inum;                                                           \
       }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
+  }
+
+#else
+
+#define local_allocate_acc_zeta()
+
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      z += shfl_down(z, s, t_per_atom);                                     \
+    }                                                                       \
+  }
+
+#if (EVFLAG == 1)
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
@@ -158,63 +234,62 @@ _texture( ts6_tex,int4);
     ans[ii]=old;                                                            \
   }
 
-#define acc_zeta(z, tid, t_per_atom, offset)                                \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      z += shfl_xor(z, s, t_per_atom);                                      \
-    }                                                                       \
-  }
-
+#endif
 #endif
 
 __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global int * dev_nbor,
+                                   const __global numtyp *restrict cutsq_pair,
+                                   const int ntypes, __global int * dev_nbor,
                                    const __global int * dev_packed,
-                                   __global int * dev_short_nbor,
-                                   const numtyp _cutshortsq,
                                    const int inum, const int nbor_pitch,
                                    const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
+  const int ii=GLOBAL_ID_X;
+
+  #ifdef ONETYPE
+  const numtyp cutsq=cutsq_pair[ONETYPE];
+  #endif
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    const int i=dev_packed[ii];
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
+    const int itype=ix.w*ntypes;
+    #endif
+    int newj=0;
 
-    int ncount = 0;
-    int m = nbor;
-    dev_short_nbor[m] = 0;
-    int nbor_short = nbor+n_stride;
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int j=dev_packed[nbor];
-      int nj = j;
-      j &= NEIGHMASK;
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
 
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
+      #ifndef ONETYPE
+      const int mtype=jx.w+itype;
+      const numtyp cutsq=cutsq_pair[mtype];
+      #endif
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<_cutshortsq) {
-        dev_short_nbor[nbor_short] = nj;
-        nbor_short += n_stride;
-        ncount++;
+      if (rsq<cutsq) {
+        *out_list=sj;
+        out_list++;
+        newj++;
+        if ((newj & (t_per_atom-1))==0)
+          out_list+=out_stride;
       }
     } // for nbor
-
-    // store the number of neighbors for each thread
-    dev_short_nbor[m] = ncount;
-
+    dev_nbor[ii+nbor_pitch]=newj;
   } // if ii
 }
 
@@ -232,22 +307,20 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict ts4_in,
                              const __global numtyp4 *restrict ts5_in,
                              const __global numtyp4 *restrict ts6_in,
-                             const __global numtyp *restrict cutsq,
                              const __global int *restrict map,
                              const __global int *restrict elem2param,
                              const int nelements, const int nparams,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
-                             const __global int * dev_packed,
-                             const __global int * dev_short_nbor,
                              const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
-  __local int tpa_sq,n_stride;
-  tpa_sq = fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq = fast_mul(t_per_atom,t_per_atom);
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_acc_zeta();
+
   // must be increased if there will be more than 3 elements in the future.
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
@@ -270,28 +343,20 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int nbor_j, nbor_end, i, numj;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
     int nborj_start = nbor_j;
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -310,14 +375,8 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int k_end = nbor_end;
-      if (dev_packed==dev_nbor) {
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
-
-      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -333,8 +392,6 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
         delr2.z = kx.z-ix.z;
         numtyp rsq2 = delr2.x*delr2.x+delr2.y*delr2.y+delr2.z*delr2.z;
 
-        if (rsq2 > cutsq[ijkparam]) continue;
-
         numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex);
         numtyp ijkparam_lam3 = ts1_ijkparam.z;
         numtyp ijkparam_powermint = ts1_ijkparam.w;
@@ -351,9 +408,6 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
                   rsq1, rsq2, delr1, delr2);
       }
 
-      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j;
-      if (dev_packed==dev_nbor) idx -= n_stride;
       acc_zeta(z, tid, t_per_atom, offset_k);
 
       numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
@@ -384,7 +438,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
         zij.y = fpfeng[1];
         zij.z = fpfeng[2];
         zij.w = z;
-        zetaij[idx] = zij;
+        zetaij[nbor_j-2*nbor_pitch] = zij;
       }
 
     } // for nbor
@@ -397,22 +451,20 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict ts6_in,
                                   const numtyp global_e, const numtyp global_a_0,
                                   const numtyp global_epsilon_0,
-                                  const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
-                                  const __global int * dev_packed,
-                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
-                                  const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
+                                  const int t_per_atom, const int ev_stride) {
+  int tid, ii, offset, n_stride;
   atom_info(t_per_atom,ii,tid,offset);
 
+  local_allocate_store_pair();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts6[SHARED_SIZE];
@@ -422,36 +474,28 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
     ts6[tid]=ts6_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end, i, numj;
-    const __global int* nbor_mem=dev_packed;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset,i,numj,
+                n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor];
-      nbor += n_stride;
-      nbor_end = nbor+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=nbor_mem[nbor];
+      int j=dev_nbor[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -466,8 +510,6 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq >= cutsq[ijparam]) continue;
-
       numtyp feng[2];
       numtyp ijparam_lam1 = ts1[ijparam].x;
       numtyp4 ts2_ijparam = ts2[ijparam];
@@ -489,9 +531,9 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
       f.y+=dely*force;
       f.z+=delz*force;
 
-      if (eflag>0)
+      if (EVFLAG && eflag)
         energy+=feng[1];
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         virial[0] += delx*delx*force;
         virial[1] += dely*dely*force;
         virial[2] += delz*delz*force;
@@ -500,37 +542,33 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
         virial[5] += dely*delz*force;
       }
     } // for nbor
-
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
-
+  store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv,ev_stride);
 }
 
 __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
                                      const __global numtyp4 *restrict ts1_in,
                                      const __global numtyp4 *restrict ts2_in,
                                      const __global numtyp4 *restrict ts4_in,
-                                     const __global numtyp *restrict cutsq,
                                      const __global int *restrict map,
                                      const __global int *restrict elem2param,
                                      const int nelements, const int nparams,
                                      const __global acctyp4 *restrict zetaij,
                                      const __global int * dev_nbor,
-                                     const __global int * dev_packed,
-                                     const __global int * dev_short_nbor,
                                      __global acctyp4 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
                                      const int inum,  const int nbor_pitch,
                                      const int t_per_atom, const int evatom) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset); // offset ranges from 0 to tpa_sq-1
 
+  local_allocate_store_three();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
@@ -540,46 +578,37 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
     ts4[tid]=ts4_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
   numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
     int nborj_start = nbor_j;
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delr1[3];
@@ -587,26 +616,22 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
       delr1[1] = jx.y-ix.y;
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
-      if (rsq1 >= cutsq[ijparam]) continue;
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
       // look up for zeta_ij
-      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
+      acctyp4 zeta_ij = zetaij[nbor_j-2*nbor_pitch];
       numtyp force = zeta_ij.x*tpainv;
       numtyp prefactor = zeta_ij.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
+      if (EVFLAG && eflag) {
         energy+=zeta_ij.z*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += delr1[0]*delr1[0]*mforce;
         virial[1] += delr1[1]*delr1[1]*mforce;
@@ -617,14 +642,8 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
       }
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int k_end = nbor_end;
-      if (dev_packed==dev_nbor) {
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
-
-      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -640,7 +659,6 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
         delr2[2] = kx.z-ix.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
 
-        if (rsq2 > cutsq[ijkparam]) continue;
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
 
@@ -656,7 +674,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
         d = ts4_ijkparam.y;
         h = ts4_ijkparam.z;
         gamma = ts4_ijkparam.w;
-        if (vflag>0)
+        if (EVFLAG && vflag)
           attractive(bigr, bigd, powermint, lam3, c, d, h, gamma,
                      prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk);
         else
@@ -666,7 +684,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
         f.y += fi[1];
         f.z += fi[2];
 
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           acctyp v[6];
           numtyp pre = (numtyp)2.0;
           if (evatom==1) pre = TWOTHIRD;
@@ -682,37 +700,34 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
         }
       } // nbor_k
     } // for nbor_j
-
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,
-                     offset,eflag,vflag,ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,
+                offset,eflag,vflag,ans,engv);
 }
 
 __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict ts1_in,
                                   const __global numtyp4 *restrict ts2_in,
                                   const __global numtyp4 *restrict ts4_in,
-                                  const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global acctyp4 *restrict zetaij,
                                   const __global int * dev_nbor,
-                                  const __global int * dev_packed,
                                   const __global int * dev_ilist,
-                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
                                   const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_store_three();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
@@ -722,23 +737,25 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
     ts4[tid]=ts4_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
-  __local int ijnum_shared[BLOCK_PAIR];
+  #ifdef LAL_SIMD_IP_SYNC
+  __local int localk[BLOCK_PAIR];
+  #endif
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
@@ -747,17 +764,9 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -776,62 +785,51 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
       int nbork_start = nbor_k;
-
       // look up for zeta_ji: find i in the j's neighbor list
+      #ifdef LAL_SIMD_IP_SYNC
       int m = tid / t_per_atom;
-      int ijnum = -1;
+      #endif
+      int ijnum;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
+          #ifdef LAL_SIMD_IP_SYNC
+          localk[m] = nbor_k;
+          #else
           ijnum = nbor_k;
-          ijnum_shared[m] = ijnum;
+          #endif
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      if (ijnum < 0) ijnum = ijnum_shared[m];
+      #ifdef LAL_SIMD_IP_SYNC
+      simdsync();
+      ijnum = localk[m];
+      #endif
 
-      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
+      acctyp4 zeta_ji = zetaij[ijnum-2*nbor_pitch];
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
+      if (EVFLAG && eflag) {
         energy+=zeta_ji.z*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += mdelr1[0]*mdelr1[0]*mforce;
         virial[1] += mdelr1[1]*mdelr1[1]*mforce;
@@ -843,7 +841,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -881,11 +879,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
         f.y += fi[1];
         f.z += fi[2];
 
-        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k;
-        if (dev_packed==dev_nbor) idx -= n_stride;
-
-        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
+        acctyp4 zeta_jk = zetaij[nbor_k-2*nbor_pitch];
         numtyp prefactor_jk = zeta_jk.y;
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
         ts1_param = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex);
@@ -906,42 +900,39 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
         f.z += fi[2];
       } // for nbor_k
     } // for nbor_j
-
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
 __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global numtyp4 *restrict ts1_in,
                                         const __global numtyp4 *restrict ts2_in,
                                         const __global numtyp4 *restrict ts4_in,
-                                        const __global numtyp *restrict cutsq,
                                         const __global int *restrict map,
                                         const __global int *restrict elem2param,
                                         const int nelements, const int nparams,
                                         const __global acctyp4 *restrict zetaij,
                                         const __global int * dev_nbor,
-                                        const __global int * dev_packed,
                                         const __global int * dev_ilist,
-                                        const __global int * dev_short_nbor,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
                                         const int inum,  const int nbor_pitch,
                                         const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_store_three();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
@@ -951,23 +942,25 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
     ts4[tid]=ts4_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
-  __local int red_acc[BLOCK_PAIR];
+  #ifdef LAL_SIMD_IP_SYNC
+  __local int localk[BLOCK_PAIR];
+  #endif
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
@@ -976,17 +969,9 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -1005,62 +990,52 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
+      #ifdef LAL_SIMD_IP_SYNC
       int m = tid / t_per_atom;
-      int ijnum = -1;
+      #endif
+      int ijnum;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
+          #ifdef LAL_SIMD_IP_SYNC
+          localk[m] = nbor_k;
+          #else
           ijnum = nbor_k;
-          red_acc[m] = ijnum;
+          #endif
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      if (ijnum < 0) ijnum = red_acc[m];
+      #ifdef LAL_SIMD_IP_SYNC
+      simdsync();
+      ijnum = localk[m];
+      #endif
 
-      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
+      acctyp4 zeta_ji = zetaij[ijnum-2*nbor_pitch];
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
+      if (EVFLAG && eflag) {
         energy+=zeta_ji.z*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += mdelr1[0]*mdelr1[0]*mforce;
         virial[1] += mdelr1[1]*mdelr1[1]*mforce;
@@ -1072,7 +1047,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1088,7 +1063,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
         delr2[2] = kx.z-jx.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
 
-        if (rsq2 > cutsq[jikparam]) continue;
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
 
@@ -1118,10 +1092,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
         virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);
 
-        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k;
-        if (dev_packed==dev_nbor) idx -= n_stride;
-        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
+        acctyp4 zeta_jk = zetaij[nbor_k-2*nbor_pitch];
         numtyp prefactor_jk = zeta_jk.y;
 
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -1150,14 +1121,13 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
       }
     } // for nbor
-
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
diff --git a/lib/gpu/lal_tersoff_zbl.h b/lib/gpu/lal_tersoff_zbl.h
index eb03e9fb02..b82b391765 100644
--- a/lib/gpu/lal_tersoff_zbl.h
+++ b/lib/gpu/lal_tersoff_zbl.h
@@ -65,7 +65,7 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
   bool shared_types;
 
   /// Number of atom types
-  int _lj_types;
+  int _ntypes;
 
   /// ts1.x = lam1, ts1.y = lam2,  ts1.z = lam3, ts1.w = powermint
   UCL_D_Vec<numtyp4> ts1;
@@ -80,7 +80,7 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
   /// ts6.x = Z_i, ts6.y = Z_j, ts6.z = ZBLcut, ts6.w = ZBLexpscale
   UCL_D_Vec<numtyp4> ts6;
 
-  UCL_D_Vec<numtyp> cutsq;
+  UCL_D_Vec<numtyp> cutsq_pair;
 
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;
@@ -91,15 +91,13 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
   /// zetaij.w = zetaij
   UCL_D_Vec<acctyp4>   _zetaij;
 
-  UCL_Kernel k_zeta;
-  UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex;
+  UCL_Kernel k_zeta, k_zeta_noev, *k_zeta_selt;
 
   numtyp _global_e,_global_a_0,_global_epsilon_0;
-  numtyp _cutshortsq;
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag, const int evatom);
+  int loop(const int eflag, const int vflag, const int evatom, bool &success);
 };
 
 }
diff --git a/lib/gpu/lal_tersoff_zbl_ext.cpp b/lib/gpu/lal_tersoff_zbl_ext.cpp
index d1a9e090b6..518b535627 100644
--- a/lib/gpu/lal_tersoff_zbl_ext.cpp
+++ b/lib/gpu/lal_tersoff_zbl_ext.cpp
@@ -70,7 +70,7 @@ int tersoff_zbl_gpu_init(const int ntypes, const int inum, const int nall,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=TSZMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+    init_ok=TSZMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                       host_map, nelements, host_elem2param, nparams,
                       ts_lam1, ts_lam2, ts_lam3, ts_powermint,
                       ts_biga, ts_bigb, ts_bigr, ts_bigd,
@@ -93,7 +93,7 @@ int tersoff_zbl_gpu_init(const int ntypes, const int inum, const int nall,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=TSZMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+      init_ok=TSZMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                         host_map, nelements, host_elem2param, nparams,
                         ts_lam1, ts_lam2, ts_lam3, ts_powermint,
                         ts_biga, ts_bigb, ts_bigr, ts_bigd,
@@ -110,7 +110,7 @@ int tersoff_zbl_gpu_init(const int ntypes, const int inum, const int nall,
     fprintf(screen,"\n");
 
   if (init_ok==0)
-    TSZMF.estimate_gpu_overhead();
+    TSZMF.estimate_gpu_overhead(1);
   return init_ok;
 }
 
diff --git a/lib/gpu/lal_ufm.cpp b/lib/gpu/lal_ufm.cpp
index a86d07f340..f6a48d4470 100644
--- a/lib/gpu/lal_ufm.cpp
+++ b/lib/gpu/lal_ufm.cpp
@@ -131,20 +131,9 @@ double UFMT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void UFMT::loop(const bool _eflag, const bool _vflag) {
+int UFMT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -152,8 +141,8 @@ void UFMT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &uf1, &uf3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &uf1, &uf3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -166,6 +155,7 @@ void UFMT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class UFM<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_ufm.cu b/lib/gpu/lal_ufm.cu
index 03d1e85bdf..9d6c7b978a 100644
--- a/lib/gpu/lal_ufm.cu
+++ b/lib/gpu/lal_ufm.cu
@@ -40,16 +40,19 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_pair();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -81,10 +84,10 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
         f.y += dely*force;
         f.z += delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           energy += - factor_lj * uf3[mtype].x*log(1.0 - expuf) - uf3[mtype].z;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -95,9 +98,9 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
@@ -116,26 +119,29 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 uf1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 uf3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     uf1[tid]=uf1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       uf3[tid]=uf3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -167,10 +173,10 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
         f.y += dely*force;
         f.z += delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           energy += - factor_lj * uf3[mtype].x * log(1.0 - expuf) - uf3[mtype].z;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -181,8 +187,8 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_ufm.h b/lib/gpu/lal_ufm.h
index 14b96bcc86..390af831ba 100644
--- a/lib/gpu/lal_ufm.h
+++ b/lib/gpu/lal_ufm.h
@@ -77,7 +77,7 @@ class UFM : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_ufm_ext.cpp b/lib/gpu/lal_ufm_ext.cpp
index 12809a28fb..432cbb2e63 100644
--- a/lib/gpu/lal_ufm_ext.cpp
+++ b/lib/gpu/lal_ufm_ext.cpp
@@ -57,7 +57,7 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
   int init_ok=0;
   if (world_me==0)
     init_ok=UFMLMF.init(ntypes, cutsq, host_uf1, host_uf2, host_uf3,
-                        offset, special_lj, inum, nall, 300,
+                        offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen);
 
   UFMLMF.device->world_barrier();
@@ -75,7 +75,7 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=UFMLMF.init(ntypes, cutsq, host_uf1, host_uf2, host_uf3,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);
 
     UFMLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp
index 4af8a0f71c..c343de3f55 100644
--- a/lib/gpu/lal_vashishta.cpp
+++ b/lib/gpu/lal_vashishta.cpp
@@ -50,7 +50,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
            const double* gamma, const double* eta,
            const double* lam1inv, const double* lam4inv,
            const double* zizj, const double* mbigd,
-           const double* dvrc, const double* big6w, 
+           const double* dvrc, const double* big6w,
            const double* heta, const double* bigh,
            const double* bigw, const double* c0,
            const double* costheta, const double* bigb,
@@ -138,8 +138,6 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
     dview[i].w=static_cast<numtyp>(r0[i]);
   }
 
-  _cutshortsq = static_cast<numtyp>(r0sqmax);
-
   ucl_copy(param4,dview,false);
   param4_tex.get_texture(*(this->pair_program),"param4_tex");
   param4_tex.bind_float(param4,4);
@@ -212,60 +210,33 @@ double VashishtaT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
-  // Compute the block size and grid size to keep all cores busy
-  int BX=this->block_pair();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
+int VashishtaT::loop(const int eflag, const int vflag, const int evatom,
+                     bool &success) {
+  const int nbor_pitch=this->nbor->nbor_pitch();
 
   // build the short neighbor list
   int ainum=this->_ainum;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/this->_threads_per_atom)));
-
-  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                         &this->_nbor_data->begin(),
-                         &this->dev_short_nbor, &_cutshortsq, &ainum,
-                         &nbor_pitch, &this->_threads_per_atom);
-
-  // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
-  // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
-  ainum=this->ans->inum();
-  nbor_pitch=this->nbor->nbor_pitch();
-  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
   this->time_pair.start();
 
-  // note that k_pair does not run with the short neighbor list
-  this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
-                   &map, &elem2param, &_nelements,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->ans->force, &this->ans->engv,
-                   &eflag, &vflag, &ainum, &nbor_pitch,
-                   &this->_threads_per_atom);
+  int BX=this->block_pair();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &param4, &map, &elem2param,
+                         &_nelements, &_nparams, &this->nbor->dev_nbor,
+                         &this->nbor->dev_packed, &ainum, &nbor_pitch,
+                         &this->_threads_per_atom);
 
+  ainum=this->ans->inum();
   BX=this->block_size();
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                            (BX/(KTHREADS*JTHREADS))));
-  
-  this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
-                           &map, &elem2param, &_nelements,
-                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                           &this->dev_short_nbor,
-                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
+  this->k_3center_sel->set_size(GX,BX);
+  this->k_3center_sel->run(&this->atom->x, &param1, &param2, &param3, &param4,
+                           &param5, &map, &elem2param, &_nelements,
+                           &this->nbor->dev_nbor, &this->ans->force,
+                           &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
+
   Answer<numtyp,acctyp> *end_ans;
   #ifdef THREE_CONCURRENT
   end_ans=this->ans2;
@@ -274,23 +245,34 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   #endif
   if (evatom!=0) {
     this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
-                          &map, &elem2param, &_nelements,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+    this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3,
+                          &param4, &param5, &map, &elem2param, &_nelements,
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
   } else {
-    this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
-                          &map, &elem2param, &_nelements,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+    this->k_3end_sel->set_size(GX,BX);
+    this->k_3end_sel->run(&this->atom->x, &param1, &param2, &param3, &param4,
+                          &param5, &map, &elem2param, &_nelements,
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
   }
 
+  BX=this->block_pair();
+  int GXT=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                           (BX/this->_threads_per_atom)));
+  // note that k_pair does not run with the short neighbor list
+  this->k_sel->set_size(GXT,BX);
+  this->k_sel->run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
+                   &map, &elem2param, &_nelements, &this->nbor->dev_packed,
+                   &this->ans->force, &this->ans->engv,  &eflag, &vflag,
+                   &ainum, &nbor_pitch, &GX);
+
   this->time_pair.stop();
+  return GX;
 }
 
 template class Vashishta<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu
index da15aaf09a..6c9ba14b4a 100644
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@@ -32,6 +32,14 @@ _texture( param4_tex,int4);
 _texture( param5_tex,int4);
 #endif
 
+#if (__CUDACC_VER_MAJOR__ >= 11)
+#define param1_tex param1
+#define param2_tex param2
+#define param3_tex param3
+#define param4_tex param4
+#define param5_tex param5
+#endif
+
 #else
 #define pos_tex x_
 #define param1_tex param1
@@ -41,92 +49,167 @@ _texture( param5_tex,int4);
 #define param5_tex param5
 #endif
 
+
+
 #define THIRD (numtyp)0.66666666666666666667
 
 //#define THREE_CONCURRENT
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
 
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_ELLIPSE];                               \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
       }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
     ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
   }
 
 #else
 
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
+#if (EVFLAG == 1)
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
         ei+=inum;                                                           \
       }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
+  }
+
+#else
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
@@ -135,55 +218,58 @@ _texture( param5_tex,int4);
   }
 
 #endif
+#endif
 
 __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
-                                     const __global int * dev_nbor,
+                                     const __global numtyp4 *restrict param4,
+                                     const __global int *restrict map,
+                                     const __global int *restrict elem2param,
+                                     const int nelements, const int nparams,
+                                     __global int * dev_nbor,
                                      const __global int * dev_packed,
-                                     __global int * dev_short_nbor,
-                                     const numtyp _cutshortsq,
                                      const int inum, const int nbor_pitch,
                                      const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
+  const int ii=GLOBAL_ID_X;
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    const int i=dev_packed[ii];
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+    int newj=0;
 
-    int ncount = 0;
-    int m = nbor;
-    dev_short_nbor[m] = 0;
-    int nbor_short = nbor+n_stride;
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int j=dev_packed[nbor];
-      int nj = j;
-      j &= NEIGHMASK;
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
 
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<_cutshortsq) {
-        dev_short_nbor[nbor_short] = nj;
-        nbor_short += n_stride;
-        ncount++;
+      if (rsq<param4[ijparam].x) {
+        *out_list=sj;
+        out_list++;
+        newj++;
+        if ((newj & (t_per_atom-1))==0)
+          out_list+=out_stride;
       }
     } // for nbor
-
-    // store the number of neighbors for each thread
-    dev_short_nbor[m] = ncount;
-
+    dev_nbor[ii+nbor_pitch]=newj;
   } // if ii
 }
 
@@ -196,35 +282,37 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
                    const __global int *restrict map,
                    const __global int *restrict elem2param,
                    const int nelements,
-                   const __global int * dev_nbor,
                    const __global int * dev_packed,
                    __global acctyp4 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
-                   const int nbor_pitch, const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
+                   const int nbor_pitch, const int ev_stride) {
+  const int ii=GLOBAL_ID_X;
+
+  local_allocate_store_pair();
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
-    int nbor, nbor_end, i, numj;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    const int i=dev_packed[ii];
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
 
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
@@ -280,10 +368,10 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0)
+        if (EVFLAG && eflag)
           energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
 
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -293,11 +381,10 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
         }
       }
     } // for nbor
-
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
-
+  const int tid=THREAD_ID_X;
+  store_answers_p(f,energy,virial,ii,inum,tid,1,0,eflag,vflag,ans,engv,
+                  ev_stride);
 }
 
 #define threebody(delr1x, delr1y, delr1z, eflag, energy)                     \
@@ -344,9 +431,9 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
   fky = delr2y*(frad2+csfac2)-delr1y*facang12;                               \
   fkz = delr2z*(frad2+csfac2)-delr1z*facang12;                               \
                                                                              \
-  if (eflag>0)                                                               \
+  if (EVFLAG && eflag)                                                       \
     energy+=facrad;                                                          \
-  if (vflag>0) {                                                             \
+  if (EVFLAG && vflag) {                                                     \
     virial[0] += delr1x*fjx + delr2x*fkx;                                    \
     virial[1] += delr1y*fjy + delr2y*fky;                                    \
     virial[2] += delr1z*fjz + delr2z*fkz;                                    \
@@ -402,54 +489,45 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
                                 const __global int *restrict elem2param,
                                 const int nelements,
                                 const __global int * dev_nbor,
-                                const __global int * dev_packed,
-                                const __global int * dev_short_nbor,
                                 __global acctyp4 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
                                 const int inum,  const int nbor_pitch,
                                 const int t_per_atom, const int evatom) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  int n_stride;
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp param_gamma_ij, param_r0sq_ij, param_r0_ij, param_gamma_ik, param_r0sq_ik, param_r0_ik;
   numtyp param_costheta_ijk, param_bigc_ijk, param_bigb_ijk, param_big2b_ijk;
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  local_allocate_store_three();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-    int nborj_start = nbor_j;
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -470,23 +548,13 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij=param4_ijparam.w;
 
-      int nbor_k,k_end;
-      if (dev_packed==dev_nbor) {
-        nbor_k=nborj_start-offset_j+offset_k;
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      } else {
-        nbor_k = nbor_j-offset_j+offset_k;
-        if (nbor_k<=nbor_j) nbor_k += n_stride;
-        k_end = nbor_end;
-      }
+      int nbor_k = nbor_j-offset_j+offset_k;
+      if (nbor_k<=nbor_j) nbor_k += n_stride;
 
-      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
-        if (dev_packed==dev_nbor && k <= j) continue;
-
         numtyp4 kx; fetch4(kx,k,pos_tex);
         int ktype=kx.w;
         ktype=map[ktype];
@@ -528,11 +596,9 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
     energy*=pre;
     for (int i=0; i<6; i++)
       virial[i]*=pre;
-
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
 }
 
 __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
@@ -545,53 +611,45 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
                              const __global int *restrict elem2param,
                              const int nelements,
                              const __global int * dev_nbor,
-                             const __global int * dev_packed,
                              const __global int * dev_ilist,
-                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
                              const int inum,  const int nbor_pitch,
                              const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  int n_stride;
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp param_gamma_ij, param_r0sq_ij, param_r0_ij, param_gamma_ik, param_r0sq_ik, param_r0_ik;
   numtyp param_costheta_ijk, param_bigc_ijk, param_bigb_ijk, param_big2b_ijk;
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  local_allocate_store_three();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -612,32 +670,16 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij = param4_ijparam.w;
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
-
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -680,14 +722,14 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
 __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
@@ -700,53 +742,45 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
                              const __global int *restrict elem2param,
                              const int nelements,
                              const __global int * dev_nbor,
-                             const __global int * dev_packed,
                              const __global int * dev_ilist,
-                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
                              const int inum,  const int nbor_pitch,
                              const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  int n_stride;
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp param_gamma_ij, param_r0sq_ij, param_r0_ij, param_gamma_ik, param_r0sq_ik, param_r0_ik;
   numtyp param_costheta_ijk, param_bigc_ijk, param_bigb_ijk, param_big2b_ijk;
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  local_allocate_store_three();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -767,32 +801,16 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij=param4_ijparam.w;
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
-
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -833,13 +851,13 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
     energy*=THIRD;
     for (int i=0; i<6; i++)
       virial[i]*=THIRD;
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
diff --git a/lib/gpu/lal_vashishta.h b/lib/gpu/lal_vashishta.h
index 2da7a11e1e..9a7bd8c630 100644
--- a/lib/gpu/lal_vashishta.h
+++ b/lib/gpu/lal_vashishta.h
@@ -40,11 +40,11 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
   int init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
            const double cell_size, const double gpu_split, FILE *screen,
            int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-           const double* cutsq, const double* r0, 
+           const double* cutsq, const double* r0,
            const double* gamma, const double* eta,
            const double* lam1inv, const double* lam4inv,
            const double* zizj, const double* mbigd,
-           const double* dvrc, const double* big6w, 
+           const double* dvrc, const double* big6w,
            const double* heta, const double* bigh,
            const double* bigw, const double* c0,
            const double* costheta, const double* bigb,
@@ -82,13 +82,12 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;
   int _nparams,_nelements;
-  numtyp _cutshortsq;
 
   UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex;
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag, const int evatom);
+  int loop(const int eflag, const int vflag, const int evatom, bool &success);
 
 };
 
diff --git a/lib/gpu/lal_vashishta_ext.cpp b/lib/gpu/lal_vashishta_ext.cpp
index 56dfd8a0ff..ecbdefed19 100644
--- a/lib/gpu/lal_vashishta_ext.cpp
+++ b/lib/gpu/lal_vashishta_ext.cpp
@@ -32,7 +32,7 @@ int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const i
                 const double* gamma, const double* eta,
                 const double* lam1inv, const double* lam4inv,
                 const double* zizj, const double* mbigd,
-                const double* dvrc, const double* big6w, 
+                const double* dvrc, const double* big6w,
                 const double* heta, const double* bigh,
                 const double* bigw, const double* c0,
                 const double* costheta, const double* bigb,
@@ -63,10 +63,10 @@ int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const i
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=VashishtaMF.init(ntypes, inum, nall, 500, cell_size, gpu_split, screen,
+    init_ok=VashishtaMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                       host_map, nelements, host_elem2param, nparams,
-                      cutsq, r0, gamma, eta, lam1inv, 
-                      lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw, 
+                      cutsq, r0, gamma, eta, lam1inv,
+                      lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw,
                       c0, costheta, bigb, big2b, bigc);
 
   VashishtaMF.device->world_barrier();
@@ -83,10 +83,10 @@ int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const i
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=VashishtaMF.init(ntypes, inum, nall, 500, cell_size, gpu_split, screen,
+      init_ok=VashishtaMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                         host_map, nelements, host_elem2param, nparams,
-                        cutsq, r0, gamma, eta, lam1inv, 
-                        lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw, 
+                        cutsq, r0, gamma, eta, lam1inv,
+                        lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw,
                         c0, costheta, bigb, big2b, bigc);
 
     VashishtaMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_yukawa.cpp b/lib/gpu/lal_yukawa.cpp
index 453139e537..707f60f071 100644
--- a/lib/gpu/lal_yukawa.cpp
+++ b/lib/gpu/lal_yukawa.cpp
@@ -109,20 +109,9 @@ double YukawaT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void YukawaT::loop(const bool _eflag, const bool _vflag) {
+int YukawaT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -130,8 +119,8 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff, &_kappa, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff, &_kappa, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -144,6 +133,7 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Yukawa<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu
index 62bc013dc6..6ebd2dc06d 100644
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@@ -38,22 +38,25 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -89,11 +92,11 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff[mtype].x*screening*rinv;
           energy+=factor_lj*(e-coeff[mtype].y);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -104,9 +107,9 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
@@ -124,25 +127,28 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -178,11 +184,11 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff[mtype].x*screening*rinv;
           energy+=factor_lj*(e-coeff[mtype].y);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -193,8 +199,8 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_yukawa.h b/lib/gpu/lal_yukawa.h
index 7d638d760e..51871a9728 100644
--- a/lib/gpu/lal_yukawa.h
+++ b/lib/gpu/lal_yukawa.h
@@ -72,7 +72,7 @@ class Yukawa : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_yukawa_colloid.cpp b/lib/gpu/lal_yukawa_colloid.cpp
index 46d4d64328..a447bb3889 100644
--- a/lib/gpu/lal_yukawa_colloid.cpp
+++ b/lib/gpu/lal_yukawa_colloid.cpp
@@ -133,10 +133,25 @@ double YukawaColloidT::host_memory_usage() const {
 template <class numtyp, class acctyp>
 void YukawaColloidT::compute(const int f_ago, const int inum_full,
                const int nall, double **host_x, int *host_type, int *ilist,
-               int *numj, int **firstneigh, const bool eflag, const bool vflag,
-               const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, double *rad) {
+               int *numj, int **firstneigh, const bool eflag_in,
+               const bool vflag_in, const bool eatom, const bool vatom,
+               int &host_start, const double cpu_time, bool &success,
+               double *rad) {
   this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
 
   // ------------------- Resize rad array --------------------------
 
@@ -177,8 +192,8 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full,
   this->atom->add_x_data(host_x,host_type);
   this->add_rad_data();
 
-  this->loop(eflag,vflag);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=this->loop(eflag,vflag);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 }
@@ -187,14 +202,28 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full,
 // Reneighbor on GPU and then compute per-atom densities
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, double *sublo,
+int** YukawaColloidT::compute(const int ago, const int inum_full,
+                const int nall, double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag,
+                tagint **special, const bool eflag_in, const bool vflag_in,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **jnum, const double cpu_time, bool &success,
                 double *rad) {
   this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
 
   // ------------------- Resize rad array ----------------------------
 
@@ -240,8 +269,8 @@ int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall
   *ilist=this->nbor->host_ilist.begin();
   *jnum=this->nbor->host_acc.begin();
 
-  this->loop(eflag,vflag);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=this->loop(eflag,vflag);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 
@@ -252,20 +281,9 @@ int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall
 // Calculate per-atom energies and forces
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
+int YukawaColloidT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -273,8 +291,8 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &c_rad, &coeff, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &c_rad, &coeff, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
@@ -286,6 +304,7 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class YukawaColloid<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu
index 30b458fec7..847ffa6d80 100644
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@@ -24,6 +24,10 @@ _texture_2d( pos_tex,int4);
 _texture( rad_tex,int2);
 #endif
 
+#if (__CUDACC_VER_MAJOR__ >= 11)
+#define rad_tex rad_
+#endif
+
 #else
 #define pos_tex x_
 #define rad_tex rad_
@@ -45,22 +49,25 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -98,11 +105,11 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff[mtype].x/kappa * screening;
           energy+=factor_lj*(e-coeff[mtype].y);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -113,9 +120,9 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
@@ -134,25 +141,28 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -190,11 +200,11 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff[mtype].x/kappa * screening;
           energy+=factor_lj*(e-coeff[mtype].y);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -205,8 +215,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
-
diff --git a/lib/gpu/lal_yukawa_colloid.h b/lib/gpu/lal_yukawa_colloid.h
index 607bc42321..a08248dd3a 100644
--- a/lib/gpu/lal_yukawa_colloid.h
+++ b/lib/gpu/lal_yukawa_colloid.h
@@ -114,7 +114,7 @@ class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
  private:
   bool _shared_view;
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_yukawa_colloid_ext.cpp b/lib/gpu/lal_yukawa_colloid_ext.cpp
index 988d33bdd6..db86f91689 100644
--- a/lib/gpu/lal_yukawa_colloid_ext.cpp
+++ b/lib/gpu/lal_yukawa_colloid_ext.cpp
@@ -55,7 +55,7 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
   int init_ok=0;
   if (world_me==0)
     init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj,
-                          inum, nall, 300, maxspecial, cell_size, gpu_split,
+                          inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
                           screen, kappa);
 
   YKCOLLMF.device->world_barrier();
@@ -73,7 +73,7 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj,
-                            inum, nall, 300, maxspecial, cell_size, gpu_split,
+                            inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
                             screen, kappa);
 
     YKCOLLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_yukawa_ext.cpp b/lib/gpu/lal_yukawa_ext.cpp
index 995694bdfd..cf2bf89e3d 100644
--- a/lib/gpu/lal_yukawa_ext.cpp
+++ b/lib/gpu/lal_yukawa_ext.cpp
@@ -55,7 +55,7 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
   int init_ok=0;
   if (world_me==0)
     init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj,
-                      inum, nall, 300, maxspecial, cell_size,
+                      inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen);
 
   YKMF.device->world_barrier();
@@ -73,7 +73,7 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj,
-                      inum, nall, 300, maxspecial, cell_size,
+                      inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen);
 
     YKMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_zbl.cpp b/lib/gpu/lal_zbl.cpp
index 2bf3369174..885f6f10bb 100644
--- a/lib/gpu/lal_zbl.cpp
+++ b/lib/gpu/lal_zbl.cpp
@@ -118,20 +118,9 @@ double ZBLT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void ZBLT::loop(const bool _eflag, const bool _vflag) {
+int ZBLT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -139,8 +128,8 @@ void ZBLT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &coeff3,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &coeff3,
                           &_cut_globalsq, &_cut_innersq, &_cut_inner,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
@@ -154,6 +143,7 @@ void ZBLT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class ZBL<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_zbl.cu b/lib/gpu/lal_zbl.cu
index 2539c0ddd7..09e1b4f6bb 100644
--- a/lib/gpu/lal_zbl.cu
+++ b/lib/gpu/lal_zbl.cu
@@ -95,17 +95,20 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_pair();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -142,7 +145,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
                          coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
           e += coeff3[mtype].z;
@@ -151,7 +154,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
           }
           energy+=e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -162,9 +165,9 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
@@ -186,25 +189,28 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
     coeff2[tid]=coeff2_in[tid];
     coeff3[tid]=coeff3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -242,7 +248,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
                          coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
           e += coeff3[mtype].z;
@@ -251,7 +257,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
           }
           energy+=e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -262,8 +268,8 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_zbl.h b/lib/gpu/lal_zbl.h
index e205d326c6..af4f1b2eac 100644
--- a/lib/gpu/lal_zbl.h
+++ b/lib/gpu/lal_zbl.h
@@ -76,7 +76,7 @@ class ZBL : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_zbl_ext.cpp b/lib/gpu/lal_zbl_ext.cpp
index f15e814a50..ee7794af2d 100644
--- a/lib/gpu/lal_zbl_ext.cpp
+++ b/lib/gpu/lal_zbl_ext.cpp
@@ -58,7 +58,7 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
     init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4,
                        host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze,
                        cut_globalsq, cut_innersq, cut_inner,
-                       inum, nall, 300, maxspecial, cell_size, gpu_split, screen);
+                       inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen);
 
   ZBLMF.device->world_barrier();
   if (message)
@@ -77,7 +77,7 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
       init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4,
                          host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze,
                          cut_globalsq, cut_innersq, cut_inner,
-                         inum, nall, 300, maxspecial, cell_size, gpu_split, screen);
+                         inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen);
 
     ZBLMF.device->gpu_barrier();
     if (message)
diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index 1fefb01d42..1767623314 100755
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -30,6 +30,16 @@ action () {
 
 action fix_gpu.cpp
 action fix_gpu.h
+action fix_nve_gpu.h
+action fix_nve_gpu.cpp
+action fix_nh_gpu.h
+action fix_nh_gpu.cpp
+action fix_nvt_gpu.h
+action fix_nvt_gpu.cpp
+action fix_npt_gpu.h
+action fix_npt_gpu.cpp
+action fix_nve_asphere_gpu.h fix_nve_asphere.h
+action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp
 action gpu_extra.h
 action pair_beck_gpu.cpp
 action pair_beck_gpu.h
@@ -83,6 +93,8 @@ action pair_lj96_cut_gpu.cpp
 action pair_lj96_cut_gpu.h
 action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp
 action pair_lj_charmm_coul_long_gpu.h pair_lj_charmm_coul_long.cpp
+action pair_lj_charmm_coul_charmm_gpu.cpp pair_lj_charmm_coul_charmm.cpp
+action pair_lj_charmm_coul_charmm_gpu.h pair_lj_charmm_coul_charmm.cpp
 action pair_lj_class2_coul_long_gpu.cpp pair_lj_class2_coul_long.cpp
 action pair_lj_class2_coul_long_gpu.h pair_lj_class2_coul_long.cpp
 action pair_lj_class2_gpu.cpp pair_lj_class2.cpp
@@ -159,6 +171,7 @@ if (test $1 = 1) then
     sed -i -e 's|^PKG_SYSINC =[ \t]*|&$(gpu_SYSINC) |' ../Makefile.package
     sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(gpu_SYSLIB) |' ../Makefile.package
     sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(gpu_SYSPATH) |' ../Makefile.package
+    sed -i -e 's|^PKG_INC =[ \t]*|&-DLMP_GPU |' ../Makefile.package
   fi
 
   if (test -e ../Makefile.package.settings) then
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 8f88dfd61d..efbaa6e1f8 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -15,6 +15,7 @@
 #include <cstring>
 
 #include "atom.h"
+#include "comm.h"
 #include "force.h"
 #include "pair.h"
 #include "pair_hybrid.h"
@@ -38,14 +39,19 @@ using namespace FixConst;
 enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH};
 
 extern int lmp_init_device(MPI_Comm world, MPI_Comm replica,
-                           const int first_gpu, const int last_gpu,
+                           const int ngpu, const int first_gpu_id,
                            const int gpu_mode, const double particle_split,
                            const int nthreads, const int t_per_atom,
-                           const double cell_size, char *opencl_flags,
+                           const double cell_size, char *opencl_args,
+                           const int ocl_platform, char *device_type_flags,
                            const int block_pair);
 extern void lmp_clear_device();
 extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                             double **vatom, double *virial, double &ecoul);
+                             double **vatom, double *virial, double &ecoul,
+                             int &err_flag);
+extern double lmp_gpu_update_bin_size(const double subx, const double suby,
+                                      const double subz, const int nlocal,
+                                      const double cut);
 
 static const char cite_gpu_package[] =
   "GPU package (short-range, long-range and three-body potentials):\n\n"
@@ -105,10 +111,13 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
 
   if (narg < 4) error->all(FLERR,"Illegal package gpu command");
 
+  // If ngpu is 0, autoset ngpu to the number of devices per node matching
+  // best device
   int ngpu = atoi(arg[3]);
-  if (ngpu <= 0) error->all(FLERR,"Illegal package gpu command");
-  int first_gpu = 0;
-  int last_gpu = ngpu-1;
+  if (ngpu < 0) error->all(FLERR,"Illegal package gpu command");
+
+  // Negative value indicate GPU package should find the best device ID
+  int first_gpu_id = -1;
 
   // options
 
@@ -118,9 +127,11 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   int newtonflag = 0;
   int threads_per_atom = -1;
   double binsize = 0.0;
-  char *opencl_flags = nullptr;
+  char *opencl_args = nullptr;
   int block_pair = -1;
   int pair_only_flag = 0;
+  int ocl_platform = -1;
+  char *device_type_flags = nullptr;
 
   int iarg = 4;
   while (iarg < narg) {
@@ -149,10 +160,9 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
         error->all(FLERR,"Illegal package GPU command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"gpuID") == 0) {
-      if (iarg+3 > narg) error->all(FLERR,"Illegal package gpu command");
-      first_gpu = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
-      last_gpu = utils::inumeric(FLERR,arg[iarg+2],false,lmp);
-      iarg += 3;
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
+      first_gpu_id = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      iarg += 2;
     } else if (strcmp(arg[iarg],"tpa") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
       threads_per_atom = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
@@ -162,9 +172,13 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
       nthreads = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
       if (nthreads < 1) error->all(FLERR,"Illegal fix GPU command");
       iarg += 2;
-    } else if (strcmp(arg[iarg],"device") == 0) {
+    } else if (strcmp(arg[iarg],"platform") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
-      opencl_flags = arg[iarg+1];
+      ocl_platform = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"device_type") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
+      device_type_flags = arg[iarg+1];
       iarg += 2;
     } else if (strcmp(arg[iarg],"blocksize") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
@@ -176,10 +190,14 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
       else if (strcmp(arg[iarg+1],"on") == 0) pair_only_flag = 1;
       else error->all(FLERR,"Illegal package gpu command");
       iarg += 2;
+    } else if (strcmp(arg[iarg],"ocl_args") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
+      opencl_args = arg[iarg+1];
+      iarg += 2;
     } else error->all(FLERR,"Illegal package gpu command");
   }
 
-  #ifndef _OPENMP
+  #if (LAL_USE_OMP == 0)
   if (nthreads > 1)
     error->all(FLERR,"No OpenMP support compiled in");
   #endif
@@ -207,10 +225,11 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   // change binsize default (0.0) to -1.0 used by GPU lib
 
   if (binsize == 0.0) binsize = -1.0;
-  int gpu_flag = lmp_init_device(universe->uworld, world, first_gpu, last_gpu,
+  _binsize = binsize;
+  int gpu_flag = lmp_init_device(universe->uworld, world, ngpu, first_gpu_id,
                                  _gpu_mode, _particle_split, nthreads,
-                                 threads_per_atom, binsize, opencl_flags,
-                                 block_pair);
+                                 threads_per_atom, binsize, opencl_args,
+                                 ocl_platform, device_type_flags, block_pair);
   GPU_EXTRA::check_flag(gpu_flag,error,world);
 }
 
@@ -296,9 +315,15 @@ void FixGPU::post_force(int /* vflag */)
   timer->stamp();
   double lvirial[6];
   for (int i = 0; i < 6; i++) lvirial[i] = 0.0;
+  int err_flag;
   double my_eng = lmp_gpu_forces(atom->f, atom->torque, force->pair->eatom,
                                  force->pair->vatom, lvirial,
-                                 force->pair->eng_coul);
+                                 force->pair->eng_coul, err_flag);
+  if (err_flag) {
+    if (err_flag==1)
+      error->one(FLERR,
+        "Too many neighbors on GPU. Use neigh_modify one to increase limit.");
+  }
 
   force->pair->eng_vdwl += my_eng;
   force->pair->virial[0] += lvirial[0];
@@ -335,3 +360,12 @@ double FixGPU::memory_usage()
   return bytes;
 }
 
+double FixGPU::binsize(const double subx, const double suby,
+                       const double subz, const int nlocal,
+                       const double cut) {
+  if (_binsize > 0.0) return _binsize;
+  else if (_gpu_mode == GPU_FORCE || comm->cutghostuser)
+    return cut * 0.5;
+  else
+    return lmp_gpu_update_bin_size(subx, suby, subz, nlocal, cut);
+}
diff --git a/src/GPU/fix_gpu.h b/src/GPU/fix_gpu.h
index ba0b4c83cb..29a0907915 100644
--- a/src/GPU/fix_gpu.h
+++ b/src/GPU/fix_gpu.h
@@ -37,10 +37,14 @@ class FixGPU : public Fix {
   void post_force_respa(int, int, int);
   double memory_usage();
 
+  double binsize(const double subx, const double suby,
+                 const double subz, const int nlocal, const double cut);
+
  private:
   int _gpu_mode;
   int _nlevels_respa;
   double _particle_split;
+  double _binsize;
 };
 
 }
@@ -78,4 +82,11 @@ E: Cannot use neigh_modify exclude with GPU neighbor builds
 This is a current limitation of the GPU implementation
 in LAMMPS.
 
+E: Too many neighbors on GPU. Use neigh_modify one to increase limit.
+
+The expected maximum number of neighbors is determined in the GPU package
+automatically. This error means the actual number of neighbors is exceeding
+the expected value. Use neigh_modify one command to increase GPU allocations
+(e.g. doubling this value doubles the GPU allocation).
+
 */
diff --git a/src/GPU/fix_nh_gpu.cpp b/src/GPU/fix_nh_gpu.cpp
new file mode 100644
index 0000000000..8b57289a50
--- /dev/null
+++ b/src/GPU/fix_nh_gpu.cpp
@@ -0,0 +1,552 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "fix_nh_gpu.h"
+
+#include "atom.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "update.h"
+
+#include <cstring>
+#include <cmath>
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+#define TILTMAX 1.5
+
+enum{NOBIAS,BIAS};
+enum{ISO,ANISO,TRICLINIC};
+
+typedef struct { double x,y,z; } dbl3_t;
+
+/* ----------------------------------------------------------------------
+   NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion
+ ---------------------------------------------------------------------- */
+
+FixNHGPU::FixNHGPU(LAMMPS *lmp, int narg, char **arg) :
+  FixNH(lmp, narg, arg)
+{
+  _dtfm = 0;
+  _nlocal3 = 0;
+  _nlocal_max = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixNHGPU::~FixNHGPU()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNHGPU::setup(int vflag)
+{
+  FixNH::setup(vflag);
+  if (strstr(update->integrate_style,"respa"))
+    _respa_on = 1;
+  else
+    _respa_on = 0;
+  reset_dt();
+}
+
+/* ----------------------------------------------------------------------
+   change box size
+   remap all atoms or dilate group atoms depending on allremap flag
+   if rigid bodies exist, scale rigid body centers-of-mass
+------------------------------------------------------------------------- */
+
+void FixNHGPU::remap()
+{
+  if (_respa_on) { FixNH::remap(); return; }
+
+  double oldlo,oldhi;
+  double expfac;
+
+  dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  double *h = domain->h;
+
+  // omega is not used, except for book-keeping
+
+  for (int i = 0; i < 6; i++) omega[i] += dto*omega_dot[i];
+
+  // convert pertinent atoms and rigid bodies to lamda coords
+  const double hi0 = domain->h_inv[0];
+  const double hi1 = domain->h_inv[1];
+  const double hi2 = domain->h_inv[2];
+  const double hi3 = domain->h_inv[3];
+  const double hi4 = domain->h_inv[4];
+  const double hi5 = domain->h_inv[5];
+  const double b0 = domain->boxlo[0];
+  const double b1 = domain->boxlo[1];
+  const double b2 = domain->boxlo[2];
+
+  if (allremap) {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < nlocal; i++) {
+      const double d0 = x[i].x - b0;
+      const double d1 = x[i].y - b1;
+      const double d2 = x[i].z - b2;
+      x[i].x = hi0*d0 + hi5*d1 + hi4*d2;
+      x[i].y = hi1*d1 + hi3*d2;
+      x[i].z = hi2*d2;
+    }
+  } else {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & dilate_group_bit) {
+        const double d0 = x[i].x - b0;
+        const double d1 = x[i].y - b1;
+        const double d2 = x[i].z - b2;
+        x[i].x = hi0*d0 + hi5*d1 + hi4*d2;
+        x[i].y = hi1*d1 + hi3*d2;
+        x[i].z = hi2*d2;
+      }
+    }
+  }
+
+  if (nrigid)
+    for (int i = 0; i < nrigid; i++)
+      modify->fix[rfix[i]]->deform(0);
+
+  // reset global and local box to new size/shape
+
+  // this operation corresponds to applying the
+  // translate and scale operations
+  // corresponding to the solution of the following ODE:
+  //
+  // h_dot = omega_dot * h
+  //
+  // where h_dot, omega_dot and h are all upper-triangular
+  // 3x3 tensors. In Voigt notation, the elements of the
+  // RHS product tensor are:
+  // h_dot = [0*0, 1*1, 2*2, 1*3+3*2, 0*4+5*3+4*2, 0*5+5*1]
+  //
+  // Ordering of operations preserves time symmetry.
+
+  double dto2 = dto/2.0;
+  double dto4 = dto/4.0;
+  double dto8 = dto/8.0;
+
+  // off-diagonal components, first half
+
+  if (pstyle == TRICLINIC) {
+
+    if (p_flag[4]) {
+      expfac = exp(dto8*omega_dot[0]);
+      h[4] *= expfac;
+      h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
+      h[4] *= expfac;
+    }
+
+    if (p_flag[3]) {
+      expfac = exp(dto4*omega_dot[1]);
+      h[3] *= expfac;
+      h[3] += dto2*(omega_dot[3]*h[2]);
+      h[3] *= expfac;
+    }
+
+    if (p_flag[5]) {
+      expfac = exp(dto4*omega_dot[0]);
+      h[5] *= expfac;
+      h[5] += dto2*(omega_dot[5]*h[1]);
+      h[5] *= expfac;
+    }
+
+    if (p_flag[4]) {
+      expfac = exp(dto8*omega_dot[0]);
+      h[4] *= expfac;
+      h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
+      h[4] *= expfac;
+    }
+  }
+
+  // scale diagonal components
+  // scale tilt factors with cell, if set
+
+  if (p_flag[0]) {
+    oldlo = domain->boxlo[0];
+    oldhi = domain->boxhi[0];
+    expfac = exp(dto*omega_dot[0]);
+    domain->boxlo[0] = (oldlo-fixedpoint[0])*expfac + fixedpoint[0];
+    domain->boxhi[0] = (oldhi-fixedpoint[0])*expfac + fixedpoint[0];
+  }
+
+  if (p_flag[1]) {
+    oldlo = domain->boxlo[1];
+    oldhi = domain->boxhi[1];
+    expfac = exp(dto*omega_dot[1]);
+    domain->boxlo[1] = (oldlo-fixedpoint[1])*expfac + fixedpoint[1];
+    domain->boxhi[1] = (oldhi-fixedpoint[1])*expfac + fixedpoint[1];
+    if (scalexy) h[5] *= expfac;
+  }
+
+  if (p_flag[2]) {
+    oldlo = domain->boxlo[2];
+    oldhi = domain->boxhi[2];
+    expfac = exp(dto*omega_dot[2]);
+    domain->boxlo[2] = (oldlo-fixedpoint[2])*expfac + fixedpoint[2];
+    domain->boxhi[2] = (oldhi-fixedpoint[2])*expfac + fixedpoint[2];
+    if (scalexz) h[4] *= expfac;
+    if (scaleyz) h[3] *= expfac;
+  }
+
+  // off-diagonal components, second half
+
+  if (pstyle == TRICLINIC) {
+
+    if (p_flag[4]) {
+      expfac = exp(dto8*omega_dot[0]);
+      h[4] *= expfac;
+      h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
+      h[4] *= expfac;
+    }
+
+    if (p_flag[3]) {
+      expfac = exp(dto4*omega_dot[1]);
+      h[3] *= expfac;
+      h[3] += dto2*(omega_dot[3]*h[2]);
+      h[3] *= expfac;
+    }
+
+    if (p_flag[5]) {
+      expfac = exp(dto4*omega_dot[0]);
+      h[5] *= expfac;
+      h[5] += dto2*(omega_dot[5]*h[1]);
+      h[5] *= expfac;
+    }
+
+    if (p_flag[4]) {
+      expfac = exp(dto8*omega_dot[0]);
+      h[4] *= expfac;
+      h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
+      h[4] *= expfac;
+    }
+
+  }
+
+  domain->yz = h[3];
+  domain->xz = h[4];
+  domain->xy = h[5];
+
+  // tilt factor to cell length ratio can not exceed TILTMAX in one step
+
+  if (domain->yz < -TILTMAX*domain->yprd ||
+      domain->yz > TILTMAX*domain->yprd ||
+      domain->xz < -TILTMAX*domain->xprd ||
+      domain->xz > TILTMAX*domain->xprd ||
+      domain->xy < -TILTMAX*domain->xprd ||
+      domain->xy > TILTMAX*domain->xprd)
+    error->all(FLERR,"Fix npt/nph has tilted box too far in one step - "
+               "periodic cell is too far from equilibrium state");
+
+  domain->set_global_box();
+  domain->set_local_box();
+
+  // convert pertinent atoms and rigid bodies back to box coords
+  const double h0 = domain->h[0];
+  const double h1 = domain->h[1];
+  const double h2 = domain->h[2];
+  const double h3 = domain->h[3];
+  const double h4 = domain->h[4];
+  const double h5 = domain->h[5];
+  const double nb0 = domain->boxlo[0];
+  const double nb1 = domain->boxlo[1];
+  const double nb2 = domain->boxlo[2];
+
+  if (allremap) {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < nlocal; i++) {
+      x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
+      x[i].y = h1*x[i].y + h3*x[i].z + nb1;
+      x[i].z = h2*x[i].z + nb2;
+    }
+  } else {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & dilate_group_bit) {
+        x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
+        x[i].y = h1*x[i].y + h3*x[i].z + nb1;
+        x[i].z = h2*x[i].z + nb2;
+      }
+    }
+  }
+
+  if (nrigid)
+    for (int i = 0; i < nrigid; i++)
+      modify->fix[rfix[i]]->deform(1);
+}
+
+/* ----------------------------------------------------------------------
+   2nd half of Verlet update
+------------------------------------------------------------------------- */
+
+void FixNHGPU::final_integrate() {
+  if (neighbor->ago == 0 && _respa_on == 0) reset_dt();
+  FixNH::final_integrate();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNHGPU::reset_dt()
+{
+  if (_respa_on) { FixNH::reset_dt(); return; }
+  dtv = update->dt;
+  dtf = 0.5 * update->dt * force->ftm2v;
+  dthalf = 0.5 * update->dt;
+  dt4 = 0.25 * update->dt;
+  dt8 = 0.125 * update->dt;
+  dto = dthalf;
+
+  if (pstat_flag)
+    pdrag_factor = 1.0 - (update->dt * p_freq_max * drag / nc_pchain);
+
+  if (tstat_flag)
+    tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain);
+
+  const int * const mask = atom->mask;
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+    atom->nlocal;
+
+  if (nlocal > _nlocal_max) {
+    if (_nlocal_max) memory->destroy(_dtfm);
+    _nlocal_max = static_cast<int>(1.20 * nlocal);
+    memory->create(_dtfm, _nlocal_max * 3, "fix_nh_gpu:dtfm");
+  }
+
+  _nlocal3 = nlocal * 3;
+
+  if (igroup == 0) {
+    if (atom->rmass) {
+      const double * const rmass = atom->rmass;
+      int n = 0;
+      for (int i = 0; i < nlocal; i++) {
+        const double dtfir = dtf / rmass[i];
+        _dtfm[n++] = dtfir;
+        _dtfm[n++] = dtfir;
+        _dtfm[n++] = dtfir;
+      }
+    } else {
+      const double * const mass = atom->mass;
+      const int * const type = atom->type;
+      int n = 0;
+      for (int i = 0; i < nlocal; i++) {
+        const double dtfim = dtf / mass[type[i]];
+        _dtfm[n++] = dtfim;
+        _dtfm[n++] = dtfim;
+        _dtfm[n++] = dtfim;
+      }
+    }
+  } else {
+    if (atom->rmass) {
+      const double * const rmass = atom->rmass;
+      int n = 0;
+      for (int i = 0; i < nlocal; i++)
+        if (mask[i] & groupbit) {
+          const double dtfir = dtf / rmass[i];
+          _dtfm[n++] = dtfir;
+          _dtfm[n++] = dtfir;
+          _dtfm[n++] = dtfir;
+        } else {
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
+    } else {
+      const double * const mass = atom->mass;
+      const int * const type = atom->type;
+      int n = 0;
+      for (int i = 0; i < nlocal; i++)
+        if (mask[i] & groupbit) {
+          const double dtfim = dtf / mass[type[i]];
+          _dtfm[n++] = dtfim;
+          _dtfm[n++] = dtfim;
+          _dtfm[n++] = dtfim;
+        } else {
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform half-step barostat scaling of velocities
+-----------------------------------------------------------------------*/
+
+void FixNHGPU::nh_v_press()
+{
+  if (pstyle == TRICLINIC || which == BIAS || _respa_on) {
+    FixNH::nh_v_press();
+    return;
+  }
+
+  dbl3_t * _noalias const v = (dbl3_t *)atom->v[0];
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+  double f0 = exp(-dt4*(omega_dot[0]+mtk_term2));
+  double f1 = exp(-dt4*(omega_dot[1]+mtk_term2));
+  double f2 = exp(-dt4*(omega_dot[2]+mtk_term2));
+  f0 *= f0;
+  f1 *= f1;
+  f2 *= f2;
+
+  if (igroup == 0) {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < nlocal; i++) {
+      v[i].x *= f0;
+      v[i].y *= f1;
+      v[i].z *= f2;
+    }
+  } else {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & groupbit) {
+        v[i].x *= f0;
+        v[i].y *= f1;
+        v[i].z *= f2;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform half-step update of velocities
+-----------------------------------------------------------------------*/
+
+void FixNHGPU::nve_v()
+{
+  if (_respa_on) { FixNH::nve_v(); return; }
+
+  double * _noalias const v = atom->v[0];
+  const double * _noalias const f = atom->f[0];
+  #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+  #pragma omp parallel for simd schedule(static)
+  #elif (LAL_USE_OMP_SIMD == 1)
+  #pragma omp simd
+  #endif
+  for (int i = 0; i < _nlocal3; i++)
+    v[i] += _dtfm[i] * f[i];
+}
+
+/* ----------------------------------------------------------------------
+   perform full-step update of positions
+-----------------------------------------------------------------------*/
+
+void FixNHGPU::nve_x()
+{
+  if (_respa_on) { FixNH::nve_x(); return; }
+
+  double * _noalias const x = atom->x[0];
+  double * _noalias const v = atom->v[0];
+
+  // x update by full step only for atoms in group
+
+  if (igroup == 0) {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < _nlocal3; i++)
+      x[i] += dtv * v[i];
+  } else {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < _nlocal3; i++) {
+      if (_dtfm[i] != 0.0)
+        x[i] += dtv * v[i];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform half-step thermostat scaling of velocities
+-----------------------------------------------------------------------*/
+
+void FixNHGPU::nh_v_temp()
+{
+  if (which == BIAS || _respa_on) {
+    FixNH::nh_v_temp();
+    return;
+  }
+
+  double * _noalias const v = atom->v[0];
+
+  if (igroup == 0) {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < _nlocal3; i++)
+        v[i] *= factor_eta;
+  } else {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < _nlocal3; i++) {
+      if (_dtfm[i] != 0.0)
+        v[i] *= factor_eta;
+    }
+  }
+}
+
+double FixNHGPU::memory_usage()
+{
+  return FixNH::memory_usage() + _nlocal_max * 3 * sizeof(double);
+}
diff --git a/src/GPU/fix_nh_gpu.h b/src/GPU/fix_nh_gpu.h
new file mode 100644
index 0000000000..edd210e813
--- /dev/null
+++ b/src/GPU/fix_nh_gpu.h
@@ -0,0 +1,164 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_FIX_NH_GPU_H
+#define LMP_FIX_NH_GPU_H
+
+#include "fix_nh.h"
+
+namespace LAMMPS_NS {
+
+class FixNHGPU : public FixNH {
+ public:
+  FixNHGPU(class LAMMPS *, int, char **);
+  virtual ~FixNHGPU();
+  virtual void setup(int vflag);
+  void reset_dt();
+  virtual void final_integrate();
+  virtual double memory_usage();
+
+ protected:
+  double *_dtfm;
+  int _nlocal3, _nlocal_max, _respa_on;
+
+  virtual void remap();
+  virtual void nve_x();
+  virtual void nve_v();
+  virtual void nh_v_press();
+  virtual void nh_v_temp();
+};
+
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Target temperature for fix nvt/npt/nph cannot be 0.0
+
+Self-explanatory.
+
+E: Invalid fix nvt/npt/nph command for a 2d simulation
+
+Cannot control z dimension in a 2d model.
+
+E: Fix nvt/npt/nph dilate group ID does not exist
+
+Self-explanatory.
+
+E: Invalid fix nvt/npt/nph command pressure settings
+
+If multiple dimensions are coupled, those dimensions must be
+specified.
+
+E: Cannot use fix nvt/npt/nph on a non-periodic dimension
+
+When specifying a diagonal pressure component, the dimension must be
+periodic.
+
+E: Cannot use fix nvt/npt/nph on a 2nd non-periodic dimension
+
+When specifying an off-diagonal pressure component, the 2nd of the two
+dimensions must be periodic.  E.g. if the xy component is specified,
+then the y dimension must be periodic.
+
+E: Cannot use fix nvt/npt/nph with yz scaling when z is non-periodic dimension
+
+The 2nd dimension in the barostatted tilt factor must be periodic.
+
+E: Cannot use fix nvt/npt/nph with xz scaling when z is non-periodic dimension
+
+The 2nd dimension in the barostatted tilt factor must be periodic.
+
+E: Cannot use fix nvt/npt/nph with xy scaling when y is non-periodic dimension
+
+The 2nd dimension in the barostatted tilt factor must be periodic.
+
+E: Cannot use fix nvt/npt/nph with both yz dynamics and yz scaling
+
+Self-explanatory.
+
+E: Cannot use fix nvt/npt/nph with both xz dynamics and xz scaling
+
+Self-explanatory.
+
+E: Cannot use fix nvt/npt/nph with both xy dynamics and xy scaling
+
+Self-explanatory.
+
+E: Can not specify Pxy/Pxz/Pyz in fix nvt/npt/nph with non-triclinic box
+
+Only triclinic boxes can be used with off-diagonal pressure components.
+See the region prism command for details.
+
+E: Invalid fix nvt/npt/nph pressure settings
+
+Settings for coupled dimensions must be the same.
+
+E: Fix nvt/npt/nph damping parameters must be > 0.0
+
+Self-explanatory.
+
+E: Cannot use fix npt and fix deform on same component of stress tensor
+
+This would be changing the same box dimension twice.
+
+E: Temperature ID for fix nvt/npt does not exist
+
+Self-explanatory.
+
+E: Pressure ID for fix npt/nph does not exist
+
+Self-explanatory.
+
+E: Fix npt/nph has tilted box too far in one step - periodic cell is too far from equilibrium state
+
+Self-explanatory.  The change in the box tilt is too extreme
+on a short timescale.
+
+E: Could not find fix_modify temperature ID
+
+The compute ID for computing temperature does not exist.
+
+E: Fix_modify temperature ID does not compute temperature
+
+The compute ID assigned to the fix must compute temperature.
+
+W: Temperature for fix modify is not for group all
+
+The temperature compute is being used with a pressure calculation
+which does operate on group all, so this may be inconsistent.
+
+E: Pressure ID for fix modify does not exist
+
+Self-explanatory.
+
+E: Could not find fix_modify pressure ID
+
+The compute ID for computing pressure does not exist.
+
+E: Fix_modify pressure ID does not compute pressure
+
+The compute ID assigned to the fix must compute pressure.
+
+*/
diff --git a/src/GPU/fix_npt_gpu.cpp b/src/GPU/fix_npt_gpu.cpp
new file mode 100644
index 0000000000..2ba0be29e0
--- /dev/null
+++ b/src/GPU/fix_npt_gpu.cpp
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include "fix_npt_gpu.h"
+#include "modify.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+FixNPTGPU::FixNPTGPU(LAMMPS *lmp, int narg, char **arg) :
+  FixNHGPU(lmp, narg, arg)
+{
+  if (!tstat_flag)
+    error->all(FLERR,"Temperature control must be used with fix npt/omp");
+  if (!pstat_flag)
+    error->all(FLERR,"Pressure control must be used with fix npt/omp");
+
+  // create a new compute temp style
+  // id = fix-ID + temp
+  // compute group = all since pressure is always global (group all)
+  // and thus its KE/temperature contribution should use group all
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+
+  char **newarg = new char*[3];
+  newarg[0] = id_temp;
+  newarg[1] = (char *) "all";
+  newarg[2] = (char *) "temp";
+
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tcomputeflag = 1;
+
+  // create a new compute pressure style
+  // id = fix-ID + press, compute group = all
+  // pass id_temp as 4th arg to pressure constructor
+
+  n = strlen(id) + 7;
+  id_press = new char[n];
+  strcpy(id_press,id);
+  strcat(id_press,"_press");
+
+  newarg = new char*[4];
+  newarg[0] = id_press;
+  newarg[1] = (char *) "all";
+  newarg[2] = (char *) "pressure";
+  newarg[3] = id_temp;
+  modify->add_compute(4,newarg);
+  delete [] newarg;
+  pcomputeflag = 1;
+}
diff --git a/src/GPU/fix_npt_gpu.h b/src/GPU/fix_npt_gpu.h
new file mode 100644
index 0000000000..2684935fe5
--- /dev/null
+++ b/src/GPU/fix_npt_gpu.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(npt/gpu,FixNPTGPU)
+
+#else
+
+#ifndef LMP_FIX_NPT_GPU_H
+#define LMP_FIX_NPT_GPU_H
+
+#include "fix_nh_gpu.h"
+
+namespace LAMMPS_NS {
+
+class FixNPTGPU : public FixNHGPU {
+ public:
+  FixNPTGPU(class LAMMPS *, int, char **);
+  ~FixNPTGPU() {}
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Temperature control must be used with fix npt
+
+Self-explanatory.
+
+E: Pressure control must be used with fix npt
+
+Self-explanatory.
+
+*/
diff --git a/src/GPU/fix_nve_asphere_gpu.cpp b/src/GPU/fix_nve_asphere_gpu.cpp
new file mode 100644
index 0000000000..bf6cfda67d
--- /dev/null
+++ b/src/GPU/fix_nve_asphere_gpu.cpp
@@ -0,0 +1,440 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "fix_nve_asphere_gpu.h"
+
+#include "atom.h"
+#include "atom_vec_ellipsoid.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "update.h"
+#include <cmath>
+#if (LAL_USE_OMP == 1)
+#include <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+#define INERTIA 0.2          // moment of inertia prefactor for ellipsoid
+
+#define ME_qnormalize(q)                                                \
+{                                                                       \
+  double norm = 1.0 /                                                   \
+    sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k);        \
+  q##_w *= norm;                                                        \
+  q##_i *= norm;                                                        \
+  q##_j *= norm;                                                        \
+  q##_k *= norm;                                                        \
+}
+
+#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w)     \
+{                                                                       \
+  double wbody_0, wbody_1, wbody_2;                                     \
+  double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \
+                                                                        \
+  double w2 = quat##_w * quat##_w;                                      \
+  double i2 = quat##_i * quat##_i;                                      \
+  double j2 = quat##_j * quat##_j;                                      \
+  double k2 = quat##_k * quat##_k;                                      \
+  double twoij = 2.0 * quat##_i * quat##_j;                             \
+  double twoik = 2.0 * quat##_i * quat##_k;                             \
+  double twojk = 2.0 * quat##_j * quat##_k;                             \
+  double twoiw = 2.0 * quat##_i * quat##_w;                             \
+  double twojw = 2.0 * quat##_j * quat##_w;                             \
+  double twokw = 2.0 * quat##_k * quat##_w;                             \
+                                                                        \
+  rot##_0 = w2 + i2 - j2 - k2;                                          \
+  rot##_1 = twoij - twokw;                                              \
+  rot##_2 = twojw + twoik;                                              \
+                                                                        \
+  rot##_3 = twoij + twokw;                                              \
+  rot##_4 = w2 - i2 + j2 - k2;                                          \
+  rot##_5 = twojk - twoiw;                                              \
+                                                                        \
+  rot##_6 = twoik - twojw;                                              \
+  rot##_7 = twojk + twoiw;                                              \
+  rot##_8 = w2 - i2 - j2 + k2;                                          \
+                                                                        \
+  wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2;              \
+  wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2;              \
+  wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2;              \
+                                                                        \
+  wbody_0 *= moments_0;                                                 \
+  wbody_1 *= moments_1;                                                 \
+  wbody_2 *= moments_2;                                                 \
+                                                                        \
+  w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2;          \
+  w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2;          \
+  w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2;          \
+}
+
+#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2)    \
+{                                                                       \
+  angmomin[0] += dtf * torque[0];                                       \
+  double angmom_0 = angmomin[0];                                        \
+  angmomin[1] += dtf * torque[1];                                       \
+  double angmom_1 = angmomin[1];                                        \
+  angmomin[2] += dtf * torque[2];                                       \
+  double angmom_2 = angmomin[2];                                        \
+                                                                        \
+  double quat_w = quatin[0];                                            \
+  double quat_i = quatin[1];                                            \
+  double quat_j = quatin[2];                                            \
+  double quat_k = quatin[3];                                            \
+                                                                        \
+  double omega_0, omega_1, omega_2;                                     \
+  ME_mq_to_omega(angmom,quat,i0,i1,i2,omega);                           \
+                                                                        \
+  double wq_0, wq_1, wq_2, wq_3;                                        \
+  wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k;             \
+  wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j;              \
+  wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k;              \
+  wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i;              \
+                                                                        \
+  double qfull_w, qfull_i, qfull_j, qfull_k;                            \
+  qfull_w = quat_w + dtq * wq_0;                                        \
+  qfull_i = quat_i + dtq * wq_1;                                        \
+  qfull_j = quat_j + dtq * wq_2;                                        \
+  qfull_k = quat_k + dtq * wq_3;                                        \
+  ME_qnormalize(qfull);                                                 \
+                                                                        \
+  double qhalf_w, qhalf_i, qhalf_j, qhalf_k;                            \
+  qhalf_w = quat_w + 0.5*dtq * wq_0;                                    \
+  qhalf_i = quat_i + 0.5*dtq * wq_1;                                    \
+  qhalf_j = quat_j + 0.5*dtq * wq_2;                                    \
+  qhalf_k = quat_k + 0.5*dtq * wq_3;                                    \
+  ME_qnormalize(qhalf);                                                 \
+                                                                        \
+  ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega);                          \
+  wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k;          \
+  wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j;           \
+  wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k;           \
+  wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i;           \
+                                                                        \
+  qhalf_w += 0.5*dtq * wq_0;                                            \
+  qhalf_i += 0.5*dtq * wq_1;                                            \
+  qhalf_j += 0.5*dtq * wq_2;                                            \
+  qhalf_k += 0.5*dtq * wq_3;                                            \
+  ME_qnormalize(qhalf);                                                 \
+                                                                        \
+  quat_w = 2.0*qhalf_w - qfull_w;                                       \
+  quat_i = 2.0*qhalf_i - qfull_i;                                       \
+  quat_j = 2.0*qhalf_j - qfull_j;                                       \
+  quat_k = 2.0*qhalf_k - qfull_k;                                       \
+  ME_qnormalize(quat);                                                  \
+                                                                        \
+  quatin[0] = quat_w;                                                   \
+  quatin[1] = quat_i;                                                   \
+  quatin[2] = quat_j;                                                   \
+  quatin[3] = quat_k;                                                   \
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixNVEAsphereGPU::FixNVEAsphereGPU(LAMMPS *lmp, int narg, char **arg) :
+  FixNVE(lmp, narg, arg)
+{
+  _dtfm = 0;
+  _nlocal_max = 0;
+  _inertia0 = 0;
+  _inertia1 = 0;
+  _inertia2 = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVEAsphereGPU::init()
+{
+  avec = (AtomVecEllipsoid *) atom->style_match("ellipsoid");
+  if (!avec)
+    error->all(FLERR,"Compute nve/asphere requires atom style ellipsoid");
+
+  // check that all particles are finite-size ellipsoids
+  // no point particles allowed, spherical is OK
+
+  int *ellipsoid = atom->ellipsoid;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++)
+    if (mask[i] & groupbit)
+      if (ellipsoid[i] < 0)
+        error->one(FLERR,"Fix nve/asphere requires extended particles");
+
+  FixNVE::init();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVEAsphereGPU::setup(int vflag)
+{
+  FixNVE::setup(vflag);
+  reset_dt();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVEAsphereGPU::initial_integrate(int /*vflag*/)
+{
+  AtomVecEllipsoid::Bonus *bonus = avec->bonus;
+  int *ellipsoid = atom->ellipsoid;
+  double * _noalias const x = atom->x[0];
+  double * _noalias const v = atom->v[0];
+  const double * _noalias const f = atom->f[0];
+  int *mask = atom->mask;
+
+  double **angmom = atom->angmom;
+  double **torque = atom->torque;
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+  // set timestep here since dt may have changed or come via rRESPA
+
+  dtq = 0.5 * dtv;
+
+  #if (LAL_USE_OMP == 1)
+  #pragma omp parallel
+  #endif
+  {
+    #if (LAL_USE_OMP == 1)
+    const int nthreads = comm->nthreads;
+    const int tid = omp_get_thread_num();
+    const int idelta = nlocal / nthreads + 1;
+    const int ifrom = tid * idelta;
+    const int ito = MIN(ifrom + idelta, nlocal);
+    const int ifrom3 = ifrom * 3;
+    const int ito3 = ito * 3;
+    #else
+    const int tid = 0;
+    const int ifrom = 0;
+    const int ifrom3 = 0;
+    const int ito = nlocal;
+    const int ito3 = nlocal * 3;
+    #endif
+
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = ifrom3; i < ito3; i++) {
+      v[i] += _dtfm[i] * f[i];
+      x[i] += dtv * v[i];
+    }
+
+    // update angular momentum by 1/2 step
+    if (igroup == 0) {
+      #if (LAL_USE_OMP_SIMD == 1)
+        // Workaround for compiler bug
+        #ifdef __INTEL_COMPILER
+        #pragma simd
+        #else
+        #pragma omp simd
+        #endif
+      #endif
+      for (int i = ifrom; i < ito; i++) {
+        double *quat = bonus[ellipsoid[i]].quat;
+        ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
+                            _inertia1[i], _inertia2[i]);
+      }
+    } else {
+      #if (LAL_USE_OMP_SIMD == 1)
+        // Workaround for compiler bug
+        #ifdef __INTEL_COMPILER
+        #pragma simd
+        #else
+        #pragma omp simd
+        #endif
+      #endif
+      for (int i = ifrom; i < ito; i++) {
+        if (mask[i] & groupbit) {
+          double *quat = bonus[ellipsoid[i]].quat;
+          ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i],
+                              _inertia0[i], _inertia1[i], _inertia2[i]);
+        }
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVEAsphereGPU::final_integrate()
+{
+  double * _noalias const v = atom->v[0];
+  const double * _noalias const f = atom->f[0];
+  double * _noalias const angmom = atom->angmom[0];
+  const double * _noalias const torque = atom->torque[0];
+
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+    atom->nlocal;
+
+  if (neighbor->ago == 0) {
+    if (nlocal > _nlocal_max) {
+      if (_nlocal_max) {
+        memory->destroy(_dtfm);
+        memory->destroy(_inertia0);
+        memory->destroy(_inertia1);
+        memory->destroy(_inertia2);
+      }
+      _nlocal_max = static_cast<int>(1.20 * nlocal);
+      memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm");
+      memory->create(_inertia0, _nlocal_max * 3, "fix_nve_gpu:inertia0");
+      memory->create(_inertia1, _nlocal_max * 3, "fix_nve_gpu:inertia1");
+      memory->create(_inertia2, _nlocal_max * 3, "fix_nve_gpu:inertia2");
+    }
+  }
+
+  #if (LAL_USE_OMP == 1)
+  #pragma omp parallel
+  #endif
+  {
+    #if (LAL_USE_OMP == 1)
+    const int nthreads = comm->nthreads;
+    const int tid = omp_get_thread_num();
+    const int idelta = nlocal / nthreads + 1;
+    const int ifrom = tid * idelta;
+    const int ito = MIN(ifrom + idelta, nlocal);
+    const int ifrom3 = ifrom * 3;
+    const int ito3 = ito * 3;
+    #else
+    const int tid = 0;
+    const int ifrom = 0;
+    const int ifrom3 = 0;
+    const int ito = nlocal;
+    const int ito3 = nlocal * 3;
+    #endif
+
+    double dtfo;
+    if (neighbor->ago == 0) dtfo = reset_dt_omp(ifrom, ito, tid);
+    else dtfo = dtf;
+
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = ifrom3; i < ito3; i++) {
+      v[i] += _dtfm[i] * f[i];
+      angmom[i] += dtfo * torque[i];
+    }
+  }
+}
+
+void FixNVEAsphereGPU::reset_dt() {
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+    atom->nlocal;
+
+  if (nlocal > _nlocal_max) {
+    if (_nlocal_max) {
+      memory->destroy(_dtfm);
+      memory->destroy(_inertia0);
+      memory->destroy(_inertia1);
+      memory->destroy(_inertia2);
+    }
+    _nlocal_max = static_cast<int>(1.20 * nlocal);
+    memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm");
+    memory->create(_inertia0, _nlocal_max * 3, "fix_nve_gpu:inertia0");
+    memory->create(_inertia1, _nlocal_max * 3, "fix_nve_gpu:inertia1");
+    memory->create(_inertia2, _nlocal_max * 3, "fix_nve_gpu:inertia2");
+  }
+
+  #if (LAL_USE_OMP == 1)
+  #pragma omp parallel
+  #endif
+  {
+    #if (LAL_USE_OMP == 1)
+    const int nthreads = comm->nthreads;
+    const int tid = omp_get_thread_num();
+    const int idelta = nlocal / nthreads + 1;
+    const int ifrom = tid * idelta;
+    const int ito = MIN(ifrom + idelta, nlocal);
+    #else
+    const int tid = 0;
+    const int ifrom = 0;
+    const int ito = nlocal;
+    #endif
+    reset_dt_omp(ifrom, ito, tid);
+  }
+}
+
+double FixNVEAsphereGPU::reset_dt_omp(const int ifrom, const int ito,
+                                      const int tid) {
+  AtomVecEllipsoid::Bonus *bonus = avec->bonus;
+  int *ellipsoid = atom->ellipsoid;
+  const int * const mask = atom->mask;
+
+  const double dtfo = 0.5 * update->dt * force->ftm2v;
+  if (tid == 0) {
+    dtv = update->dt;
+    dtf = dtfo;
+  }
+
+  if (igroup == 0) {
+    const double * const rmass = atom->rmass;
+    int n = ifrom * 3;
+    for (int i = ifrom; i < ito; i++) {
+      const double dtfir = dtfo / rmass[i];
+      _dtfm[n++] = dtfir;
+      _dtfm[n++] = dtfir;
+      _dtfm[n++] = dtfir;
+      double *shape = bonus[ellipsoid[i]].shape;
+      double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
+      if (idot != 0.0) idot = 1.0 / idot;
+      _inertia0[i] = idot;
+      idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
+      if (idot != 0.0) idot = 1.0 / idot;
+      _inertia1[i] = idot;
+      idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
+      if (idot != 0.0) idot = 1.0 / idot;
+      _inertia2[i] = idot;
+    }
+  } else {
+    const double * const rmass = atom->rmass;
+    int n = ifrom * 3;
+    for (int i = ifrom; i < ito; i++) {
+      if (mask[i] & groupbit) {
+        const double dtfir = dtfo / rmass[i];
+        _dtfm[n++] = dtfir;
+        _dtfm[n++] = dtfir;
+        _dtfm[n++] = dtfir;
+        double *shape = bonus[ellipsoid[i]].shape;
+        double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
+        if (idot != 0.0) idot = 1.0 / idot;
+        _inertia0[i] = idot;
+        idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
+        if (idot != 0.0) idot = 1.0 / idot;
+        _inertia1[i] = idot;
+        idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
+        if (idot != 0.0) idot = 1.0 / idot;
+        _inertia2[i] = idot;
+      } else {
+        _dtfm[n++] = 0.0;
+        _dtfm[n++] = 0.0;
+        _dtfm[n++] = 0.0;
+      }
+    }
+  }
+  return dtfo;
+}
+
+double FixNVEAsphereGPU::memory_usage()
+{
+  return FixNVE::memory_usage() + _nlocal_max * 12 * sizeof(double);
+}
+
diff --git a/src/GPU/fix_nve_asphere_gpu.h b/src/GPU/fix_nve_asphere_gpu.h
new file mode 100644
index 0000000000..3c67e0e024
--- /dev/null
+++ b/src/GPU/fix_nve_asphere_gpu.h
@@ -0,0 +1,63 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nve/asphere/gpu,FixNVEAsphereGPU)
+
+#else
+
+#ifndef LMP_FIX_NVE_ASPHERE_GPU_H
+#define LMP_FIX_NVE_ASPHERE_GPU_H
+
+#include "fix_nve.h"
+
+namespace LAMMPS_NS {
+
+class FixNVEAsphereGPU : public FixNVE {
+ public:
+  FixNVEAsphereGPU(class LAMMPS *, int, char **);
+  void init();
+  void setup(int vflag);
+  void initial_integrate(int);
+  void final_integrate();
+  void reset_dt();
+  virtual double memory_usage();
+
+ private:
+  double reset_dt_omp(const int, const int, const int);
+  double *_dtfm, *_inertia0, *_inertia1, *_inertia2;
+  int _nlocal_max;
+  double dtq;
+  class AtomVecEllipsoid *avec;
+};
+
+}
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Compute nve/asphere requires atom style ellipsoid
+
+Self-explanatory.
+
+E: Fix nve/asphere requires extended particles
+
+This fix can only be used for particles with a shape setting.
+
+*/
diff --git a/src/GPU/fix_nve_gpu.cpp b/src/GPU/fix_nve_gpu.cpp
new file mode 100644
index 0000000000..c3dd5b6ae2
--- /dev/null
+++ b/src/GPU/fix_nve_gpu.cpp
@@ -0,0 +1,291 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "fix_nve_gpu.h"
+#include <cstring>
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "update.h"
+#if (LAL_USE_OMP == 1)
+#include <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+FixNVEGPU::FixNVEGPU(LAMMPS *lmp, int narg, char **arg) :
+  FixNVE(lmp, narg, arg)
+{
+  _dtfm = 0;
+  _nlocal_max = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixNVEGPU::~FixNVEGPU()
+{
+  memory->destroy(_dtfm);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVEGPU::setup(int vflag)
+{
+  FixNVE::setup(vflag);
+  if (strstr(update->integrate_style,"respa"))
+    _respa_on = 1;
+  else
+    _respa_on = 0;
+  if (atom->ntypes > 1) reset_dt();
+}
+
+/* ----------------------------------------------------------------------
+   allow for both per-type and per-atom mass
+------------------------------------------------------------------------- */
+
+void FixNVEGPU::initial_integrate(int vflag)
+{
+  if (_respa_on) { FixNVE::initial_integrate(vflag); return; }
+
+  // update v and x of atoms in group
+
+  double * _noalias const x = atom->x[0];
+  double * _noalias const v = atom->v[0];
+  const double * _noalias const f = atom->f[0];
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+    atom->nlocal;
+  const int nlocal3 = nlocal * 3;
+
+  #if (LAL_USE_OMP == 1)
+  #pragma omp parallel
+  #endif
+  {
+    #if (LAL_USE_OMP == 1)
+    const int nthreads = comm->nthreads;
+    const int idelta = nlocal3 / nthreads + 1;
+    const int ifrom3 = omp_get_thread_num() * idelta;
+    const int ito3 = MIN(ifrom3 + idelta, nlocal3);
+    #else
+    const int ifrom3 = 0;
+    const int ito3 = nlocal3;
+    #endif
+    if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
+      const double dtfm = dtf / atom->mass[1];
+      #if (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i = ifrom3; i < ito3; i++) {
+        v[i] += dtfm * f[i];
+        x[i] += dtv * v[i];
+      }
+    } else if (igroup == 0) {
+      #if (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i = ifrom3; i < ito3; i++) {
+        v[i] += _dtfm[i] * f[i];
+        x[i] += dtv * v[i];
+      }
+    } else {
+      #if (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i = ifrom3; i < ito3; i++) {
+        if (_dtfm[i] != 0.0) {
+          v[i] += _dtfm[i] * f[i];
+          x[i] += dtv * v[i];
+        }
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVEGPU::final_integrate()
+{
+  if (_respa_on) { FixNVE::final_integrate(); return; }
+  // update v of atoms in group
+  double * _noalias const v = atom->v[0];
+  const double * _noalias const f = atom->f[0];
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+    atom->nlocal;
+
+  if (neighbor->ago == 0) {
+    if (igroup != 0 || atom->ntypes != 1 || atom->rmass) {
+      if (nlocal > _nlocal_max) {
+        if (_nlocal_max) memory->destroy(_dtfm);
+        _nlocal_max = static_cast<int>(1.20 * nlocal);
+        memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm");
+      }
+    }
+  }
+
+  #if (LAL_USE_OMP == 1)
+  #pragma omp parallel
+  #endif
+  {
+    #if (LAL_USE_OMP == 1)
+    const int nthreads = comm->nthreads;
+    const int tid = omp_get_thread_num();
+    const int idelta = nlocal / nthreads + 1;
+    const int ifrom = tid * idelta;
+    const int ito = MIN(ifrom + idelta, nlocal);
+    const int ifrom3 = ifrom * 3;
+    const int ito3 = ito * 3;
+    #else
+    const int tid = 0;
+    const int ifrom = 0;
+    const int ifrom3 = 0;
+    const int ito = nlocal;
+    const int ito3 = nlocal * 3;
+    #endif
+    if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
+      const double dtfm = dtf / atom->mass[1];
+      #if (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i = ifrom3; i < ito3; i++)
+        v[i] += dtfm * f[i];
+    } else if (igroup == 0) {
+      if (neighbor->ago == 0) reset_dt_omp(ifrom,ito,tid);
+      #if (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i = ifrom3; i < ito3; i++)
+        v[i] += _dtfm[i] * f[i];
+    } else {
+      if (neighbor->ago == 0) reset_dt_omp(ifrom,ito,tid);
+      #if (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i = ifrom3; i < ito3; i++)
+        v[i] += _dtfm[i] * f[i];
+    }
+  }
+}
+
+void FixNVEGPU::reset_dt() {
+  if (_respa_on) { FixNVE::reset_dt(); return; }
+  if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
+    dtv = update->dt;
+    dtf = 0.5 * update->dt * force->ftm2v;
+  } else {
+    const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+      atom->nlocal;
+    if (nlocal > _nlocal_max) {
+      if (_nlocal_max) memory->destroy(_dtfm);
+      _nlocal_max = static_cast<int>(1.20 * nlocal);
+      memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm");
+    }
+
+    #if (LAL_USE_OMP == 1)
+    #pragma omp parallel
+    #endif
+    {
+      #if (LAL_USE_OMP == 1)
+      const int nthreads = comm->nthreads;
+      const int tid = omp_get_thread_num();
+      const int idelta = nlocal / nthreads + 1;
+      const int ifrom = tid * idelta;
+      const int ito = MIN(ifrom + idelta, nlocal);
+      #else
+      const int tid = 0;
+      const int ifrom = 0;
+      const int ito = nlocal;
+      #endif
+
+      reset_dt_omp(ifrom, ito, tid);
+    }
+  }
+}
+
+void FixNVEGPU::reset_dt_omp(const int ifrom, const int ito, const int tid) {
+  const double dtfo = 0.5 * update->dt * force->ftm2v;
+  if (tid == 0) {
+    dtv = update->dt;
+    dtf = dtfo;
+  }
+
+  const int * const mask = atom->mask;
+  if (igroup == 0) {
+    if (atom->rmass) {
+      const double * const rmass = atom->rmass;
+      int n = ifrom * 3;
+      for (int i = ifrom; i < ito; i++) {
+        const double dtfir = dtfo / rmass[i];
+        _dtfm[n++] = dtfir;
+        _dtfm[n++] = dtfir;
+        _dtfm[n++] = dtfir;
+      }
+    } else {
+      const double * const mass = atom->mass;
+      const int * const type = atom->type;
+      int n = ifrom * 3;
+      for (int i = ifrom; i < ito; i++) {
+        const double dtfim = dtfo / mass[type[i]];
+        _dtfm[n++] = dtfim;
+        _dtfm[n++] = dtfim;
+        _dtfm[n++] = dtfim;
+      }
+    }
+  } else {
+    if (atom->rmass) {
+      const double * const rmass = atom->rmass;
+      int n = ifrom * 3;
+      for (int i = ifrom; i < ito; i++)
+        if (mask[i] & groupbit) {
+          const double dtfir = dtfo / rmass[i];
+          _dtfm[n++] = dtfir;
+          _dtfm[n++] = dtfir;
+          _dtfm[n++] = dtfir;
+        } else {
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
+    } else {
+      const double * const mass = atom->mass;
+      const int * const type = atom->type;
+      int n = ifrom * 3;
+      for (int i = ifrom; i < ito; i++)
+        if (mask[i] & groupbit) {
+          const double dtfim = dtfo / mass[type[i]];
+          _dtfm[n++] = dtfim;
+          _dtfm[n++] = dtfim;
+          _dtfm[n++] = dtfim;
+        } else {
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
+    }
+  }
+}
+
+double FixNVEGPU::memory_usage()
+{
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+    atom->nlocal;
+  return FixNVE::memory_usage() + nlocal * 3 * sizeof(double);
+}
diff --git a/src/GPU/fix_nve_gpu.h b/src/GPU/fix_nve_gpu.h
new file mode 100644
index 0000000000..1042d4eadd
--- /dev/null
+++ b/src/GPU/fix_nve_gpu.h
@@ -0,0 +1,60 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nve/gpu,FixNVEGPU)
+
+#else
+
+#ifndef LMP_FIX_NVE_GPU_H
+#define LMP_FIX_NVE_GPU_H
+
+#include "fix_nve.h"
+
+namespace LAMMPS_NS {
+
+class FixNVEGPU : public FixNVE {
+ public:
+  FixNVEGPU(class LAMMPS *, int, char **);
+  virtual ~FixNVEGPU();
+  virtual void setup(int);
+  virtual void initial_integrate(int);
+  virtual void final_integrate();
+  virtual void reset_dt();
+  virtual double memory_usage();
+
+ protected:
+  void reset_dt_omp(const int, const int, const int);
+  double *_dtfm;
+  int _nlocal_max, _respa_on;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+*/
diff --git a/src/GPU/fix_nvt_gpu.cpp b/src/GPU/fix_nvt_gpu.cpp
new file mode 100644
index 0000000000..7d7826b6bf
--- /dev/null
+++ b/src/GPU/fix_nvt_gpu.cpp
@@ -0,0 +1,50 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include "fix_nvt_gpu.h"
+#include "group.h"
+#include "modify.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+FixNVTGPU::FixNVTGPU(LAMMPS *lmp, int narg, char **arg) :
+  FixNHGPU(lmp, narg, arg)
+{
+  if (!tstat_flag)
+    error->all(FLERR,"Temperature control must be used with fix nvt");
+  if (pstat_flag)
+    error->all(FLERR,"Pressure control can not be used with fix nvt");
+
+  // create a new compute temp style
+  // id = fix-ID + temp
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+
+  char **newarg = new char*[3];
+  newarg[0] = id_temp;
+  newarg[1] = group->names[igroup];
+  newarg[2] = (char *) "temp";
+
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tcomputeflag = 1;
+}
+
diff --git a/src/GPU/fix_nvt_gpu.h b/src/GPU/fix_nvt_gpu.h
new file mode 100644
index 0000000000..7ccba97040
--- /dev/null
+++ b/src/GPU/fix_nvt_gpu.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nvt/gpu,FixNVTGPU)
+
+#else
+
+#ifndef LMP_FIX_NVT_GPU_H
+#define LMP_FIX_NVT_GPU_H
+
+#include "fix_nh_gpu.h"
+
+namespace LAMMPS_NS {
+
+class FixNVTGPU : public FixNHGPU {
+ public:
+  FixNVTGPU(class LAMMPS *, int, char **);
+  ~FixNVTGPU() {}
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Temperature control must be used with fix nvt
+
+Self-explanatory.
+
+E: Pressure control can not be used with fix nvt
+
+Self-explanatory.
+
+*/
diff --git a/src/GPU/gpu_extra.h b/src/GPU/gpu_extra.h
index 115e1f0574..1a957c9aef 100644
--- a/src/GPU/gpu_extra.h
+++ b/src/GPU/gpu_extra.h
@@ -21,6 +21,29 @@
 #include "modify.h"
 #include "error.h"
 
+// ---------------------- OPENMP PREPROCESSOR STUFF ------------------
+#if defined(_OPENMP)
+  #if !defined(LAL_USE_OMP)
+  #define LAL_USE_OMP 1
+  #endif
+
+  #if !defined(LAL_USE_OMP_SIMD)
+    #if (_OPENMP >= 201307)
+    #define LAL_USE_OMP_SIMD 1
+    #else
+    #define LAL_USE_OMP_SIMD 0
+    #endif
+  #endif
+#else
+  #if !defined(LAL_USE_OMP)
+  #define LAL_USE_OMP 0
+  #endif
+
+  #if !defined(LAL_USE_OMP_SIMD)
+  #define LAL_USE_OMP_SIMD 0
+  #endif
+#endif
+
 namespace GPU_EXTRA {
 
   inline void check_flag(int error_flag, LAMMPS_NS::Error *error,
@@ -61,6 +84,12 @@ namespace GPU_EXTRA {
       else if (all_success == -12)
         error->all(FLERR,
                    "Invalid OpenCL platform ID.");
+      else if (all_success == -13)
+        error->all(FLERR,
+                   "Invalid device configuration.");
+      else if (all_success == -15)
+        error->all(FLERR,
+                   "P3M built for FP64 and GPU device is FP32 only.");
       else
         error->all(FLERR,"Unknown error in GPU library");
     }
@@ -127,12 +156,22 @@ greater than 4 for NVIDIA GPUs.
 E: Invalid custom OpenCL parameter string.
 
 There are not enough or too many parameters in the custom string for package
-GPU.
+GPU or the parameters do not meet required restrictions.
 
 E: Unknown error in GPU library
 
 Self-explanatory.
 
+E: Invalid device configuration.
+
+The specified GPU or accelerator does not support the specified device
+configuration. Check the output of ocl_get_devices or nvd_get_devices to
+verify the correct device IDs for the GPU package.
+
+E: P3M built for FP64 and GPU device is FP32 only
+
+Either turn off GPU acceleration for PPPM or build LAMMPS with -DFFT_SINGLE
+
 W: Increasing communication cutoff for GPU style
 
 The pair style has increased the communication cutoff to be consistent with
diff --git a/src/GPU/pair_beck_gpu.cpp b/src/GPU/pair_beck_gpu.cpp
index 38cc593076..ff9537a33e 100644
--- a/src/GPU/pair_beck_gpu.cpp
+++ b/src/GPU/pair_beck_gpu.cpp
@@ -48,9 +48,9 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **host_aa,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen);
 void beck_gpu_clear();
-int ** beck_gpu_compute_n(const int ago, const int inum,
-                          const int nall, double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** beck_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum,
@@ -160,9 +160,10 @@ void PairBeckGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = beck_gpu_init(atom->ntypes+1, cutsq, aa, alpha, beta,
                               AA, BB, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_born_coul_long_cs_gpu.cpp b/src/GPU/pair_born_coul_long_cs_gpu.cpp
index b65b662496..db0faab0ab 100644
--- a/src/GPU/pair_born_coul_long_cs_gpu.cpp
+++ b/src/GPU/pair_born_coul_long_cs_gpu.cpp
@@ -57,15 +57,15 @@ using namespace MathConst;
 // External functions from cuda library for atom decomposition
 
 int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                    double **host_born1, double **host_born2,
-                    double **host_born3, double **host_a,
-                    double **host_c, double **host_d,
-                    double **sigma, double **offset, double *special_lj,
-                    const int inum, const int nall, const int max_nbors,
-                    const int maxspecial, const double cell_size,
-                    int &gpu_mode, FILE *screen, double **host_cut_ljsq,
-                    double host_cut_coulsq, double *host_special_coul,
-                    const double qqrd2e, const double g_ewald);
+                      double **host_born1, double **host_born2,
+                      double **host_born3, double **host_a,
+                      double **host_c, double **host_d,
+                      double **sigma, double **offset, double *special_lj,
+                      const int inum, const int nall, const int max_nbors,
+                      const int maxspecial, const double cell_size,
+                      int &gpu_mode, FILE *screen, double **host_cut_ljsq,
+                      double host_cut_coulsq, double *host_special_coul,
+                      const double qqrd2e, const double g_ewald);
 void bornclcs_gpu_clear();
 int** bornclcs_gpu_compute_n(const int ago, const int inum_full, const int nall,
                            double **host_x, int *host_type, double *sublo,
@@ -196,10 +196,11 @@ void PairBornCoulLongCSGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = bornclcs_gpu_init(atom->ntypes+1, cutsq,  rhoinv,
                                 born1, born2, born3, a, c, d, sigma,
                                 offset, force->special_lj, atom->nlocal,
-                                  atom->nlocal+atom->nghost, 300, maxspecial,
+                                  atom->nlocal+atom->nghost, mnf, maxspecial,
                                    cell_size, gpu_mode, screen, cut_ljsq,
                                 cut_coulsq, force->special_coul,
                                 force->qqrd2e, g_ewald);
diff --git a/src/GPU/pair_born_coul_long_gpu.cpp b/src/GPU/pair_born_coul_long_gpu.cpp
index 0a359f66cc..cad174c0de 100644
--- a/src/GPU/pair_born_coul_long_gpu.cpp
+++ b/src/GPU/pair_born_coul_long_gpu.cpp
@@ -195,10 +195,11 @@ void PairBornCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = borncl_gpu_init(atom->ntypes+1, cutsq,  rhoinv,
                                 born1, born2, born3, a, c, d, sigma,
                                 offset, force->special_lj, atom->nlocal,
-                                  atom->nlocal+atom->nghost, 300, maxspecial,
+                                  atom->nlocal+atom->nghost, mnf, maxspecial,
                                    cell_size, gpu_mode, screen, cut_ljsq,
                                 cut_coulsq, force->special_coul,
                                 force->qqrd2e, g_ewald);
diff --git a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
index 7aba6e059b..5c8cac0ec2 100644
--- a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
+++ b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
@@ -45,24 +45,26 @@ using namespace MathConst;
 // External functions from cuda library for atom decomposition
 
 int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                    double **host_born1, double **host_born2,
-                    double **host_born3, double **host_a, double **host_c,
-                    double **host_d, double **sigma, double **offset,
-                    double *special_lj, const int inum,
-                    const int nall, const int max_nbors, const int maxspecial,
-                    const double cell_size, int &gpu_mode, FILE *screen,
-                    double **host_cut_ljsq, double host_cut_coulsq,
-                    double *host_special_coul, const double qqrd2e,
-                    const double alf, const double e_shift, const double f_shift);
+                      double **host_born1, double **host_born2,
+                      double **host_born3, double **host_a, double **host_c,
+                      double **host_d, double **sigma, double **offset,
+                      double *special_lj, const int inum, const int nall,
+                      const int max_nbors, const int maxspecial,
+                      const double cell_size, int &gpu_mode, FILE *screen,
+                      double **host_cut_ljsq, double host_cut_coulsq,
+                      double *host_special_coul, const double qqrd2e,
+                      const double alf, const double e_shift,
+                      const double f_shift);
 void borncwcs_gpu_clear();
-int ** borncwcs_gpu_compute_n(const int ago, const int inum_full, const int nall,
-                            double **host_x, int *host_type, double *sublo,
-                            double *subhi, tagint *tag, int **nspecial,
-                            tagint **special, const bool eflag, const bool vflag,
-                            const bool eatom, const bool vatom, int &host_start,
-                            int **ilist, int **jnum, const double cpu_time,
-                            bool &success, double *host_q, double *boxlo,
-                            double *prd);
+int ** borncwcs_gpu_compute_n(const int ago, const int inum_full,
+                              const int nall, double **host_x, int *host_type,
+                              double *sublo, double *subhi, tagint *tag,
+                              int **nspecial, tagint **special,
+                              const bool eflag, const bool vflag,
+                              const bool eatom, const bool vatom,
+                              int &host_start, int **ilist, int **jnum,
+                              const double cpu_time, bool &success,
+                              double *host_q, double *boxlo, double *prd);
 void borncwcs_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
                         int **firstneigh, const bool eflag, const bool vflag,
@@ -179,10 +181,11 @@ void PairBornCoulWolfCSGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = borncwcs_gpu_init(atom->ntypes+1, cutsq, rhoinv,
                                 born1, born2, born3, a, c, d, sigma, offset,
                                 force->special_lj, atom->nlocal,
-                                atom->nlocal+atom->nghost, 300, maxspecial,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                 cell_size, gpu_mode, screen, cut_ljsq,
                                 cut_coulsq, force->special_coul, force->qqrd2e,
                                 alf, e_shift, f_shift);
diff --git a/src/GPU/pair_born_coul_wolf_gpu.cpp b/src/GPU/pair_born_coul_wolf_gpu.cpp
index ee6fcf3cea..73e58b0a1f 100644
--- a/src/GPU/pair_born_coul_wolf_gpu.cpp
+++ b/src/GPU/pair_born_coul_wolf_gpu.cpp
@@ -51,13 +51,15 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                     const double cell_size, int &gpu_mode, FILE *screen,
                     double **host_cut_ljsq, double host_cut_coulsq,
                     double *host_special_coul, const double qqrd2e,
-                    const double alf, const double e_shift, const double f_shift);
+                    const double alf, const double e_shift,
+                    const double f_shift);
 void borncw_gpu_clear();
 int ** borncw_gpu_compute_n(const int ago, const int inum_full, const int nall,
                             double **host_x, int *host_type, double *sublo,
                             double *subhi, tagint *tag, int **nspecial,
-                            tagint **special, const bool eflag, const bool vflag,
-                            const bool eatom, const bool vatom, int &host_start,
+                            tagint **special, const bool eflag,
+                            const bool vflag, const bool eatom,
+                            const bool vatom, int &host_start,
                             int **ilist, int **jnum, const double cpu_time,
                             bool &success, double *host_q, double *boxlo,
                             double *prd);
@@ -177,10 +179,11 @@ void PairBornCoulWolfGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = borncw_gpu_init(atom->ntypes+1, cutsq, rhoinv,
                                 born1, born2, born3, a, c, d, sigma, offset,
                                 force->special_lj, atom->nlocal,
-                                atom->nlocal+atom->nghost, 300, maxspecial,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                 cell_size, gpu_mode, screen, cut_ljsq,
                                 cut_coulsq, force->special_coul, force->qqrd2e,
                                 alf, e_shift, f_shift);
diff --git a/src/GPU/pair_born_gpu.cpp b/src/GPU/pair_born_gpu.cpp
index 84ed4cfc04..770dad8346 100644
--- a/src/GPU/pair_born_gpu.cpp
+++ b/src/GPU/pair_born_gpu.cpp
@@ -48,13 +48,13 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                   const int maxspecial, const double cell_size,
                   int &gpu_mode, FILE *screen);
 void born_gpu_reinit(const int ntypes, double **host_rhoinv,
-                     double **host_born1, double **host_born2, double **host_born3,
-                     double **host_a, double **host_c, double **host_d,
-                     double **offset);
+                     double **host_born1, double **host_born2,
+                     double **host_born3, double **host_a, double **host_c,
+                     double **host_d, double **offset);
 void born_gpu_clear();
-int ** born_gpu_compute_n(const int ago, const int inum_full,
-                          const int nall, double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** born_gpu_compute_n(const int ago, const int inum_full, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
@@ -163,10 +163,11 @@ void PairBornGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = born_gpu_init(atom->ntypes+1, cutsq, rhoinv,
                               born1, born2, born3, a, c, d, sigma,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_buck_coul_cut_gpu.cpp b/src/GPU/pair_buck_coul_cut_gpu.cpp
index 036bc0d7a8..2c9e71bc83 100644
--- a/src/GPU/pair_buck_coul_cut_gpu.cpp
+++ b/src/GPU/pair_buck_coul_cut_gpu.cpp
@@ -167,9 +167,10 @@ void PairBuckCoulCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = buckc_gpu_init(atom->ntypes+1, cutsq, rhoinv, buck1, buck2,
                                a, c, offset, force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
+                               atom->nlocal+atom->nghost, mnf, maxspecial,
                                cell_size, gpu_mode, screen, cut_ljsq,
                                cut_coulsq, force->special_coul, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_buck_coul_long_gpu.cpp b/src/GPU/pair_buck_coul_long_gpu.cpp
index 3916e5634e..3d48862c6a 100644
--- a/src/GPU/pair_buck_coul_long_gpu.cpp
+++ b/src/GPU/pair_buck_coul_long_gpu.cpp
@@ -191,9 +191,10 @@ void PairBuckCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = buckcl_gpu_init(atom->ntypes+1, cutsq,  rhoinv, buck1, buck2,
                                 a, c, offset, force->special_lj, atom->nlocal,
-                                atom->nlocal+atom->nghost, 300, maxspecial,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                 cell_size, gpu_mode, screen, cut_ljsq,
                                 cut_coulsq, force->special_coul, force->qqrd2e,
                                 g_ewald);
diff --git a/src/GPU/pair_buck_gpu.cpp b/src/GPU/pair_buck_gpu.cpp
index 54c579bf72..d17f9d2072 100644
--- a/src/GPU/pair_buck_gpu.cpp
+++ b/src/GPU/pair_buck_gpu.cpp
@@ -47,8 +47,8 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                   const int nall, const int max_nbors,  const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen);
 void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv,
-                  double **host_buck1, double **host_buck2,
-                  double **host_a, double **host_c, double **offset);
+                     double **host_buck1, double **host_buck2,
+                     double **host_a, double **host_c, double **offset);
 void buck_gpu_clear();
 int ** buck_gpu_compute_n(const int ago, const int inum_full, const int nall,
                           double **host_x, int *host_type, double *sublo,
@@ -161,9 +161,10 @@ void PairBuckGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = buck_gpu_init(atom->ntypes+1, cutsq, rhoinv, buck1, buck2,
                               a, c, offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_colloid_gpu.cpp b/src/GPU/pair_colloid_gpu.cpp
index 2e35486993..8b7870575a 100644
--- a/src/GPU/pair_colloid_gpu.cpp
+++ b/src/GPU/pair_colloid_gpu.cpp
@@ -44,18 +44,18 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                      double **host_lj2, double **host_lj3, double **host_lj4,
                      double **offset, double *special_lj, double **host_a12,
                      double **host_a1, double **host_a2, double **host_d1,
-                     double **host_d2, double **host_sigma3, double **host_sigma6,
-                     int **host_form, const int nlocal,
+                     double **host_d2, double **host_sigma3,
+                     double **host_sigma6, int **host_form, const int nlocal,
                      const int nall, const int max_nbors, const int maxspecial,
                      const double cell_size, int &gpu_mode, FILE *screen);
 void colloid_gpu_clear();
-int ** colloid_gpu_compute_n(const int ago, const int inum,
-                             const int nall, double **host_x, int *host_type,
-                             double *sublo, double *subhi, tagint *tag, int **nspecial,
-                             tagint **special, const bool eflag, const bool vflag,
-                             const bool eatom, const bool vatom, int &host_start,
-                             int **ilist, int **jnum,
-                             const double cpu_time, bool &success);
+int ** colloid_gpu_compute_n(const int ago, const int inum, const int nall,
+                             double **host_x, int *host_type, double *sublo,
+                             double *subhi, tagint *tag, int **nspecial,
+                             tagint **special, const bool eflag,
+                             const bool vflag, const bool eatom,
+                             const bool vatom, int &host_start, int **ilist,
+                             int **jnum, const double cpu_time, bool &success);
 void colloid_gpu_compute(const int ago, const int inum, const int nall,
                          double **host_x, int *host_type, int *ilist, int *numj,
                          int **firstneigh, const bool eflag, const bool vflag,
@@ -171,10 +171,11 @@ void PairColloidGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = colloid_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                                  offset, force->special_lj, a12, a1, a2,
                                  d1, d2, sigma3, sigma6, _form, atom->nlocal,
-                                 atom->nlocal+atom->nghost, 300, maxspecial,
+                                 atom->nlocal+atom->nghost, mnf, maxspecial,
                                  cell_size, gpu_mode, screen);
   memory->destroy(_form);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_coul_cut_gpu.cpp b/src/GPU/pair_coul_cut_gpu.cpp
index 1e45aebf7b..9098f86737 100644
--- a/src/GPU/pair_coul_cut_gpu.cpp
+++ b/src/GPU/pair_coul_cut_gpu.cpp
@@ -47,21 +47,21 @@ int coul_gpu_init(const int ntypes, double **host_scale, double **cutsq,
                   const double qqrd2e);
 void coul_gpu_reinit(const int ntypes, double **host_scale);
 void coul_gpu_clear();
-int ** coul_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
-                         tagint **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum, const double cpu_time,
-                         bool &success, double *host_q, double *boxlo,
-                         double *prd);
+int ** coul_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
+                          tagint **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum, const double cpu_time,
+                          bool &success, double *host_q, double *boxlo,
+                          double *prd);
 void coul_gpu_compute(const int ago, const int inum,
                       const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-                     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q, const int nlocal,
-                     double *boxlo, double *prd);
+                      int *ilist, int *numj, int **firstneigh,
+                      const bool eflag, const bool vflag, const bool eatom,
+                      const bool vatom, int &host_start, const double cpu_time,
+                      bool &success, double *host_q, const int nlocal,
+                      double *boxlo, double *prd);
 double coul_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -166,9 +166,10 @@ void PairCoulCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = coul_gpu_init(atom->ntypes+1, scale, cutsq,
                              force->special_coul, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_coul_debye_gpu.cpp b/src/GPU/pair_coul_debye_gpu.cpp
index f23b5acde3..1db2995810 100644
--- a/src/GPU/pair_coul_debye_gpu.cpp
+++ b/src/GPU/pair_coul_debye_gpu.cpp
@@ -48,20 +48,20 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
 void cdebye_gpu_reinit(const int ntypes, double **host_scale);
 void cdebye_gpu_clear();
 int ** cdebye_gpu_compute_n(const int ago, const int inum, const int nall,
-                          double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial,
-                          tagint **special, const bool eflag, const bool vflag,
-                          const bool eatom, const bool vatom, int &host_start,
-                          int **ilist, int **jnum, const double cpu_time,
-                          bool &success, double *host_q, double *boxlo,
-                          double *prd);
+                            double **host_x, int *host_type, double *sublo,
+                            double *subhi, tagint *tag, int **nspecial,
+                            tagint **special, const bool eflag,
+                            const bool vflag, const bool eatom,
+                            const bool vatom, int &host_start, int **ilist,
+                            int **jnum, const double cpu_time, bool &success,
+                            double *host_q, double *boxlo, double *prd);
 void cdebye_gpu_compute(const int ago, const int inum, const int nall,
-                      double **host_x, int *host_type,
-                      int *ilist, int *numj, int **firstneigh,
-                      const bool eflag, const bool vflag, const bool eatom,
-                      const bool vatom, int &host_start, const double cpu_time,
-                      bool &success, double *host_q, const int nlocal,
-                      double *boxlo, double *prd);
+                        double **host_x, int *host_type, int *ilist,
+                        int *numj, int **firstneigh, const bool eflag,
+                        const bool vflag, const bool eatom, const bool vatom,
+                        int &host_start, const double cpu_time, bool &success,
+                        double *host_q, const int nlocal, double *boxlo,
+                        double *prd);
 double cdebye_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -167,9 +167,10 @@ void PairCoulDebyeGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = cdebye_gpu_init(atom->ntypes+1, scale, cutsq,
                                 force->special_coul, atom->nlocal,
-                                atom->nlocal+atom->nghost, 300, maxspecial,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                 cell_size, gpu_mode, screen,
                                 force->qqrd2e, kappa);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_coul_dsf_gpu.cpp b/src/GPU/pair_coul_dsf_gpu.cpp
index 0bcffb5d2c..830ad057e6 100644
--- a/src/GPU/pair_coul_dsf_gpu.cpp
+++ b/src/GPU/pair_coul_dsf_gpu.cpp
@@ -57,9 +57,9 @@ int cdsf_gpu_init(const int ntypes, const int nlocal, const int nall,
                   const double e_shift, const double f_shift,
                   const double alpha);
 void cdsf_gpu_clear();
-int ** cdsf_gpu_compute_n(const int ago, const int inum,
-                          const int nall, double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** cdsf_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
@@ -184,8 +184,9 @@ void PairCoulDSFGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = cdsf_gpu_init(atom->ntypes+1, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_coulsq,
                               force->special_coul, force->qqrd2e, e_shift,
                               f_shift, alpha);
diff --git a/src/GPU/pair_coul_long_cs_gpu.cpp b/src/GPU/pair_coul_long_cs_gpu.cpp
index ef404d7a13..5b1fcd9c8f 100644
--- a/src/GPU/pair_coul_long_cs_gpu.cpp
+++ b/src/GPU/pair_coul_long_cs_gpu.cpp
@@ -54,27 +54,27 @@ using namespace LAMMPS_NS;
 
 // External functions from cuda library for atom decomposition
 
-int clcs_gpu_init(const int ntypes, double **scale,
-                const int nlocal, const int nall, const int max_nbors,
-                const int maxspecial, const double cell_size, int &gpu_mode,
-                FILE *screen, double host_cut_coulsq, double *host_special_coul,
-                const double qqrd2e, const double g_ewald);
+int clcs_gpu_init(const int ntypes, double **scale, const int nlocal,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double host_cut_coulsq, double *host_special_coul,
+                  const double qqrd2e, const double g_ewald);
 void clcs_gpu_reinit(const int ntypes, double **scale);
 void clcs_gpu_clear();
 int ** clcs_gpu_compute_n(const int ago, const int inum,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag,
-                        int **nspecial, tagint **special, const bool eflag,
-                        const bool vflag, const bool eatom, const bool vatom,
-                        int &host_start, int **ilist, int **jnum,
-                        const double cpu_time, bool &success, double *host_q,
-                        double *boxlo, double *prd);
+                          const int nall, double **host_x, int *host_type,
+                          double *sublo, double *subhi, tagint *tag,
+                          int **nspecial, tagint **special, const bool eflag,
+                          const bool vflag, const bool eatom, const bool vatom,
+                          int &host_start, int **ilist, int **jnum,
+                          const double cpu_time, bool &success, double *host_q,
+                          double *boxlo, double *prd);
 void clcs_gpu_compute(const int ago, const int inum, const int nall,
-                    double **host_x, int *host_type, int *ilist, int *numj,
-                    int **firstneigh, const bool eflag, const bool vflag,
-                    const bool eatom, const bool vatom, int &host_start,
-                    const double cpu_time, bool &success, double *host_q,
-                    const int nlocal, double *boxlo, double *prd);
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_q,
+                      const int nlocal, double *boxlo, double *prd);
 double clcs_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -186,8 +186,9 @@ void PairCoulLongCSGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = clcs_gpu_init(atom->ntypes+1, scale,
-                            atom->nlocal, atom->nlocal+atom->nghost, 300,
+                            atom->nlocal, atom->nlocal+atom->nghost, mnf,
                             maxspecial, cell_size, gpu_mode, screen, cut_coulsq,
                             force->special_coul, force->qqrd2e, g_ewald);
 
diff --git a/src/GPU/pair_coul_long_gpu.cpp b/src/GPU/pair_coul_long_gpu.cpp
index 1118a012d0..af6a66fa34 100644
--- a/src/GPU/pair_coul_long_gpu.cpp
+++ b/src/GPU/pair_coul_long_gpu.cpp
@@ -181,8 +181,9 @@ void PairCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = cl_gpu_init(atom->ntypes+1, scale,
-                            atom->nlocal, atom->nlocal+atom->nghost, 300,
+                            atom->nlocal, atom->nlocal+atom->nghost, mnf,
                             maxspecial, cell_size, gpu_mode, screen, cut_coulsq,
                             force->special_coul, force->qqrd2e, g_ewald);
 
diff --git a/src/GPU/pair_dpd_gpu.cpp b/src/GPU/pair_dpd_gpu.cpp
index 59c0fa031f..d77d83e953 100644
--- a/src/GPU/pair_dpd_gpu.cpp
+++ b/src/GPU/pair_dpd_gpu.cpp
@@ -52,8 +52,8 @@ int ** dpd_gpu_compute_n(const int ago, const int inum_full, const int nall,
                          double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum, const double cpu_time, bool &success,
-                         double **host_v, const double dtinvsqrt,
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success, double **host_v, const double dtinvsqrt,
                          const int seed, const int timestep,
                          double *boxlo, double *prd);
 void dpd_gpu_compute(const int ago, const int inum_full, const int nall,
@@ -308,9 +308,10 @@ void PairDPDGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = dpd_gpu_init(atom->ntypes+1, cutsq, a0, gamma, sigma,
                              cut, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_dpd_tstat_gpu.cpp b/src/GPU/pair_dpd_tstat_gpu.cpp
index 8bf98cc8ed..a5ae3e3001 100644
--- a/src/GPU/pair_dpd_tstat_gpu.cpp
+++ b/src/GPU/pair_dpd_tstat_gpu.cpp
@@ -47,12 +47,13 @@ int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen);
 void dpd_tstat_gpu_clear();
-int ** dpd_tstat_gpu_compute_n(const int ago, const int inum_full, const int nall,
-                         double **host_x, int *host_type, double *sublo,
-                         double *subhi, tagint *tag, int **nspecial,
-                         tagint **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum, const double cpu_time, bool &success,
+int ** dpd_tstat_gpu_compute_n(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, tagint *tag,
+                         int **nspecial, tagint **special, const bool eflag,
+                         const bool vflag, const bool eatom, const bool vatom,
+                         int &host_start, int **ilist, int **jnum,
+                         const double cpu_time, bool &success,
                          double **host_v, const double dtinvsqrt,
                          const int seed, const int timestep,
                          double *boxlo, double *prd);
@@ -64,8 +65,9 @@ void dpd_tstat_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_v, const double dtinvsqrt,
                      const int seed, const int timestep,
                      const int nlocal, double *boxlo, double *prd);
-void dpd_tstat_gpu_update_coeff(int ntypes, double **host_a0, double **host_gamma,
-                          double **host_sigma, double **host_cut);
+void dpd_tstat_gpu_update_coeff(int ntypes, double **host_a0,
+                                double **host_gamma, double **host_sigma,
+                                double **host_cut);
 double dpd_tstat_gpu_bytes();
 
 #define EPSILON 1.0e-10
@@ -325,10 +327,11 @@ void PairDPDTstatGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = dpd_tstat_gpu_init(atom->ntypes+1, cutsq, a0, gamma, sigma,
-                             cut, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
-                             cell_size, gpu_mode, screen);
+                                   cut, force->special_lj, atom->nlocal,
+                                   atom->nlocal+atom->nghost, mnf, maxspecial,
+                                   cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE) {
diff --git a/src/GPU/pair_eam_alloy_gpu.cpp b/src/GPU/pair_eam_alloy_gpu.cpp
index c1370af307..4678a6f669 100644
--- a/src/GPU/pair_eam_alloy_gpu.cpp
+++ b/src/GPU/pair_eam_alloy_gpu.cpp
@@ -39,21 +39,22 @@ using namespace LAMMPS_NS;
 // External functions from cuda library for atom decomposition
 
 int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
-                 int **host_type2rhor, int **host_type2z2r,
-                 int *host_type2frho, double ***host_rhor_spline,
-                 double ***host_z2r_spline, double ***host_frho_spline,
-                 double rdr, double rdrho, double rhomax,
-                 int nrhor, int nrho, int nz2r, int nfrho, int nr,
-                 const int nlocal, const int nall, const int max_nbors,
-                 const int maxspecial, const double cell_size, int &gpu_mode,
-                 FILE *screen, int &fp_size);
+                       int **host_type2rhor, int **host_type2z2r,
+                       int *host_type2frho, double ***host_rhor_spline,
+                       double ***host_z2r_spline, double ***host_frho_spline,
+                       double rdr, double rdrho, double rhomax,
+                       int nrhor, int nrho, int nz2r, int nfrho, int nr,
+                       const int nlocal, const int nall, const int max_nbors,
+                       const int maxspecial, const double cell_size,
+                       int &gpu_mode, FILE *screen, int &fp_size);
 void eam_alloy_gpu_clear();
-int** eam_alloy_gpu_compute_n(const int ago, const int inum_full, const int nall,
-                        double **host_x, int *host_type, double *sublo,
-                        double *subhi, tagint *tag, int **nspecial, tagint **special,
-                        const bool eflag, const bool vflag, const bool eatom,
-                        const bool vatom, int &host_start, int **ilist,
-                        int **jnum,  const double cpu_time, bool &success,
+int** eam_alloy_gpu_compute_n(const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *sublo, double *subhi, tagint *tag,
+                        int **nspecial, tagint **special, const bool eflag,
+                        const bool vflag, const bool eatom, const bool vatom,
+                        int &host_start, int **ilist, int **jnum,
+                        const double cpu_time, bool &success,
                         int &inum, void **fp_ptr);
 void eam_alloy_gpu_compute(const int ago, const int inum_full, const int nlocal,
                      const int nall,double **host_x, int *host_type,
@@ -183,10 +184,11 @@ void PairEAMAlloyGPU::init_style()
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int fp_size;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = eam_alloy_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r,
                              type2frho, rhor_spline, z2r_spline, frho_spline,
                              rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr,
-                             atom->nlocal, atom->nlocal+atom->nghost, 300,
+                             atom->nlocal, atom->nlocal+atom->nghost, mnf,
                              maxspecial, cell_size, gpu_mode, screen, fp_size);
   GPU_EXTRA::check_flag(success,error,world);
 
@@ -195,7 +197,6 @@ void PairEAMAlloyGPU::init_style()
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
-
   if (fp_size == sizeof(double))
     fp_single = false;
   else
diff --git a/src/GPU/pair_eam_fs_gpu.cpp b/src/GPU/pair_eam_fs_gpu.cpp
index ce3ea8bb0b..390bb93987 100644
--- a/src/GPU/pair_eam_fs_gpu.cpp
+++ b/src/GPU/pair_eam_fs_gpu.cpp
@@ -50,19 +50,19 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
 void eam_fs_gpu_clear();
 int** eam_fs_gpu_compute_n(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, double *sublo,
-                        double *subhi, tagint *tag, int **nspecial, tagint **special,
-                        const bool eflag, const bool vflag, const bool eatom,
-                        const bool vatom, int &host_start, int **ilist,
-                        int **jnum,  const double cpu_time, bool &success,
-                        int &inum, void **fp_ptr);
+                        double *subhi, tagint *tag, int **nspecial,
+                        tagint **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum,  const double cpu_time,
+                        bool &success, int &inum, void **fp_ptr);
 void eam_fs_gpu_compute(const int ago, const int inum_full, const int nlocal,
-                     const int nall,double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-                     const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, void **fp_ptr);
+                        const int nall,double **host_x, int *host_type,
+                        int *ilist, int *numj, int **firstneigh,
+                        const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success, void **fp_ptr);
 void eam_fs_gpu_compute_force(int *ilist, const bool eflag, const bool vflag,
-                           const bool eatom, const bool vatom);
+                              const bool eatom, const bool vatom);
 double eam_fs_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -183,10 +183,11 @@ void PairEAMFSGPU::init_style()
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int fp_size;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = eam_fs_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r,
                              type2frho, rhor_spline, z2r_spline, frho_spline,
                              rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr,
-                             atom->nlocal, atom->nlocal+atom->nghost, 300,
+                             atom->nlocal, atom->nlocal+atom->nghost, mnf,
                              maxspecial, cell_size, gpu_mode, screen, fp_size);
   GPU_EXTRA::check_flag(success,error,world);
 
@@ -195,7 +196,6 @@ void PairEAMFSGPU::init_style()
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
-
   if (fp_size == sizeof(double))
     fp_single = false;
   else
diff --git a/src/GPU/pair_eam_gpu.cpp b/src/GPU/pair_eam_gpu.cpp
index abd721a327..e458ea2020 100644
--- a/src/GPU/pair_eam_gpu.cpp
+++ b/src/GPU/pair_eam_gpu.cpp
@@ -50,11 +50,11 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
 void eam_gpu_clear();
 int** eam_gpu_compute_n(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, double *sublo,
-                        double *subhi, tagint *tag, int **nspecial, tagint **special,
-                        const bool eflag, const bool vflag, const bool eatom,
-                        const bool vatom, int &host_start, int **ilist,
-                        int **jnum,  const double cpu_time, bool &success,
-                        int &inum, void **fp_ptr);
+                        double *subhi, tagint *tag, int **nspecial,
+                        tagint **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum,  const double cpu_time,
+                        bool &success, int &inum, void **fp_ptr);
 void eam_gpu_compute(const int ago, const int inum_full, const int nlocal,
                      const int nall,double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh,
@@ -185,10 +185,11 @@ void PairEAMGPU::init_style()
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int fp_size;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = eam_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r,
                              type2frho, rhor_spline, z2r_spline, frho_spline,
                              rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr,
-                             atom->nlocal, atom->nlocal+atom->nghost, 300,
+                             atom->nlocal, atom->nlocal+atom->nghost, mnf,
                              maxspecial, cell_size, gpu_mode, screen, fp_size);
   GPU_EXTRA::check_flag(success,error,world);
 
@@ -197,7 +198,6 @@ void PairEAMGPU::init_style()
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
-
   if (fp_size == sizeof(double))
     fp_single = false;
   else
diff --git a/src/GPU/pair_gauss_gpu.cpp b/src/GPU/pair_gauss_gpu.cpp
index 89b79f11f2..fe9dd9ba96 100644
--- a/src/GPU/pair_gauss_gpu.cpp
+++ b/src/GPU/pair_gauss_gpu.cpp
@@ -41,15 +41,16 @@ using namespace LAMMPS_NS;
 // External functions from cuda library for atom decomposition
 
 int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
-                   double **b, double **offset, double *special_lj, const int nlocal,
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen);
+                   double **b, double **offset, double *special_lj,
+                   const int nlocal, const int nall, const int max_nbors,
+                   const int maxspecial, const double cell_size,
+                   int &gpu_mode, FILE *screen);
 void gauss_gpu_reinit(const int ntypes, double **cutsq, double **host_a,
-                   double **b, double **offset);
+                      double **b, double **offset);
 void gauss_gpu_clear();
-int ** gauss_gpu_compute_n(const int ago, const int inum,
-                           const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** gauss_gpu_compute_n(const int ago, const int inum, const int nall,
+                           double **host_x, int *host_type, double *sublo,
+                           double *subhi, tagint *tag, int **nspecial,
                            tagint **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum,
@@ -158,9 +159,10 @@ void PairGaussGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = gauss_gpu_init(atom->ntypes+1, cutsq, a, b,
                                offset, force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
+                               atom->nlocal+atom->nghost, mnf, maxspecial,
                                cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp
index 19a4c77032..81966824ba 100644
--- a/src/GPU/pair_gayberne_gpu.cpp
+++ b/src/GPU/pair_gayberne_gpu.cpp
@@ -49,12 +49,12 @@ int gb_gpu_init(const int ntypes, const double gamma, const double upsilon,
                 double **host_lj3, double **host_lj4, double **offset,
                 double *special_lj, const int nlocal, const int nall,
                 const int max_nbors, const int maxspecial,
-                const double cell_size,        int &gpu_mode, FILE *screen);
+                const double cell_size, int &gpu_mode, FILE *screen);
 void gb_gpu_clear();
 int ** gb_gpu_compute_n(const int ago, const int inum, const int nall,
                         double **host_x, int *host_type, double *sublo,
-                        double *subhi, tagint *tag, int **nspecial, tagint **special,
-                        const bool eflag, const bool vflag,
+                        double *subhi, tagint *tag, int **nspecial,
+                        tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success, double **host_quat);
@@ -207,10 +207,11 @@ void PairGayBerneGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu,
                             shape2, well, cutsq, sigma, epsilon, lshape, form,
                             lj1, lj2, lj3, lj4, offset, force->special_lj,
-                            atom->nlocal, atom->nlocal+atom->nghost, 300,
+                            atom->nlocal, atom->nlocal+atom->nghost, mnf,
                             maxspecial, cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj96_cut_gpu.cpp b/src/GPU/pair_lj96_cut_gpu.cpp
index e15a78fb91..84d1a1a10d 100644
--- a/src/GPU/pair_lj96_cut_gpu.cpp
+++ b/src/GPU/pair_lj96_cut_gpu.cpp
@@ -160,9 +160,10 @@ void PairLJ96CutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp b/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp
new file mode 100644
index 0000000000..4f8679a8a8
--- /dev/null
+++ b/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp
@@ -0,0 +1,309 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_charmm_coul_charmm_gpu.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "domain.h"
+#include "gpu_extra.h"
+
+using namespace LAMMPS_NS;
+
+// External functions from cuda library for atom decomposition
+
+int crm_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double *special_lj, const int nlocal,
+                 const int nall, const int max_nbors, const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen,
+                 double host_cut_ljsq, double host_cut_coulsq,
+                 double *host_special_coul, const double qqrd2e,
+                 const double cut_lj_innersq, const double cut_coul_innersq,
+                 const double denom_lj, const double denom_coul,
+                 double **epsilon, double **sigma,
+                 const bool mix_arithmetic);
+void crm_gpu_clear();
+int ** crm_gpu_compute_n(const int ago, const int inum,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, tagint *tag,
+                         int **nspecial, tagint **special, const bool eflag,
+                         const bool vflag, const bool eatom,
+                         const bool vatom, int &host_start, int **ilist,
+                         int **jnum, const double cpu_time, bool &success,
+                         double *host_q, double *boxlo, double *prd);
+void crm_gpu_compute(const int ago, const int inum, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double *host_q,
+                     const int nlocal, double *boxlo, double *prd);
+double crm_gpu_bytes();
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmGPU::PairLJCharmmCoulCharmmGPU(LAMMPS *lmp) :
+  PairLJCharmmCoulCharmm(lmp), gpu_mode(GPU_FORCE)
+{
+  reinitflag = 0;
+  cpu_time = 0.0;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmGPU::~PairLJCharmmCoulCharmmGPU()
+{
+  crm_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  if (gpu_mode != GPU_FORCE) {
+    inum = atom->nlocal;
+    firstneigh = crm_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+                                   atom->type, domain->sublo, domain->subhi,
+                                   atom->tag, atom->nspecial, atom->special,
+                                   eflag, vflag, eflag_atom, vflag_atom,
+                                   host_start, &ilist, &numneigh, cpu_time,
+                                   success, atom->q, domain->boxlo,
+                                   domain->prd);
+  } else {
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    crm_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+                    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+                    vflag_atom, host_start, cpu_time, success, atom->q,
+                    atom->nlocal, domain->boxlo, domain->prd);
+  }
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmGPU::init_style()
+{
+  if (!atom->q_flag)
+    error->all(FLERR,
+               "Pair style lj/charmm/coul/long/gpu requires atom attribute q");
+  if (force->newton_pair)
+    error->all(FLERR,
+      "Cannot use newton pair with lj/charmm/coul/long/gpu pair style");
+
+  // Repeat cutsq calculation because done after call to init_style
+
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
+        cut = init_one(i,j);
+    }
+  }
+
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
+
+  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) *
+    (cut_ljsq-cut_lj_innersq);
+  denom_lj = 1.0 / denom_lj;
+
+  denom_coul = (cut_coulsq-cut_coul_innersq) * (cut_coulsq-cut_coul_innersq) *
+    (cut_coulsq-cut_coul_innersq);
+  denom_coul = 1.0 / denom_coul;
+
+  double cell_size = sqrt(cut_bothsq) + neighbor->skin;
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+
+  bool arithmetic = true;
+  for (int i = 1; i < atom->ntypes + 1; i++)
+    for (int j = i + 1; j < atom->ntypes + 1; j++) {
+      if (epsilon[i][j] != sqrt(epsilon[i][i] * epsilon[j][j]))
+        arithmetic = false;
+      if (sigma[i][j] != 0.5 * (sigma[i][i] + sigma[j][j]))
+        arithmetic = false;
+    }
+
+  int mnf = 5e-2 * neighbor->oneatom;
+  int success = crm_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4,
+                             force->special_lj, atom->nlocal,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
+                             cell_size, gpu_mode, screen, cut_ljsq,
+                             cut_coulsq, force->special_coul, force->qqrd2e,
+                             cut_lj_innersq,cut_coul_innersq,denom_lj,
+                             denom_coul,epsilon,sigma,arithmetic);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (gpu_mode == GPU_FORCE) {
+    int irequest = neighbor->request(this,instance_me);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCharmmCoulCharmmGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + crm_gpu_bytes();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmGPU::cpu_compute(int start, int inum, int eflag,
+                                          int vflag, int *ilist,
+                                          int *numneigh, int **firstneigh)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double philj,switch1,switch2;
+  int *jlist;
+
+  evdwl = ecoul = 0.0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *q = atom->q;
+  int *type = atom->type;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cut_bothsq) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq) {
+          forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
+          if (rsq > cut_coul_innersq) {
+            switch1 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+              (cut_coulsq + 2.0*rsq - 3.0*cut_coul_innersq) * denom_coul;
+            forcecoul *= switch1;
+          }
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq) {
+          r6inv = r2inv*r2inv*r2inv;
+          jtype = type[j];
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+              (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) * denom_lj;
+            switch2 = 12.0*rsq * (cut_ljsq-rsq) *
+              (rsq-cut_lj_innersq) * denom_lj;
+            philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]);
+            forcelj = forcelj*switch1 + philj*switch2;
+          }
+        } else forcelj = 0.0;
+
+        fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv;
+
+        f[i][0] += delx*fpair;
+        f[i][1] += dely*fpair;
+        f[i][2] += delz*fpair;
+
+        if (eflag) {
+          if (rsq < cut_coulsq) {
+            ecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
+            if (rsq > cut_coul_innersq) {
+              switch1 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+                (cut_coulsq + 2.0*rsq - 3.0*cut_coul_innersq) *
+                denom_coul;
+              ecoul *= switch1;
+            }
+            ecoul *= factor_coul;
+          } else ecoul = 0.0;
+
+          if (rsq < cut_ljsq) {
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
+            if (rsq > cut_lj_innersq) {
+              switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+                (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) * denom_lj;
+              evdwl *= switch1;
+            }
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
diff --git a/src/GPU/pair_lj_charmm_coul_charmm_gpu.h b/src/GPU/pair_lj_charmm_coul_charmm_gpu.h
new file mode 100644
index 0000000000..d80730ca5c
--- /dev/null
+++ b/src/GPU/pair_lj_charmm_coul_charmm_gpu.h
@@ -0,0 +1,62 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/charmm/gpu,PairLJCharmmCoulCharmmGPU)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_GPU_H
+#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_GPU_H
+
+#include "pair_lj_charmm_coul_charmm.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulCharmmGPU : public PairLJCharmmCoulCharmm {
+ public:
+  PairLJCharmmCoulCharmmGPU(LAMMPS *lmp);
+  ~PairLJCharmmCoulCharmmGPU();
+  void cpu_compute(int, int, int, int, int *, int *, int **);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+ enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+};
+
+}
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Insufficient memory on accelerator
+
+There is insufficient memory on one of the devices specified for the gpu
+package
+
+E: Pair style lj/charmm/coul/long/gpu requires atom attribute q
+
+The atom style defined does not have this attribute.
+
+E: Cannot use newton pair with lj/charmm/coul/long/gpu pair style
+
+Self-explanatory.
+
+*/
diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
index b89e4d4574..9753404d5e 100644
--- a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
@@ -203,9 +203,10 @@ void PairLJCharmmCoulLongGPU::init_style()
         arithmetic = false;
     }
 
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = crml_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq,
                               cut_coulsq, force->special_coul, force->qqrd2e,
                               g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma,
diff --git a/src/GPU/pair_lj_class2_coul_long_gpu.cpp b/src/GPU/pair_lj_class2_coul_long_gpu.cpp
index 50183196f8..3fc6195fa8 100644
--- a/src/GPU/pair_lj_class2_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_class2_coul_long_gpu.cpp
@@ -188,9 +188,10 @@ void PairLJClass2CoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = c2cl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                               force->special_coul, force->qqrd2e, g_ewald);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_class2_gpu.cpp b/src/GPU/pair_lj_class2_gpu.cpp
index 55fdc2d43d..cf8158ce5f 100644
--- a/src/GPU/pair_lj_class2_gpu.cpp
+++ b/src/GPU/pair_lj_class2_gpu.cpp
@@ -157,9 +157,10 @@ void PairLJClass2GPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj_cubic_gpu.cpp b/src/GPU/pair_lj_cubic_gpu.cpp
index 35062a5d71..a0dd9498c6 100644
--- a/src/GPU/pair_lj_cubic_gpu.cpp
+++ b/src/GPU/pair_lj_cubic_gpu.cpp
@@ -52,18 +52,18 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq,
                   const double cell_size, int &gpu_mode, FILE *screen);
 
 void ljcb_gpu_clear();
-int ** ljcb_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
-                         tagint **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success);
+int ** ljcb_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
+                          tagint **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum, const double cpu_time,
+                          bool &success);
 void ljcb_gpu_compute(const int ago, const int inum, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success);
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success);
 double ljcb_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -165,10 +165,11 @@ void PairLJCubicGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljcb_gpu_init(atom->ntypes+1, cutsq, cut_inner_sq,
                               cut_inner, sigma, epsilon, lj1, lj2,
                               lj3, lj4, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
index e4823a3ea4..7932a352b3 100644
--- a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
@@ -48,16 +48,16 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_cut_ljsq, double **host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e);
 void ljc_gpu_clear();
-int ** ljc_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** ljc_gpu_compute_n(const int ago, const int inum, const int nall,
+                         double **host_x, int *host_type, double *sublo,
+                         double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
                          bool &success, double *host_q, double *boxlo,
                          double *prd);
 void ljc_gpu_compute(const int ago, const int inum,
-                      const int nall, double **host_x, int *host_type,
+                     const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh,
                      const bool eflag, const bool vflag, const bool eatom,
                      const bool vatom, int &host_start, const double cpu_time,
@@ -168,9 +168,10 @@ void PairLJCutCoulCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                              force->special_coul, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_cut_coul_debye_gpu.cpp b/src/GPU/pair_lj_cut_coul_debye_gpu.cpp
index 1f7ae9af01..eb8e2c9c7f 100644
--- a/src/GPU/pair_lj_cut_coul_debye_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_debye_gpu.cpp
@@ -41,17 +41,17 @@ using namespace LAMMPS_NS;
 // External functions from cuda library for atom decomposition
 
 int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                 double **host_lj2, double **host_lj3, double **host_lj4,
-                 double **offset, double *special_lj, const int nlocal,
-                 const int nall, const int max_nbors, const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen,
-                 double **host_cut_ljsq, double **host_cut_coulsq,
-                 double *host_special_coul, const double qqrd2e,
-                 const double kappa);
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int nlocal,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, double **host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double kappa);
 void ljcd_gpu_clear();
 int ** ljcd_gpu_compute_n(const int ago, const int inum, const int nall,
-                          double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
@@ -170,9 +170,10 @@ void PairLJCutCoulDebyeGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljcd_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq,
                               cut_coulsq, force->special_coul,
                               force->qqrd2e, kappa);
diff --git a/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp b/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp
index 6c25412ae8..e071245a56 100644
--- a/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp
@@ -59,9 +59,9 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  const double e_shift, const double f_shift,
                  const double alpha);
 void ljd_gpu_clear();
-int ** ljd_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** ljd_gpu_compute_n(const int ago, const int inum, const int nall,
+                         double **host_x, int *host_type, double *sublo,
+                         double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
@@ -185,9 +185,10 @@ void PairLJCutCoulDSFGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljd_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                              force->special_coul, force->qqrd2e, e_shift,
                              f_shift, alpha);
diff --git a/src/GPU/pair_lj_cut_coul_long_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
index 50776de795..cff48afd1e 100644
--- a/src/GPU/pair_lj_cut_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
@@ -58,8 +58,8 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   double *host_special_coul, const double qqrd2e,
                   const double g_ewald);
 void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
-                    double **host_lj2, double **host_lj3, double **host_lj4,
-                    double **offset, double **host_lj_cutsq);
+                     double **host_lj2, double **host_lj3, double **host_lj4,
+                     double **offset, double **host_lj_cutsq);
 void ljcl_gpu_clear();
 int ** ljcl_gpu_compute_n(const int ago, const int inum,
                           const int nall, double **host_x, int *host_type,
@@ -193,9 +193,10 @@ void PairLJCutCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                               force->special_coul, force->qqrd2e, g_ewald);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_cut_coul_msm_gpu.cpp b/src/GPU/pair_lj_cut_coul_msm_gpu.cpp
index 33ba418533..d686ea4d88 100644
--- a/src/GPU/pair_lj_cut_coul_msm_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_msm_gpu.cpp
@@ -48,15 +48,17 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
                   double **host_cut_ljsq, double host_cut_coulsq,
-                  double *host_special_coul, const int order, const double qqrd2e);
+                  double *host_special_coul, const int order,
+                  const double qqrd2e);
 void ljcm_gpu_clear();
-int ** ljcm_gpu_compute_n(const int ago, const int inum,
-                          const int nall, double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** ljcm_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
-                          bool &success, double *host_q, double *boxlo, double *prd);
+                          bool &success, double *host_q, double *boxlo,
+                          double *prd);
 void ljcm_gpu_compute(const int ago, const int inum, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
@@ -177,12 +179,13 @@ void PairLJCutCoulMSMGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljcm_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               force->kspace->get_gcons(),
                               force->kspace->get_dgcons(),
                               offset, force->special_lj,
                               atom->nlocal, atom->nlocal+atom->nghost,
-                              300, maxspecial, cell_size, gpu_mode, screen,
+                              mnf, maxspecial, cell_size, gpu_mode, screen,
                               cut_ljsq, cut_coulsq, force->special_coul,
                               force->kspace->order, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp b/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp
index ae93cd9010..16eef6e8e8 100644
--- a/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp
@@ -173,9 +173,10 @@ void PairLJCutDipoleCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = dpl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                              force->special_coul, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_cut_dipole_long_gpu.cpp b/src/GPU/pair_lj_cut_dipole_long_gpu.cpp
index 8e7d5baddc..b7c29cedb8 100644
--- a/src/GPU/pair_lj_cut_dipole_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_dipole_long_gpu.cpp
@@ -52,29 +52,30 @@ using namespace MathConst;
 // External functions from cuda library for atom decomposition
 
 int dplj_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                 double **host_lj2, double **host_lj3, double **host_lj4,
-                 double **offset, double *special_lj, const int nlocal,
-                 const int nall, const int max_nbors, const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen,
-                 double **host_cut_ljsq, const double host_cut_coulsq,
-                 double *host_special_coul, const double qqrd2e, const double g_ewald);
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int nlocal,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, const double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald);
 void dplj_gpu_clear();
 int ** dplj_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag,
-                         int **nspecial, tagint **special, const bool eflag,
-                         const bool vflag, const bool eatom, const bool vatom,
-                         int &host_start, int **ilist, int **jnum,
-                         const double cpu_time, bool &success,
-                         double *host_q, double **host_mu,
-                         double *boxlo, double *prd);
+                          const int nall, double **host_x, int *host_type,
+                          double *sublo, double *subhi, tagint *tag,
+                          int **nspecial, tagint **special, const bool eflag,
+                          const bool vflag, const bool eatom, const bool vatom,
+                          int &host_start, int **ilist, int **jnum,
+                          const double cpu_time, bool &success,
+                          double *host_q, double **host_mu,
+                          double *boxlo, double *prd);
 void dplj_gpu_compute(const int ago, const int inum,
-                     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-                     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q, double **host_mu,
-                     const int nlocal, double *boxlo, double *prd);
+                      const int nall, double **host_x, int *host_type,
+                      int *ilist, int *numj, int **firstneigh,
+                      const bool eflag, const bool vflag, const bool eatom,
+                      const bool vatom, int &host_start, const double cpu_time,
+                      bool &success, double *host_q, double **host_mu,
+                      const int nlocal, double *boxlo, double *prd);
 double dplj_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -196,9 +197,10 @@ void PairLJCutDipoleLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = dplj_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                              force->special_coul, force->qqrd2e, g_ewald);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp
index 2b2773b920..edd2a7feb0 100644
--- a/src/GPU/pair_lj_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_gpu.cpp
@@ -47,13 +47,13 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  const double cell_size, int &gpu_mode, FILE *screen);
 
 void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset);
+                    double **host_lj2, double **host_lj3, double **host_lj4,
+                    double **offset);
 
 void ljl_gpu_clear();
-int ** ljl_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** ljl_gpu_compute_n(const int ago, const int inum, const int nall,
+                         double **host_x, int *host_type, double *sublo,
+                         double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,
@@ -164,9 +164,10 @@ void PairLJCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp b/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp
index 3e852513b2..9584c6f68a 100644
--- a/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp
@@ -229,10 +229,11 @@ void PairLJCutTIP4PLongGPU::init_style()
       error->warning(FLERR,"Increasing communication cutoff for TIP4P GPU style");
   }
 
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljtip4p_long_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
                              typeH, typeO, alpha, qdist,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, cut_ljsq,
                              cut_coulsq, cut_coulsqplus,
                              force->special_coul, force->qqrd2e,
diff --git a/src/GPU/pair_lj_expand_coul_long_gpu.cpp b/src/GPU/pair_lj_expand_coul_long_gpu.cpp
index 533f9d9070..da0c720c74 100644
--- a/src/GPU/pair_lj_expand_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_expand_coul_long_gpu.cpp
@@ -50,31 +50,31 @@ using namespace LAMMPS_NS;
 // External functions from cuda library for atom decomposition
 
 int ljecl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4,
-                  double **offset, double **shift, double *special_lj, const int nlocal,
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen,
-                  double **host_cut_ljsq, double host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e,
-                  const double g_ewald);
+                   double **host_lj2, double **host_lj3, double **host_lj4,
+                   double **offset, double **shift, double *special_lj,
+                   const int nlocal, const int nall, const int max_nbors,
+                   const int maxspecial, const double cell_size,
+                   int &gpu_mode, FILE *screen, double **host_cut_ljsq,
+                   double host_cut_coulsq, double *host_special_coul,
+                   const double qqrd2e, const double g_ewald);
 int ljecl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
-                    double **host_lj2, double **host_lj3, double **host_lj4,
-                    double **offset, double **shift, double **host_lj_cutsq);
+                     double **host_lj2, double **host_lj3, double **host_lj4,
+                     double **offset, double **shift, double **host_lj_cutsq);
 void ljecl_gpu_clear();
 int ** ljecl_gpu_compute_n(const int ago, const int inum,
-                          const int nall, double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag,
-                          int **nspecial, tagint **special, const bool eflag,
-                          const bool vflag, const bool eatom, const bool vatom,
-                          int &host_start, int **ilist, int **jnum,
-                          const double cpu_time, bool &success, double *host_q,
-                          double *boxlo, double *prd);
+                           const int nall, double **host_x, int *host_type,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special, const bool eflag,
+                           const bool vflag, const bool eatom, const bool vatom,
+                           int &host_start, int **ilist, int **jnum,
+                           const double cpu_time, bool &success, double *host_q,
+                           double *boxlo, double *prd);
 void ljecl_gpu_compute(const int ago, const int inum, const int nall,
-                      double **host_x, int *host_type, int *ilist, int *numj,
-                      int **firstneigh, const bool eflag, const bool vflag,
-                      const bool eatom, const bool vatom, int &host_start,
-                      const double cpu_time, bool &success, double *host_q,
-                      const int nlocal, double *boxlo, double *prd);
+                       double **host_x, int *host_type, int *ilist, int *numj,
+                       int **firstneigh, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       const double cpu_time, bool &success, double *host_q,
+                       const int nlocal, double *boxlo, double *prd);
 double ljecl_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -193,9 +193,10 @@ void PairLJExpandCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljecl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, shift, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                               force->special_coul, force->qqrd2e, g_ewald);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_expand_gpu.cpp b/src/GPU/pair_lj_expand_gpu.cpp
index d3745dce56..0e86e41255 100644
--- a/src/GPU/pair_lj_expand_gpu.cpp
+++ b/src/GPU/pair_lj_expand_gpu.cpp
@@ -47,8 +47,8 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen);
 void lje_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset, double **shift);
+                    double **host_lj2, double **host_lj3, double **host_lj4,
+                    double **offset, double **shift);
 void lje_gpu_clear();
 int ** lje_gpu_compute_n(const int ago, const int inum, const int nall,
                          double **host_x, int *host_type, double *sublo,
@@ -161,9 +161,10 @@ void PairLJExpandGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = lje_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, shift, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj_gromacs_gpu.cpp b/src/GPU/pair_lj_gromacs_gpu.cpp
index 1bffbcd0b9..a605ebd6c4 100644
--- a/src/GPU/pair_lj_gromacs_gpu.cpp
+++ b/src/GPU/pair_lj_gromacs_gpu.cpp
@@ -43,16 +43,17 @@ using namespace LAMMPS_NS;
 
 int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                    double **host_lj2, double **host_lj3, double **host_lj4,
-                   double *special_lj, const int inum,
-                   const int nall, const int max_nbors, const int maxspecial,
+                   double *special_lj, const int inum, const int nall,
+                   const int max_nbors, const int maxspecial,
                    const double cell_size, int &gpu_mode, FILE *screen,
-                   double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
-                   double **host_ljsw4, double **host_ljsw5,
-                   double **cut_inner, double **cut_innersq);
+                   double **host_ljsw1, double **host_ljsw2,
+                   double **host_ljsw3, double **host_ljsw4,
+                   double **host_ljsw5, double **cut_inner,
+                   double **cut_innersq);
 void ljgrm_gpu_clear();
-int ** ljgrm_gpu_compute_n(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** ljgrm_gpu_compute_n(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, double *sublo,
+                           double *subhi, tagint *tag, int **nspecial,
                            tagint **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
@@ -164,9 +165,10 @@ void PairLJGromacsGPU::init_style()
   if (atom->molecular)
     maxspecial=atom->maxspecial;
 
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljgrm_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                                                  force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
+                               atom->nlocal+atom->nghost, mnf, maxspecial,
                                cell_size, gpu_mode, screen, ljsw1, ljsw2,
                                ljsw3, ljsw4, ljsw5, cut_inner, cut_inner_sq);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_sdk_coul_long_gpu.cpp b/src/GPU/pair_lj_sdk_coul_long_gpu.cpp
index a3ba87c82e..df2310e904 100644
--- a/src/GPU/pair_lj_sdk_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_sdk_coul_long_gpu.cpp
@@ -197,9 +197,10 @@ void PairLJSDKCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = sdkl_gpu_init(atom->ntypes+1, cutsq, lj_type, lj1, lj2, lj3,
                               lj4, offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq,
                               cut_coulsq, force->special_coul,
                               force->qqrd2e, g_ewald);
diff --git a/src/GPU/pair_lj_sdk_gpu.cpp b/src/GPU/pair_lj_sdk_gpu.cpp
index baf341c25a..5a1960e4c8 100644
--- a/src/GPU/pair_lj_sdk_gpu.cpp
+++ b/src/GPU/pair_lj_sdk_gpu.cpp
@@ -166,9 +166,10 @@ void PairLJSDKGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = sdk_gpu_init(atom->ntypes+1,cutsq,lj_type,lj1,lj2,lj3,lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp b/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp
index 6f0ebc58b7..470c2f049e 100644
--- a/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp
+++ b/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp
@@ -48,21 +48,21 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                    double **host_cut_ljsq, double **host_cut_coulsq,
                    double *host_special_coul, const double qqrd2e);
 void dplsf_gpu_clear();
-int ** dplsf_gpu_compute_n(const int ago, const int inum,
-                           const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** dplsf_gpu_compute_n(const int ago, const int inum, const int nall,
+                           double **host_x, int *host_type, double *sublo,
+                           double *subhi, tagint *tag, int **nspecial,
                            tagint **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success, double *host_q, double **host_mu,
                            double *boxlo, double *prd);
-void dplsf_gpu_compute(const int ago, const int inum,
-                       const int nall, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start, const double cpu_time,
-                       bool &success, double *host_q, double **host_mu, const int nlocal,
-                       double *boxlo, double *prd);
+void dplsf_gpu_compute(const int ago, const int inum, const int nall,
+                       double **host_x, int *host_type, int *ilist, int *numj,
+                       int **firstneigh, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       const double cpu_time, bool &success, double *host_q,
+                       double **host_mu, const int nlocal, double *boxlo,
+                       double *prd);
 double dplsf_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -172,9 +172,10 @@ void PairLJSFDipoleSFGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = dplsf_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                                force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
+                               atom->nlocal+atom->nghost, mnf, maxspecial,
                                cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                                force->special_coul, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_mie_cut_gpu.cpp b/src/GPU/pair_mie_cut_gpu.cpp
index e9e6eedde8..05e92909da 100644
--- a/src/GPU/pair_mie_cut_gpu.cpp
+++ b/src/GPU/pair_mie_cut_gpu.cpp
@@ -47,9 +47,9 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
                  const int nall, const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen);
 void mie_gpu_clear();
-int ** mie_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** mie_gpu_compute_n(const int ago, const int inum, const int nall,
+                         double **host_x, int *host_type, double *sublo,
+                         double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,
@@ -161,9 +161,10 @@ void PairMIECutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = mie_gpu_init(atom->ntypes+1, cutsq, mie1, mie2, mie3, mie4,
                              gamA, gamR, offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_morse_gpu.cpp b/src/GPU/pair_morse_gpu.cpp
index 75ca5627ba..d929c76930 100644
--- a/src/GPU/pair_morse_gpu.cpp
+++ b/src/GPU/pair_morse_gpu.cpp
@@ -46,9 +46,9 @@ int mor_gpu_init(const int ntypes, double **cutsq, double **host_morse1,
                  const int nall, const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen);
 void mor_gpu_clear();
-int ** mor_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** mor_gpu_compute_n(const int ago, const int inum, const int nall,
+                         double **host_x, int *host_type, double *sublo,
+                         double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,
@@ -157,9 +157,10 @@ void PairMorseGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = mor_gpu_init(atom->ntypes+1, cutsq, morse1, r0, alpha, d0,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_resquared_gpu.cpp b/src/GPU/pair_resquared_gpu.cpp
index b6c212da6f..c816ad9166 100644
--- a/src/GPU/pair_resquared_gpu.cpp
+++ b/src/GPU/pair_resquared_gpu.cpp
@@ -44,16 +44,16 @@ using namespace LAMMPS_NS;
 
 int re_gpu_init(const int ntypes, double **shape, double **well,
                 double **cutsq, double **sigma, double **epsilon,
-                int **form, double **host_lj1,
-                double **host_lj2, double **host_lj3, double **host_lj4,
-                double **offset, double *special_lj, const int nlocal,
-                const int nall,        const int max_nbors, const int maxspecial,
-                const double cell_size,        int &gpu_mode, FILE *screen);
+                int **form, double **host_lj1, double **host_lj2,
+                double **host_lj3, double **host_lj4, double **offset,
+                double *special_lj, const int nlocal, const int nall,
+                const int max_nbors, const int maxspecial,
+                const double cell_size, int &gpu_mode, FILE *screen);
 void re_gpu_clear();
 int ** re_gpu_compute_n(const int ago, const int inum, const int nall,
                         double **host_x, int *host_type, double *sublo,
-                        double *subhi, tagint *tag, int **nspecial, tagint **special,
-                        const bool eflag, const bool vflag,
+                        double *subhi, tagint *tag, int **nspecial,
+                        tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success, double **host_quat);
@@ -205,10 +205,11 @@ void PairRESquaredGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = re_gpu_init(atom->ntypes+1, shape1, well, cutsq, sigma,
                             epsilon, form, lj1, lj2, lj3, lj4, offset,
                             force->special_lj, atom->nlocal,
-                            atom->nlocal+atom->nghost, 300, maxspecial,
+                            atom->nlocal+atom->nghost, mnf, maxspecial,
                             cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_soft_gpu.cpp b/src/GPU/pair_soft_gpu.cpp
index c9eb55157a..5a3ad0c577 100644
--- a/src/GPU/pair_soft_gpu.cpp
+++ b/src/GPU/pair_soft_gpu.cpp
@@ -48,13 +48,13 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **prefactor,
 void soft_gpu_reinit(const int ntypes, double **cutsq, double **host_prefactor,
                      double **host_cut);
 void soft_gpu_clear();
-int ** soft_gpu_compute_n(const int ago, const int inum,
-                           const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, const bool eflag, const bool vflag,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **ilist, int **jnum,
-                           const double cpu_time, bool &success);
+int ** soft_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
+                          tagint **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum,
+                          const double cpu_time, bool &success);
 void soft_gpu_compute(const int ago, const int inum, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
@@ -162,9 +162,10 @@ void PairSoftGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = soft_gpu_init(atom->ntypes+1, cutsq, prefactor, cut,
                               force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_sw_gpu.cpp b/src/GPU/pair_sw_gpu.cpp
index 3d851121e0..7bfbe2810f 100644
--- a/src/GPU/pair_sw_gpu.cpp
+++ b/src/GPU/pair_sw_gpu.cpp
@@ -38,31 +38,27 @@ using namespace LAMMPS_NS;
 
 // External functions from cuda library for atom decomposition
 
-int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors,
-                const double cell_size, int &gpu_mode, FILE *screen,
-                int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-                const double* sw_epsilon, const double* sw_sigma,
-                const double* sw_lambda, const double* sw_gamma,
-                const double* sw_costheta, const double* sw_biga,
-                const double* sw_bigb, const double* sw_powerp,
-                const double* sw_powerq, const double* sw_cut,
-                const double* sw_cutsq);
+int sw_gpu_init(const int ntypes, const int inum, const int nall,
+                const int max_nbors, const double cell_size, int &gpu_mode,
+                FILE *screen, double **ncutsq, double **ncut, double **sigma,
+                double **powerp, double **powerq, double **sigma_gamma,
+                double **c1, double **c2, double **c3,double **c4,
+                double **c5, double **c6, double ***lambda_epsilon,
+                double ***costheta, const int *map, int ***e2param);
 void sw_gpu_clear();
-int ** sw_gpu_compute_n(const int ago, const int inum,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** sw_gpu_compute_n(const int ago, const int inum, const int nall,
+                        double **host_x, int *host_type, double *sublo,
+                        double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum,
                         const double cpu_time, bool &success);
-void sw_gpu_compute(const int ago, const int nloc, const int nall, const int ln,
-                    double **host_x, int *host_type, int *ilist, int *numj,
-                    int **firstneigh, const bool eflag, const bool vflag,
-                    const bool eatom, const bool vatom, int &host_start,
-                    const double cpu_time, bool &success);
+void sw_gpu_compute(const int ago, const int nloc, const int nall,
+                    const int ln, double **host_x, int *host_type, int *ilist,
+                    int *numj, int **firstneigh, const bool eflag,
+                    const bool vflag, const bool eatom, const bool vatom,
+                    int &host_start, const double cpu_time, bool &success);
 double sw_gpu_bytes();
-extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                             double **vatom, double *virial, double &ecoul);
 
 #define MAXLINE 1024
 #define DELTA 4
@@ -159,55 +155,84 @@ void PairSWGPU::init_style()
   if (force->newton_pair != 0)
     error->all(FLERR,"Pair style sw/gpu requires newton pair off");
 
-  double *epsilon, *sigma, *lambda, *gamma;
-  double *biga, *bigb, *powerp, *powerq;
-  double *_cut, *_cutsq, *costheta;
-  epsilon = sigma = lambda = gamma = nullptr;
-  biga = bigb = powerp = powerq = nullptr;
-  _cut = _cutsq = costheta = nullptr;
+  double **c1, **c2, **c3, **c4, **c5, **c6;
+  double **ncutsq, **ncut, **sigma, **powerp, **powerq, **sigma_gamma;
+  double ***lambda_epsilon, ***costheta;
+  c1 = c2 = c3 = c4 = c5 = c6 = nullptr;
+  ncutsq = ncut = sigma = powerp = powerq = sigma_gamma = nullptr;
+  lambda_epsilon = costheta = nullptr;
 
-  memory->create(epsilon,nparams,"pair:epsilon");
-  memory->create(sigma,nparams,"pair:sigma");
-  memory->create(lambda,nparams,"pair:lambda");
-  memory->create(gamma,nparams,"pair:gamma");
-  memory->create(biga,nparams,"pair:biga");
-  memory->create(bigb,nparams,"pair:bigb");
-  memory->create(powerp,nparams,"pair:powerp");
-  memory->create(powerq,nparams,"pair:powerq");
-  memory->create(_cut,nparams,"pair:_cut");
-  memory->create(_cutsq,nparams,"pair:_cutsq");
-  memory->create(costheta,nparams,"pair:costheta");
+  const int tp1 = atom->ntypes + 1;
 
-  for (int i = 0; i < nparams; i++) {
-    epsilon[i] = params[i].epsilon;
-    sigma[i] = params[i].sigma;
-    lambda[i] = params[i].lambda;
-    gamma[i] = params[i].gamma;
-    biga[i] = params[i].biga;
-    bigb[i] = params[i].bigb;
-    powerp[i] = params[i].powerp;
-    powerq[i] = params[i].powerq;
-    _cut[i] = params[i].cut;
-    _cutsq[i] = params[i].cutsq;
-    costheta[i] = params[i].costheta;
+  memory->create(ncutsq, tp1, tp1, "pair:ncutsq");
+  memory->create(ncut, tp1, tp1, "pair:ncut");
+  memory->create(sigma, tp1, tp1, "pair:sigma");
+  memory->create(powerp, tp1, tp1, "pair:powerp");
+  memory->create(powerq, tp1, tp1, "pair:powerq");
+  memory->create(sigma_gamma, tp1, tp1, "pair:sigma_gamma");
+  memory->create(c1, tp1, tp1, "pair:c1");
+  memory->create(c2, tp1, tp1, "pair:c2");
+  memory->create(c3, tp1, tp1, "pair:c3");
+  memory->create(c4, tp1, tp1, "pair:c4");
+  memory->create(c5, tp1, tp1, "pair:c5");
+  memory->create(c6, tp1, tp1, "pair:c6");
+  memory->create(lambda_epsilon, tp1, tp1, tp1, "pair:lambda_epsilon");
+  memory->create(costheta, tp1, tp1, tp1, "pair:costheta");
+
+  for (int ii = 1; ii < tp1; ii++) {
+    int i = map[ii];
+    for (int jj = 1; jj < tp1; jj++) {
+      int j = map[jj];
+      if (i < 0 || j < 0)
+        continue;
+      else {
+        int ijparam = elem2param[i][j][j];
+        ncutsq[ii][jj] = params[ijparam].cutsq;
+        ncut[ii][jj] = params[ijparam].cut;
+        sigma[ii][jj] = params[ijparam].sigma;
+        powerp[ii][jj] = params[ijparam].powerp;
+        powerq[ii][jj] = params[ijparam].powerq;
+        sigma_gamma[ii][jj] = params[ijparam].sigma_gamma;
+        c1[ii][jj] = params[ijparam].c1;
+        c2[ii][jj] = params[ijparam].c2;
+        c3[ii][jj] = params[ijparam].c3;
+        c4[ii][jj] = params[ijparam].c4;
+        c5[ii][jj] = params[ijparam].c5;
+        c6[ii][jj] = params[ijparam].c6;
+      }
+
+      for (int kk = 1; kk < tp1; kk++) {
+        int k = map[kk];
+        if (k < 0)
+          continue;
+        else {
+          int ijkparam = elem2param[i][j][k];
+          costheta[ii][jj][kk] = params[ijkparam].costheta;
+          lambda_epsilon[ii][jj][kk] = params[ijkparam].lambda_epsilon;
+        }
+      }
+    }
   }
 
-  int success = sw_gpu_init(atom->ntypes+1, atom->nlocal, atom->nlocal+atom->nghost, 300,
-                            cell_size, gpu_mode, screen, map, nelements,
-                            elem2param, nparams, epsilon,
-                            sigma, lambda, gamma, costheta, biga, bigb,
-                            powerp, powerq, _cut, _cutsq);
+  int mnf = 5e-2 * neighbor->oneatom;
+  int success = sw_gpu_init(tp1, atom->nlocal, atom->nlocal+atom->nghost, mnf,
+                            cell_size, gpu_mode, screen, ncutsq, ncut, sigma,
+                            powerp, powerq, sigma_gamma,  c1, c2, c3, c4, c5,
+                            c6, lambda_epsilon, costheta, map, elem2param);
 
-  memory->destroy(epsilon);
+  memory->destroy(ncutsq);
+  memory->destroy(ncut);
   memory->destroy(sigma);
-  memory->destroy(lambda);
-  memory->destroy(gamma);
-  memory->destroy(biga);
-  memory->destroy(bigb);
   memory->destroy(powerp);
   memory->destroy(powerq);
-  memory->destroy(_cut);
-  memory->destroy(_cutsq);
+  memory->destroy(sigma_gamma);
+  memory->destroy(c1);
+  memory->destroy(c2);
+  memory->destroy(c3);
+  memory->destroy(c4);
+  memory->destroy(c5);
+  memory->destroy(c6);
+  memory->destroy(lambda_epsilon);
   memory->destroy(costheta);
 
   GPU_EXTRA::check_flag(success,error,world);
@@ -218,7 +243,6 @@ void PairSWGPU::init_style()
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->ghost = 1;
   }
-
   if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) {
     comm->cutghostuser=2.0*cutmax + neighbor->skin;
     if (comm->me == 0)
diff --git a/src/GPU/pair_table_gpu.cpp b/src/GPU/pair_table_gpu.cpp
index e3cb740e0e..05b76d9adb 100644
--- a/src/GPU/pair_table_gpu.cpp
+++ b/src/GPU/pair_table_gpu.cpp
@@ -231,9 +231,10 @@ void PairTableGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = table_gpu_init(atom->ntypes+1, cutsq, table_coeffs, table_data,
                                force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
+                               atom->nlocal+atom->nghost, mnf, maxspecial,
                                cell_size, gpu_mode, screen, tabstyle, ntables,
                                tablength);
   GPU_EXTRA::check_flag(success,error,world);
@@ -243,7 +244,6 @@ void PairTableGPU::init_style()
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
-
   memory->destroy(table_coeffs);
   memory->destroy(table_data);
 }
diff --git a/src/GPU/pair_tersoff_gpu.cpp b/src/GPU/pair_tersoff_gpu.cpp
index 8758150956..e675ba6903 100644
--- a/src/GPU/pair_tersoff_gpu.cpp
+++ b/src/GPU/pair_tersoff_gpu.cpp
@@ -66,8 +66,6 @@ void tersoff_gpu_compute(const int ago, const int nlocal, const int nall,
                     const bool vflag, const bool eatom, const bool vatom,
                     int &host_start, const double cpu_time, bool &success);
 double tersoff_gpu_bytes();
-extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                             double **vatom, double *virial, double &ecoul);
 
 #define MAXLINE 1024
 #define DELTA 4
@@ -216,8 +214,9 @@ void PairTersoffGPU::init_style()
     _cutsq[i] = params[i].cutsq;
   }
 
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = tersoff_gpu_init(atom->ntypes+1, atom->nlocal,
-                                 atom->nlocal+atom->nghost, 300,
+                                 atom->nlocal+atom->nghost, mnf,
                                  cell_size, gpu_mode, screen, map, nelements,
                                  elem2param, nparams, lam1, lam2, lam3,
                                  powermint, biga, bigb, bigr, bigd,
@@ -252,7 +251,6 @@ void PairTersoffGPU::init_style()
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->ghost = 1;
   }
-
   if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) {
     comm->cutghostuser = 2.0*cutmax + neighbor->skin;
     if (comm->me == 0)
diff --git a/src/GPU/pair_tersoff_mod_gpu.cpp b/src/GPU/pair_tersoff_mod_gpu.cpp
index 71734c1c09..98a7248c1f 100644
--- a/src/GPU/pair_tersoff_mod_gpu.cpp
+++ b/src/GPU/pair_tersoff_mod_gpu.cpp
@@ -43,9 +43,10 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall,
   int* host_map, const int nelements, int*** host_elem2param, const int nparams,
   const double* ts_lam1, const double* ts_lam2, const double* ts_lam3,
   const double* ts_powermint, const double* ts_biga, const double* ts_bigb,
-  const double* ts_bigr, const double* ts_bigd, const double* ts_c1, const double* ts_c2,
-  const double* ts_c3, const double* ts_c4, const double* ts_c5, const double* ts_h,
-  const double* ts_beta, const double* ts_powern, const double* ts_powern_del,
+  const double* ts_bigr, const double* ts_bigd, const double* ts_c1,
+  const double* ts_c2, const double* ts_c3, const double* ts_c4,
+  const double* ts_c5, const double* ts_h, const double* ts_beta,
+  const double* ts_powern, const double* ts_powern_del,
   const double* ts_ca1, const double* ts_cutsq);
 void tersoff_mod_gpu_clear();
 int ** tersoff_mod_gpu_compute_n(const int ago, const int inum_full,
@@ -61,8 +62,6 @@ void tersoff_mod_gpu_compute(const int ago, const int nlocal, const int nall,
                     const bool vflag, const bool eatom, const bool vatom,
                     int &host_start, const double cpu_time, bool &success);
 double tersoff_mod_gpu_bytes();
-extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                             double **vatom, double *virial, double &ecoul);
 
 /* ---------------------------------------------------------------------- */
 
@@ -208,8 +207,9 @@ void PairTersoffMODGPU::init_style()
     _cutsq[i] = params[i].cutsq;
   }
 
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = tersoff_mod_gpu_init(atom->ntypes+1, atom->nlocal,
-                                 atom->nlocal+atom->nghost, 300,
+                                 atom->nlocal+atom->nghost, mnf,
                                  cell_size, gpu_mode, screen, map, nelements,
                                  elem2param, nparams, lam1, lam2, lam3,
                                  powermint, biga, bigb, bigr, bigd,
@@ -244,7 +244,6 @@ void PairTersoffMODGPU::init_style()
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->ghost = 1;
   }
-
   if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) {
     comm->cutghostuser = 2.0*cutmax + neighbor->skin;
     if (comm->me == 0)
diff --git a/src/GPU/pair_tersoff_zbl_gpu.cpp b/src/GPU/pair_tersoff_zbl_gpu.cpp
index e662159fa8..e17b48fec5 100644
--- a/src/GPU/pair_tersoff_zbl_gpu.cpp
+++ b/src/GPU/pair_tersoff_zbl_gpu.cpp
@@ -69,8 +69,6 @@ void tersoff_zbl_gpu_compute(const int ago, const int nlocal, const int nall,
                     const bool vflag, const bool eatom, const bool vatom,
                     int &host_start, const double cpu_time, bool &success);
 double tersoff_zbl_gpu_bytes();
-extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                             double **vatom, double *virial, double &ecoul);
 
 /* ---------------------------------------------------------------------- */
 
@@ -225,8 +223,9 @@ void PairTersoffZBLGPU::init_style()
     _cutsq[i] = params[i].cutsq;
   }
 
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = tersoff_zbl_gpu_init(atom->ntypes+1, atom->nlocal,
-                                 atom->nlocal+atom->nghost, 300,
+                                 atom->nlocal+atom->nghost, mnf,
                                  cell_size, gpu_mode, screen, map, nelements,
                                  elem2param, nparams, lam1, lam2, lam3,
                                  powermint, biga, bigb, bigr, bigd,
@@ -266,7 +265,6 @@ void PairTersoffZBLGPU::init_style()
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->ghost = 1;
   }
-
   if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) {
     comm->cutghostuser = 2.0*cutmax + neighbor->skin;
     if (comm->me == 0)
diff --git a/src/GPU/pair_ufm_gpu.cpp b/src/GPU/pair_ufm_gpu.cpp
index 87354acda9..f950bf11c3 100644
--- a/src/GPU/pair_ufm_gpu.cpp
+++ b/src/GPU/pair_ufm_gpu.cpp
@@ -43,28 +43,27 @@ using namespace LAMMPS_NS;
 // External functions from cuda library for atom decomposition
 
 int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
-                 double **host_uf2, double **host_uf3,
-                 double **offset, double *special_lj, const int nlocal,
-                 const int nall, const int max_nbors, const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen);
+                  double **host_uf2, double **host_uf3,
+                  double **offset, double *special_lj, const int nlocal,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen);
 
 int ufml_gpu_reinit(const int ntypes, double **cutsq, double **host_uf1,
-                   double **host_uf2, double **host_uf3,
-                   double **offset);
+                    double **host_uf2, double **host_uf3, double **offset);
 
 void ufml_gpu_clear();
-int ** ufml_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
-                         tagint **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success);
+int ** ufml_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
+                          tagint **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum,
+                          const double cpu_time, bool &success);
 void ufml_gpu_compute(const int ago, const int inum, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success);
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success);
 double ufml_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -166,9 +165,10 @@ void PairUFMGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ufml_gpu_init(atom->ntypes+1, cutsq, uf1, uf2, uf3,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_vashishta_gpu.cpp b/src/GPU/pair_vashishta_gpu.cpp
index df17b2091a..c5dd722974 100644
--- a/src/GPU/pair_vashishta_gpu.cpp
+++ b/src/GPU/pair_vashishta_gpu.cpp
@@ -38,34 +38,34 @@ using namespace LAMMPS_NS;
 
 // External functions from cuda library for atom decomposition
 
-int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors,
-                const double cell_size, int &gpu_mode, FILE *screen,
-                int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-                const double* cutsq, const double* r0,
-                const double* gamma, const double* eta,
-                const double* lam1inv, const double* lam4inv,
-                const double* zizj, const double* mbigd,
-                const double* dvrc, const double* big6w,
-                const double* heta, const double* bigh,
-                const double* bigw, const double* c0,
-                const double* costheta, const double* bigb,
-                const double* big2b, const double* bigc);
+int vashishta_gpu_init(const int ntypes, const int inum, const int nall,
+                       const int max_nbors, const double cell_size,
+                       int &gpu_mode, FILE *screen, int* host_map,
+                       const int nelements, int*** host_elem2param,
+                       const int nparams, const double* cutsq, const double* r0,
+                       const double* gamma, const double* eta,
+                       const double* lam1inv, const double* lam4inv,
+                       const double* zizj, const double* mbigd,
+                       const double* dvrc, const double* big6w,
+                       const double* heta, const double* bigh,
+                       const double* bigw, const double* c0,
+                       const double* costheta, const double* bigb,
+                       const double* big2b, const double* bigc);
 void vashishta_gpu_clear();
-int ** vashishta_gpu_compute_n(const int ago, const int inum,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** vashishta_gpu_compute_n(const int ago, const int inum, const int nall,
+                        double **host_x, int *host_type, double *sublo,
+                        double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum,
                         const double cpu_time, bool &success);
-void vashishta_gpu_compute(const int ago, const int nloc, const int nall, const int ln,
-                    double **host_x, int *host_type, int *ilist, int *numj,
-                    int **firstneigh, const bool eflag, const bool vflag,
-                    const bool eatom, const bool vatom, int &host_start,
-                    const double cpu_time, bool &success);
+void vashishta_gpu_compute(const int ago, const int nloc, const int nall,
+                           const int ln, double **host_x, int *host_type,
+                           int *ilist, int *numj, int **firstneigh,
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           const double cpu_time, bool &success);
 double vashishta_gpu_bytes();
-extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                             double **vatom, double *virial, double &ecoul);
 
 /* ---------------------------------------------------------------------- */
 
@@ -214,7 +214,8 @@ void PairVashishtaGPU::init_style()
     big2b[i] = params[i].big2b;
     bigc[i] = params[i].bigc;
   }
-  int success = vashishta_gpu_init(atom->ntypes+1, atom->nlocal, atom->nlocal+atom->nghost, 500,
+  int mnf = 5e-2 * neighbor->oneatom;
+  int success = vashishta_gpu_init(atom->ntypes+1, atom->nlocal, atom->nlocal+atom->nghost, mnf,
                             cell_size, gpu_mode, screen, map, nelements,
                             elem2param, nparams, cutsq, r0, gamma, eta, lam1inv,
                             lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw,
@@ -246,7 +247,6 @@ void PairVashishtaGPU::init_style()
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->ghost = 1;
   }
-
   if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) {
     comm->cutghostuser=2.0*cutmax + neighbor->skin;
     if (comm->me == 0)
diff --git a/src/GPU/pair_yukawa_colloid_gpu.cpp b/src/GPU/pair_yukawa_colloid_gpu.cpp
index 8da3b48dd5..9322f95f44 100644
--- a/src/GPU/pair_yukawa_colloid_gpu.cpp
+++ b/src/GPU/pair_yukawa_colloid_gpu.cpp
@@ -41,24 +41,27 @@ using namespace LAMMPS_NS;
 // External functions from cuda library for atom decomposition
 
 int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
-                 double **host_offset, double *special_lj, const int inum,
-                 const int nall, const int max_nbors,  const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen,
-                 const double kappa);
+                       double **host_offset, double *special_lj, const int inum,
+                       const int nall, const int max_nbors,
+                       const int maxspecial, const double cell_size,
+                       int &gpu_mode, FILE *screen, const double kappa);
 void ykcolloid_gpu_clear();
 int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial,
-                        tagint **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        int **ilist, int **jnum, const double cpu_time,
-                        bool &success, double *host_rad);
+                               const int nall, double **host_x, int *host_type,
+                               double *sublo, double *subhi, tagint *tag,
+                               int **nspecial, tagint **special,
+                               const bool eflag, const bool vflag,
+                               const bool eatom, const bool vatom,
+                               int &host_start, int **ilist, int **jnum,
+                               const double cpu_time, bool &success,
+                               double *host_rad);
 void ykcolloid_gpu_compute(const int ago, const int inum_full,
-                     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-                     const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, double *host_rad);
+                           const int nall, double **host_x, int *host_type,
+                           int *ilist, int *numj, int **firstneigh,
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           const double cpu_time, bool &success,
+                           double *host_rad);
 double ykcolloid_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -167,9 +170,10 @@ void PairYukawaColloidGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ykcolloid_gpu_init(atom->ntypes+1, cutsq, a,
                                    offset, force->special_lj, atom->nlocal,
-                                   atom->nlocal+atom->nghost, 300, maxspecial,
+                                   atom->nlocal+atom->nghost, mnf, maxspecial,
                                    cell_size, gpu_mode, screen, kappa);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_yukawa_gpu.cpp b/src/GPU/pair_yukawa_gpu.cpp
index 8c133b068e..81304159a0 100644
--- a/src/GPU/pair_yukawa_gpu.cpp
+++ b/src/GPU/pair_yukawa_gpu.cpp
@@ -49,10 +49,10 @@ void yukawa_gpu_clear();
 int ** yukawa_gpu_compute_n(const int ago, const int inum_full, const int nall,
                             double **host_x, int *host_type, double *sublo,
                             double *subhi, tagint *tag, int **nspecial,
-                            tagint **special, const bool eflag, const bool vflag,
-                            const bool eatom, const bool vatom,
-                            int &host_start, int **ilist, int **jnum,
-                            const double cpu_time, bool &success);
+                            tagint **special, const bool eflag,
+                            const bool vflag, const bool eatom,
+                            const bool vatom, int &host_start, int **ilist,
+                            int **jnum, const double cpu_time, bool &success);
 void yukawa_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
                         int **firstneigh, const bool eflag, const bool vflag,
@@ -159,9 +159,10 @@ void PairYukawaGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = yukawa_gpu_init(atom->ntypes+1, cutsq, kappa, a,
                                 offset, force->special_lj, atom->nlocal,
-                                atom->nlocal+atom->nghost, 300, maxspecial,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                 cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_zbl_gpu.cpp b/src/GPU/pair_zbl_gpu.cpp
index eda0c26614..93e0588285 100644
--- a/src/GPU/pair_zbl_gpu.cpp
+++ b/src/GPU/pair_zbl_gpu.cpp
@@ -50,9 +50,9 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
                  const int maxspecial, const double cell_size,
                  int &gpu_mode, FILE *screen);
 void zbl_gpu_clear();
-int ** zbl_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** zbl_gpu_compute_n(const int ago, const int inum, const int nall,
+                         double **host_x, int *host_type, double *sublo,
+                         double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,
@@ -165,11 +165,12 @@ void PairZBLGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = zbl_gpu_init(atom->ntypes+1, cutsq, sw1, sw2, sw3, sw4,
                              sw5, d1a, d2a, d3a, d4a, zze,
                              cut_globalsq, cut_innersq, cut_inner,
                              atom->nlocal, atom->nlocal+atom->nghost,
-                             300, maxspecial, cell_size, gpu_mode, screen);
+                             mnf, maxspecial, cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE) {
diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index cc7ef8841e..61d0144b73 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -80,9 +80,9 @@ FFT_SCALAR* PPPM_GPU_API(init)(const int nlocal, const int nall, FILE *screen,
                                const bool respa, int &success);
 void PPPM_GPU_API(clear)(const double poisson_time);
 int PPPM_GPU_API(spread)(const int ago, const int nlocal, const int nall,
-                      double **host_x, int *host_type, bool &success,
-                      double *host_q, double *boxlo, const double delxinv,
-                      const double delyinv, const double delzinv);
+                         double **host_x, int *host_type, bool &success,
+                         double *host_q, double *boxlo, const double delxinv,
+                         const double delyinv, const double delzinv);
 void PPPM_GPU_API(interp)(const FFT_SCALAR qqrd2e_scale);
 double PPPM_GPU_API(bytes)();
 void PPPM_GPU_API(forces)(double **f);
@@ -208,9 +208,9 @@ void PPPMGPU::compute(int eflag, int vflag)
   if (triclinic == 0) {
     bool success = true;
     int flag=PPPM_GPU_API(spread)(nago, atom->nlocal, atom->nlocal +
-                              atom->nghost, atom->x, atom->type, success,
-                              atom->q, domain->boxlo, delxinv, delyinv,
-                              delzinv);
+                                  atom->nghost, atom->x, atom->type, success,
+                                  atom->q, domain->boxlo, delxinv, delyinv,
+                                  delzinv);
     if (!success)
       error->one(FLERR,"Insufficient memory on accelerator");
     if (flag != 0)
@@ -402,7 +402,7 @@ void PPPMGPU::poisson_ik()
     work1[n++] = ZEROF;
   }
 
-  fft1->compute(work1,work1,1);
+  fft1->compute(work1,work1,FFT3d::FORWARD);
 
   // if requested, compute energy and virial contribution
 
@@ -441,7 +441,7 @@ void PPPMGPU::poisson_ik()
 
   if (evflag_atom) poisson_peratom();
 
-  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+  // compute gradients of V(r) in each of 3 dims by transformimg ik*V(k)
   // FFT leaves data in 3d brick decomposition
   // copy it into inner portion of vdx,vdy,vdz arrays
 
@@ -451,12 +451,12 @@ void PPPMGPU::poisson_ik()
   for (k = nzlo_fft; k <= nzhi_fft; k++)
     for (j = nylo_fft; j <= nyhi_fft; j++)
       for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        work2[n] = fkx[i]*work1[n+1];
-        work2[n+1] = -fkx[i]*work1[n];
+        work2[n] = -fkx[i]*work1[n+1];
+        work2[n+1] = fkx[i]*work1[n];
         n += 2;
       }
 
-  fft2->compute(work2,work2,-1);
+  fft2->compute(work2,work2,FFT3d::BACKWARD);
 
   n = 0;
   int x_hi = nxhi_in * 4 + 3;
@@ -473,12 +473,12 @@ void PPPMGPU::poisson_ik()
   for (k = nzlo_fft; k <= nzhi_fft; k++)
     for (j = nylo_fft; j <= nyhi_fft; j++)
       for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        work2[n] = fky[j]*work1[n+1];
-        work2[n+1] = -fky[j]*work1[n];
+        work2[n] = -fky[j]*work1[n+1];
+        work2[n+1] = fky[j]*work1[n];
         n += 2;
       }
 
-  fft2->compute(work2,work2,-1);
+  fft2->compute(work2,work2,FFT3d::BACKWARD);
 
   n = 0;
   for (k = nzlo_in; k <= nzhi_in; k++)
@@ -494,12 +494,12 @@ void PPPMGPU::poisson_ik()
   for (k = nzlo_fft; k <= nzhi_fft; k++)
     for (j = nylo_fft; j <= nyhi_fft; j++)
       for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        work2[n] = fkz[k]*work1[n+1];
-        work2[n+1] = -fkz[k]*work1[n];
+        work2[n] = -fkz[k]*work1[n+1];
+        work2[n+1] = fkz[k]*work1[n];
         n += 2;
       }
 
-  fft2->compute(work2,work2,-1);
+  fft2->compute(work2,work2,FFT3d::BACKWARD);
 
   n = 0;
   for (k = nzlo_in; k <= nzhi_in; k++)

From d3123dd5c3025c6b173ccc11e5db8ba02d9351da Mon Sep 17 00:00:00 2001
From: Michael Brown <michael.w.brown@intel.com>
Date: Mon, 15 Feb 2021 08:37:38 -0800
Subject: [PATCH 029/116] Feb2021 GPU Package Update - Core LAMMPS Files

---
 src/MAKE/OPTIONS/Makefile.g++_openmpi |   4 +-
 src/MAKE/OPTIONS/Makefile.g++_serial  |   4 +-
 src/MAKE/OPTIONS/Makefile.oneapi      | 122 ++++++++++++++++++++++++++
 src/atom.cpp                          |  33 +++++++
 src/lammps.cpp                        |   4 +-
 5 files changed, 161 insertions(+), 6 deletions(-)
 create mode 100644 src/MAKE/OPTIONS/Makefile.oneapi

diff --git a/src/MAKE/OPTIONS/Makefile.g++_openmpi b/src/MAKE/OPTIONS/Makefile.g++_openmpi
index 548994f832..75c12f9b38 100644
--- a/src/MAKE/OPTIONS/Makefile.g++_openmpi
+++ b/src/MAKE/OPTIONS/Makefile.g++_openmpi
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 export OMPI_CXX = g++
-CC =		mpicxx
+CC =		mpicxx -std=c++11
 CCFLAGS =	-g -O3
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
-LINK =		mpicxx
+LINK =		mpicxx -std=c++11
 LINKFLAGS =	-g -O
 LIB = 
 SIZE =		size
diff --git a/src/MAKE/OPTIONS/Makefile.g++_serial b/src/MAKE/OPTIONS/Makefile.g++_serial
index 65de6a2c2c..4f6f0afe22 100644
--- a/src/MAKE/OPTIONS/Makefile.g++_serial
+++ b/src/MAKE/OPTIONS/Makefile.g++_serial
@@ -6,12 +6,12 @@ SHELL = /bin/sh
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
-CC =		g++
+CC =		g++ -std=c++11
 CCFLAGS =	-g -O3
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
-LINK =		g++
+LINK =		g++ -std=c++11
 LINKFLAGS =	-g -O
 LIB = 
 SIZE =		size
diff --git a/src/MAKE/OPTIONS/Makefile.oneapi b/src/MAKE/OPTIONS/Makefile.oneapi
new file mode 100644
index 0000000000..2524773a76
--- /dev/null
+++ b/src/MAKE/OPTIONS/Makefile.oneapi
@@ -0,0 +1,122 @@
+# oneapi = For Intel oneAPI builds with GPU package
+
+SHELL = /bin/sh
+
+# ---------------------------------------------------------------------
+# compiler/linker settings
+# specify flags and libraries needed for your compiler
+
+CC =		mpiicpc -std=c++11
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+CCFLAGS =	-qopenmp -qopenmp-simd -qno-offload -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) \
+                -I$(MKLROOT)/include
+SHFLAGS =	-fPIC
+DEPFLAGS =	-M
+
+LINK =		mpiicpc -std=c++11
+LINKFLAGS =	-qopenmp -qopenmp-simd $(OPTFLAGS) -L$(MKLROOT)/lib/intel64/
+LIB =           -ltbbmalloc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core	
+SIZE =		size
+
+ARCHIVE =	ar
+ARFLAGS =	-rc
+SHLIBFLAGS =	-shared
+
+# ---------------------------------------------------------------------
+# LAMMPS-specific settings, all OPTIONAL
+# specify settings for LAMMPS features you will use
+# if you change any -D setting, do full re-compile after "make clean"
+
+# LAMMPS ifdef settings
+# see possible settings in Section 3.5 of the manual
+
+LMP_INC =	-DLAMMPS_GZIP
+
+# MPI library
+# see discussion in Section 3.4 of the manual
+# MPI wrapper compiler/linker can provide this info
+# can point to dummy MPI library in src/STUBS as in Makefile.serial
+# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
+# INC = path for mpi.h, MPI compiler settings
+# PATH = path for MPI library
+# LIB = name of MPI library
+
+MPI_INC =       -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
+MPI_PATH = 
+MPI_LIB =
+
+# FFT library
+# see discussion in Section 3.5.2 of manual
+# can be left blank to use provided KISS FFT library
+# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
+# PATH = path for FFT library
+# LIB = name of FFT library
+
+FFT_INC =       -DFFT_MKL -DFFT_SINGLE
+FFT_PATH = 
+FFT_LIB =
+
+# JPEG and/or PNG library
+# see discussion in Section 3.5.4 of manual
+# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
+# INC = path(s) for jpeglib.h and/or png.h
+# PATH = path(s) for JPEG library and/or PNG library
+# LIB = name(s) of JPEG library and/or PNG library
+
+JPG_INC =
+JPG_PATH =
+JPG_LIB =
+
+# ---------------------------------------------------------------------
+# build rules and dependencies
+# do not edit this section
+
+include Makefile.package.settings
+include Makefile.package
+
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
+EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
+EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
+EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
+EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
+
+# Path to src files
+
+vpath %.cpp ..
+vpath %.h ..
+
+# Link target
+
+$(EXE): main.o $(LMPLIB) $(EXTRA_LINK_DEPENDS)
+	$(LINK) $(LINKFLAGS) main.o $(EXTRA_PATH) $(LMPLINK) $(EXTRA_LIB) $(LIB) -o $@
+	$(SIZE) $@
+
+# Library targets
+
+$(ARLIB): $(OBJ) $(EXTRA_LINK_DEPENDS)
+	@rm -f ../$(ARLIB)
+	$(ARCHIVE) $(ARFLAGS) ../$(ARLIB) $(OBJ)
+	@rm -f $(ARLIB)
+	@ln -s ../$(ARLIB) $(ARLIB)
+
+$(SHLIB): $(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o ../$(SHLIB) \
+		$(OBJ) $(EXTRA_LIB) $(LIB)
+	@rm -f $(SHLIB)
+	@ln -s ../$(SHLIB) $(SHLIB)
+
+# Compilation rules
+
+%.o:%.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+# Individual dependencies
+
+depend : fastdep.exe $(SRC)
+	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
+
+fastdep.exe: ../DEPEND/fastdep.c
+	cc -O -o $@ $<
+
+sinclude .depend
diff --git a/src/atom.cpp b/src/atom.cpp
index 3308d07267..e7b1df8240 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -40,6 +40,10 @@
 #include "neigh_request.h"
 #endif
 
+#ifdef LMP_GPU
+#include "fix_gpu.h"
+#endif
+
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
@@ -2196,6 +2200,35 @@ void Atom::setup_sort_bins()
   }
   #endif
 
+  #ifdef LMP_GPU
+  if (userbinsize == 0.0) {
+    int ifix = modify->find_fix("package_gpu");
+    if (ifix >= 0) {
+      const double subx = domain->subhi[0] - domain->sublo[0];
+      const double suby = domain->subhi[1] - domain->sublo[1];
+      const double subz = domain->subhi[2] - domain->sublo[2];
+
+      FixGPU *fix = static_cast<FixGPU *>(modify->fix[ifix]);
+      binsize = fix->binsize(subx, suby, subz, atom->nlocal,
+                             neighbor->cutneighmax);
+      bininv = 1.0 / binsize;
+      
+      nbinx = static_cast<int> (ceil(subx * bininv));
+      nbiny = static_cast<int> (ceil(suby * bininv));
+      nbinz = static_cast<int> (ceil(subz * bininv));
+      if (domain->dimension == 2) nbinz = 1;
+
+      if (nbinx == 0) nbinx = 1;
+      if (nbiny == 0) nbiny = 1;
+      if (nbinz == 0) nbinz = 1;
+
+      bininvx = bininv;
+      bininvy = bininv;
+      bininvz = bininv;
+    }
+  }
+  #endif
+
   if (1.0*nbinx*nbiny*nbinz > INT_MAX)
     error->one(FLERR,"Too many atom sorting bins");
 
diff --git a/src/lammps.cpp b/src/lammps.cpp
index 6734fbd209..277ec4414f 100644
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@@ -842,12 +842,12 @@ void LAMMPS::post_create()
     if (strcmp(suffix,"omp") == 0 && !modify->check_package("OMP"))
       error->all(FLERR,"Using suffix omp without USER-OMP package installed");
 
-    if (strcmp(suffix,"gpu") == 0) input->one("package gpu 1");
+    if (strcmp(suffix,"gpu") == 0) input->one("package gpu 0");
     if (strcmp(suffix,"intel") == 0) input->one("package intel 1");
     if (strcmp(suffix,"omp") == 0) input->one("package omp 0");
 
     if (suffix2) {
-      if (strcmp(suffix2,"gpu") == 0) input->one("package gpu 1");
+      if (strcmp(suffix2,"gpu") == 0) input->one("package gpu 0");
       if (strcmp(suffix2,"intel") == 0) input->one("package intel 1");
       if (strcmp(suffix2,"omp") == 0) input->one("package omp 0");
     }

From d256614c9f642e2028e8996903e3b36d56ad6164 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Mon, 15 Feb 2021 13:19:25 -0500
Subject: [PATCH 030/116] Fix docs after PR #2592

---
 doc/src/Python_atoms.rst    |  2 +-
 doc/src/Python_module.rst   | 12 ++++++------
 doc/src/Python_neighbor.rst |  4 ++--
 doc/src/Python_objects.rst  | 12 ++++++------
 python/lammps/core.py       |  2 +-
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/doc/src/Python_atoms.rst b/doc/src/Python_atoms.rst
index 92b9677d16..be0d4ff800 100644
--- a/doc/src/Python_atoms.rst
+++ b/doc/src/Python_atoms.rst
@@ -50,7 +50,7 @@ against invalid accesses.
 
       **Numpy Methods**:
 
-      * :py:meth:`numpy.extract_atom() <lammps.numpy_wrapper.extract_atom()>`: extract a per-atom quantity as numpy array
+      * :py:meth:`numpy.extract_atom() <lammps.numpy_wrapper.numpy_wrapper.extract_atom()>`: extract a per-atom quantity as numpy array
 
    .. tab:: PyLammps/IPyLammps API
 
diff --git a/doc/src/Python_module.rst b/doc/src/Python_module.rst
index 59be645cbd..d2564986de 100644
--- a/doc/src/Python_module.rst
+++ b/doc/src/Python_module.rst
@@ -61,7 +61,7 @@ functions. Below is a detailed documentation of the API.
 .. autoclass:: lammps.lammps
    :members:
 
-.. autoclass:: lammps.numpy::numpy_wrapper
+.. autoclass:: lammps.numpy_wrapper::numpy_wrapper
    :members:
 
 ----------
@@ -134,8 +134,8 @@ Style Constants
    to request from computes or fixes. See :cpp:enum:`_LMP_STYLE_CONST`
    for the equivalent constants in the C library interface. Used in
    :py:func:`lammps.extract_compute`, :py:func:`lammps.extract_fix`, and their NumPy variants
-   :py:func:`lammps.numpy.extract_compute() <lammps.numpy.numpy_wrapper.extract_compute>` and
-   :py:func:`lammps.numpy.extract_fix() <lammps.numpy.numpy_wrapper.extract_fix>`.
+   :py:func:`lammps.numpy.extract_compute() <lammps.numpy_wrapper.numpy_wrapper.extract_compute>` and
+   :py:func:`lammps.numpy.extract_fix() <lammps.numpy_wrapper.numpy_wrapper.extract_fix>`.
 
 .. _py_type_constants:
 
@@ -149,8 +149,8 @@ Type Constants
    to request  from computes  or fixes.  See :cpp:enum:`_LMP_TYPE_CONST`
    for the equivalent constants in the C library interface. Used in
    :py:func:`lammps.extract_compute`, :py:func:`lammps.extract_fix`, and their NumPy variants
-   :py:func:`lammps.numpy.extract_compute() <lammps.numpy.numpy_wrapper.extract_compute>` and
-   :py:func:`lammps.numpy.extract_fix() <lammps.numpy.numpy_wrapper.extract_fix>`.
+   :py:func:`lammps.numpy.extract_compute() <lammps.numpy_wrapper.numpy_wrapper.extract_compute>` and
+   :py:func:`lammps.numpy.extract_fix() <lammps.numpy_wrapper.numpy_wrapper.extract_fix>`.
 
 .. _py_vartype_constants:
 
@@ -170,6 +170,6 @@ Classes representing internal objects
    :members:
    :no-undoc-members:
 
-.. autoclass:: lammps.numpy::NumPyNeighList
+.. autoclass:: lammps.numpy_wrapper::NumPyNeighList
    :members:
    :no-undoc-members:
diff --git a/doc/src/Python_neighbor.rst b/doc/src/Python_neighbor.rst
index 80651b608f..cba117ad20 100644
--- a/doc/src/Python_neighbor.rst
+++ b/doc/src/Python_neighbor.rst
@@ -14,5 +14,5 @@ Neighbor list access
 
 **NumPy Methods:**
 
-* :py:meth:`lammps.numpy.get_neighlist() <lammps.numpy_wrapper.get_neighlist()>`: Get neighbor list for given index, which uses NumPy arrays for its element neighbor arrays
-* :py:meth:`lammps.numpy.get_neighlist_element_neighbors() <lammps.numpy_wrapper.get_neighlist_element_neighbors()>`: Get element in neighbor list and its neighbors (as numpy array)
+* :py:meth:`lammps.numpy.get_neighlist() <lammps.numpy_wrapper.numpy_wrapper.get_neighlist()>`: Get neighbor list for given index, which uses NumPy arrays for its element neighbor arrays
+* :py:meth:`lammps.numpy.get_neighlist_element_neighbors() <lammps.numpy_wrapper.numpy_wrapper.get_neighlist_element_neighbors()>`: Get element in neighbor list and its neighbors (as numpy array)
diff --git a/doc/src/Python_objects.rst b/doc/src/Python_objects.rst
index ec29863d38..4c8161b8bd 100644
--- a/doc/src/Python_objects.rst
+++ b/doc/src/Python_objects.rst
@@ -36,9 +36,9 @@ computes, fixes, or variables in LAMMPS using the :py:mod:`lammps` module.
       Python subscripting. The values will be zero for atoms not in the
       specified group.
 
-      :py:meth:`lammps.numpy.extract_compute() <lammps.numpy_wrapper.extract_compute()>`,
-      :py:meth:`lammps.numpy.extract_fix() <lammps.numpy_wrapper.extract_fix()>`, and
-      :py:meth:`lammps.numpy.extract_variable() <lammps.numpy_wrapper.extract_variable()>` are
+      :py:meth:`lammps.numpy.extract_compute() <lammps.numpy_wrapper.numpy_wrapper.extract_compute()>`,
+      :py:meth:`lammps.numpy.extract_fix() <lammps.numpy_wrapper.numpy_wrapper.extract_fix()>`, and
+      :py:meth:`lammps.numpy.extract_variable() <lammps.numpy_wrapper.numpy_wrapper.extract_variable()>` are
       equivalent NumPy implementations that return NumPy arrays instead of ``ctypes`` pointers.
 
       The :py:meth:`lammps.set_variable() <lammps.lammps.set_variable()>` method sets an
@@ -54,9 +54,9 @@ computes, fixes, or variables in LAMMPS using the :py:mod:`lammps` module.
 
       **NumPy Methods**:
 
-      * :py:meth:`lammps.numpy.extract_compute() <lammps.numpy_wrapper.extract_compute()>`: extract value(s) from a compute, return arrays as numpy arrays
-      * :py:meth:`lammps.numpy.extract_fix() <lammps.numpy_wrapper.extract_fix()>`: extract value(s) from a fix, return arrays as numpy arrays
-      * :py:meth:`lammps.numpy.extract_variable() <lammps.numpy_wrapper.extract_variable()>`: extract value(s) from a variable, return arrays as numpy arrays
+      * :py:meth:`lammps.numpy.extract_compute() <lammps.numpy_wrapper.numpy_wrapper.extract_compute()>`: extract value(s) from a compute, return arrays as numpy arrays
+      * :py:meth:`lammps.numpy.extract_fix() <lammps.numpy_wrapper.numpy_wrapper.extract_fix()>`: extract value(s) from a fix, return arrays as numpy arrays
+      * :py:meth:`lammps.numpy.extract_variable() <lammps.numpy_wrapper.numpy_wrapper.extract_variable()>`: extract value(s) from a variable, return arrays as numpy arrays
 
 
    .. tab:: PyLammps/IPyLammps API
diff --git a/python/lammps/core.py b/python/lammps/core.py
index d1bc7bc138..1a4650c285 100644
--- a/python/lammps/core.py
+++ b/python/lammps/core.py
@@ -1612,7 +1612,7 @@ class lammps(object):
   def get_neighlist(self, idx):
     """Returns an instance of :class:`NeighList` which wraps access to the neighbor list with the given index
 
-    See :py:meth:`lammps.numpy.get_neighlist() <lammps.numpy_wrapper.get_neighlist()>` if you want to use
+    See :py:meth:`lammps.numpy.get_neighlist() <lammps.numpy_wrapper.numpy_wrapper.get_neighlist()>` if you want to use
     NumPy arrays instead of ``c_int`` pointers.
 
     :param idx: index of neighbor list

From e2c32d12a44b53863d4ff2a6e0e27a0798e0b45c Mon Sep 17 00:00:00 2001
From: Michael Brown <michael.w.brown@intel.com>
Date: Mon, 15 Feb 2021 11:07:43 -0800
Subject: [PATCH 031/116] Feb2021 GPU Package Update - Documentation Files

---
 doc/src/Commands_fix.rst    |   8 +-
 doc/src/Commands_pair.rst   |   2 +-
 doc/src/Speed_gpu.rst       |  40 ++++++++--
 doc/src/Speed_packages.rst  |   8 +-
 doc/src/fix_nh.rst          |   6 +-
 doc/src/fix_nve.rst         |   3 +-
 doc/src/fix_nve_asphere.rst |   3 +-
 doc/src/package.rst         | 155 ++++++++++++++++++------------------
 doc/src/pair_charmm.rst     |   3 +-
 9 files changed, 132 insertions(+), 96 deletions(-)

diff --git a/doc/src/Commands_fix.rst b/doc/src/Commands_fix.rst
index 26dcc1101c..4793568288 100644
--- a/doc/src/Commands_fix.rst
+++ b/doc/src/Commands_fix.rst
@@ -114,7 +114,7 @@ OPT.
    * :doc:`nph/eff <fix_nh_eff>`
    * :doc:`nph/sphere (o) <fix_nph_sphere>`
    * :doc:`nphug <fix_nphug>`
-   * :doc:`npt (iko) <fix_nh>`
+   * :doc:`npt (giko) <fix_nh>`
    * :doc:`npt/asphere (o) <fix_npt_asphere>`
    * :doc:`npt/body <fix_npt_body>`
    * :doc:`npt/cauchy <fix_npt_cauchy>`
@@ -122,8 +122,8 @@ OPT.
    * :doc:`npt/sphere (o) <fix_npt_sphere>`
    * :doc:`npt/uef <fix_nh_uef>`
    * :doc:`numdiff <fix_numdiff>`
-   * :doc:`nve (iko) <fix_nve>`
-   * :doc:`nve/asphere (i) <fix_nve_asphere>`
+   * :doc:`nve (giko) <fix_nve>`
+   * :doc:`nve/asphere (gi) <fix_nve_asphere>`
    * :doc:`nve/asphere/noforce <fix_nve_asphere_noforce>`
    * :doc:`nve/awpmd <fix_nve_awpmd>`
    * :doc:`nve/body <fix_nve_body>`
@@ -138,7 +138,7 @@ OPT.
    * :doc:`nve/spin <fix_nve_spin>`
    * :doc:`nve/tri <fix_nve_tri>`
    * :doc:`nvk <fix_nvk>`
-   * :doc:`nvt (iko) <fix_nh>`
+   * :doc:`nvt (giko) <fix_nh>`
    * :doc:`nvt/asphere (o) <fix_nvt_asphere>`
    * :doc:`nvt/body <fix_nvt_body>`
    * :doc:`nvt/eff <fix_nh_eff>`
diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst
index f5b1ef9b38..e7277e2bbb 100644
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@@ -122,7 +122,7 @@ OPT.
    * :doc:`lebedeva/z <pair_lebedeva_z>`
    * :doc:`lennard/mdf <pair_mdf>`
    * :doc:`line/lj <pair_line_lj>`
-   * :doc:`lj/charmm/coul/charmm (iko) <pair_charmm>`
+   * :doc:`lj/charmm/coul/charmm (giko) <pair_charmm>`
    * :doc:`lj/charmm/coul/charmm/implicit (ko) <pair_charmm>`
    * :doc:`lj/charmm/coul/long (gikot) <pair_charmm>`
    * :doc:`lj/charmm/coul/long/soft (o) <pair_fep_soft>`
diff --git a/doc/src/Speed_gpu.rst b/doc/src/Speed_gpu.rst
index 56eb48cd0e..655f2e1958 100644
--- a/doc/src/Speed_gpu.rst
+++ b/doc/src/Speed_gpu.rst
@@ -45,12 +45,23 @@ to have the OpenCL headers and the (vendor neutral) OpenCL library installed.
 In OpenCL mode, the acceleration depends on having an `OpenCL Installable Client Driver (ICD) <https://www.khronos.org/news/permalink/opencl-installable-client-driver-icd-loader>`_
 installed. There can be multiple of them for the same or different hardware
 (GPUs, CPUs, Accelerators) installed at the same time. OpenCL refers to those
-as 'platforms'.  The GPU library will select the **first** suitable platform,
-but this can be overridden using the device option of the :doc:`package <package>`
+as 'platforms'.  The GPU library will try to auto-select the best suitable platform,
+but this can be overridden using the platform option of the :doc:`package <package>`
 command. run lammps/lib/gpu/ocl_get_devices to get a list of available
 platforms and devices with a suitable ICD available.
 
-To compute and use this package in HIP mode, you have to have the AMD ROCm
+To compile and use this package for Intel GPUs, OpenCL or the Intel oneAPI
+HPC Toolkit can be installed using linux package managers. The latter also
+provides optimized C++, MPI, and many other libraries and tools. See:
+
+* https://software.intel.com/content/www/us/en/develop/tools/oneapi/hpc-toolkit/download.html
+
+If you do not have a discrete GPU card installed, this package can still provide
+significant speedups on some CPUs that include integrated GPUs. Additionally, for
+many macs, OpenCL is already included with the OS and Makefiles are available
+in the lib/gpu directory.
+
+To compile and use this package in HIP mode, you have to have the AMD ROCm
 software installed. Versions of ROCm older than 3.5 are currently deprecated
 by AMD.
 
@@ -75,10 +86,20 @@ automatically if you create more MPI tasks/node than there are
 GPUs/mode.  E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be
 shared by 4 MPI tasks.
 
+The GPU package also has limited support for OpenMP for both
+multi-threading and vectorization of routines that are run on the CPUs.
+This requires that the GPU library and LAMMPS are built with flags to
+enable OpenMP support (e.g. -fopenmp -fopenmp-simd). Some styles for
+time integration are also available in the GPU package. These run
+completely on the CPUs in full double precision, but exploit
+multi-threading and vectorization for faster performance.
+
 Use the "-sf gpu" :doc:`command-line switch <Run_options>`, which will
 automatically append "gpu" to styles that support it.  Use the "-pk
 gpu Ng" :doc:`command-line switch <Run_options>` to set Ng = # of
-GPUs/node to use.
+GPUs/node to use. If Ng is 0, the number is selected automatically as
+the number of matching GPUs that have the highest number of compute
+cores.
 
 .. code-block:: bash
 
@@ -87,8 +108,8 @@ GPUs/node to use.
    mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script   # ditto on 4 16-core nodes
 
 Note that if the "-sf gpu" switch is used, it also issues a default
-:doc:`package gpu 1 <package>` command, which sets the number of
-GPUs/node to 1.
+:doc:`package gpu 0 <package>` command, which will result in
+automatic selection of the number of GPUs to use.
 
 Using the "-pk" switch explicitly allows for setting of the number of
 GPUs/node to use and additional options.  Its syntax is the same as
@@ -138,6 +159,13 @@ Likewise, you should experiment with the precision setting for the GPU
 library to see if single or mixed precision will give accurate
 results, since they will typically be faster.
 
+MPI parallelism typically outperforms OpenMP parallelism, but in same cases
+using fewer MPI tasks and multiple OpenMP threads with the GPU package
+can give better performance. 3-body potentials can often perform better
+with multiple OMP threads because the inter-process communication is
+higher for these styles with the GPU package in order to allow
+deterministic results.
+
 **Guidelines for best performance:**
 
 * Using multiple MPI tasks per GPU will often give the best performance,
diff --git a/doc/src/Speed_packages.rst b/doc/src/Speed_packages.rst
index 600c4ac2b4..6210242413 100644
--- a/doc/src/Speed_packages.rst
+++ b/doc/src/Speed_packages.rst
@@ -16,7 +16,7 @@ These are the accelerator packages currently in LAMMPS, either as
 standard or user packages:
 
 +-----------------------------------------+-------------------------------------------------------+
-| :doc:`GPU Package <Speed_gpu>`          | for NVIDIA GPUs as well as OpenCL support             |
+| :doc:`GPU Package <Speed_gpu>`          | for GPUs via CUDA, OpenCL, or ROCm HIP                |
 +-----------------------------------------+-------------------------------------------------------+
 | :doc:`USER-INTEL Package <Speed_intel>` | for Intel CPUs and Intel Xeon Phi                     |
 +-----------------------------------------+-------------------------------------------------------+
@@ -43,7 +43,7 @@ three kinds of hardware, via the listed packages:
 +-----------------+-----------------------------------------------------------------------------------------------------------------------------+
 | Many-core CPUs  | :doc:`USER-INTEL <Speed_intel>`, :doc:`KOKKOS <Speed_kokkos>`, :doc:`USER-OMP <Speed_omp>`, :doc:`OPT <Speed_opt>` packages |
 +-----------------+-----------------------------------------------------------------------------------------------------------------------------+
-| NVIDIA/AMD GPUs | :doc:`GPU <Speed_gpu>`, :doc:`KOKKOS <Speed_kokkos>` packages                                                               |
+| GPUs            | :doc:`GPU <Speed_gpu>`, :doc:`KOKKOS <Speed_kokkos>` packages                                                               |
 +-----------------+-----------------------------------------------------------------------------------------------------------------------------+
 | Intel Phi/AVX   | :doc:`USER-INTEL <Speed_intel>`, :doc:`KOKKOS <Speed_kokkos>` packages                                                      |
 +-----------------+-----------------------------------------------------------------------------------------------------------------------------+
@@ -154,8 +154,8 @@ Here is a brief summary of what the various packages provide.  Details
 are in the individual accelerator sections.
 
 * Styles with a "gpu" suffix are part of the GPU package and can be run
-  on NVIDIA or AMD GPUs.  The speed-up on a GPU depends on a variety of
-  factors, discussed in the accelerator sections.
+  on Intel, NVIDIA, or AMD GPUs.  The speed-up on a GPU depends on a
+  variety of factors, discussed in the accelerator sections.
 * Styles with an "intel" suffix are part of the USER-INTEL
   package. These styles support vectorized single and mixed precision
   calculations, in addition to full double precision.  In extreme cases,
diff --git a/doc/src/fix_nh.rst b/doc/src/fix_nh.rst
index 590211eda7..f40ce0c463 100644
--- a/doc/src/fix_nh.rst
+++ b/doc/src/fix_nh.rst
@@ -1,8 +1,10 @@
 .. index:: fix nvt
+.. index:: fix nvt/gpu
 .. index:: fix nvt/intel
 .. index:: fix nvt/kk
 .. index:: fix nvt/omp
 .. index:: fix npt
+.. index:: fix npt/gpu
 .. index:: fix npt/intel
 .. index:: fix npt/kk
 .. index:: fix npt/omp
@@ -13,12 +15,12 @@
 fix nvt command
 ===============
 
-Accelerator Variants: *nvt/intel*, *nvt/kk*, *nvt/omp*
+Accelerator Variants: *nvt/gpu*, *nvt/intel*, *nvt/kk*, *nvt/omp*
 
 fix npt command
 ===============
 
-Accelerator Variants: *npt/intel*, *npt/kk*, *npt/omp*
+Accelerator Variants: *npt/gpu*, *npt/intel*, *npt/kk*, *npt/omp*
 
 fix nph command
 ===============
diff --git a/doc/src/fix_nve.rst b/doc/src/fix_nve.rst
index 71f8ec300f..ae472b1a38 100644
--- a/doc/src/fix_nve.rst
+++ b/doc/src/fix_nve.rst
@@ -1,4 +1,5 @@
 .. index:: fix nve
+.. index:: fix nve/gpu
 .. index:: fix nve/intel
 .. index:: fix nve/kk
 .. index:: fix nve/omp
@@ -6,7 +7,7 @@
 fix nve command
 ===============
 
-Accelerator Variants: *nve/intel*, *nve/kk*, *nve/omp*
+Accelerator Variants: *nve/gpu*, *nve/intel*, *nve/kk*, *nve/omp*
 
 Syntax
 """"""
diff --git a/doc/src/fix_nve_asphere.rst b/doc/src/fix_nve_asphere.rst
index af80460b32..c49de34d0b 100644
--- a/doc/src/fix_nve_asphere.rst
+++ b/doc/src/fix_nve_asphere.rst
@@ -1,10 +1,11 @@
 .. index:: fix nve/asphere
+.. index:: fix nve/asphere/gpu
 .. index:: fix nve/asphere/intel
 
 fix nve/asphere command
 =======================
 
-Accelerator Variants: *nve/asphere/intel*
+Accelerator Variants: *nve/asphere/gpu*, *nve/asphere/intel*
 
 Syntax
 """"""
diff --git a/doc/src/package.rst b/doc/src/package.rst
index 6a5ff44077..a091759214 100644
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@@ -18,7 +18,7 @@ Syntax
        *gpu* args = Ngpu keyword value ...
          Ngpu = # of GPUs per node
          zero or more keyword/value pairs may be appended
-         keywords = *neigh* or *newton* or *pair/only* or *binsize* or *split* or *gpuID* or *tpa* or *device* or *blocksize*
+         keywords = *neigh* or *newton* or *pair/only* or *binsize* or *split* or *gpuID* or *tpa* or *blocksize* or *platform* or *device_type* or *ocl_args*
            *neigh* value = *yes* or *no*
              yes = neighbor list build on GPU (default)
              no = neighbor list build on CPU
@@ -32,17 +32,18 @@ Syntax
              size = bin size for neighbor list construction (distance units)
            *split* = fraction
              fraction = fraction of atoms assigned to GPU (default = 1.0)
-           *gpuID* values = first last
-             first = ID of first GPU to be used on each node
-             last = ID of last GPU to be used on each node
            *tpa* value = Nthreads
-             Nthreads = # of GPU threads used per atom
-           *device* value = device_type or platform_id:device_type or platform_id:custom,val1,val2,val3,..,val13
-             platform_id = numerical OpenCL platform id (default: -1)
-             device_type = *kepler* or *fermi* or *cypress* or *intel* or *phi* or *generic* or *custom*
-             val1,val2,... = custom OpenCL tune parameters (see below for details)
+             Nthreads = # of GPU vector lanes used per atom
            *blocksize* value = size
              size = thread block size for pair force computation
+           *platform* value = id
+             id = For OpenCL, platform ID for the GPU or accelerator
+           *gpuID* values = id
+             id = ID of first GPU to be used on each node
+           *device_type* value = *intelgpu* or *nvidiagpu* or *amdgpu* or *applegpu* or *generic* or *custom,val1,val2,...*
+             val1,val2,... = custom OpenCL accelerator configuration parameters (see below for details)
+           *ocl_args* value = args
+             args = List of additional OpenCL compiler arguments delimited by colons
        *intel* args = NPhi keyword value ...
          Nphi = # of co-processors per node
          zero or more keyword/value pairs may be appended
@@ -112,12 +113,10 @@ Examples
 
 .. code-block:: LAMMPS
 
-   package gpu 1
+   package gpu 0
    package gpu 1 split 0.75
    package gpu 2 split -1.0
-   package gpu 1 device kepler
-   package gpu 1 device 2:generic
-   package gpu 1 device custom,32,4,8,256,11,128,256,128,32,64,8,128,128
+   package gpu 0 device_type intelgpu
    package kokkos neigh half comm device
    package omp 0 neigh no
    package omp 4
@@ -174,10 +173,18 @@ simulations.
 The *gpu* style invokes settings associated with the use of the GPU
 package.
 
-The *Ngpu* argument sets the number of GPUs per node.  There must be
-at least as many MPI tasks per node as GPUs, as set by the mpirun or
-mpiexec command.  If there are more MPI tasks (per node)
-than GPUs, multiple MPI tasks will share each GPU.
+The *Ngpu* argument sets the number of GPUs per node. If *Ngpu* is 0
+and no other keywords are specified, GPU or accelerator devices are
+autoselected. In this process, all platforms are searched for
+accelerator devices and GPUs are chosen if available. The device with
+the highest number of compute cores is selected. The number of devices
+is increased to be the number of matching accelerators with the same
+number of compute cores. If there are more devices than MPI tasks,
+the additional devices will be unused. The auto-selection of GPUs/
+accelerator devices and platforms can be restricted by specifying
+a non-zero value for *Ngpu* and / or using the *gpuID*, *platform*,
+and *device_type* keywords as described below. If there are more MPI
+tasks (per node) than GPUs, multiple MPI tasks will share each GPU.
 
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
@@ -212,18 +219,8 @@ overlapped with all other computations on the CPU.
 
 The *binsize* keyword sets the size of bins used to bin atoms in
 neighbor list builds performed on the GPU, if *neigh* = *yes* is set.
-If *binsize* is set to 0.0 (the default), then bins = the size of the
-pairwise cutoff + neighbor skin distance.  This is 2x larger than the
-LAMMPS default used for neighbor list building on the CPU.  This will
-be close to optimal for the GPU, so you do not normally need to use
-this keyword.  Note that if you use a longer-than-usual pairwise
-cutoff, e.g. to allow for a smaller fraction of KSpace work with a
-:doc:`long-range Coulombic solver <kspace_style>` because the GPU is
-faster at performing pairwise interactions, then it may be optimal to
-make the *binsize* smaller than the default.  For example, with a
-cutoff of 20\*sigma in LJ :doc:`units <units>` and a neighbor skin
-distance of sigma, a *binsize* = 5.25\*sigma can be more efficient than
-the default.
+If *binsize* is set to 0.0 (the default), then the binsize is set
+automatically using heuristics in the GPU package.
 
 The *split* keyword can be used for load balancing force calculations
 between CPU and GPU cores in GPU-enabled pair styles. If 0 < *split* <
@@ -257,63 +254,69 @@ cores would perform force calculations for some fraction of the
 particles at the same time the GPUs performed force calculation for
 the other particles.
 
-The *gpuID* keyword allows selection of which GPUs on each node will
-be used for a simulation.  The *first* and *last* values specify the
-GPU IDs to use (from 0 to Ngpu-1).  By default, first = 0 and last =
-Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number
-of physical GPUs.  If you only wish to use a subset, set Ngpu to a
-smaller number and first/last to a sub-range of the available GPUs.
+The *gpuID* keyword is used to specify the first ID for the GPU or
+other accelerator that LAMMPS will use. For example, if the ID is
+1 and *Ngpu* is 3, GPUs 1-3 will be used. Device IDs should be
+determined from the output of nvc_get_devices or ocl_get_devices
+as provided in the lib/gpu directory. When using OpenCL with
+accelerators that have main memory NUMA, the accelerators can be
+split into smaller virtual accelerators for more efficient use
+with MPI.
 
-The *tpa* keyword sets the number of GPU thread per atom used to
+The *tpa* keyword sets the number of GPU vector lanes per atom used to
 perform force calculations.  With a default value of 1, the number of
 threads will be chosen based on the pair style, however, the value can
 be set explicitly with this keyword to fine-tune performance.  For
 large cutoffs or with a small number of particles per GPU, increasing
 the value can improve performance. The number of threads per atom must
-be a power of 2 and currently cannot be greater than 32.
-
-The *device* keyword can be used to tune parameters optimized for a
-specific accelerator and platform when using OpenCL. OpenCL supports
-the concept of a **platform**\ , which represents one or more devices that
-share the same driver (e.g. there would be a different platform for
-GPUs from different vendors or for CPU based accelerator support).
-In LAMMPS only one platform can be active at a time and by default
-the first platform with an accelerator is selected. This is equivalent
-to using a platform ID of -1. The platform ID is a number corresponding
-to the output of the ocl_get_devices tool. The platform ID is passed
-to the GPU library, by prefixing the *device* keyword with that number
-separated by a colon. For CUDA, the *device* keyword is ignored.
-Currently, the device tuning support is limited to NVIDIA Kepler, NVIDIA
-Fermi, AMD Cypress, Intel x86_64 CPU, Intel Xeon Phi, or a generic device.
-More devices may be added later.  The default device type can be
-specified when building LAMMPS with the GPU library, via setting a
-variable in the lib/gpu/Makefile that is used.
-
-In addition, a device type *custom* is available, which is followed by
-13 comma separated numbers, which allows to set those tweakable parameters
-from the package command. It can be combined with the (colon separated)
-platform id. The individual settings are:
-
-* MEM_THREADS
-* THREADS_PER_ATOM
-* THREADS_PER_CHARGE
-* BLOCK_PAIR
-* MAX_SHARED_TYPES
-* BLOCK_NBOR_BUILD
-* BLOCK_BIO_PAIR
-* BLOCK_ELLIPSE
-* WARP_SIZE
-* PPPM_BLOCK_1D
-* BLOCK_CELL_2D
-* BLOCK_CELL_ID
-* MAX_BIO_SHARED_TYPES
+be a power of 2 and currently cannot be greater than the SIMD width
+for the GPU / accelerator. In the case it exceeds the SIMD width, it
+will automatically be decreased to meet the restriction.
 
 The *blocksize* keyword allows you to tweak the number of threads used
 per thread block. This number should be a multiple of 32 (for GPUs)
 and its maximum depends on the specific GPU hardware. Typical choices
 are 64, 128, or 256. A larger block size increases occupancy of
 individual GPU cores, but reduces the total number of thread blocks,
-thus may lead to load imbalance.
+thus may lead to load imbalance. On modern hardware, the sensitivity
+to the blocksize is typically low.
+
+The *platform* keyword is only used with OpenCL to specify the ID for
+an OpenCL platform. See the output from ocl_get_devices in the lib/gpu
+directory. In LAMMPS only one platform can be active at a time and by
+default (id=-1) the platform is auto-selected to find the GPU with the
+most compute cores. When *Ngpu* or other keywords are specified, the
+auto-selection is appropriately restricted. For example, if *Ngpu* is
+3, only platforms with at least 3 accelerators are considered. Similar
+restrictions can be enforced by the *gpuID* and *device_type* keywords.
+
+The *device_type* keyword can be used for OpenCL to specify the type of
+GPU to use or specify a custom configuration for an accelerator. In most
+cases this selection will be automatic and there is no need to use the
+keyword. The *applegpu* type is not specific to a particular GPU vendor,
+but is separate due to the more restrictive Apple OpenCL implementation.
+For expert users, to specify a custom configuration, the *custom* keyword
+followed by the next parameters can be specified:
+
+CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH,
+THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
+BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
+BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
+PPPM_MAX_SPLINE.
+
+CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX
+(NVIDIA) or OpenCL extensions (Intel) should be used for horizontal
+vector operataions. FAST_MATH in {0,1} indicates that OpenCL fast math
+optimizations are used during the build and HW-accelerated
+transcendentals are used when available. THREADS_PER_* give the default
+*tpa* values for ellipsoidal models, styles using charge, and any other
+styles. The BLOCK_* parameters specify the block sizes for various
+kernal calls and the MAX_*SHARED*_ parameters are used to determine the
+amount of local shared memory to use for storing model parameters.
+
+For OpenCL, the routines are compiled at runtime for the specified GPU
+or accelerator architecture. The *ocl_args* keyword can be used to
+specify additional flags for the runtime build.
 
 ----------
 
@@ -658,9 +661,9 @@ Related commands
 Default
 """""""
 
-For the GPU package, the default is Ngpu = 1 and the option defaults
+For the GPU package, the default is Ngpu = 0 and the option defaults
 are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0
-to Ngpu-1, tpa = 1, and device = not used.  These settings are made
+to Ngpu-1, tpa = 1, and platform=-1.  These settings are made
 automatically if the "-sf gpu" :doc:`command-line switch <Run_options>`
 is used.  If it is not used, you must invoke the package gpu command
 in your input script or via the "-pk gpu" :doc:`command-line switch <Run_options>`.
diff --git a/doc/src/pair_charmm.rst b/doc/src/pair_charmm.rst
index 6d81266a35..b3d2a2b878 100644
--- a/doc/src/pair_charmm.rst
+++ b/doc/src/pair_charmm.rst
@@ -1,4 +1,5 @@
 .. index:: pair_style lj/charmm/coul/charmm
+.. index:: pair_style lj/charmm/coul/charmm/gpu
 .. index:: pair_style lj/charmm/coul/charmm/intel
 .. index:: pair_style lj/charmm/coul/charmm/kk
 .. index:: pair_style lj/charmm/coul/charmm/omp
@@ -19,7 +20,7 @@
 pair_style lj/charmm/coul/charmm command
 ========================================
 
-Accelerator Variants: *lj/charmm/coul/charmm/intel*, *lj/charmm/coul/charmm/kk*, *lj/charmm/coul/charmm/omp*
+Accelerator Variants: *lj/charmm/coul/charmm/gpu*, *lj/charmm/coul/charmm/intel*, *lj/charmm/coul/charmm/kk*, *lj/charmm/coul/charmm/omp*
 
 pair_style lj/charmm/coul/charmm/implicit command
 =================================================

From 515da322155891ce412e9c37684a221a6ef81783 Mon Sep 17 00:00:00 2001
From: "Ryan S. Elliott" <relliott@umn.edu>
Date: Mon, 15 Feb 2021 16:00:47 -0600
Subject: [PATCH 032/116] Fixup errors/issues in
 cmake/Modules/Packages/KIM.cmake

---
 cmake/Modules/Packages/KIM.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/Modules/Packages/KIM.cmake b/cmake/Modules/Packages/KIM.cmake
index 83a96d02b8..5482d3071c 100644
--- a/cmake/Modules/Packages/KIM.cmake
+++ b/cmake/Modules/Packages/KIM.cmake
@@ -69,14 +69,14 @@ if(DOWNLOAD_KIM)
     BUILD_RPATH "${_rpath_prefix}/kim_build-prefix/lib"
     )
 else()
-  if(KIM-API_FOUND AND KIM_API_VERSION VERSION_GREATER_EQUAL 2.2.0)
+  if(KIM-API_FOUND AND KIM-API_VERSION VERSION_GREATER_EQUAL 2.2.0)
     # For kim-api >= 2.2.0
-    find_package(KIM-API ${KIM-API_MIN_VERSION} CONFIG REQUIRED)
+    find_package(KIM-API 2.2.0 CONFIG REQUIRED)
     target_link_libraries(lammps PRIVATE KIM-API::kim-api)
   else()
     # For kim-api 2.1.3 (consistent with previous version of this file)
     find_package(PkgConfig REQUIRED)
-    pkg_check_modules(KIM-API REQUIRED IMPORTED_TARGET libkim-api>=KIM-API_MIN_VERSION)
+    pkg_check_modules(KIM-API REQUIRED IMPORTED_TARGET libkim-api>=${KIM-API_MIN_VERSION})
     target_link_libraries(lammps PRIVATE PkgConfig::KIM-API)
   endif()
 endif()

From 44ab383917ef92e01a7be68662bf4c5aeb3d0c43 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 16 Feb 2021 11:14:22 -0500
Subject: [PATCH 033/116] Remove duplicate line in GPU/Install.sh

Otherwise, after running

$ make yes-all
$ make no-lib

the generated Makefile.package would still contain the LMP_GPU define
---
 src/GPU/Install.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index 1767623314..49b7eeda57 100755
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -171,7 +171,6 @@ if (test $1 = 1) then
     sed -i -e 's|^PKG_SYSINC =[ \t]*|&$(gpu_SYSINC) |' ../Makefile.package
     sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(gpu_SYSLIB) |' ../Makefile.package
     sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(gpu_SYSPATH) |' ../Makefile.package
-    sed -i -e 's|^PKG_INC =[ \t]*|&-DLMP_GPU |' ../Makefile.package
   fi
 
   if (test -e ../Makefile.package.settings) then

From 224da33b228fb680f7059d0204803c4cc3bb3117 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 16 Feb 2021 12:29:50 -0500
Subject: [PATCH 034/116] Add missing fix_nh_gpu files to CMake build

---
 cmake/Modules/Packages/GPU.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 4c52eee68b..b11c34b034 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -1,7 +1,9 @@
 set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
 set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h
                 ${GPU_SOURCES_DIR}/fix_gpu.h
-                ${GPU_SOURCES_DIR}/fix_gpu.cpp)
+                ${GPU_SOURCES_DIR}/fix_gpu.cpp
+                ${GPU_SOURCES_DIR}/fix_nh_gpu.h
+                ${GPU_SOURCES_DIR}/fix_nh_gpu.cpp)
 target_compile_definitions(lammps PRIVATE -DLMP_GPU)
 
 set(GPU_API "opencl" CACHE STRING "API used by GPU package")

From d85a5e3290deee10b48ad3206979080346cb6d25 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 16 Feb 2021 14:57:10 -0500
Subject: [PATCH 035/116] Remove OCL_TUNE option in CMake

The GPU package now auto-detects these settings.
---
 cmake/Modules/Packages/GPU.cmake | 7 +------
 doc/src/Build_extras.rst         | 2 --
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index b11c34b034..8557cc7178 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -157,11 +157,6 @@ elseif(GPU_API STREQUAL "OPENCL")
   else()
     find_package(OpenCL REQUIRED)
   endif()
-  set(OCL_TUNE "generic" CACHE STRING "OpenCL Device Tuning")
-  set(OCL_TUNE_VALUES intel fermi kepler cypress generic)
-  set_property(CACHE OCL_TUNE PROPERTY STRINGS ${OCL_TUNE_VALUES})
-  validate_option(OCL_TUNE OCL_TUNE_VALUES)
-  string(TOUPPER ${OCL_TUNE} OCL_TUNE)
 
   include(OpenCLUtils)
   set(OCL_COMMON_HEADERS ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_preprocessor.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_aux_fun1.h)
@@ -205,7 +200,7 @@ elseif(GPU_API STREQUAL "OPENCL")
   add_library(gpu STATIC ${GPU_LIB_SOURCES})
   target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
   target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
-  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT)
+  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
   target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
 
   target_link_libraries(lammps PRIVATE gpu)
diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst
index 8f1154a167..cf15de74bd 100644
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@@ -120,8 +120,6 @@ CMake build
    -D GPU_API=value             # value = opencl (default) or cuda or hip
    -D GPU_PREC=value            # precision setting
                                 # value = double or mixed (default) or single
-   -D OCL_TUNE=value            # hardware choice for GPU_API=opencl
-                                # generic (default) or intel (Intel CPU) or fermi, kepler, cypress (NVIDIA)
    -D GPU_ARCH=value            # primary GPU hardware choice for GPU_API=cuda
                                 # value = sm_XX, see below
                                 # default is sm_50

From 775446b60f85ddac4f2ecd8747064a95cd6eb972 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 16 Feb 2021 15:01:22 -0500
Subject: [PATCH 036/116] Add GERYON_NUMA_FISSION define in CMake

---
 cmake/Modules/Packages/GPU.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 8557cc7178..76ad4190cf 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -200,7 +200,7 @@ elseif(GPU_API STREQUAL "OPENCL")
   add_library(gpu STATIC ${GPU_LIB_SOURCES})
   target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
   target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
-  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
+  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
   target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
 
   target_link_libraries(lammps PRIVATE gpu)

From 721c6d96ccb5eade289c25b6e7967ba1b24ca8bf Mon Sep 17 00:00:00 2001
From: Michael Brown <michael.w.brown@intel.com>
Date: Tue, 16 Feb 2021 00:36:37 -0800
Subject: [PATCH 037/116] Removing Makefile.opencl from lib/gpu

---
 lib/gpu/Makefile.opencl | 92 -----------------------------------------
 1 file changed, 92 deletions(-)
 delete mode 100644 lib/gpu/Makefile.opencl

diff --git a/lib/gpu/Makefile.opencl b/lib/gpu/Makefile.opencl
deleted file mode 100644
index aa7806b542..0000000000
--- a/lib/gpu/Makefile.opencl
+++ /dev/null
@@ -1,92 +0,0 @@
-# /* ----------------------------------------------------------------------   
-#  Generic Linux Makefile for OpenCL 
-# ------------------------------------------------------------------------- */
-
-# which file will be copied to Makefile.lammps
-
-EXTRAMAKE = Makefile.lammps.opencl
-
-# this setting should match LAMMPS Makefile
-# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
-
-LMP_INC = -DLAMMPS_SMALLBIG
-
-# precision for GPU calculations
-# -D_SINGLE_SINGLE  # Single precision for all calculations
-# -D_DOUBLE_DOUBLE  # Double precision for all calculations
-# -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
-
-OCL_PREC = -D_SINGLE_DOUBLE
-
-BIN_DIR = ./
-OBJ_DIR = ./
-LIB_DIR = ./
-AR = ar
-BSH = /bin/sh
-
-# Compiler and linker settings
-
-# OCL_TUNE = -DFERMI_OCL     # -- Uncomment for NVIDIA Fermi
-# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
-# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
-OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device
-
-OCL_INC = -I/usr/local/cuda/include  # Path to CL directory
-OCL_CPP = mpic++ $(DEFAULT_DEVICE) -g -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
-OCL_LINK = -lOpenCL
-OCL  = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL
-
-# Headers for Geryon
-UCL_H  = $(wildcard ./geryon/ucl*.h)
-OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_preprocessor.h
-PRE1_H = lal_preprocessor.h lal_aux_fun1.h
-ALL_H  =  $(OCL_H) $(wildcard ./lal_*.h)
-
-# Source files
-SRCS := $(wildcard ./lal_*.cpp)
-OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
-CUS  := $(wildcard lal_*.cu)
-KERS := $(subst ./,$(OBJ_DIR)/,$(CUS:lal_%.cu=%_cl.h))
-KERS := $(addprefix $(OBJ_DIR)/, $(KERS))
-
-# targets
-
-GPU_LIB = $(LIB_DIR)/libgpu.a
-
-EXECS = $(BIN_DIR)/ocl_get_devices
-
-all: $(OBJ_DIR) $(KERS) $(GPU_LIB) $(EXECS)
-
-$(OBJ_DIR):
-	mkdir -p $@
-
-# device code compilation
-
-$(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
-	$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;
-
-# host code compilation
-
-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(KERS)
-	$(OCL) -o $@ -c $< -I$(OBJ_DIR)
-
-# build libgpu.a
-
-$(GPU_LIB): $(OBJS)
-	$(AR) -crusv $(GPU_LIB) $(OBJS)
-	@cp $(EXTRAMAKE) Makefile.lammps
-
-# test app for querying device info
-
-$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H)
-	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)
-
-clean:
-	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(KERS) *.linkinfo
-
-veryclean: clean
-	-rm -rf *~ *.linkinfo
-
-cleanlib:
-	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(KERS) *.linkinfo
-

From 7b943948eafa4039465599f24c5a626877fb0146 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 16 Feb 2021 06:51:50 -0500
Subject: [PATCH 038/116] Point users to the LAMMPS GitHub Releases page for
 downloading archives

---
 doc/src/Install_tarball.rst | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/doc/src/Install_tarball.rst b/doc/src/Install_tarball.rst
index 7c9e834104..6f87df8a21 100644
--- a/doc/src/Install_tarball.rst
+++ b/doc/src/Install_tarball.rst
@@ -33,22 +33,19 @@ in its name, e.g. lammps-23Jun18.
 
 ----------
 
-You can also download a zip file via the "Clone or download" button on
-the `LAMMPS GitHub site <git_>`_.  The file name will be lammps-master.zip
-which can be unzipped with the following command, to create
-a lammps-master dir:
+You can also download a compressed tar or zip archives from the
+"Assets" sections of the `LAMMPS GitHub releases site <git_>`_.
+The file name will be lammps-<version>.zip which can be unzipped
+with the following command, to create a lammps-<version> dir:
 
 .. code-block:: bash
 
    $ unzip lammps*.zip
 
-This version is the most up-to-date LAMMPS development version.  It
-will have the date of the most recent patch release (see the file
-src/version.h).  But it will also include any new bug-fixes or
-features added since the last patch release.  They will be included in
-the next patch release tarball.
+This version corresponds to the selected LAMMPS patch or stable
+release.
 
-.. _git: https://github.com/lammps/lammps
+.. _git: https://github.com/lammps/lammps/releases
 
 ----------
 

From e7a37877c0debded037ae8dde9508ccd03f10307 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 16 Feb 2021 11:40:25 -0500
Subject: [PATCH 039/116] apply changes to doc Makefile to limit the impact of
 SNL network config changes

---
 doc/Makefile               | 8 +++++---
 doc/utils/requirements.txt | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/doc/Makefile b/doc/Makefile
index 6032aff45f..7deaaf2a2e 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -47,6 +47,8 @@ HAS_PDFLATEX = YES
 endif
 endif
 
+# override settings for PIP commands
+# PIP_OPTIONS = --cert /etc/pki/ca-trust/extracted/openssl/ca-bundle.trust.crt --proxy http://proxy.mydomain.org
 
 #SPHINXEXTRA = -j $(shell $(PYTHON) -c 'import multiprocessing;print(multiprocessing.cpu_count())') $(shell test -f $(BUILDDIR)/doxygen/xml/run.stamp && printf -- "-E")
 
@@ -228,13 +230,13 @@ $(VENV):
 	@( \
 		$(VIRTUALENV) -p $(PYTHON) $(VENV); \
 		. $(VENV)/bin/activate; \
-		pip install --upgrade pip; \
-		pip install -r $(BUILDDIR)/utils/requirements.txt; \
+		pip $(PIP_OPTIONS) install --upgrade pip; \
+		pip $(PIP_OPTIONS) install -r $(BUILDDIR)/utils/requirements.txt; \
 		deactivate;\
 	)
 
 $(MATHJAX):
-	@git clone --depth 1 https://github.com/mathjax/MathJax.git $@
+	@git clone --depth 1 git://github.com/mathjax/MathJax.git $@
 
 $(TXT2RST) $(ANCHORCHECK): $(VENV)
 	@( \
diff --git a/doc/utils/requirements.txt b/doc/utils/requirements.txt
index e025e23b09..00fa6ecfaf 100644
--- a/doc/utils/requirements.txt
+++ b/doc/utils/requirements.txt
@@ -1,6 +1,6 @@
 Sphinx
 sphinxcontrib-spelling
-git+https://github.com/akohlmey/sphinx-fortran@parallel-read
+git+git://github.com/akohlmey/sphinx-fortran@parallel-read
 sphinx_tabs
 breathe
 Pygments

From b37ae4aea6560c9a61d2b9b935f356c865aabfd0 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 16 Feb 2021 12:26:59 -0500
Subject: [PATCH 040/116] propagate PIP_OPTIONS change to CMake doc build
 module

---
 cmake/Modules/Documentation.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/Modules/Documentation.cmake b/cmake/Modules/Documentation.cmake
index 189c32e301..5a42244b9e 100644
--- a/cmake/Modules/Documentation.cmake
+++ b/cmake/Modules/Documentation.cmake
@@ -50,9 +50,9 @@ if(BUILD_DOC)
     OUTPUT ${DOC_BUILD_DIR}/requirements.txt
     DEPENDS docenv ${DOCENV_REQUIREMENTS_FILE}
     COMMAND ${CMAKE_COMMAND} -E copy ${DOCENV_REQUIREMENTS_FILE} ${DOC_BUILD_DIR}/requirements.txt
-    COMMAND ${DOCENV_BINARY_DIR}/pip install --upgrade pip
-    COMMAND ${DOCENV_BINARY_DIR}/pip install --upgrade ${LAMMPS_DOC_DIR}/utils/converters
-    COMMAND ${DOCENV_BINARY_DIR}/pip install --use-feature=2020-resolver -r ${DOC_BUILD_DIR}/requirements.txt --upgrade
+    COMMAND ${DOCENV_BINARY_DIR}/pip $ENV{PIP_OPTIONS} install --upgrade pip
+    COMMAND ${DOCENV_BINARY_DIR}/pip $ENV{PIP_OPTIONS} install --upgrade ${LAMMPS_DOC_DIR}/utils/converters
+    COMMAND ${DOCENV_BINARY_DIR}/pip $ENV{PIP_OPTIONS} install -r ${DOC_BUILD_DIR}/requirements.txt --upgrade
   )
 
   # download mathjax distribution and unpack to folder "mathjax"

From 57b630acbbf509baf9d2c5c6a6c2025728fa8274 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 16 Feb 2021 12:32:37 -0500
Subject: [PATCH 041/116] update documentation for building the manual with
 PIP_OPTIONS settings

---
 doc/src/Build_manual.rst | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/doc/src/Build_manual.rst b/doc/src/Build_manual.rst
index 59e4e3235b..3bf0337b31 100644
--- a/doc/src/Build_manual.rst
+++ b/doc/src/Build_manual.rst
@@ -74,7 +74,11 @@ For the documentation build a python virtual environment is set up in
 the folder ``doc/docenv`` and various python packages are installed into
 that virtual environment via the ``pip`` tool.  For rendering embedded
 LaTeX code also the `MathJax <https://www.mathjax.org/>`_ JavaScript
-engine needs to be downloaded.
+engine needs to be downloaded.  If you need to pass additional options
+to the pip commands to work (e.g. to use a web proxy or to point to
+additional SSL certificates) you can set them via the ``PIP_OPTIONS``
+environment variable or uncomment and edit the ``PIP_OPTIONS`` setting
+at beginning of the makefile.
 
 The actual translation is then done via ``make`` commands in the doc
 folder.  The following ``make`` commands are available:
@@ -108,7 +112,10 @@ installation of the HTML manual pages into the "install" step when
 installing LAMMPS after the CMake build via ``cmake --build . --target
 install``.  The documentation build is included in the default build
 target, but can also be requested independently with
-``cmake --build . --target doc``.
+``cmake --build . --target doc``.  If you need to pass additional options
+to the pip commands to work (e.g. to use a web proxy or to point to
+additional SSL certificates) you can set them via the ``PIP_OPTIONS``
+environment variable.
 
 .. code-block:: bash
 

From f929e57261da982e140526a7a41aa17baa825e8f Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 16 Feb 2021 14:38:03 -0500
Subject: [PATCH 042/116] avoid loading mpi4py if the LAMMPS executable has
 been built without MPI

---
 python/lammps/core.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/python/lammps/core.py b/python/lammps/core.py
index 1a4650c285..8639743a75 100644
--- a/python/lammps/core.py
+++ b/python/lammps/core.py
@@ -286,15 +286,16 @@ class lammps(object):
     self.lib.lammps_fix_external_set_energy_global = [c_void_p, c_char_p, c_double]
     self.lib.lammps_fix_external_set_virial_global = [c_void_p, c_char_p, POINTER(c_double)]
 
-    # detect if Python is using version of mpi4py that can pass a communicator
-
+    # detect if Python is using a version of mpi4py that can pass communicators
+    # only needed if LAMMPS has been compiled with MPI support.
     self.has_mpi4py = False
-    try:
-      from mpi4py import __version__ as mpi4py_version
-      # tested to work with mpi4py versions 2 and 3
-      self.has_mpi4py = mpi4py_version.split('.')[0] in ['2','3']
-    except:
-      pass
+    if self.has_mpi_support:
+      try:
+        from mpi4py import __version__ as mpi4py_version
+        # tested to work with mpi4py versions 2 and 3
+        self.has_mpi4py = mpi4py_version.split('.')[0] in ['2','3']
+      except:
+        pass
 
     # if no ptr provided, create an instance of LAMMPS
     #   don't know how to pass an MPI communicator from PyPar
@@ -307,18 +308,18 @@ class lammps(object):
 
     if not ptr:
 
-      # with mpi4py v2, can pass MPI communicator to LAMMPS
+      # with mpi4py v2+, we can pass MPI communicators to LAMMPS
       # need to adjust for type of MPI communicator object
       # allow for int (like MPICH) or void* (like OpenMPI)
-      if self.has_mpi4py and self.has_mpi_support:
+      if self.has_mpi_support and self.has_mpi4py:
         from mpi4py import MPI
         self.MPI = MPI
 
       if comm:
-        if not self.has_mpi4py:
-          raise Exception('Python mpi4py version is not 2 or 3')
         if not self.has_mpi_support:
           raise Exception('LAMMPS not compiled with real MPI library')
+        if not self.has_mpi4py:
+          raise Exception('Python mpi4py version is not 2 or 3')
         if self.MPI._sizeof(self.MPI.Comm) == sizeof(c_int):
           MPI_Comm = c_int
         else:

From 742eebec2d533ef113b595f656dcf6ce2073a181 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 16 Feb 2021 20:22:18 -0500
Subject: [PATCH 043/116] support checking the size of MPI communicators and
 fail if LAMMPS and mpi4py have a mismatch

---
 python/lammps/core.py | 4 ++++
 src/library.cpp       | 8 +++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/python/lammps/core.py b/python/lammps/core.py
index 8639743a75..e13bf9585b 100644
--- a/python/lammps/core.py
+++ b/python/lammps/core.py
@@ -325,6 +325,10 @@ class lammps(object):
         else:
           MPI_Comm = c_void_p
 
+        # Detect whether LAMMPS and mpi4py definitely use different MPI libs
+        if sizeof(MPI_Comm) != self.lib.lammps_config_has_mpi_support():
+          raise Exception('Inconsistent MPI library in LAMMPS and mpi4py')
+
         narg = 0
         cargs = None
         if cmdargs:
diff --git a/src/library.cpp b/src/library.cpp
index 71bf205d90..2a7bbf07b3 100644
--- a/src/library.cpp
+++ b/src/library.cpp
@@ -4128,16 +4128,18 @@ void lammps_get_os_info(char *buffer, int buf_size)
 /* ---------------------------------------------------------------------- */
 
 /** This function is used to query whether LAMMPS was compiled with
- *  a real MPI library or in serial.
+ *  a real MPI library or in serial. For the real MPI library it
+ *  reports the size of the MPI communicator in bytes (4 or 8),
+ *  which allows to check for compatibility with a hosting code.
  *
- * \return 0 when compiled with MPI STUBS, otherwise 1 */
+ * \return 0 when compiled with MPI STUBS, otherwise the MPI_Comm size in bytes */
 
 int lammps_config_has_mpi_support()
 {
 #ifdef MPI_STUBS
   return 0;
 #else
-  return 1;
+  return sizeof(MPI_Comm);
 #endif
 }
 

From 61585b1eb6a0e9c1124a2aa718cc970f23607c27 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 17 Feb 2021 15:02:02 -0500
Subject: [PATCH 044/116] make MPI STUBS a C++ library so its symbols won't
 collide when loading a real MPI library

---
 cmake/CMakeLists.txt                 |  3 +--
 cmake/Modules/Packages/MESSAGE.cmake |  5 ++---
 src/STUBS/Makefile                   |  6 +++---
 src/STUBS/Makefile.mingw32-cross     |  6 +++---
 src/STUBS/Makefile.mingw64-cross     |  6 +++---
 src/STUBS/{mpi.c => mpi.cpp}         |  0
 src/STUBS/mpi.h                      | 15 ++++++++-------
 7 files changed, 20 insertions(+), 21 deletions(-)
 rename src/STUBS/{mpi.c => mpi.cpp} (100%)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 2d259791f2..aefa9cd597 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -156,8 +156,7 @@ if(BUILD_MPI)
     endif()
   endif()
 else()
-  enable_language(C)
-  file(GLOB MPI_SOURCES ${LAMMPS_SOURCE_DIR}/STUBS/mpi.c)
+  file(GLOB MPI_SOURCES ${LAMMPS_SOURCE_DIR}/STUBS/mpi.cpp)
   add_library(mpi_stubs STATIC ${MPI_SOURCES})
   set_target_properties(mpi_stubs PROPERTIES OUTPUT_NAME lammps_mpi_stubs${LAMMPS_MACHINE})
   target_include_directories(mpi_stubs PUBLIC $<BUILD_INTERFACE:${LAMMPS_SOURCE_DIR}/STUBS>)
diff --git a/cmake/Modules/Packages/MESSAGE.cmake b/cmake/Modules/Packages/MESSAGE.cmake
index fb62763828..6ff4e322aa 100644
--- a/cmake/Modules/Packages/MESSAGE.cmake
+++ b/cmake/Modules/Packages/MESSAGE.cmake
@@ -2,9 +2,8 @@ if(LAMMPS_SIZES STREQUAL BIGBIG)
   message(FATAL_ERROR "The MESSAGE Package is not compatible with -DLAMMPS_BIGBIG")
 endif()
 option(MESSAGE_ZMQ "Use ZeroMQ in MESSAGE package" OFF)
-file(GLOB_RECURSE cslib_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.F
-    ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.c
-    ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.cpp)
+file(GLOB_RECURSE cslib_SOURCES
+        ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.cpp)
 
 add_library(cslib STATIC ${cslib_SOURCES})
 target_compile_definitions(cslib PRIVATE -DLAMMPS_${LAMMPS_SIZES})
diff --git a/src/STUBS/Makefile b/src/STUBS/Makefile
index 3c3c3b46d9..c9b6fdb65a 100644
--- a/src/STUBS/Makefile
+++ b/src/STUBS/Makefile
@@ -11,13 +11,13 @@ SHELL = /bin/sh
 
 # Files
 
-SRC =		mpi.c
+SRC =		mpi.cpp
 INC =		mpi.h
 
 # Definitions
 
 EXE =		libmpi_stubs.a
-OBJ = 		$(SRC:.c=.o)
+OBJ = 		$(SRC:.cpp=.o)
 
 # System-specific settings
 
@@ -36,7 +36,7 @@ clean:
 
 # Compilation rules
 
-.c.o:
+.cpp.o:
 	$(CC) $(CCFLAGS) -c $<
 
 # Individual dependencies
diff --git a/src/STUBS/Makefile.mingw32-cross b/src/STUBS/Makefile.mingw32-cross
index 4144954ec7..2934bbd468 100644
--- a/src/STUBS/Makefile.mingw32-cross
+++ b/src/STUBS/Makefile.mingw32-cross
@@ -5,17 +5,17 @@ SHELL = /bin/sh
 
 # Files
 
-SRC =		mpi.c
+SRC =		mpi.cpp
 INC =		mpi.h
 
 # Definitions
 
 EXE =		libmpi_mingw32.a
-OBJ = 		$(SRC:%.c=%_mingw32.o)
+OBJ = 		$(SRC:%.cpp=%_mingw32.o)
 
 # System-specific settings
 
-CC =	        i686-w64-mingw32-gcc
+CC =	        i686-w64-mingw32-g++
 CCFLAGS =	-O2 -Wall -march=i686 -mtune=generic -mfpmath=387 -mpc64 -I.
 ARCHIVE =	i686-w64-mingw32-ar
 ARCHFLAG =	rs
diff --git a/src/STUBS/Makefile.mingw64-cross b/src/STUBS/Makefile.mingw64-cross
index 70b971f262..e62d5dcbe1 100644
--- a/src/STUBS/Makefile.mingw64-cross
+++ b/src/STUBS/Makefile.mingw64-cross
@@ -5,17 +5,17 @@ SHELL = /bin/sh
 
 # Files
 
-SRC =		mpi.c
+SRC =		mpi.cpp
 INC =		mpi.h
 
 # Definitions
 
 EXE =		libmpi_mingw64.a
-OBJ = 		$(SRC:%.c=%_mingw64.o)
+OBJ = 		$(SRC:%.cpp=%_mingw64.o)
 
 # System-specific settings
 
-CC =	        x86_64-w64-mingw32-gcc
+CC =	        x86_64-w64-mingw32-g++
 CCFLAGS =	-O2 -Wall -march=core2 -mtune=core2 -msse2 -mpc64 -I.
 ARCHIVE =	x86_64-w64-mingw32-ar
 ARCHFLAG =	rs
diff --git a/src/STUBS/mpi.c b/src/STUBS/mpi.cpp
similarity index 100%
rename from src/STUBS/mpi.c
rename to src/STUBS/mpi.cpp
diff --git a/src/STUBS/mpi.h b/src/STUBS/mpi.h
index 063dc542be..28e897960d 100644
--- a/src/STUBS/mpi.h
+++ b/src/STUBS/mpi.h
@@ -16,12 +16,17 @@
 
 #include <stdlib.h>
 
-/* use C bindings for MPI interface */
+/* We compile STUBS with C++ so the symbols embedded
+ * the serial shared library will not collide with any
+ * corresponding symbols from a real MPI library (which
+ * uses C bindings). As a consequence the header *must*
+ * enforce compiling with C++ only. */
 
-#ifdef __cplusplus
-extern "C" {
+#ifndef __cplusplus
+#error "MPI STUBS must be compiled with a C++ compiler"
 #endif
 
+
 /* Dummy defs for MPI stubs */
 
 #define MPI_COMM_WORLD 0
@@ -176,8 +181,4 @@ int MPI_Alltoallv(void *sendbuf, int *sendcounts, int *sdispls,
                   MPI_Datatype recvtype, MPI_Comm comm);
 /* ---------------------------------------------------------------------- */
 
-#ifdef __cplusplus
-}
-#endif
-
 #endif

From 1552b0d1d66f1a305456a5a920d1a52204898e2f Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 17 Feb 2021 15:19:24 -0500
Subject: [PATCH 045/116] update/correct documentation for changes to the STUBS
 library and its implications

---
 doc/src/Build_basics.rst |  2 +-
 doc/src/Build_link.rst   | 16 ++++------------
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/doc/src/Build_basics.rst b/doc/src/Build_basics.rst
index cb6bd9f6aa..c7baa21e62 100644
--- a/doc/src/Build_basics.rst
+++ b/doc/src/Build_basics.rst
@@ -95,7 +95,7 @@ standard. A more detailed discussion of that is below.
 
       .. note::
 
-         The file ``src/STUBS/mpi.c`` provides a CPU timer function
+         The file ``src/STUBS/mpi.cpp`` provides a CPU timer function
          called ``MPI_Wtime()`` that calls ``gettimeofday()``.  If your
          operating system does not support ``gettimeofday()``, you will
          need to insert code to call another timer.  Note that the
diff --git a/doc/src/Build_link.rst b/doc/src/Build_link.rst
index 3d66371304..5255620231 100644
--- a/doc/src/Build_link.rst
+++ b/doc/src/Build_link.rst
@@ -20,16 +20,8 @@ the suffix ``.so.0`` (or some other number).
 .. note::
 
    Care should be taken to use the same MPI library for the calling code
-   and the LAMMPS library.  The ``library.h`` file includes ``mpi.h``
-   and uses definitions from it so those need to be available and
-   consistent.  When LAMMPS is compiled with the included STUBS MPI
-   library, then its ``mpi.h`` file needs to be included.  While it is
-   technically possible to use a full MPI library in the calling code
-   and link to a serial LAMMPS library compiled with MPI STUBS, it is
-   recommended to use the *same* MPI library for both, and then use
-   ``MPI_Comm_split()`` in the calling code to pass a suitable
-   communicator with a subset of MPI ranks to the function creating the
-   LAMMPS instance.
+   and the LAMMPS library unless LAMMPS is to be compiled without (real)
+   MPI support using the include STUBS MPI library.
 
 Link with LAMMPS as a static library
 ------------------------------------
@@ -110,7 +102,7 @@ executable, that are also required to link the LAMMPS executable.
 
       .. code-block:: bash
 
-         gcc -c -O -I${HOME}/lammps/src/STUBS -I${HOME}/lammps/src -caller.c
+         gcc -c -O -I${HOME}/lammps/src -caller.c
          g++ -o caller caller.o -L${HOME}/lammps/lib/poems \
                       -L${HOME}/lammps/src/STUBS -L${HOME}/lammps/src \
                       -llammps_serial -lpoems -lmpi_stubs
@@ -174,7 +166,7 @@ the POEMS package installed becomes:
 
       .. code-block:: bash
 
-         gcc -c -O -I${HOME}/lammps/src/STUBS -I${HOME}/lammps/src -caller.c
+         gcc -c -O -I${HOME}/lammps/src -caller.c
          g++ -o caller caller.o -L${HOME}/lammps/src -llammps_serial
 
 Locating liblammps.so at runtime

From db841dd41278cd3665bc7c8b4fd7baec7db423d2 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 17 Feb 2021 15:28:44 -0500
Subject: [PATCH 046/116] correct return value when no packages are installed

---
 unittest/c-library/test_library_config.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unittest/c-library/test_library_config.cpp b/unittest/c-library/test_library_config.cpp
index f196f800da..8a3d08cb33 100644
--- a/unittest/c-library/test_library_config.cpp
+++ b/unittest/c-library/test_library_config.cpp
@@ -74,7 +74,7 @@ TEST(LAMMPSConfig, package_name)
         EXPECT_EQ(lammps_config_package_name(numpkgs + 10, buf, 128), 0);
         EXPECT_THAT(buf, StrEq(""));
     } else {
-        EXPECT_EQ(lammps_config_package_name(0, buf, 128), 1);
+        EXPECT_EQ(lammps_config_package_name(0, buf, 128), 0);
         EXPECT_THAT(buf, StrEq(""));
     }
 };

From 0c348105181f85e823aee7cbf1e88ea1577ebb4b Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 17 Feb 2021 15:29:33 -0500
Subject: [PATCH 047/116] lmp.mpi4py will always be false if LAMMPS has been
 compiled without MPI support

---
 unittest/python/python-open.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unittest/python/python-open.py b/unittest/python/python-open.py
index 67500ea6fa..5140ce9185 100644
--- a/unittest/python/python-open.py
+++ b/unittest/python/python-open.py
@@ -37,7 +37,7 @@ class PythonOpen(unittest.TestCase):
         lmp=lammps(name=self.machine)
         self.assertIsNot(lmp.lmp,None)
         self.assertEqual(lmp.opened,1)
-        self.assertEqual(has_mpi4py,lmp.has_mpi4py)
+        self.assertEqual(has_mpi and has_mpi4py,lmp.has_mpi4py)
         self.assertEqual(has_mpi,lmp.has_mpi_support)
         lmp.close()
         self.assertIsNone(lmp.lmp,None)

From 1f109b0db29339b05e3199bae3fce7d42acfeb0c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 17 Feb 2021 15:50:45 -0500
Subject: [PATCH 048/116] update unittest for lammps_config_has_mpi() change

---
 unittest/c-library/test_library_config.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/unittest/c-library/test_library_config.cpp b/unittest/c-library/test_library_config.cpp
index 8a3d08cb33..e5eb044d31 100644
--- a/unittest/c-library/test_library_config.cpp
+++ b/unittest/c-library/test_library_config.cpp
@@ -200,7 +200,10 @@ TEST(LAMMPSConfig, exceptions)
 
 TEST(LAMMPSConfig, mpi_support)
 {
-    EXPECT_EQ(lammps_config_has_mpi_support(), LAMMPS_HAS_MPI);
+    if (LAMMPS_HAS_MPI)
+        EXPECT_GT(lammps_config_has_mpi_support(), 0);
+    else
+        EXPECT_EQ(lammps_config_has_mpi_support(), 0);
 };
 
 TEST(LAMMPSConfig, png_support)

From 1e5a73c468cd274e734adad25e36652b7de57edd Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 17 Feb 2021 17:36:35 -0500
Subject: [PATCH 049/116] silence warnings when using default OpenCL headers.
 Pick OpenCL v2.1 as default.

---
 lib/gpu/geryon/ocl_device.h | 4 ++++
 lib/gpu/geryon/ocl_macros.h | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index b0a3e3d583..435ee24dd3 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -28,6 +28,10 @@
 #include <vector>
 #include <iostream>
 
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 210
+#endif
+
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #include <OpenCL/cl_platform.h>
diff --git a/lib/gpu/geryon/ocl_macros.h b/lib/gpu/geryon/ocl_macros.h
index 5fb7665817..0e9ce78389 100644
--- a/lib/gpu/geryon/ocl_macros.h
+++ b/lib/gpu/geryon/ocl_macros.h
@@ -4,6 +4,10 @@
 #include <cstdio>
 #include <cassert>
 
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 210
+#endif
+
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #else

From e575c5fa29f79bf5336bacc7cb0b86dd36b2a86d Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 17 Feb 2021 18:28:37 -0500
Subject: [PATCH 050/116] -fopenmp implies -fopenmp-simd

---
 lib/gpu/Makefile.cuda_mps     | 2 +-
 lib/gpu/Makefile.hip          | 2 +-
 lib/gpu/Makefile.linux_opencl | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/gpu/Makefile.cuda_mps b/lib/gpu/Makefile.cuda_mps
index baffe99b47..21aac89151 100644
--- a/lib/gpu/Makefile.cuda_mps
+++ b/lib/gpu/Makefile.cuda_mps
@@ -51,7 +51,7 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
 
 # host code compiler and settings
 
-CUDR_CPP = mpicxx -fopenmp -fopenmp-simd -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
+CUDR_CPP = mpicxx -fopenmp -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
 CUDR_OPTS = -O2 $(LMP_INC)
 CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PROXY) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
          $(CUDPP_OPT)
diff --git a/lib/gpu/Makefile.hip b/lib/gpu/Makefile.hip
index c34823d471..dbdef433ec 100644
--- a/lib/gpu/Makefile.hip
+++ b/lib/gpu/Makefile.hip
@@ -17,7 +17,7 @@ LMP_INC = -DLAMMPS_SMALLBIG
 HIP_PRECISION = -D_SINGLE_DOUBLE
 
 HIP_OPTS = -O3
-HIP_HOST_OPTS = -Wno-deprecated-declarations -fopenmp -fopenmp-sim
+HIP_HOST_OPTS = -Wno-deprecated-declarations -fopenmp
 HIP_HOST_INCLUDE =
 
 # use device sort
diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl
index c20e26b1f3..43d012dc4a 100644
--- a/lib/gpu/Makefile.linux_opencl
+++ b/lib/gpu/Makefile.linux_opencl
@@ -15,7 +15,7 @@ OCL_INC =
 OCL_CPP = mpic++ -std=c++11 -O3 -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
 OCL_LINK = -lOpenCL
 OCL_PREC = -D_SINGLE_DOUBLE
-OCL_TUNE = -fopenmp -fopenmp-simd -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
+OCL_TUNE = -fopenmp -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
 
 BIN_DIR = ./
 OBJ_DIR = ./

From f367e66abafe2cd4bd7bc4d63e25118259612419 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 17 Feb 2021 18:47:35 -0500
Subject: [PATCH 051/116] documentation corrections, spelling fixes and updates

---
 doc/src/Speed_gpu.rst                       | 42 +++++++++++++--------
 doc/src/package.rst                         | 20 +++++-----
 doc/utils/sphinx-config/false_positives.txt |  2 +
 lib/gpu/README                              |  4 +-
 4 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/doc/src/Speed_gpu.rst b/doc/src/Speed_gpu.rst
index 655f2e1958..709a3ad3bb 100644
--- a/doc/src/Speed_gpu.rst
+++ b/doc/src/Speed_gpu.rst
@@ -1,11 +1,14 @@
 GPU package
 ===========
 
-The GPU package was developed by Mike Brown while at SNL and ORNL
-and his collaborators, particularly Trung Nguyen (now at Northwestern).
-It provides GPU versions of many pair styles and for parts of the
-:doc:`kspace_style pppm <kspace_style>` for long-range Coulombics.
-It has the following general features:
+The GPU package was developed by Mike Brown while at SNL and ORNL (now
+at Intel Corp.) and his collaborators, particularly Trung Nguyen (now at
+Northwestern).  Support for AMD GPUs via HIP was added by Vsevolod Nikolskiy
+and coworkers at HSE University.
+
+The GPU package provides GPU versions of many pair styles and for
+parts of the :doc:`kspace_style pppm <kspace_style>` for long-range
+Coulombics.  It has the following general features:
 
 * It is designed to exploit common GPU hardware configurations where one
   or more GPUs are coupled to many cores of one or more multi-core CPUs,
@@ -24,8 +27,9 @@ It has the following general features:
   force vectors.
 * LAMMPS-specific code is in the GPU package.  It makes calls to a
   generic GPU library in the lib/gpu directory.  This library provides
-  NVIDIA support as well as more general OpenCL support, so that the
-  same functionality is supported on a variety of hardware.
+  either Nvidia support, AMD support, or more general OpenCL support
+  (for Nvidia GPUs, AMD GPUs, Intel GPUs, and multi-core CPUs).
+  so that the same functionality is supported on a variety of hardware.
 
 **Required hardware/software:**
 
@@ -89,10 +93,10 @@ shared by 4 MPI tasks.
 The GPU package also has limited support for OpenMP for both
 multi-threading and vectorization of routines that are run on the CPUs.
 This requires that the GPU library and LAMMPS are built with flags to
-enable OpenMP support (e.g. -fopenmp -fopenmp-simd). Some styles for
-time integration are also available in the GPU package. These run
-completely on the CPUs in full double precision, but exploit
-multi-threading and vectorization for faster performance.
+enable OpenMP support (e.g. -fopenmp). Some styles for time integration
+are also available in the GPU package. These run completely on the CPUs
+in full double precision, but exploit multi-threading and vectorization
+for faster performance.
 
 Use the "-sf gpu" :doc:`command-line switch <Run_options>`, which will
 automatically append "gpu" to styles that support it.  Use the "-pk
@@ -159,11 +163,11 @@ Likewise, you should experiment with the precision setting for the GPU
 library to see if single or mixed precision will give accurate
 results, since they will typically be faster.
 
-MPI parallelism typically outperforms OpenMP parallelism, but in same cases
-using fewer MPI tasks and multiple OpenMP threads with the GPU package
-can give better performance. 3-body potentials can often perform better
-with multiple OMP threads because the inter-process communication is
-higher for these styles with the GPU package in order to allow
+MPI parallelism typically outperforms OpenMP parallelism, but in some
+cases using fewer MPI tasks and multiple OpenMP threads with the GPU
+package can give better performance. 3-body potentials can often perform
+better with multiple OMP threads because the inter-process communication
+is higher for these styles with the GPU package in order to allow
 deterministic results.
 
 **Guidelines for best performance:**
@@ -189,6 +193,12 @@ deterministic results.
   :doc:`angle <angle_style>`, :doc:`dihedral <dihedral_style>`,
   :doc:`improper <improper_style>`, and :doc:`long-range <kspace_style>`
   calculations will not be included in the "Pair" time.
+* Since only part of the pppm kspace style is GPU accelerated, it
+  may be faster to only use GPU acceleration for Pair styles with
+  long-range electrostatics.  See the "pair/only" keyword of the
+  package command for a shortcut to do that.  The work between kspace
+  on the CPU and non-bonded interactions on the GPU can be balanced
+  through adjusting the coulomb cutoff without loss of accuracy.
 * When the *mode* setting for the package gpu command is force/neigh,
   the time for neighbor list calculations on the GPU will be added into
   the "Pair" time, not the "Neigh" time.  An additional breakdown of the
diff --git a/doc/src/package.rst b/doc/src/package.rst
index a091759214..aea4ba657f 100644
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@@ -175,7 +175,7 @@ package.
 
 The *Ngpu* argument sets the number of GPUs per node. If *Ngpu* is 0
 and no other keywords are specified, GPU or accelerator devices are
-autoselected. In this process, all platforms are searched for
+auto-selected. In this process, all platforms are searched for
 accelerator devices and GPUs are chosen if available. The device with
 the highest number of compute cores is selected. The number of devices
 is increased to be the number of matching accelerators with the same
@@ -257,7 +257,8 @@ the other particles.
 The *gpuID* keyword is used to specify the first ID for the GPU or
 other accelerator that LAMMPS will use. For example, if the ID is
 1 and *Ngpu* is 3, GPUs 1-3 will be used. Device IDs should be
-determined from the output of nvc_get_devices or ocl_get_devices
+determined from the output of nvc_get_devices, ocl_get_devices,
+or hip_get_devices
 as provided in the lib/gpu directory. When using OpenCL with
 accelerators that have main memory NUMA, the accelerators can be
 split into smaller virtual accelerators for more efficient use
@@ -306,13 +307,14 @@ PPPM_MAX_SPLINE.
 
 CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX
 (NVIDIA) or OpenCL extensions (Intel) should be used for horizontal
-vector operataions. FAST_MATH in {0,1} indicates that OpenCL fast math
-optimizations are used during the build and HW-accelerated
-transcendentals are used when available. THREADS_PER_* give the default
-*tpa* values for ellipsoidal models, styles using charge, and any other
-styles. The BLOCK_* parameters specify the block sizes for various
-kernal calls and the MAX_*SHARED*_ parameters are used to determine the
-amount of local shared memory to use for storing model parameters.
+vector operations. FAST_MATH in {0,1} indicates that OpenCL fast math
+optimizations are used during the build and hardware-accelerated
+transcendental functions are used when available. THREADS_PER_* give the
+default *tpa* values for ellipsoidal models, styles using charge, and
+any other styles. The BLOCK_* parameters specify the block sizes for
+various kernel calls and the MAX_*SHARED*_ parameters are used to
+determine the amount of local shared memory to use for storing model
+parameters.
 
 For OpenCL, the routines are compiled at runtime for the specified GPU
 or accelerator architecture. The *ocl_args* keyword can be used to
diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt
index 9937a98850..982e1fde2a 100644
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@@ -2297,6 +2297,7 @@ omegaz
 Omelyan
 omp
 OMP
+oneAPI
 onelevel
 oneway
 onn
@@ -2528,6 +2529,7 @@ ptm
 PTM
 ptol
 ptr
+PTX
 pu
 purdue
 Purohit
diff --git a/lib/gpu/README b/lib/gpu/README
index 28655836f4..dfffe11b81 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -45,8 +45,10 @@ efficient use with MPI.
 
 After building the GPU library, for OpenCL:
   ./ocl_get_devices
-and for CUDA
+for CUDA:
   ./nvc_get_devices
+and for ROCm HIP:
+  ./hip_get_devices
 
 ------------------------------------------------------------------------------
                               QUICK START

From 45f6e9ec2ef4a9af5278a2537cf123ac330ba9d7 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 17 Feb 2021 18:47:41 -0500
Subject: [PATCH 052/116] whitespace

---
 src/atom.cpp         | 10 +++++-----
 src/reset_atom_ids.h |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/atom.cpp b/src/atom.cpp
index e7b1df8240..75b1b07fbf 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -2153,7 +2153,7 @@ void Atom::setup_sort_bins()
   bininvy = nbiny / (bboxhi[1]-bboxlo[1]);
   bininvz = nbinz / (bboxhi[2]-bboxlo[2]);
 
-  #ifdef LMP_USER_INTEL
+#ifdef LMP_USER_INTEL
   int intel_neigh = 0;
   if (neighbor->nrequest) {
     if (neighbor->requests[0]->intel) intel_neigh = 1;
@@ -2198,9 +2198,9 @@ void Atom::setup_sort_bins()
     bboxhi[1] = bboxlo[1] + static_cast<double>(nbiny) / bininvy;
     bboxhi[2] = bboxlo[2] + static_cast<double>(nbinz) / bininvz;
   }
-  #endif
+#endif
 
-  #ifdef LMP_GPU
+#ifdef LMP_GPU
   if (userbinsize == 0.0) {
     int ifix = modify->find_fix("package_gpu");
     if (ifix >= 0) {
@@ -2212,7 +2212,7 @@ void Atom::setup_sort_bins()
       binsize = fix->binsize(subx, suby, subz, atom->nlocal,
                              neighbor->cutneighmax);
       bininv = 1.0 / binsize;
-      
+
       nbinx = static_cast<int> (ceil(subx * bininv));
       nbiny = static_cast<int> (ceil(suby * bininv));
       nbinz = static_cast<int> (ceil(subz * bininv));
@@ -2227,7 +2227,7 @@ void Atom::setup_sort_bins()
       bininvz = bininv;
     }
   }
-  #endif
+#endif
 
   if (1.0*nbinx*nbiny*nbinz > INT_MAX)
     error->one(FLERR,"Too many atom sorting bins");
diff --git a/src/reset_atom_ids.h b/src/reset_atom_ids.h
index 7c5c53e2ba..02a7f77e8d 100644
--- a/src/reset_atom_ids.h
+++ b/src/reset_atom_ids.h
@@ -37,7 +37,7 @@ class ResetIDs : protected Pointers {
     int ilocal;
   };
 
-  #if defined(LMP_QSORT)
+#if defined(LMP_QSORT)
   // static variable across all ResetID objects, for qsort callback
   static AtomRvous *sortrvous;
 #endif

From 45c782308c0937260049db940c121934fa0b2ffc Mon Sep 17 00:00:00 2001
From: Michael Brown <michael.w.brown@intel.com>
Date: Thu, 18 Feb 2021 21:08:18 -0800
Subject: [PATCH 053/116] Fixing issue from recent GPU package update with
 OMP_NUM_THREADS env being overridden in GPU library. Fixing race condition
 with OpenMP for GPU styles using torque (missed in regression tests due to
 the first fix) Documenting GPU package option for setting the number of
 threads (consistent with USER-INTEL and USER-OMP).

---
 doc/src/package.rst            | 85 +++++++++++++++-------------------
 lib/gpu/lal_answer.cpp         |  8 ++--
 lib/gpu/lal_base_ellipsoid.cpp |  2 +-
 lib/gpu/lal_device.cpp         | 28 +++++------
 lib/gpu/lal_device.h           | 12 ++---
 src/GPU/fix_gpu.cpp            | 27 +++++++----
 6 files changed, 77 insertions(+), 85 deletions(-)

diff --git a/doc/src/package.rst b/doc/src/package.rst
index aea4ba657f..842fc8bc1c 100644
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@@ -32,10 +32,12 @@ Syntax
              size = bin size for neighbor list construction (distance units)
            *split* = fraction
              fraction = fraction of atoms assigned to GPU (default = 1.0)
-           *tpa* value = Nthreads
-             Nthreads = # of GPU vector lanes used per atom
+           *tpa* value = Nlanes
+             Nlanes = # of GPU vector lanes (CUDA threads) used per atom
            *blocksize* value = size
              size = thread block size for pair force computation
+           *omp* value = Nthreads
+             Nthreads = number of OpenMP threads to use on CPU (default = 0)
            *platform* value = id
              id = For OpenCL, platform ID for the GPU or accelerator
            *gpuID* values = id
@@ -101,7 +103,7 @@ Syntax
              off = use device acceleration (e.g. GPU) for all available styles in the KOKKOS package (default)
              on  = use device acceleration only for pair styles (and host acceleration for others)
        *omp* args = Nthreads keyword value ...
-         Nthread = # of OpenMP threads to associate with each MPI process
+         Nthreads = # of OpenMP threads to associate with each MPI process
          zero or more keyword/value pairs may be appended
          keywords = *neigh*
            *neigh* value = *yes* or *no*
@@ -116,7 +118,7 @@ Examples
    package gpu 0
    package gpu 1 split 0.75
    package gpu 2 split -1.0
-   package gpu 0 device_type intelgpu
+   package gpu 0 omp 2 device_type intelgpu
    package kokkos neigh half comm device
    package omp 0 neigh no
    package omp 4
@@ -266,10 +268,10 @@ with MPI.
 
 The *tpa* keyword sets the number of GPU vector lanes per atom used to
 perform force calculations.  With a default value of 1, the number of
-threads will be chosen based on the pair style, however, the value can
+lanes will be chosen based on the pair style, however, the value can
 be set explicitly with this keyword to fine-tune performance.  For
 large cutoffs or with a small number of particles per GPU, increasing
-the value can improve performance. The number of threads per atom must
+the value can improve performance. The number of lanes per atom must
 be a power of 2 and currently cannot be greater than the SIMD width
 for the GPU / accelerator. In the case it exceeds the SIMD width, it
 will automatically be decreased to meet the restriction.
@@ -282,6 +284,14 @@ individual GPU cores, but reduces the total number of thread blocks,
 thus may lead to load imbalance. On modern hardware, the sensitivity
 to the blocksize is typically low.
 
+The *Nthreads* value for the *omp* keyword sets the number of OpenMP
+threads allocated for each MPI task. This setting controls OpenMP
+parallelism only for routines run on the CPUs. For more details on
+setting the number of OpenMP threads, see the discussion of the
+*Nthreads* setting on this doc page for the "package omp" command.
+The meaning of *Nthreads* is exactly the same for the GPU, USER-INTEL,
+and GPU packages.
+
 The *platform* keyword is only used with OpenCL to specify the ID for
 an OpenCL platform. See the output from ocl_get_devices in the lib/gpu
 directory. In LAMMPS only one platform can be active at a time and by
@@ -336,44 +346,13 @@ built with co-processor support.
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 
-The *omp* keyword determines the number of OpenMP threads allocated
-for each MPI task when any portion of the interactions computed by a
-USER-INTEL pair style are run on the CPU.  This can be the case even
-if LAMMPS was built with co-processor support; see the *balance*
-keyword discussion below.  If you are running with less MPI tasks/node
-than there are CPUs, it can be advantageous to use OpenMP threading on
-the CPUs.
-
-.. note::
-
-   The *omp* keyword has nothing to do with co-processor threads on
-   the Xeon Phi; see the *tpc* and *tptask* keywords below for a
-   discussion of co-processor threads.
-
-The *Nthread* value for the *omp* keyword sets the number of OpenMP
-threads allocated for each MPI task.  Setting *Nthread* = 0 (the
-default) instructs LAMMPS to use whatever value is the default for the
-given OpenMP environment. This is usually determined via the
-*OMP_NUM_THREADS* environment variable or the compiler runtime, which
-is usually a value of 1.
-
-For more details, including examples of how to set the OMP_NUM_THREADS
-environment variable, see the discussion of the *Nthreads* setting on
-this doc page for the "package omp" command.  Nthreads is a required
-argument for the USER-OMP package.  Its meaning is exactly the same
-for the USER-INTEL package.
-
-.. note::
-
-   If you build LAMMPS with both the USER-INTEL and USER-OMP
-   packages, be aware that both packages allow setting of the *Nthreads*
-   value via their package commands, but there is only a single global
-   *Nthreads* value used by OpenMP.  Thus if both package commands are
-   invoked, you should insure the two values are consistent.  If they are
-   not, the last one invoked will take precedence, for both packages.
-   Also note that if the :doc:`-sf hybrid intel omp command-line switch <Run_options>` is used, it invokes a "package intel"
-   command, followed by a "package omp" command, both with a setting of
-   *Nthreads* = 0.
+The *Nthreads* value for the *omp* keyword sets the number of OpenMP
+threads allocated for each MPI task. This setting controls OpenMP
+parallelism only for routines run on the CPUs. For more details on
+setting the number of OpenMP threads, see the discussion of the
+*Nthreads* setting on this doc page for the "package omp" command.
+The meaning of *Nthreads* is exactly the same for the GPU, USER-INTEL,
+and GPU packages.
 
 The *mode* keyword determines the precision mode to use for
 computing pair style forces, either on the CPU or on the co-processor,
@@ -579,7 +558,7 @@ result in better performance for certain configurations and system sizes.
 The *omp* style invokes settings associated with the use of the
 USER-OMP package.
 
-The *Nthread* argument sets the number of OpenMP threads allocated for
+The *Nthreads* argument sets the number of OpenMP threads allocated for
 each MPI task.  For example, if your system has nodes with dual
 quad-core processors, it has a total of 8 cores per node.  You could
 use two MPI tasks per node (e.g. using the -ppn option of the mpirun
@@ -588,7 +567,7 @@ This would use all 8 cores on each node.  Note that the product of MPI
 tasks \* threads/task should not exceed the physical number of cores
 (on a node), otherwise performance will suffer.
 
-Setting *Nthread* = 0 instructs LAMMPS to use whatever value is the
+Setting *Nthreads* = 0 instructs LAMMPS to use whatever value is the
 default for the given OpenMP environment. This is usually determined
 via the *OMP_NUM_THREADS* environment variable or the compiler
 runtime.  Note that in most cases the default for OpenMP capable
@@ -619,6 +598,18 @@ input.  Not all features of LAMMPS support OpenMP threading via the
 USER-OMP package and the parallel efficiency can be very different,
 too.
 
+.. note::
+
+   If you build LAMMPS with the GPU, USER-INTEL, and / or USER-OMP
+   packages, be aware these packages all allow setting of the *Nthreads*
+   value via their package commands, but there is only a single global
+   *Nthreads* value used by OpenMP.  Thus if multiple package commands are
+   invoked, you should insure the values are consistent.  If they are
+   not, the last one invoked will take precedence, for all packages.
+   Also note that if the :doc:`-sf hybrid intel omp command-line switch <Run_options>` is used, it invokes a "package intel" command, followed by a
+   "package omp" command, both with a setting of *Nthreads* = 0. Likewise
+   for a hybrid suffix for gpu and omp.
+
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 
@@ -665,7 +656,7 @@ Default
 
 For the GPU package, the default is Ngpu = 0 and the option defaults
 are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0
-to Ngpu-1, tpa = 1, and platform=-1.  These settings are made
+to Ngpu-1, tpa = 1, omp = 0, and platform=-1.  These settings are made
 automatically if the "-sf gpu" :doc:`command-line switch <Run_options>`
 is used.  If it is not used, you must invoke the package gpu command
 in your input script or via the "-pk gpu" :doc:`command-line switch <Run_options>`.
diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp
index e2478a64e5..4a68466d05 100644
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@@ -331,11 +331,11 @@ void AnswerT::get_answers(double **f, double **tor) {
       }
       if (_rot) {
         vec3d *torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
-        forcep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
+        vec4d_t *torquep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
         for (int i=ifrom; i<ito; i++) {
-          torp[i].x+=forcep[i].x;
-          torp[i].y+=forcep[i].y;
-          torp[i].z+=forcep[i].z;
+          torp[i].x+=torquep[i].x;
+          torp[i].y+=torquep[i].y;
+          torp[i].z+=torquep[i].z;
         }
       }
     }
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index 87bfe14751..98411a8033 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -250,7 +250,7 @@ void BaseEllipsoidT::output_times() {
       if (times[6]>0)
         fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);
+      fprintf(screen,"Lanes / atom:    %d.\n",_threads_per_atom);
       fprintf(screen,"Vector width:    %d.\n", device->simd_size());
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
       if (nbor->gpu_nbor()==2)
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 5ba9185e6f..a65c3d8810 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -53,14 +53,10 @@ DeviceT::~Device() {
 template <class numtyp, class acctyp>
 int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
                          const int first_gpu_id, const int gpu_mode,
-                         const double p_split, const int nthreads,
-                         const int t_per_atom, const double user_cell_size,
-                         char *ocl_args, const int ocl_platform,
-                         char *device_type_flags, const int block_pair) {
-  _nthreads=nthreads;
-  #if (LAL_USE_OMP == 1)
-  omp_set_num_threads(nthreads);
-  #endif
+                         const double p_split, const int t_per_atom,
+                         const double user_cell_size, char *ocl_args,
+                         const int ocl_platform, char *device_type_flags,
+                         const int block_pair) {
   _threads_per_atom=t_per_atom;
   _threads_per_charge=t_per_atom;
   _threads_per_three=t_per_atom;
@@ -583,7 +579,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
     fprintf(screen,"- Using acceleration for %s:\n",name);
     fprintf(screen,"-  with %d proc(s) per device.\n",_procs_per_gpu);
     #if (LAL_USE_OMP == 1)
-    fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
+    fprintf(screen,"-  with %d thread(s) per proc.\n", omp_get_max_threads());
     #endif
     #ifdef USE_OPENCL
     fprintf(screen,"-  with OpenCL Parameters for: %s (%d)\n",
@@ -803,7 +799,7 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
       if (times[5]>0)
         fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
+      fprintf(screen,"Lanes / atom:    %d.\n",threads_per_atom);
       fprintf(screen,"Vector width:    %d.\n", simd_size());
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
       if (nbor.gpu_nbor()==2)
@@ -1031,13 +1027,13 @@ Device<PRECISION,ACC_PRECISION> global_device;
 using namespace LAMMPS_AL;
 int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
                     const int first_gpu_id, const int gpu_mode,
-                    const double particle_split, const int nthreads,
-                    const int t_per_atom, const double user_cell_size,
-                    char *opencl_config, const int ocl_platform,
-                    char *device_type_flags, const int block_pair) {
+                    const double particle_split, const int t_per_atom,
+                    const double user_cell_size, char *opencl_config,
+                    const int ocl_platform, char *device_type_flags,
+                    const int block_pair) {
   return global_device.init_device(world,replica,ngpu,first_gpu_id,gpu_mode,
-                                   particle_split,nthreads,t_per_atom,
-                                   user_cell_size,opencl_config,ocl_platform,
+                                   particle_split,t_per_atom,user_cell_size,
+                                   opencl_config,ocl_platform,
                                    device_type_flags,block_pair);
 }
 
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index bd5b81558c..1db6ae3127 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -49,10 +49,10 @@ class Device {
     * - -11 if config_string has the wrong number of parameters **/
   int init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
                   const int first_gpu_id, const int gpu_mode,
-                  const double particle_split, const int nthreads,
-                  const int t_per_atom, const double user_cell_size,
-                  char *config_string, const int ocl_platform,
-                  char *device_type_flags, const int block_pair);
+                  const double particle_split, const int t_per_atom,
+                  const double user_cell_size, char *config_string,
+                  const int ocl_platform, char *device_type_flags,
+                  const int block_pair);
 
   /// Initialize the device for Atom storage
   /** \param charge True if charges need to be stored
@@ -201,8 +201,6 @@ class Device {
 
   /// Return the number of procs sharing a device (size of device communicator)
   inline int procs_per_gpu() const { return _procs_per_gpu; }
-  /// Return the number of threads per proc
-  inline int num_threads() const { return _nthreads; }
   /// My rank within all processes
   inline int world_me() const { return _world_me; }
   /// Total number of processes
@@ -331,7 +329,7 @@ class Device {
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;
   int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
       _replica_size;
-  int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads;
+  int _gpu_mode, _first_device, _last_device, _platform_id;
   double _particle_split;
   double _cpu_full;
   double _ptx_arch;
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index efbaa6e1f8..8297c338a5 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -32,16 +32,18 @@
 #include "citeme.h"
 #include "error.h"
 
+#if (LAL_USE_OMP == 1)
+#include <omp.h>
+#endif
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH};
 
-extern int lmp_init_device(MPI_Comm world, MPI_Comm replica,
-                           const int ngpu, const int first_gpu_id,
-                           const int gpu_mode, const double particle_split,
-                           const int nthreads, const int t_per_atom,
+extern int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+                           const int first_gpu_id, const int gpu_mode,
+                           const double particle_split, const int t_per_atom,
                            const double cell_size, char *opencl_args,
                            const int ocl_platform, char *device_type_flags,
                            const int block_pair);
@@ -123,7 +125,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
 
   _gpu_mode = GPU_NEIGH;
   _particle_split = 1.0;
-  int nthreads = 1;
+  int nthreads = 0;
   int newtonflag = 0;
   int threads_per_atom = -1;
   double binsize = 0.0;
@@ -167,10 +169,10 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
       threads_per_atom = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"nthreads") == 0) {
+    } else if (strcmp(arg[iarg],"omp") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
       nthreads = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
-      if (nthreads < 1) error->all(FLERR,"Illegal fix GPU command");
+      if (nthreads < 0) error->all(FLERR,"Illegal fix GPU command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"platform") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
@@ -200,6 +202,11 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   #if (LAL_USE_OMP == 0)
   if (nthreads > 1)
     error->all(FLERR,"No OpenMP support compiled in");
+  #else
+  if (nthreads > 0) {
+    omp_set_num_threads(nthreads);
+    comm->nthreads = nthreads;
+  }
   #endif
 
   // set newton pair flag
@@ -227,9 +234,9 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   if (binsize == 0.0) binsize = -1.0;
   _binsize = binsize;
   int gpu_flag = lmp_init_device(universe->uworld, world, ngpu, first_gpu_id,
-                                 _gpu_mode, _particle_split, nthreads,
-                                 threads_per_atom, binsize, opencl_args,
-                                 ocl_platform, device_type_flags, block_pair);
+                                 _gpu_mode, _particle_split, threads_per_atom,
+                                 binsize, opencl_args, ocl_platform,
+                                 device_type_flags, block_pair);
   GPU_EXTRA::check_flag(gpu_flag,error,world);
 }
 

From ab9552b63a8d6a35d2c80bbf243b91971511cbbc Mon Sep 17 00:00:00 2001
From: Michael Brown <michael.w.brown@intel.com>
Date: Fri, 19 Feb 2021 05:55:37 -0800
Subject: [PATCH 054/116] Adding some notes about KOKKOS thread settings to the
 package doc.

---
 doc/src/package.rst | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/doc/src/package.rst b/doc/src/package.rst
index 842fc8bc1c..1613ff2fae 100644
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@@ -608,7 +608,13 @@ too.
    not, the last one invoked will take precedence, for all packages.
    Also note that if the :doc:`-sf hybrid intel omp command-line switch <Run_options>` is used, it invokes a "package intel" command, followed by a
    "package omp" command, both with a setting of *Nthreads* = 0. Likewise
-   for a hybrid suffix for gpu and omp.
+   for a hybrid suffix for gpu and omp. Note that KOKKOS also supports
+   setting the number of OpenMP threads from the command line using the
+   "-k on" :doc:`command-line switch <Run_options>`. The default for
+   KOKKOS is 1 thread per MPI task, so any other number of threads should
+   be explicitly set using the "-k on" command-line switch (and this
+   setting should be consistent with settings from any other packages
+   used).
 
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.

From a40db8ddf1a468415c372a04e16fc92d87534a0a Mon Sep 17 00:00:00 2001
From: Michael Brown <michael.w.brown@intel.com>
Date: Fri, 19 Feb 2021 05:59:25 -0800
Subject: [PATCH 055/116] Fix for hybrid pair style with certain combinations
 of USER-INTEL styles.

Specifically, fixes issue where memory was not zeroed correctly with a hybrid
pair style including an intel variant and a non-intel variant combined with
intel variant(s) of non-pair styles.
---
 src/USER-INTEL/fix_intel.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
index 31bd63160f..6c7e108ca6 100644
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -318,8 +318,7 @@ void FixIntel::init()
   _zero_master = 0;
 
   if (_pair_hybrid_flag && _hybrid_nonpair)
-    if (_pair_hybrid_flag > 1 || force->newton_pair == 0)
-      _pair_hybrid_zero = 1;
+    _pair_hybrid_zero = 1;
   _hybrid_nonpair = 0;
 
   _pair_intel_count = 0;

From fd67f83bb7595db7ecff20fc9fc1fef4fe69c364 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 19 Feb 2021 10:27:31 -0500
Subject: [PATCH 056/116] replace atoi() with utils::inumeric()

---
 src/fix_addforce.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fix_addforce.cpp b/src/fix_addforce.cpp
index a06544e268..07031a40a4 100644
--- a/src/fix_addforce.cpp
+++ b/src/fix_addforce.cpp
@@ -83,7 +83,7 @@ FixAddForce::FixAddForce(LAMMPS *lmp, int narg, char **arg) :
   while (iarg < narg) {
     if (strcmp(arg[iarg],"every") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal fix addforce command");
-      nevery = atoi(arg[iarg+1]);
+      nevery = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
       if (nevery <= 0) error->all(FLERR,"Illegal fix addforce command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"region") == 0) {

From d36df19a2d798837a8a630d72b4dda9727e88eb7 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Fri, 19 Feb 2021 13:22:35 -0500
Subject: [PATCH 057/116] Use mallinfo2 with glibc >= 2.33

---
 src/info.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/info.cpp b/src/info.cpp
index bf6f14a48a..f1dc96645b 100644
--- a/src/info.cpp
+++ b/src/info.cpp
@@ -1449,8 +1449,13 @@ void Info::get_memory_info(double *meminfo)
     meminfo[2] = (double)pmc.PeakWorkingSetSize/1048576.0;
 #else
 #if defined(__linux__)
+#if defined(__GLIBC__) && __GLIBC_PREREQ(2, 33)
+    struct mallinfo2 mi;
+    mi = mallinfo2();
+#else
     struct mallinfo mi;
     mi = mallinfo();
+#endif
     meminfo[1] = (double)mi.uordblks/1048576.0+(double)mi.hblkhd/1048576.0;
 #endif
     struct rusage ru;

From 99ff0bb4d25f17a6a18c038b8489e485272e71d8 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 19 Feb 2021 16:57:49 -0500
Subject: [PATCH 058/116] fix cut-n-paste bug

---
 src/citeme.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/citeme.cpp b/src/citeme.cpp
index fdd1ee867d..41ac87f5bb 100644
--- a/src/citeme.cpp
+++ b/src/citeme.cpp
@@ -118,7 +118,7 @@ void CiteMe::flush()
       if (!citefile.empty())
         logbuffer += fmt::format(cite_file,"file",citefile);
       if (screen_flag == VERBOSE)
-        scrbuffer += fmt::format(cite_file,"screen","output");
+        logbuffer += fmt::format(cite_file,"screen","output");
       logbuffer += cite_separator;
       if (logfile) fputs(logbuffer.c_str(),logfile);
       logbuffer.clear();

From 0a355c019451c641d2b22c9aa47cba63aa22d40f Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 19 Feb 2021 15:20:09 -0700
Subject: [PATCH 059/116] Fix bug in dump image ssao depth shading

---
 src/image.cpp | 218 +++++++++++++++++++++++++++++---------------------
 src/image.h   |   4 +
 2 files changed, 131 insertions(+), 91 deletions(-)

diff --git a/src/image.cpp b/src/image.cpp
index 4b181ee8b0..0acef0bceb 100644
--- a/src/image.cpp
+++ b/src/image.cpp
@@ -113,6 +113,11 @@ Image::Image(LAMMPS *lmp, int nmap_caller) : Pointers(lmp)
   backLightColor[2] = 0.9;
 
   random = nullptr;
+
+  // MPI_Gatherv vectors
+
+  recvcounts = nullptr;
+  displs = nullptr;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -134,6 +139,9 @@ Image::~Image()
   memory->destroy(rgbcopy);
 
   if (random) delete random;
+
+  memory->destroy(recvcounts);
+  memory->destroy(displs);
 }
 
 /* ----------------------------------------------------------------------
@@ -334,16 +342,37 @@ void Image::merge()
   // extra SSAO enhancement
   // bcast full image to all procs
   // each works on subset of pixels
-  // gather result back to proc 0
+  // MPI_Gather() result back to proc 0
+  // use Gatherv() if subset of pixels is not the same size on every proc
 
   if (ssao) {
     MPI_Bcast(imageBuffer,npixels*3,MPI_BYTE,0,world);
     MPI_Bcast(surfaceBuffer,npixels*2,MPI_DOUBLE,0,world);
     MPI_Bcast(depthBuffer,npixels,MPI_DOUBLE,0,world);
     compute_SSAO();
-    int pixelPart = height/nprocs * width*3;
-    MPI_Gather(imageBuffer+me*pixelPart,pixelPart,MPI_BYTE,
-               rgbcopy,pixelPart,MPI_BYTE,0,world);
+
+    int pixelstart = 3 * static_cast<int> (1.0*me/nprocs * npixels);
+    int pixelstop = 3 * static_cast<int> (1.0*(me+1)/nprocs * npixels);
+    int mypixels = pixelstop - pixelstart;
+
+    if (npixels % nprocs == 0) {
+      MPI_Gather(imageBuffer+pixelstart,mypixels,MPI_BYTE,
+                 rgbcopy,mypixels,MPI_BYTE,0,world);
+
+    } else {
+      if (recvcounts == nullptr) {
+        memory->create(recvcounts,nprocs,"image:recvcounts");
+        memory->create(displs,nprocs,"image:displs");
+        MPI_Allgather(&mypixels,1,MPI_INT,recvcounts,1,MPI_INT,world);
+        displs[0] = 0;
+        for (int i = 1; i < nprocs; i++)
+          displs[i] = displs[i-1] + recvcounts[i-1];
+      }
+
+      MPI_Gatherv(imageBuffer+pixelstart,mypixels,MPI_BYTE,
+                  rgbcopy,recvcounts,displs,MPI_BYTE,0,world);
+    }
+
     writeBuffer = rgbcopy;
   } else {
     writeBuffer = imageBuffer;
@@ -880,110 +909,117 @@ void Image::compute_SSAO()
         -tanPerPixel / zoom;
   int pixelRadius = (int) trunc (SSAORadius / pixelWidth + 0.5);
 
-  int x,y,s;
-  int hPart = height / nprocs;
-  int index = me * hPart * width;
-  for (y = me * hPart; y < (me + 1) * hPart; y ++) {
-    for (x = 0; x < width; x ++, index ++) {
-      double cdepth = depthBuffer[index];
-      if (cdepth < 0) { continue; }
+  // each proc is assigned a subset of contiguous pixels from the full image
+  // pixels are contiguous in x (columns within a row), then by row
+  // index = pixels from 0 to npixel-1
+  // x = column # from 0 to width-1
+  // y = row # from 0 to height-1
 
-      double sx = surfaceBuffer[index * 2 + 0];
-      double sy = surfaceBuffer[index * 2 + 1];
-      double sin_t = -sqrt(sx*sx + sy*sy);
+  int pixelstart = static_cast<int> (1.0*me/nprocs * npixels);
+  int pixelstop = static_cast<int> (1.0*(me+1)/nprocs * npixels);
 
-      double mytheta = random->uniform() * SSAOJitter;
-      double ao = 0.0;
+  for (int index = pixelstart; index < pixelstop; index++) {
+    int x = index % width;
+    int y = index / width;
 
-      for (s = 0; s < SSAOSamples; s ++) {
-        double hx = cos(mytheta);
-        double hy = sin(mytheta);
-        mytheta += delTheta;
+    double cdepth = depthBuffer[index];
+    if (cdepth < 0) { continue; }
 
-        // multiply by z cross surface tangent
-        // so that dot (aka cos) works here
+    double sx = surfaceBuffer[index * 2 + 0];
+    double sy = surfaceBuffer[index * 2 + 1];
+    double sin_t = -sqrt(sx*sx + sy*sy);
 
-        double scaled_sin_t = sin_t * (hx*sy + hy*sx);
+    double mytheta = random->uniform() * SSAOJitter;
+    double ao = 0.0;
 
-        // Bresenham's line algorithm to march over depthBuffer
+    for (int s = 0; s < SSAOSamples; s ++) {
+      double hx = cos(mytheta);
+      double hy = sin(mytheta);
+      mytheta += delTheta;
 
-        int dx = static_cast<int> (hx * pixelRadius);
-        int dy = static_cast<int> (hy * pixelRadius);
-        int ex = x + dx;
-        if (ex < 0) { ex = 0; } if (ex >= width) { ex = width - 1; }
-        int ey = y + dy;
-        if (ey < 0) { ey = 0; } if (ey >= height) { ey = height - 1; }
-        double delta;
-        int small, large;
-        double lenIncr;
-        if (fabs(hx) > fabs(hy)) {
-          small = (hx > 0) ? 1 : -1;
-          large = (hy > 0) ? width : -width;
-          delta = fabs(hy / hx);
-        } else {
-          small = (hy > 0) ? width : -width;
-          large = (hx > 0) ? 1 : -1;
-          delta = fabs(hx / hy);
+      // multiply by z cross surface tangent
+      // so that dot (aka cos) works here
+
+      double scaled_sin_t = sin_t * (hx*sy + hy*sx);
+
+      // Bresenham's line algorithm to march over depthBuffer
+
+      int dx = static_cast<int> (hx * pixelRadius);
+      int dy = static_cast<int> (hy * pixelRadius);
+      int ex = x + dx;
+      if (ex < 0) { ex = 0; } if (ex >= width) { ex = width - 1; }
+      int ey = y + dy;
+      if (ey < 0) { ey = 0; } if (ey >= height) { ey = height - 1; }
+      double delta;
+      int small, large;
+      double lenIncr;
+      if (fabs(hx) > fabs(hy)) {
+        small = (hx > 0) ? 1 : -1;
+        large = (hy > 0) ? width : -width;
+        delta = fabs(hy / hx);
+      } else {
+        small = (hy > 0) ? width : -width;
+        large = (hx > 0) ? 1 : -1;
+        delta = fabs(hx / hy);
+      }
+      lenIncr = sqrt (1 + delta * delta) * pixelWidth;
+
+      // initialize with one step
+      // because the center point doesn't need testing
+
+      int end = ex + ey * width;
+      int ind = index + small;
+      double len = lenIncr;
+      double err = delta;
+      if (err >= 1.0) {
+        ind += large;
+        err -= 1.0;
+      }
+
+      double minPeak = -1;
+      double peakLen = 0.0;
+      int stepsTaken = 1;
+      while ((small > 0 && ind <= end) || (small < 0 && ind >= end)) {
+        if (ind < 0 || ind >= (width*height)) {
+          break;
         }
-        lenIncr = sqrt (1 + delta * delta) * pixelWidth;
 
-        // initialize with one step
-        // because the center point doesn't need testing
+        // cdepth - depthBuffer B/C we want it in the negative z direction
 
-        int end = ex + ey * width;
-        int ind = index + small;
-        double len = lenIncr;
-        double err = delta;
+        if (minPeak < 0 || (depthBuffer[ind] >= 0 &&
+                            depthBuffer[ind] < minPeak)) {
+          minPeak = depthBuffer[ind];
+          peakLen = len;
+        }
+        ind += small;
+        len += lenIncr;
+        err += delta;
         if (err >= 1.0) {
           ind += large;
           err -= 1.0;
         }
-
-        double minPeak = -1;
-        double peakLen = 0.0;
-        int stepsTaken = 1;
-        while ((small > 0 && ind <= end) || (small < 0 && ind >= end)) {
-          if (ind < 0 || ind >= (width*height)) {
-            break;
-          }
-
-          // cdepth - depthBuffer B/C we want it in the negative z direction
-
-          if (minPeak < 0 || (depthBuffer[ind] >= 0 &&
-                              depthBuffer[ind] < minPeak)) {
-            minPeak = depthBuffer[ind];
-            peakLen = len;
-          }
-          ind += small;
-          len += lenIncr;
-          err += delta;
-          if (err >= 1.0) {
-            ind += large;
-            err -= 1.0;
-          }
-          stepsTaken ++;
-        }
-
-        if (peakLen > 0) {
-          double h = atan ((cdepth - minPeak) / peakLen);
-          ao += saturate(sin (h) - scaled_sin_t);
-        } else {
-          ao += saturate(-scaled_sin_t);
-        }
+        stepsTaken ++;
       }
-      ao /= (double)SSAOSamples;
 
-      double c[3];
-      c[0] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 0]);
-      c[1] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 1]);
-      c[2] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 2]);
-      c[0] *= (1.0 - ao);
-      c[1] *= (1.0 - ao);
-      c[2] *= (1.0 - ao);
-      imageBuffer[index * 3 + 0] = (int) c[0];
-      imageBuffer[index * 3 + 1] = (int) c[1];
-      imageBuffer[index * 3 + 2] = (int) c[2];
+      if (peakLen > 0) {
+        double h = atan ((cdepth - minPeak) / peakLen);
+        ao += saturate(sin (h) - scaled_sin_t);
+      } else {
+        ao += saturate(-scaled_sin_t);
+      }
     }
+    ao /= (double)SSAOSamples;
+
+    double c[3];
+    c[0] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 0]);
+    c[1] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 1]);
+    c[2] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 2]);
+    c[0] *= (1.0 - ao);
+    c[1] *= (1.0 - ao);
+    c[2] *= (1.0 - ao);
+    imageBuffer[index * 3 + 0] = (int) c[0];
+    imageBuffer[index * 3 + 1] = (int) c[1];
+    imageBuffer[index * 3 + 2] = (int) c[2];
   }
 }
 
diff --git a/src/image.h b/src/image.h
index 7df81425d9..1de455d4bd 100644
--- a/src/image.h
+++ b/src/image.h
@@ -73,6 +73,10 @@ class Image : protected Pointers {
   double *depthcopy,*surfacecopy;
   unsigned char *imageBuffer,*rgbcopy,*writeBuffer;
 
+  // MPI_Gatherv
+
+  int *recvcounts,*displs;
+
   // constant view params
 
   double FOV;

From a98177c366ca53b548556e784e2931288e3469bc Mon Sep 17 00:00:00 2001
From: jrgissing <jrgiss05@gmail.com>
Date: Sat, 20 Feb 2021 15:07:50 -0500
Subject: [PATCH 060/116] ring_check refactor

---
 src/USER-REACTION/fix_bond_react.cpp | 32 +++++++++++-----------------
 src/USER-REACTION/fix_bond_react.h   |  2 +-
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/src/USER-REACTION/fix_bond_react.cpp b/src/USER-REACTION/fix_bond_react.cpp
index 1ec29efacd..40cf2748e2 100644
--- a/src/USER-REACTION/fix_bond_react.cpp
+++ b/src/USER-REACTION/fix_bond_react.cpp
@@ -1627,8 +1627,8 @@ void FixBondReact::check_a_neighbor()
 
             glove_counter++;
             if (glove_counter == onemol->natoms) {
-              status = ACCEPT;
-              ring_check();
+              if (ring_check()) status = ACCEPT;
+              else status = GUESSFAIL;
               return;
             }
             // status should still == PROCEED
@@ -1679,8 +1679,8 @@ void FixBondReact::check_a_neighbor()
 
         glove_counter++;
         if (glove_counter == onemol->natoms) {
-          status = ACCEPT;
-          ring_check();
+          if (ring_check()) status = ACCEPT;
+          else status = GUESSFAIL;
           return;
           // will never complete here when there are edge atoms
           // ...actually that could be wrong if people get creative...shouldn't affect anything
@@ -1791,8 +1791,8 @@ void FixBondReact::inner_crosscheck_loop()
   }
   glove_counter++;
   if (glove_counter == onemol->natoms) {
-    status = ACCEPT;
-    ring_check();
+    if (ring_check()) status = ACCEPT;
+    else status = GUESSFAIL;
     return;
   }
   status = CONTINUE;
@@ -1803,21 +1803,17 @@ void FixBondReact::inner_crosscheck_loop()
   Necessary for certain ringed structures
 ------------------------------------------------------------------------- */
 
-void FixBondReact::ring_check()
+int FixBondReact::ring_check()
 {
   // ring_check can be made more efficient by re-introducing 'frozen' atoms
   // 'frozen' atoms have been assigned and also are no longer pioneers
 
   // double check the number of neighbors match for all non-edge atoms
   // otherwise, atoms at 'end' of symmetric ring can behave like edge atoms
-  for (int i = 0; i < onemol->natoms; i++) {
-    if (edge[i][rxnID] == 0) {
-      if (onemol_nxspecial[i][0] != nxspecial[atom->map(glove[i][1])][0]) {
-        status = GUESSFAIL;
-        return;
-      }
-    }
-  }
+  for (int i = 0; i < onemol->natoms; i++)
+    if (edge[i][rxnID] == 0 &&
+        onemol_nxspecial[i][0] != nxspecial[atom->map(glove[i][1])][0])
+      return 0;
 
   for (int i = 0; i < onemol->natoms; i++) {
     for (int j = 0; j < onemol_nxspecial[i][0]; j++) {
@@ -1829,12 +1825,10 @@ void FixBondReact::ring_check()
           break;
         }
       }
-      if (ring_fail == 1) {
-        status = GUESSFAIL;
-        return;
-      }
+      if (ring_fail == 1) return 0;
     }
   }
+  return 1;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/USER-REACTION/fix_bond_react.h b/src/USER-REACTION/fix_bond_react.h
index 87a5945d45..153bdd7a6d 100644
--- a/src/USER-REACTION/fix_bond_react.h
+++ b/src/USER-REACTION/fix_bond_react.h
@@ -171,7 +171,7 @@ class FixBondReact : public Fix {
   void check_a_neighbor();
   void crosscheck_the_neighbor();
   void inner_crosscheck_loop();
-  void ring_check();
+  int ring_check();
   int check_constraints();
   void get_IDcoords(int, int, double *);
   double get_temperature(tagint **, int, int);

From 80ae5ba7acbd946df15732456ad6682ad2db4321 Mon Sep 17 00:00:00 2001
From: jrgissing <jrgiss05@gmail.com>
Date: Sat, 20 Feb 2021 15:14:42 -0500
Subject: [PATCH 061/116] refactor constraints check

---
 src/USER-REACTION/fix_bond_react.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/USER-REACTION/fix_bond_react.cpp b/src/USER-REACTION/fix_bond_react.cpp
index 40cf2748e2..0f98e74a5e 100644
--- a/src/USER-REACTION/fix_bond_react.cpp
+++ b/src/USER-REACTION/fix_bond_react.cpp
@@ -1385,9 +1385,9 @@ void FixBondReact::superimpose_algorithm()
           }
         }
 
-        if (status == ACCEPT && check_constraints()) { // reaction site found successfully!
-          glove_ghostcheck();
-        }
+        // reaction site found successfully!
+        if (status == ACCEPT) glove_ghostcheck();
+
         hang_catch++;
         // let's go ahead and catch the simplest of hangs
         //if (hang_catch > onemol->natoms*4)
@@ -1627,7 +1627,7 @@ void FixBondReact::check_a_neighbor()
 
             glove_counter++;
             if (glove_counter == onemol->natoms) {
-              if (ring_check()) status = ACCEPT;
+              if (ring_check() && check_constraints()) status = ACCEPT;
               else status = GUESSFAIL;
               return;
             }
@@ -1679,7 +1679,7 @@ void FixBondReact::check_a_neighbor()
 
         glove_counter++;
         if (glove_counter == onemol->natoms) {
-          if (ring_check()) status = ACCEPT;
+          if (ring_check() && check_constraints()) status = ACCEPT;
           else status = GUESSFAIL;
           return;
           // will never complete here when there are edge atoms
@@ -1791,7 +1791,7 @@ void FixBondReact::inner_crosscheck_loop()
   }
   glove_counter++;
   if (glove_counter == onemol->natoms) {
-    if (ring_check()) status = ACCEPT;
+    if (ring_check() && check_constraints()) status = ACCEPT;
     else status = GUESSFAIL;
     return;
   }

From 7d9187cff8364fbf5872946996b1a62c67d6053b Mon Sep 17 00:00:00 2001
From: jrgissing <jrgiss05@gmail.com>
Date: Sat, 20 Feb 2021 16:24:24 -0500
Subject: [PATCH 062/116] eval reaction prob after constraints check

---
 src/USER-REACTION/fix_bond_react.cpp | 94 +++++++++-------------------
 src/USER-REACTION/fix_bond_react.h   |  2 +-
 2 files changed, 32 insertions(+), 64 deletions(-)

diff --git a/src/USER-REACTION/fix_bond_react.cpp b/src/USER-REACTION/fix_bond_react.cpp
index 0f98e74a5e..173a92b7fb 100644
--- a/src/USER-REACTION/fix_bond_react.cpp
+++ b/src/USER-REACTION/fix_bond_react.cpp
@@ -537,7 +537,6 @@ FixBondReact::FixBondReact(LAMMPS *lmp, int narg, char **arg) :
   nmax = 0;
   partner = finalpartner = nullptr;
   distsq = nullptr;
-  probability = nullptr;
   maxattempt = 0;
   attempt = nullptr;
   nattempt = nullptr;
@@ -585,7 +584,6 @@ FixBondReact::~FixBondReact()
   memory->destroy(finalpartner);
   memory->destroy(nattempt);
   memory->destroy(distsq);
-  memory->destroy(probability);
   memory->destroy(attempt);
   memory->destroy(edge);
   memory->destroy(equivalences);
@@ -877,6 +875,10 @@ void FixBondReact::post_integrate()
     return;
   }
 
+  // update reaction probability
+  if (var_flag[PROB][rxnID])
+    fraction[rxnID] = input->variable->compute_equal(var_id[PROB][rxnID]);
+
   // acquire updated ghost atom positions
   // necessary b/c are calling this after integrate, but before Verlet comm
 
@@ -890,16 +892,14 @@ void FixBondReact::post_integrate()
     memory->destroy(finalpartner);
     memory->destroy(distsq);
     memory->destroy(nattempt);
-    memory->destroy(probability);
     nmax = atom->nmax;
     memory->create(partner,nmax,"bond/react:partner");
     memory->create(finalpartner,nmax,"bond/react:finalpartner");
     memory->create(distsq,nmax,2,"bond/react:distsq");
     memory->create(nattempt,nreacts,"bond/react:nattempt");
-    memory->create(probability,nmax,"bond/react:probability");
   }
 
-  // reset create counts
+  // reset 'attempt' counts
   for (int i = 0; i < nreacts; i++) {
     nattempt[i] = 0;
   }
@@ -962,25 +962,14 @@ void FixBondReact::post_integrate()
       comm->reverse_comm_fix(this);
     }
 
-    // update reaction probability
-    if (var_flag[PROB][rxnID])
-      fraction[rxnID] = input->variable->compute_equal(var_id[PROB][rxnID]);
-
     // each atom now knows its winning partner
-    // for prob check, generate random value for each atom with a bond partner
-    // forward comm of partner and random value, so ghosts have it
-
-    if (fraction[rxnID] < 1.0) {
-      for (int i = 0; i < nlocal; i++)
-        if (partner[i]) probability[i] = random[rxnID]->uniform();
-    }
+    // forward comm of partner, so ghosts have it
 
     commflag = 2;
     comm->forward_comm_fix(this,2);
 
     // consider for reaction:
     // only if both atoms list each other as winning bond partner
-    //   and probability constraint is satisfied
     // if other atom is owned by another proc, it should do same thing
 
     int temp_nattempt = 0;
@@ -994,16 +983,6 @@ void FixBondReact::post_integrate()
         continue;
       }
 
-      // apply probability constraint using RN for atom with smallest ID
-
-      if (fraction[rxnID] < 1.0) {
-        if (tag[i] < tag[j]) {
-          if (probability[i] >= fraction[rxnID]) continue;
-        } else {
-          if (probability[j] >= fraction[rxnID]) continue;
-        }
-      }
-
       // store final bond partners and count the rxn possibility once
 
       finalpartner[i] = tag[j];
@@ -1345,10 +1324,14 @@ void FixBondReact::superimpose_algorithm()
              (nxspecial[local_atom1][0] == 0 ||
               xspecial[local_atom1][0] == atom->tag[local_atom2]) &&
              check_constraints()) {
-          status = ACCEPT;
-          glove_ghostcheck();
-        } else
-          status = REJECT;
+          if (fraction[rxnID] < 1.0 &&
+              random[rxnID]->uniform() >= fraction[rxnID]) {
+            status = REJECT;
+          } else {
+            status = ACCEPT;
+            glove_ghostcheck();
+          }
+        } else status = REJECT;
       }
 
       avail_guesses = 0;
@@ -1386,7 +1369,10 @@ void FixBondReact::superimpose_algorithm()
         }
 
         // reaction site found successfully!
-        if (status == ACCEPT) glove_ghostcheck();
+        if (status == ACCEPT)
+          if (fraction[rxnID] < 1.0 &&
+              random[rxnID]->uniform() >= fraction[rxnID]) status = REJECT;
+          else glove_ghostcheck();
 
         hang_catch++;
         // let's go ahead and catch the simplest of hangs
@@ -3946,20 +3932,10 @@ int FixBondReact::pack_forward_comm(int n, int *list, double *buf,
 
   m = 0;
 
-  if (commflag == 1) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      printf("hello you shouldn't be here\n");
-      //buf[m++] = ubuf(bondcount[j]).d;
-    }
-    return m;
-  }
-
   if (commflag == 2) {
     for (i = 0; i < n; i++) {
       j = list[i];
       buf[m++] = ubuf(partner[j]).d;
-      buf[m++] = probability[j];
     }
     return m;
   }
@@ -3985,15 +3961,9 @@ void FixBondReact::unpack_forward_comm(int n, int first, double *buf)
   m = 0;
   last = first + n;
 
-  if (commflag == 1) {
+  if (commflag == 2) {
     for (i = first; i < last; i++)
-      printf("hello you shouldn't be here\n");
-    // bondcount[i] = (int) ubuf(buf[m++]).i;
-  } else if (commflag == 2) {
-    for (i = first; i < last; i++) {
       partner[i] = (tagint) ubuf(buf[m++]).i;
-      probability[i] = buf[m++];
-    }
   } else {
     m = 0;
     last = first + n;
@@ -4034,20 +4004,18 @@ void FixBondReact::unpack_reverse_comm(int n, int *list, double *buf)
 
   m = 0;
 
-  if (commflag != 1) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      if (closeneigh[rxnID] != 0) {
-        if (buf[m+1] < distsq[j][1]) {
-          partner[j] = (tagint) ubuf(buf[m++]).i;
-          distsq[j][1] = buf[m++];
-        } else m += 2;
-      } else {
-        if (buf[m+1] > distsq[j][0]) {
-          partner[j] = (tagint) ubuf(buf[m++]).i;
-          distsq[j][0] = buf[m++];
-        } else m += 2;
-      }
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    if (closeneigh[rxnID] != 0) {
+      if (buf[m+1] < distsq[j][1]) {
+        partner[j] = (tagint) ubuf(buf[m++]).i;
+        distsq[j][1] = buf[m++];
+      } else m += 2;
+    } else {
+      if (buf[m+1] > distsq[j][0]) {
+        partner[j] = (tagint) ubuf(buf[m++]).i;
+        distsq[j][0] = buf[m++];
+      } else m += 2;
     }
   }
 }
diff --git a/src/USER-REACTION/fix_bond_react.h b/src/USER-REACTION/fix_bond_react.h
index 153bdd7a6d..67788df217 100644
--- a/src/USER-REACTION/fix_bond_react.h
+++ b/src/USER-REACTION/fix_bond_react.h
@@ -86,7 +86,7 @@ class FixBondReact : public Fix {
   int nmax; // max num local atoms
   int max_natoms; // max natoms in a molecule template
   tagint *partner,*finalpartner;
-  double **distsq,*probability;
+  double **distsq;
   int *nattempt;
   int maxattempt;
   int allnattempt;

From 196b6b92730cd2a9949f158a8981258e52003eb4 Mon Sep 17 00:00:00 2001
From: jrgissing <jrgiss05@gmail.com>
Date: Sat, 20 Feb 2021 20:22:53 -0500
Subject: [PATCH 063/116] variable probability fix

---
 src/USER-REACTION/fix_bond_react.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/USER-REACTION/fix_bond_react.cpp b/src/USER-REACTION/fix_bond_react.cpp
index 173a92b7fb..93c9fe525b 100644
--- a/src/USER-REACTION/fix_bond_react.cpp
+++ b/src/USER-REACTION/fix_bond_react.cpp
@@ -868,6 +868,9 @@ void FixBondReact::post_integrate()
     ghostly_rxn_count[i] = 0;
     nlocalskips[i] = 0;
     nghostlyskips[i] = 0;
+    // update reaction probability
+    if (var_flag[PROB][i])
+      fraction[i] = input->variable->compute_equal(var_id[PROB][i]);
   }
 
   if (nevery_check) {
@@ -875,10 +878,6 @@ void FixBondReact::post_integrate()
     return;
   }
 
-  // update reaction probability
-  if (var_flag[PROB][rxnID])
-    fraction[rxnID] = input->variable->compute_equal(var_id[PROB][rxnID]);
-
   // acquire updated ghost atom positions
   // necessary b/c are calling this after integrate, but before Verlet comm
 

From d9941b1648157d32bf812f9cb537cf8b5e093b25 Mon Sep 17 00:00:00 2001
From: jrgissing <jrgiss05@gmail.com>
Date: Sat, 20 Feb 2021 20:29:39 -0500
Subject: [PATCH 064/116] Update in.tiny_nylon.stabilized_variable_probability

---
 .../tiny_nylon/in.tiny_nylon.stabilized_variable_probability    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability b/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability
index 2c101ac77c..e81fedc34a 100644
--- a/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability
+++ b/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability
@@ -22,7 +22,7 @@ improper_style class2
 read_data tiny_nylon.data
 
 variable runsteps equal 1000
-variable prob1 equal step/v_runsteps*2
+variable prob1 equal step/v_runsteps*2+0.1
 variable prob2 equal (step/v_runsteps)>0.5
 
 velocity all create 300.0 4928459 dist gaussian

From d5917652d49cd248c3aa9edab463ea2240045ec3 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sat, 20 Feb 2021 23:50:55 -0500
Subject: [PATCH 065/116] remove output that is no longer necessary. settings
 are adapted automatically

---
 cmake/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index aefa9cd597..f67699c54d 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -777,9 +777,7 @@ if(PKG_GPU)
   message(STATUS "<<< GPU package settings >>>
 -- GPU API:          ${GPU_API}")
   if(GPU_API STREQUAL "CUDA")
-    message(STATUS "GPU architecture: ${GPU_ARCH}")
-  elseif(GPU_API STREQUAL "OPENCL")
-    message(STATUS "OpenCL tuning:    ${OCL_TUNE}")
+    message(STATUS "GPU default architecture: ${GPU_ARCH}")
   elseif(GPU_API STREQUAL "HIP")
     message(STATUS "HIP platform:     ${HIP_PLATFORM}")
     message(STATUS "HIP architecture: ${HIP_ARCH}")

From d025b281cf17b593bee604af787b1e5481f8e96e Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 21 Feb 2021 01:28:44 -0500
Subject: [PATCH 066/116] Build and link a static OpenCL loader library for all
 platforms

---
 cmake/Modules/OpenCLLoader.cmake | 54 ++++++++++++++++++++++++++++++++
 cmake/Modules/Packages/GPU.cmake | 18 +++--------
 2 files changed, 59 insertions(+), 13 deletions(-)
 create mode 100644 cmake/Modules/OpenCLLoader.cmake

diff --git a/cmake/Modules/OpenCLLoader.cmake b/cmake/Modules/OpenCLLoader.cmake
new file mode 100644
index 0000000000..0460f686ef
--- /dev/null
+++ b/cmake/Modules/OpenCLLoader.cmake
@@ -0,0 +1,54 @@
+message(STATUS "Downloading and building OpenCL loader library")
+
+if(CMAKE_BUILD_TYPE STREQUAL Debug)
+  set(OPENCL_LOADER_LIB_POSTFIX d)
+else()
+  set(OPENCL_LOADER_LIB_POSTFIX)
+endif()
+
+include(ExternalProject)
+set(OPENCL_LOADER_URL "https://download.lammps.org/thirdparty/opencl-loader-2020.12.18.tar.gz" CACHE STRING "URL for OpenCL loader tarball")
+mark_as_advanced(OPENCL_LOADER_URL)
+ExternalProject_Add(opencl_loader
+                    URL ${OPENCL_LOADER_URL}
+                    URL_MD5         f1e6a084d4950382588207133965ec89
+                    SOURCE_DIR      "${CMAKE_BINARY_DIR}/opencl_loader-src"
+                    BINARY_DIR      "${CMAKE_BINARY_DIR}/opencl_loader-build"
+                    CMAKE_ARGS      ${CMAKE_REQUEST_PIC} ${CMAKE_EXTRA_OPENCL_LOADER_OPTS}
+                                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                                    -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+                                    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                                    -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
+                                    -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
+                    BUILD_BYPRODUCTS <BINARY_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}OpenCL${OPENCL_LOADER_LIB_POSTFIX}.a
+                    LOG_DOWNLOAD ON
+                    LOG_CONFIGURE ON
+                    LOG_BUILD ON
+                    INSTALL_COMMAND ""
+                    TEST_COMMAND    "")
+
+ExternalProject_Get_Property(opencl_loader SOURCE_DIR)
+set(OPENCL_LOADER_INCLUDE_DIR ${SOURCE_DIR}/inc)
+
+# workaround for CMake 3.10 on ubuntu 18.04
+file(MAKE_DIRECTORY ${OPENCL_LOADER_INCLUDE_DIR})
+
+ExternalProject_Get_Property(opencl_loader BINARY_DIR)
+  set(OPENCL_LOADER_LIBRARY_PATH "${BINARY_DIR}/libOpenCL${OPENCL_LOADER_LIB_POSTFIX}.a")
+
+find_package(Threads QUIET)
+if(NOT WIN32)
+  set(OPENCL_LOADER_DEP_LIBS "Threads::Threads;${CMAKE_DL_LIBS}")
+else()
+  set(OPENCL_LOADER_DEP_LIBS "cfgmgr32;runtimeobject")
+endif()
+
+add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
+add_dependencies(OpenCL::OpenCL opencl_loader)
+
+set_target_properties(OpenCL::OpenCL PROPERTIES
+  IMPORTED_LOCATION ${OPENCL_LOADER_LIBRARY_PATH}
+  INTERFACE_INCLUDE_DIRECTORIES ${OPENCL_LOADER_INCLUDE_DIR}
+  INTERFACE_LINK_LIBRARIES "${OPENCL_LOADER_DEP_LIBS}")
+
+
diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 76ad4190cf..1b543eba8c 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -141,19 +141,10 @@ if(GPU_API STREQUAL "CUDA")
   target_include_directories(nvc_get_devices PRIVATE ${CUDA_INCLUDE_DIRS})
 
 elseif(GPU_API STREQUAL "OPENCL")
-  if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
-    # download and unpack support binaries for compilation of windows binaries.
-    set(LAMMPS_THIRDPARTY_URL "https://download.lammps.org/thirdparty")
-    file(DOWNLOAD "${LAMMPS_THIRDPARTY_URL}/opencl-win-devel.tar.gz" "${CMAKE_CURRENT_BINARY_DIR}/opencl-win-devel.tar.gz"
-            EXPECTED_MD5 2c00364888d5671195598b44c2e0d44d)
-    execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf opencl-win-devel.tar.gz WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
-    if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86")
-      set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win32/libOpenCL.dll")
-    elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
-      set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win64/libOpenCL.dll")
-    endif()
-    set_target_properties(OpenCL::OpenCL PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/include")
+  option(USE_STATIC_OPENCL_LOADER "Download and include a static OpenCL ICD loader" ON)
+  mark_as_advanced(USE_STATIC_OPENCL_LOADER)
+  if (USE_STATIC_OPENCL_LOADER)
+    include(OpenCLLoader)
   else()
     find_package(OpenCL REQUIRED)
   endif()
@@ -208,6 +199,7 @@ elseif(GPU_API STREQUAL "OPENCL")
   add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
   target_compile_definitions(ocl_get_devices PRIVATE -DUCL_OPENCL)
   target_link_libraries(ocl_get_devices PRIVATE OpenCL::OpenCL)
+  add_dependencies(ocl_get_devices OpenCL::OpenCL)
 elseif(GPU_API STREQUAL "HIP")
   if(NOT DEFINED HIP_PATH)
       if(NOT DEFINED ENV{HIP_PATH})

From 70327861b29dbc568254d18ce07cbbfcafb5e8ac Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 21 Feb 2021 09:43:56 -0500
Subject: [PATCH 067/116] update for improved OpenCL stub driver with tests

---
 cmake/Modules/OpenCLLoader.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/OpenCLLoader.cmake b/cmake/Modules/OpenCLLoader.cmake
index 0460f686ef..a6aac1c603 100644
--- a/cmake/Modules/OpenCLLoader.cmake
+++ b/cmake/Modules/OpenCLLoader.cmake
@@ -11,7 +11,7 @@ set(OPENCL_LOADER_URL "https://download.lammps.org/thirdparty/opencl-loader-2020
 mark_as_advanced(OPENCL_LOADER_URL)
 ExternalProject_Add(opencl_loader
                     URL ${OPENCL_LOADER_URL}
-                    URL_MD5         f1e6a084d4950382588207133965ec89
+                    URL_MD5         d89ab1dc1121b96c9c37526b9db46df1
                     SOURCE_DIR      "${CMAKE_BINARY_DIR}/opencl_loader-src"
                     BINARY_DIR      "${CMAKE_BINARY_DIR}/opencl_loader-build"
                     CMAKE_ARGS      ${CMAKE_REQUEST_PIC} ${CMAKE_EXTRA_OPENCL_LOADER_OPTS}

From 24079e9302c42fc8e559e24d3f4eefe696070253 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 21 Feb 2021 10:45:01 -0500
Subject: [PATCH 068/116] update docs

---
 doc/src/Build_extras.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst
index cf15de74bd..5e3356478d 100644
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@@ -133,6 +133,8 @@ CMake build
                                 # value = yes (default) or no
    -D CUDA_MPS_SUPPORT=value    # enables some tweaks required to run with active nvidia-cuda-mps daemon
                                 # value = yes or no (default)
+   -D USE_STATIC_OPENCL_LOADER=value  # downloads/includes OpenCL ICD loader library, no local OpenCL headers/libs needed
+                                      # value = yes (default) or no
 
 :code:`GPU_ARCH` settings for different GPU hardware is as follows:
 
@@ -159,6 +161,12 @@ When building with CMake, you **must NOT** build the GPU library in ``lib/gpu``
 using the traditional build procedure. CMake will detect files generated by that
 process and will terminate with an error and a suggestion for how to remove them.
 
+If you are compiling for OpenCL, the default setting is to download, build, and
+link with a static OpenCL ICD loader library and standard OpenCL headers.  This
+way no local OpenCL development headers or library needs to be present and only
+OpenCL compatible drivers need to be installed to use OpenCL.  If this is not
+desired, you can set :code:`USE_STATIC_OPENCL_LOADER` to :code:`no`.
+
 If you are compiling with HIP, note that before running CMake you will have to
 set appropriate environment variables. Some variables such as
 :code:`HCC_AMDGPU_TARGET` or :code:`CUDA_PATH` are necessary for :code:`hipcc`

From 0c6671ad64354be8197d1f784df8b35b7a67b41c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 21 Feb 2021 11:06:58 -0500
Subject: [PATCH 069/116] do not always add styles that depend on other
 packages

---
 cmake/Modules/Packages/GPU.cmake | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 1b543eba8c..70014c8782 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -382,12 +382,8 @@ elseif(GPU_API STREQUAL "HIP")
   target_link_libraries(lammps PRIVATE gpu)
 endif()
 
-# GPU package
-FindStyleHeaders(${GPU_SOURCES_DIR} FIX_CLASS fix_ FIX)
-
 set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}")
-
-# detects styles which have GPU version
+# detect styles which have a GPU version
 RegisterStylesExt(${GPU_SOURCES_DIR} gpu GPU_SOURCES)
 
 get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)

From 826c618aa9e3a69eee32abded0686b0191d5211a Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 21 Feb 2021 15:09:50 -0500
Subject: [PATCH 070/116] replace a few more cases of atoi()/atof() with
 utils::*numeric() functions

---
 src/atom.cpp              |  2 +-
 src/compute_reduce.cpp    |  4 ++--
 src/fix_property_atom.cpp | 20 +++++++++++++-------
 src/kspace.cpp            | 12 ++++++------
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/atom.cpp b/src/atom.cpp
index 75b1b07fbf..fe260309e2 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -1752,7 +1752,7 @@ void Atom::set_mass(const char *file, int line, int /*narg*/, char **arg)
   if (lo < 1 || hi > ntypes) error->all(file,line,"Invalid type for mass set");
 
   for (int itype = lo; itype <= hi; itype++) {
-    mass[itype] = atof(arg[1]);
+    mass[itype] = utils::numeric(FLERR,arg[1],false,lmp);
     mass_setflag[itype] = 1;
 
     if (mass[itype] <= 0.0) error->all(file,line,"Invalid mass value");
diff --git a/src/compute_reduce.cpp b/src/compute_reduce.cpp
index 82d3dff458..bc9aeefe7b 100644
--- a/src/compute_reduce.cpp
+++ b/src/compute_reduce.cpp
@@ -148,8 +148,8 @@ ComputeReduce::ComputeReduce(LAMMPS *lmp, int narg, char **arg) :
       if (iarg+3 > narg) error->all(FLERR,"Illegal compute reduce command");
       if (mode != MINN && mode != MAXX)
         error->all(FLERR,"Compute reduce replace requires min or max mode");
-      int col1 = atoi(arg[iarg+1]) - 1;
-      int col2 = atoi(arg[iarg+2]) - 1;
+      int col1 = utils::inumeric(FLERR,arg[iarg+1],false,lmp) - 1;
+      int col2 = utils::inumeric(FLERR,arg[iarg+2],false,lmp) - 1;
       if (col1 < 0 || col1 >= nvalues || col2 < 0 || col2 >= nvalues)
         error->all(FLERR,"Illegal compute reduce command");
       if (col1 == col2) error->all(FLERR,"Illegal compute reduce command");
diff --git a/src/fix_property_atom.cpp b/src/fix_property_atom.cpp
index c1c52a3f8c..f18888bbfc 100644
--- a/src/fix_property_atom.cpp
+++ b/src/fix_property_atom.cpp
@@ -254,13 +254,19 @@ void FixPropertyAtom::read_data_section(char *keyword, int n, char *buf,
 
     if ((m = atom->map(itag)) >= 0) {
       for (j = 0; j < nvalue; j++) {
-        if (style[j] == MOLECULE) atom->molecule[m] = ATOTAGINT(values[j+1]);
-        else if (style[j] == CHARGE) atom->q[m] = atof(values[j+1]);
-        else if (style[j] == RMASS) atom->rmass[m] = atof(values[j+1]);
-        else if (style[j] == INTEGER)
-          atom->ivector[index[j]][m] = atoi(values[j+1]);
-        else if (style[j] == DOUBLE)
-          atom->dvector[index[j]][m] = atof(values[j+1]);
+        if (style[j] == MOLECULE) {
+          atom->molecule[m] = utils::tnumeric(FLERR,values[j+1],false,lmp);
+        } else if (style[j] == CHARGE) {
+          atom->q[m] = utils::numeric(FLERR,values[j+1],false,lmp);
+        } else if (style[j] == RMASS) {
+          atom->rmass[m] = utils::numeric(FLERR,values[j+1],false,lmp);
+        } else if (style[j] == INTEGER) {
+          atom->ivector[index[j]][m] = utils::inumeric(FLERR,values[j+1],
+                                                       false,lmp);
+        } else if (style[j] == DOUBLE) {
+          atom->dvector[index[j]][m] = utils::numeric(FLERR,values[j+1],
+                                                      true,lmp);
+        }
       }
     }
 
diff --git a/src/kspace.cpp b/src/kspace.cpp
index 5556a5e8d0..f44cc42aaf 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -564,9 +564,9 @@ void KSpace::modify_params(int narg, char **arg)
       iarg += 2;
     } else if (strcmp(arg[iarg],"kmax/ewald") == 0) {
       if (iarg+4 > narg) error->all(FLERR,"Illegal kspace_modify command");
-      kx_ewald = atoi(arg[iarg+1]);
-      ky_ewald = atoi(arg[iarg+2]);
-      kz_ewald = atoi(arg[iarg+3]);
+      kx_ewald = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      ky_ewald = utils::inumeric(FLERR,arg[iarg+2],false,lmp);
+      kz_ewald = utils::inumeric(FLERR,arg[iarg+3],false,lmp);
       if (kx_ewald < 0 || ky_ewald < 0 || kz_ewald < 0)
         error->all(FLERR,"Bad kspace_modify kmax/ewald parameter");
       if (kx_ewald > 0 && ky_ewald > 0 && kz_ewald > 0)
@@ -583,15 +583,15 @@ void KSpace::modify_params(int narg, char **arg)
       iarg += 2;
     } else if (strcmp(arg[iarg],"force/disp/real") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
-      accuracy_real_6 = atof(arg[iarg+1]);
+      accuracy_real_6 = utils::numeric(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
     } else if (strcmp(arg[iarg],"force/disp/kspace") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
-      accuracy_kspace_6 = atof(arg[iarg+1]);
+      accuracy_kspace_6 = utils::numeric(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
     } else if (strcmp(arg[iarg],"eigtol") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
-      splittol = atof(arg[iarg+1]);
+      splittol = utils::numeric(FLERR,arg[iarg+1],false,lmp);
       if (splittol >= 1.0)
         error->all(FLERR,"Kspace_modify eigtol must be smaller than one");
       iarg += 2;

From 06f6766ed6dab0ac18607ea9ee9d237a09569716 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 21 Feb 2021 16:20:36 -0500
Subject: [PATCH 071/116] CMAKE_FIND_LIBRARY_PREFIXES is a path. must use plain
 "lib" instead, but there is a variable for the suffix

---
 cmake/Modules/GTest.cmake | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cmake/Modules/GTest.cmake b/cmake/Modules/GTest.cmake
index 060a7e42f9..0c62291d5e 100644
--- a/cmake/Modules/GTest.cmake
+++ b/cmake/Modules/GTest.cmake
@@ -20,10 +20,10 @@ ExternalProject_Add(googletest
                                     -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
                                     -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
                                     -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
-                    BUILD_BYPRODUCTS <BINARY_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gtest${GTEST_LIB_POSTFIX}.a
-                                     <BINARY_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gmock${GTEST_LIB_POSTFIX}.a
-                                     <BINARY_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gtest_main${GTEST_LIB_POSTFIX}.a
-                                     <BINARY_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gmock_main${GTEST_LIB_POSTFIX}.a
+                    BUILD_BYPRODUCTS <BINARY_DIR>/lib/libgtest${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
+                                     <BINARY_DIR>/lib/libgmock${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
+                                     <BINARY_DIR>/lib/libgtest_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
+                                     <BINARY_DIR>/lib/libgmock_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
                     LOG_DOWNLOAD ON
                     LOG_CONFIGURE ON
                     LOG_BUILD ON
@@ -39,10 +39,10 @@ file(MAKE_DIRECTORY ${GTEST_INCLUDE_DIR})
 file(MAKE_DIRECTORY ${GMOCK_INCLUDE_DIR})
 
 ExternalProject_Get_Property(googletest BINARY_DIR)
-set(GTEST_LIBRARY_PATH ${BINARY_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gtest${GTEST_LIB_POSTFIX}.a)
-set(GMOCK_LIBRARY_PATH ${BINARY_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gmock${GTEST_LIB_POSTFIX}.a)
-set(GTEST_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gtest_main${GTEST_LIB_POSTFIX}.a)
-set(GMOCK_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gmock_main${GTEST_LIB_POSTFIX}.a)
+set(GTEST_LIBRARY_PATH ${BINARY_DIR}/lib/libgtest${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX})
+set(GMOCK_LIBRARY_PATH ${BINARY_DIR}/lib/libgmock${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX})
+set(GTEST_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/libgtest_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX})
+set(GMOCK_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/libgmock_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX})
 
 # Prevent GoogleTest from overriding our compiler/linker options
 # when building with Visual Studio

From 1a68d761a3060cfd7c8ead56b495df070a847a23 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 21 Feb 2021 16:21:45 -0500
Subject: [PATCH 072/116] correct how to construct the path to the generated
 OpenCL lib

---
 cmake/Modules/OpenCLLoader.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/Modules/OpenCLLoader.cmake b/cmake/Modules/OpenCLLoader.cmake
index a6aac1c603..290f15415a 100644
--- a/cmake/Modules/OpenCLLoader.cmake
+++ b/cmake/Modules/OpenCLLoader.cmake
@@ -20,7 +20,7 @@ ExternalProject_Add(opencl_loader
                                     -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
                                     -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
                                     -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
-                    BUILD_BYPRODUCTS <BINARY_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}OpenCL${OPENCL_LOADER_LIB_POSTFIX}.a
+                    BUILD_BYPRODUCTS <BINARY_DIR>/libOpenCL${OPENCL_LOADER_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
                     LOG_DOWNLOAD ON
                     LOG_CONFIGURE ON
                     LOG_BUILD ON
@@ -34,7 +34,7 @@ set(OPENCL_LOADER_INCLUDE_DIR ${SOURCE_DIR}/inc)
 file(MAKE_DIRECTORY ${OPENCL_LOADER_INCLUDE_DIR})
 
 ExternalProject_Get_Property(opencl_loader BINARY_DIR)
-  set(OPENCL_LOADER_LIBRARY_PATH "${BINARY_DIR}/libOpenCL${OPENCL_LOADER_LIB_POSTFIX}.a")
+set(OPENCL_LOADER_LIBRARY_PATH "${BINARY_DIR}/libOpenCL${OPENCL_LOADER_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
 
 find_package(Threads QUIET)
 if(NOT WIN32)

From db95552f2b4c28c630c5c9a35b3a83b7892e990e Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 21 Feb 2021 16:42:30 -0500
Subject: [PATCH 073/116] update md5sum for updated archive (again)

---
 cmake/Modules/OpenCLLoader.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/OpenCLLoader.cmake b/cmake/Modules/OpenCLLoader.cmake
index 290f15415a..ecd9204d24 100644
--- a/cmake/Modules/OpenCLLoader.cmake
+++ b/cmake/Modules/OpenCLLoader.cmake
@@ -11,7 +11,7 @@ set(OPENCL_LOADER_URL "https://download.lammps.org/thirdparty/opencl-loader-2020
 mark_as_advanced(OPENCL_LOADER_URL)
 ExternalProject_Add(opencl_loader
                     URL ${OPENCL_LOADER_URL}
-                    URL_MD5         d89ab1dc1121b96c9c37526b9db46df1
+                    URL_MD5         011cdcbd41030be94f3fced6d763a52a
                     SOURCE_DIR      "${CMAKE_BINARY_DIR}/opencl_loader-src"
                     BINARY_DIR      "${CMAKE_BINARY_DIR}/opencl_loader-build"
                     CMAKE_ARGS      ${CMAKE_REQUEST_PIC} ${CMAKE_EXTRA_OPENCL_LOADER_OPTS}

From e0e89c588ba0c907f60824806dcf5ea84e8090d5 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 21 Feb 2021 17:11:38 -0500
Subject: [PATCH 074/116] correct library prefix name use also for building
 libyaml

---
 cmake/Modules/YAML.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/Modules/YAML.cmake b/cmake/Modules/YAML.cmake
index a080b566be..f2ba34e1b6 100644
--- a/cmake/Modules/YAML.cmake
+++ b/cmake/Modules/YAML.cmake
@@ -12,7 +12,7 @@ ExternalProject_Add(libyaml
                                       CXX=${CMAKE_CXX_COMPILER}
                                       CC=${CMAKE_C_COMPILER}
                                       --prefix=<INSTALL_DIR> --disable-shared
-                    BUILD_BYPRODUCTS  <INSTALL_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}yaml.a
+                    BUILD_BYPRODUCTS  <INSTALL_DIR>/lib/libyaml${CMAKE_STATIC_LIBRARY_SUFFIX}
                     TEST_COMMAND      "")
 
 ExternalProject_Get_Property(libyaml INSTALL_DIR)
@@ -23,7 +23,7 @@ set(YAML_LIBRARY_DIR ${INSTALL_DIR}/lib)
 file(MAKE_DIRECTORY ${YAML_INCLUDE_DIR})
 file(MAKE_DIRECTORY ${YAML_LIBRARY_DIR})
 
-set(YAML_LIBRARY_PATH ${INSTALL_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}yaml.a)
+set(YAML_LIBRARY_PATH ${INSTALL_DIR}/lib/libyaml${CMAKE_STATIC_LIBRARY_SUFFIX})
 
 add_library(Yaml::Yaml UNKNOWN IMPORTED)
 set_target_properties(Yaml::Yaml PROPERTIES

From 4786391fad023faaa676de5957cd3ea7d3ead24f Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 21 Feb 2021 20:40:11 -0500
Subject: [PATCH 075/116] must explicitly register fix gpu

---
 cmake/Modules/Packages/GPU.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 70014c8782..e2586881ef 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -385,6 +385,7 @@ endif()
 set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}")
 # detect styles which have a GPU version
 RegisterStylesExt(${GPU_SOURCES_DIR} gpu GPU_SOURCES)
+RegisterFixStyle(${GPU_SOURCES_DIR}/fix_gpu.h)
 
 get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)
 

From f467832e0f48d6aeb1c14fcbd3a30b27af3d537f Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Mon, 22 Feb 2021 01:29:50 -0500
Subject: [PATCH 076/116] Make PyLammps command history feature optional

PyLammps so far has been saving a history for every executed command.  This was
originally added to allow writing out the commands of interactive PyLammps
sessions as regular input scripts.

This commit disables this history by default, which avoids the small, but
rising memory consumption over time. It can be enabled and disabled with the
enable_cmd_history property. There is also now a method to clear the history
at any time.
---
 python/lammps/pylammps.py | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/python/lammps/pylammps.py b/python/lammps/pylammps.py
index 47a2a5a6ab..4bba9f5e94 100644
--- a/python/lammps/pylammps.py
+++ b/python/lammps/pylammps.py
@@ -400,6 +400,7 @@ class PyLammps(object):
       self.lmp = lammps(name=name,cmdargs=cmdargs,ptr=None,comm=comm)
     print("LAMMPS output is captured by PyLammps wrapper")
     self._cmd_history = []
+    self._enable_cmd_history = False
     self.runs = []
 
   def __del__(self):
@@ -434,6 +435,24 @@ class PyLammps(object):
     """
     self.lmp.file(file)
 
+  @property
+  def enable_cmd_history(self):
+    """
+    :getter: Return whether command history is saved
+    :setter: Set if command history should be saved
+    :type: bool
+    """
+    return self._enable_cmd_history
+
+  @enable_cmd_history.setter
+  def enable_cmd_history(self, value):
+    """
+    :getter: Return whether command history is saved
+    :setter: Set if command history should be saved
+    :type: bool
+    """
+    self._enable_cmd_history = (value == True)
+
   def write_script(self, filepath):
     """
     Write LAMMPS script file containing all commands executed up until now
@@ -445,18 +464,28 @@ class PyLammps(object):
       for cmd in self._cmd_history:
         print(cmd, file=f)
 
+  def clear_cmd_history(self):
+    """
+    Clear LAMMPS command history up to this point
+    """
+    self._cmd_history = []
+
   def command(self, cmd):
     """
     Execute LAMMPS command
 
-    All commands executed will be stored in a command history which can be
-    written to a file using :py:meth:`PyLammps.write_script()`
+    If :py:attr:`PyLammps.enable_cmd_history` is set to ``True``, commands executed
+    will be recorded. The entire command history can be written to a file using
+    :py:meth:`PyLammps.write_script()`. To clear the command history, use
+    :py:meth:`PyLammps.clear_cmd_history()`.
 
     :param cmd: command string that should be executed
     :type: cmd: string
     """
     self.lmp.command(cmd)
-    self._cmd_history.append(cmd)
+
+    if self.enable_cmd_history:
+      self._cmd_history.append(cmd)
 
   def run(self, *args, **kwargs):
     """

From f3ee948450bbc63eef80c3bad1a2de382dda7b75 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 22 Feb 2021 07:05:20 -0500
Subject: [PATCH 077/116] need to use column 1 in fix ave/time example

---
 doc/src/compute_temp_chunk.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/compute_temp_chunk.rst b/doc/src/compute_temp_chunk.rst
index 77e2568fce..f1c34b42fa 100644
--- a/doc/src/compute_temp_chunk.rst
+++ b/doc/src/compute_temp_chunk.rst
@@ -153,7 +153,7 @@ temp/chunk calculation to a file is to use the :doc:`fix ave/time <fix_ave_time>
 
    compute cc1 all chunk/atom molecule
    compute myChunk all temp/chunk cc1 temp
-   fix 1 all ave/time 100 1 100 c_myChunk file tmp.out mode vector
+   fix 1 all ave/time 100 1 100 c_myChunk[1] file tmp.out mode vector
 
 ----------
 

From ab05e9f5c1db913e32e1e2c866c1548524c0da61 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 23 Feb 2021 06:11:54 -0600
Subject: [PATCH 078/116] update the log files for the kim command examples

---
 .../kim/log.10Feb21.in.kim-ex.melt.clang.1    | 107 +++
 .../kim/log.10Feb21.in.kim-ex.melt.clang.4    | 107 +++
 .../log.10Feb21.in.kim-pm-property.clang.1    | 223 ++++++
 .../log.10Feb21.in.kim-pm-property.clang.4    | 223 ++++++
 .../log.10Feb21.in.kim-pm-query.melt.clang.1  | 210 ++++++
 .../log.10Feb21.in.kim-pm-query.melt.clang.4  | 210 ++++++
 .../kim/log.10Feb21.in.kim-pm.melt.clang.1    | 204 ++++++
 .../kim/log.10Feb21.in.kim-pm.melt.clang.4    | 204 ++++++
 examples/kim/log.10Feb21.in.kim-query.clang.1 | 655 ++++++++++++++++++
 .../kim/log.10Feb21.in.kim-sm.melt.clang.1    | 208 ++++++
 .../kim/log.10Feb21.in.kim-sm.melt.clang.4    | 208 ++++++
 .../kim/log.10Feb21.in.lammps.melt.clang.1    |  88 +++
 .../kim/log.10Feb21.in.lammps.melt.clang.4    |  88 +++
 13 files changed, 2735 insertions(+)
 create mode 100644 examples/kim/log.10Feb21.in.kim-ex.melt.clang.1
 create mode 100644 examples/kim/log.10Feb21.in.kim-ex.melt.clang.4
 create mode 100644 examples/kim/log.10Feb21.in.kim-pm-property.clang.1
 create mode 100644 examples/kim/log.10Feb21.in.kim-pm-property.clang.4
 create mode 100644 examples/kim/log.10Feb21.in.kim-pm-query.melt.clang.1
 create mode 100644 examples/kim/log.10Feb21.in.kim-pm-query.melt.clang.4
 create mode 100644 examples/kim/log.10Feb21.in.kim-pm.melt.clang.1
 create mode 100644 examples/kim/log.10Feb21.in.kim-pm.melt.clang.4
 create mode 100644 examples/kim/log.10Feb21.in.kim-query.clang.1
 create mode 100644 examples/kim/log.10Feb21.in.kim-sm.melt.clang.1
 create mode 100644 examples/kim/log.10Feb21.in.kim-sm.melt.clang.4
 create mode 100644 examples/kim/log.10Feb21.in.lammps.melt.clang.1
 create mode 100644 examples/kim/log.10Feb21.in.lammps.melt.clang.4

diff --git a/examples/kim/log.10Feb21.in.kim-ex.melt.clang.1 b/examples/kim/log.10Feb21.in.kim-ex.melt.clang.1
new file mode 100644
index 0000000000..dcf8727fc0
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.kim-ex.melt.clang.1
@@ -0,0 +1,107 @@
+LAMMPS (10 Feb 2021)
+# 3d Lennard-Jones melt
+#
+# This example requires that the example models provided with
+# the kim-api package are installed.  see the `./lib/kim/README` or
+# `./lib/kim/Install.py` files for details on how to install these
+# example models.
+#
+
+variable     x index 1
+variable     y index 1
+variable     z index 1
+
+variable     xx equal 20*$x
+variable     xx equal 20*1
+variable     yy equal 20*$y
+variable     yy equal 20*1
+variable     zz equal 20*$z
+variable     zz equal 20*1
+
+kim          init LennardJones_Ar real
+#=== BEGIN kim init ==========================================
+units real
+neighbor 2.0 bin   # Angstroms
+timestep 1.0       # femtoseconds
+
+This model has No mutable parameters.
+#=== END kim init ============================================
+
+
+lattice      fcc 4.4300
+Lattice spacing in x,y,z = 4.4300000 4.4300000 4.4300000
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+region       box block 0 20 0 ${yy} 0 ${zz}
+region       box block 0 20 0 20 0 ${zz}
+region       box block 0 20 0 20 0 20
+create_box   1 box
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (88.600000 88.600000 88.600000)
+  1 by 1 by 1 MPI processor grid
+create_atoms 1 box
+Created 32000 atoms
+  create_atoms CPU = 0.003 seconds
+
+kim          interactions Ar
+#=== BEGIN kim interactions ==================================
+pair_style kim LennardJones_Ar
+WARNING: KIM Model does not provide 'partialParticleEnergy'; energy per atom will be zero (src/KIM/pair_kim.cpp:1139)
+WARNING: KIM Model does not provide 'partialParticleVirial'; virial per atom will be zero (src/KIM/pair_kim.cpp:1145)
+pair_coeff * * Ar
+#=== END kim interactions ====================================
+
+
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
+
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
+
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+
+run          100
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 8.45
+  ghost atom cutoff = 8.45
+  binsize = 4.225, bins = 21 21 21
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair kim, perpetual
+      attributes: full, newton off, cut 8.450000000000001
+      pair build: full/bin/atomonly
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 28.12 | 28.12 | 28.12 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0          200    145069.63            0    164146.22    128015.94 
+     100    95.179703    154939.42            0    164017.94    131602.75 
+Loop time of 2.8463 on 1 procs for 100 steps with 32000 atoms
+
+Performance: 3.036 ns/day, 7.906 hours/ns, 35.133 timesteps/s
+99.9% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 2.5046     | 2.5046     | 2.5046     |   0.0 | 88.00
+Neigh   | 0.29437    | 0.29437    | 0.29437    |   0.0 | 10.34
+Comm    | 0.01182    | 0.01182    | 0.01182    |   0.0 |  0.42
+Output  | 7e-05      | 7e-05      | 7e-05      |   0.0 |  0.00
+Modify  | 0.024522   | 0.024522   | 0.024522   |   0.0 |  0.86
+Other   |            | 0.01091    |            |       |  0.38
+
+Nlocal:        32000.0 ave       32000 max       32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        19911.0 ave       19911 max       19911 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:         0.00000 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:  4.25375e+06 ave 4.25375e+06 max 4.25375e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 4253750
+Ave neighs/atom = 132.92969
+Neighbor list builds = 3
+Dangerous builds = 0
+Total wall time: 0:00:02
diff --git a/examples/kim/log.10Feb21.in.kim-ex.melt.clang.4 b/examples/kim/log.10Feb21.in.kim-ex.melt.clang.4
new file mode 100644
index 0000000000..476b66753b
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.kim-ex.melt.clang.4
@@ -0,0 +1,107 @@
+LAMMPS (10 Feb 2021)
+# 3d Lennard-Jones melt
+#
+# This example requires that the example models provided with
+# the kim-api package are installed.  see the `./lib/kim/README` or
+# `./lib/kim/Install.py` files for details on how to install these
+# example models.
+#
+
+variable     x index 1
+variable     y index 1
+variable     z index 1
+
+variable     xx equal 20*$x
+variable     xx equal 20*1
+variable     yy equal 20*$y
+variable     yy equal 20*1
+variable     zz equal 20*$z
+variable     zz equal 20*1
+
+kim          init LennardJones_Ar real
+#=== BEGIN kim init ==========================================
+units real
+neighbor 2.0 bin   # Angstroms
+timestep 1.0       # femtoseconds
+
+This model has No mutable parameters.
+#=== END kim init ============================================
+
+
+lattice      fcc 4.4300
+Lattice spacing in x,y,z = 4.4300000 4.4300000 4.4300000
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+region       box block 0 20 0 ${yy} 0 ${zz}
+region       box block 0 20 0 20 0 ${zz}
+region       box block 0 20 0 20 0 20
+create_box   1 box
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (88.600000 88.600000 88.600000)
+  1 by 2 by 2 MPI processor grid
+create_atoms 1 box
+Created 32000 atoms
+  create_atoms CPU = 0.001 seconds
+
+kim          interactions Ar
+#=== BEGIN kim interactions ==================================
+pair_style kim LennardJones_Ar
+WARNING: KIM Model does not provide 'partialParticleEnergy'; energy per atom will be zero (src/KIM/pair_kim.cpp:1139)
+WARNING: KIM Model does not provide 'partialParticleVirial'; virial per atom will be zero (src/KIM/pair_kim.cpp:1145)
+pair_coeff * * Ar
+#=== END kim interactions ====================================
+
+
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
+
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
+
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+
+run          100
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 8.45
+  ghost atom cutoff = 8.45
+  binsize = 4.225, bins = 21 21 21
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair kim, perpetual
+      attributes: full, newton off, cut 8.450000000000001
+      pair build: full/bin/atomonly
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 9.791 | 9.791 | 9.791 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0          200    145069.63            0    164146.22    128015.94 
+     100    95.179703    154939.42            0    164017.94    131602.75 
+Loop time of 0.857614 on 4 procs for 100 steps with 32000 atoms
+
+Performance: 10.074 ns/day, 2.382 hours/ns, 116.603 timesteps/s
+99.6% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.73048    | 0.73398    | 0.73855    |   0.3 | 85.58
+Neigh   | 0.083739   | 0.083964   | 0.084335   |   0.1 |  9.79
+Comm    | 0.017996   | 0.022912   | 0.026515   |   2.1 |  2.67
+Output  | 2.7e-05    | 3.5e-05    | 4.5e-05    |   0.0 |  0.00
+Modify  | 0.010073   | 0.010158   | 0.010271   |   0.1 |  1.18
+Other   |            | 0.006571   |            |       |  0.77
+
+Nlocal:        8000.00 ave        8018 max        7967 min
+Histogram: 1 0 0 0 0 0 1 0 0 2
+Nghost:        9131.00 ave        9164 max        9113 min
+Histogram: 2 0 0 1 0 0 0 0 0 1
+Neighs:         0.00000 ave           0 max           0 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+FullNghs:  1.06344e+06 ave 1.06594e+06 max 1.05881e+06 min
+Histogram: 1 0 0 0 0 0 1 0 0 2
+
+Total # of neighbors = 4253750
+Ave neighs/atom = 132.92969
+Neighbor list builds = 3
+Dangerous builds = 0
+Total wall time: 0:00:00
diff --git a/examples/kim/log.10Feb21.in.kim-pm-property.clang.1 b/examples/kim/log.10Feb21.in.kim-pm-property.clang.1
new file mode 100644
index 0000000000..a00085a486
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.kim-pm-property.clang.1
@@ -0,0 +1,223 @@
+LAMMPS (10 Feb 2021)
+# kim property example
+#
+# For detailed information of this example please refer to:
+# `https://openkim.org/doc/evaluation/tutorial-lammps/`
+#
+# Description:
+#
+# This example is designed to calculate the cohesive energy corresponding to
+# the equilibrium FCC lattice constant for
+# `LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004` model for
+# argon. The material properties computed in LAMMPS are represented as a
+# standard KIM property instance format. (See
+# `https://openkim.org/doc/schema/properties-framework/` and
+# `https://lammps.sandia.gov/doc/kim_commands.html` for further details).
+# Then the created property instance is written to a file named `results.edn`
+# using the `kim property dump` command.
+#
+# Requirement:
+#
+# This example requires LAMMPS built with the Python 3.6 or later package
+# installed. See the `https://lammps.sandia.gov/doc/python.html` doc page for
+# more info on building LAMMPS with the version of Python on your system.
+# After successfully building LAMMPS with Python, you need to install the
+# kim-property Python package, See the
+# `https://lammps.sandia.gov/doc/Build_extras.html#kim` doc page for
+# further details.
+#
+# This example requires that the KIM Portable Model (PM)
+# `LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004`
+# is installed.  This can be done with the command
+#   kim-api-collections-management install user LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004
+# If this command does not work, you may need to setup your PATH to find the utility.
+# If you installed the kim-api using the LAMMPS CMake build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS build directory)
+#   source ./kim_build-prefix/bin/kim-api-activate
+# If you installed the kim-api using the LAMMPS Make build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS src directory)
+#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
+# (where you should relplace X.Y.Z with the appropriate kim-api version number).
+#
+# Or, see `https://openkim.org/doc/obtaining-models` for alternative options.
+#
+
+# Initialize interatomic potential (KIM model) and units
+atom_style atomic
+
+# Set the OpenKIM model that will be used
+kim init LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004 metal
+#=== BEGIN kim init ==========================================
+units metal
+neighbor 2.0 bin   # Angstroms
+timestep 1.0e-3    # picoseconds
+
+This model has 3 mutable parameters. 
+ No.      | Parameter name     | data type  | extent
+-----------------------------------------------------
+ 1        | cutoff             | "Double"   | 1
+ 2        | epsilon            | "Double"   | 1
+ 3        | sigma              | "Double"   | 1
+#=== END kim init ============================================
+
+
+# the equilibrium lattice constant for the fcc structure
+variable lattice_constant equal 5.248509056866169
+
+# Periodic boundary conditions along all three dimensions
+boundary p p p
+
+# Create an FCC lattice with the lattice spacing
+# using a single conventional (orthogonal) unit cell
+lattice      fcc ${lattice_constant}
+lattice      fcc 5.24850905686617
+Lattice spacing in x,y,z = 5.2485091 5.2485091 5.2485091
+region box   block 0 1 0 1 0 1 units lattice
+create_box   1 box
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (5.2485091 5.2485091 5.2485091)
+  1 by 1 by 1 MPI processor grid
+create_atoms 1 box
+Created 4 atoms
+  create_atoms CPU = 0.000 seconds
+mass         1 39.948
+
+# Specify the KIM interactions
+kim interactions Ar
+#=== BEGIN kim interactions ==================================
+pair_style kim LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004
+pair_coeff * * Ar
+#=== END kim interactions ====================================
+
+
+# Compute energy
+run 0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- @Comment
+{
+\documentclass{article}
+\usepackage{url}
+\begin{document}
+This Model originally published in \cite{MO_126566794224_004a} is archived in OpenKIM~\cite{MO_126566794224_004, MD_498634107543_004, tadmor:elliott:2011, elliott:tadmor:2011}.
+\bibliographystyle{vancouver}
+\bibliography{kimcite-MO_126566794224_004.bib}
+\end{document}
+}
+
+@Misc{MO_126566794224_004,
+  author       = {Ellad Tadmor},
+  title        = {{L}ennard-{J}ones model (shifted) for {A}r with parameters from {B}ernardes (1958) (medium precision cutoff) v004},
+  doi          = {10.25950/9f98b989},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/9f98b989}},
+  keywords     = {OpenKIM, Model, MO_126566794224_004},
+  publisher    = {OpenKIM},
+  year         = 2020,
+}
+
+@Misc{MD_498634107543_004,
+  author       = {Ellad Tadmor},
+  title        = {{D}river for the {L}ennard-{J}ones model uniformly shifted to have zero energy at the cutoff radius v004},
+  doi          = {10.25950/bdffd6a6},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/9f98b989}},
+  keywords     = {OpenKIM, Model Driver, MD_498634107543_004},
+  publisher    = {OpenKIM},
+  year         = 2020,
+}
+
+@Article{tadmor:elliott:2011,
+  author    = {E. B. Tadmor and R. S. Elliott and J. P. Sethna and R. E. Miller and C. A. Becker},
+  title     = {The potential of atomistic simulations and the {K}nowledgebase of {I}nteratomic {M}odels},
+  journal   = {{JOM}},
+  year      = {2011},
+  volume    = {63},
+  number    = {7},
+  pages     = {17},
+  doi       = {10.1007/s11837-011-0102-6},
+}
+
+@Misc{elliott:tadmor:2011,
+  author       = {Ryan S. Elliott and Ellad B. Tadmor},
+  title        = {{K}nowledgebase of {I}nteratomic {M}odels ({KIM}) Application Programming Interface ({API})},
+  howpublished = {\url{https://openkim.org/kim-api}},
+  publisher    = {OpenKIM},
+  year         = 2011,
+  doi          = {10.25950/ff8f563a},
+}
+
+@Article{MO_126566794224_004a,
+  author = {Newton Bernardes},
+  doi = {10.1103/PhysRev.112.1534},
+  issue = {5},
+  journal = {Physical Review},
+  pages = {1534--1539},
+  publisher = {American Physical Society},
+  title = {Theory of Solid {N}e, {A}, {K}r, and {X}e at 0{K}},
+  volume = {112},
+  year = {1958},
+}
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 15.5
+  ghost atom cutoff = 15.5
+  binsize = 7.75, bins = 1 1 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair kim, perpetual
+      attributes: full, newton off, cut 15.5
+      pair build: full/bin/atomonly
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 3.119 | 3.119 | 3.119 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0            0  -0.34602203            0  -0.34602203 0.00061471244 
+Loop time of 0 on 1 procs for 0 steps with 4 atoms
+
+0.0% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0          |            |       |  0.00
+
+Nlocal:        4.00000 ave           4 max           4 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        1094.00 ave        1094 max        1094 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:         0.00000 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:      1712.00 ave        1712 max        1712 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 1712
+Ave neighs/atom = 428.00000
+Neighbor list builds = 0
+Dangerous builds = 0
+
+# Get cohesive energy
+variable natoms       equal "count(all)"
+variable ecohesive    equal "-pe/v_natoms"
+
+# Create a property instance
+kim property create 1 cohesive-potential-energy-cubic-crystal
+#=== kim property ===========================================
+
+# Set all the key-value pairs for this property instance
+kim property modify 1 key short-name source-value 1 fcc                                                key species source-value 1 Ar                                                    key a source-value ${lattice_constant}                                                 source-unit angstrom                                                       key basis-atom-coordinates source-value 1 1:3 0.0 0.0 0.0                                                   source-value 2 1:3 0.0 0.5 0.5                                                   source-value 3 1:3 0.5 0.0 0.5                                                   source-value 4 1:3 0.5 0.5 0.0                        key space-group source-value Fm-3m                                               key cohesive-potential-energy source-value ${ecohesive}                                                        source-unit eV
+kim property modify 1 key short-name source-value 1 fcc                                                key species source-value 1 Ar                                                    key a source-value 5.24850905686617                                                 source-unit angstrom                                                       key basis-atom-coordinates source-value 1 1:3 0.0 0.0 0.0                                                   source-value 2 1:3 0.0 0.5 0.5                                                   source-value 3 1:3 0.5 0.0 0.5                                                   source-value 4 1:3 0.5 0.5 0.0                        key space-group source-value Fm-3m                                               key cohesive-potential-energy source-value ${ecohesive}                                                        source-unit eV
+kim property modify 1 key short-name source-value 1 fcc                                                key species source-value 1 Ar                                                    key a source-value 5.24850905686617                                                 source-unit angstrom                                                       key basis-atom-coordinates source-value 1 1:3 0.0 0.0 0.0                                                   source-value 2 1:3 0.0 0.5 0.5                                                   source-value 3 1:3 0.5 0.0 0.5                                                   source-value 4 1:3 0.5 0.5 0.0                        key space-group source-value Fm-3m                                               key cohesive-potential-energy source-value 0.0865055084950546                                                        source-unit eV
+#=== kim property ===========================================
+
+# Dump the results in a file
+kim property dump "results.edn"
+#=== kim property ===========================================
+Total wall time: 0:00:00
diff --git a/examples/kim/log.10Feb21.in.kim-pm-property.clang.4 b/examples/kim/log.10Feb21.in.kim-pm-property.clang.4
new file mode 100644
index 0000000000..c3dd234af2
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.kim-pm-property.clang.4
@@ -0,0 +1,223 @@
+LAMMPS (10 Feb 2021)
+# kim property example
+#
+# For detailed information of this example please refer to:
+# `https://openkim.org/doc/evaluation/tutorial-lammps/`
+#
+# Description:
+#
+# This example is designed to calculate the cohesive energy corresponding to
+# the equilibrium FCC lattice constant for
+# `LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004` model for
+# argon. The material properties computed in LAMMPS are represented as a
+# standard KIM property instance format. (See
+# `https://openkim.org/doc/schema/properties-framework/` and
+# `https://lammps.sandia.gov/doc/kim_commands.html` for further details).
+# Then the created property instance is written to a file named `results.edn`
+# using the `kim property dump` command.
+#
+# Requirement:
+#
+# This example requires LAMMPS built with the Python 3.6 or later package
+# installed. See the `https://lammps.sandia.gov/doc/python.html` doc page for
+# more info on building LAMMPS with the version of Python on your system.
+# After successfully building LAMMPS with Python, you need to install the
+# kim-property Python package, See the
+# `https://lammps.sandia.gov/doc/Build_extras.html#kim` doc page for
+# further details.
+#
+# This example requires that the KIM Portable Model (PM)
+# `LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004`
+# is installed.  This can be done with the command
+#   kim-api-collections-management install user LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004
+# If this command does not work, you may need to setup your PATH to find the utility.
+# If you installed the kim-api using the LAMMPS CMake build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS build directory)
+#   source ./kim_build-prefix/bin/kim-api-activate
+# If you installed the kim-api using the LAMMPS Make build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS src directory)
+#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
+# (where you should relplace X.Y.Z with the appropriate kim-api version number).
+#
+# Or, see `https://openkim.org/doc/obtaining-models` for alternative options.
+#
+
+# Initialize interatomic potential (KIM model) and units
+atom_style atomic
+
+# Set the OpenKIM model that will be used
+kim init LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004 metal
+#=== BEGIN kim init ==========================================
+units metal
+neighbor 2.0 bin   # Angstroms
+timestep 1.0e-3    # picoseconds
+
+This model has 3 mutable parameters. 
+ No.      | Parameter name     | data type  | extent
+-----------------------------------------------------
+ 1        | cutoff             | "Double"   | 1
+ 2        | epsilon            | "Double"   | 1
+ 3        | sigma              | "Double"   | 1
+#=== END kim init ============================================
+
+
+# the equilibrium lattice constant for the fcc structure
+variable lattice_constant equal 5.248509056866169
+
+# Periodic boundary conditions along all three dimensions
+boundary p p p
+
+# Create an FCC lattice with the lattice spacing
+# using a single conventional (orthogonal) unit cell
+lattice      fcc ${lattice_constant}
+lattice      fcc 5.24850905686617
+Lattice spacing in x,y,z = 5.2485091 5.2485091 5.2485091
+region box   block 0 1 0 1 0 1 units lattice
+create_box   1 box
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (5.2485091 5.2485091 5.2485091)
+  1 by 2 by 2 MPI processor grid
+create_atoms 1 box
+Created 4 atoms
+  create_atoms CPU = 0.000 seconds
+mass         1 39.948
+
+# Specify the KIM interactions
+kim interactions Ar
+#=== BEGIN kim interactions ==================================
+pair_style kim LJ_Shifted_Bernardes_1958MedCutoff_Ar__MO_126566794224_004
+pair_coeff * * Ar
+#=== END kim interactions ====================================
+
+
+# Compute energy
+run 0
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- @Comment
+{
+\documentclass{article}
+\usepackage{url}
+\begin{document}
+This Model originally published in \cite{MO_126566794224_004a} is archived in OpenKIM~\cite{MO_126566794224_004, MD_498634107543_004, tadmor:elliott:2011, elliott:tadmor:2011}.
+\bibliographystyle{vancouver}
+\bibliography{kimcite-MO_126566794224_004.bib}
+\end{document}
+}
+
+@Misc{MO_126566794224_004,
+  author       = {Ellad Tadmor},
+  title        = {{L}ennard-{J}ones model (shifted) for {A}r with parameters from {B}ernardes (1958) (medium precision cutoff) v004},
+  doi          = {10.25950/9f98b989},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/9f98b989}},
+  keywords     = {OpenKIM, Model, MO_126566794224_004},
+  publisher    = {OpenKIM},
+  year         = 2020,
+}
+
+@Misc{MD_498634107543_004,
+  author       = {Ellad Tadmor},
+  title        = {{D}river for the {L}ennard-{J}ones model uniformly shifted to have zero energy at the cutoff radius v004},
+  doi          = {10.25950/bdffd6a6},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/9f98b989}},
+  keywords     = {OpenKIM, Model Driver, MD_498634107543_004},
+  publisher    = {OpenKIM},
+  year         = 2020,
+}
+
+@Article{tadmor:elliott:2011,
+  author    = {E. B. Tadmor and R. S. Elliott and J. P. Sethna and R. E. Miller and C. A. Becker},
+  title     = {The potential of atomistic simulations and the {K}nowledgebase of {I}nteratomic {M}odels},
+  journal   = {{JOM}},
+  year      = {2011},
+  volume    = {63},
+  number    = {7},
+  pages     = {17},
+  doi       = {10.1007/s11837-011-0102-6},
+}
+
+@Misc{elliott:tadmor:2011,
+  author       = {Ryan S. Elliott and Ellad B. Tadmor},
+  title        = {{K}nowledgebase of {I}nteratomic {M}odels ({KIM}) Application Programming Interface ({API})},
+  howpublished = {\url{https://openkim.org/kim-api}},
+  publisher    = {OpenKIM},
+  year         = 2011,
+  doi          = {10.25950/ff8f563a},
+}
+
+@Article{MO_126566794224_004a,
+  author = {Newton Bernardes},
+  doi = {10.1103/PhysRev.112.1534},
+  issue = {5},
+  journal = {Physical Review},
+  pages = {1534--1539},
+  publisher = {American Physical Society},
+  title = {Theory of Solid {N}e, {A}, {K}r, and {X}e at 0{K}},
+  volume = {112},
+  year = {1958},
+}
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 15.5
+  ghost atom cutoff = 15.5
+  binsize = 7.75, bins = 1 1 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair kim, perpetual
+      attributes: full, newton off, cut 15.5
+      pair build: full/bin/atomonly
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 3.165 | 3.165 | 3.165 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0            0  -0.34602203            0  -0.34602203 0.00061471244 
+Loop time of 1.5e-06 on 4 procs for 0 steps with 4 atoms
+
+100.0% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 1.5e-06    |            |       |100.00
+
+Nlocal:        1.00000 ave           1 max           1 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+Nghost:        935.000 ave         935 max         935 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+Neighs:         0.00000 ave           0 max           0 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+FullNghs:      428.000 ave         428 max         428 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 1712
+Ave neighs/atom = 428.00000
+Neighbor list builds = 0
+Dangerous builds = 0
+
+# Get cohesive energy
+variable natoms       equal "count(all)"
+variable ecohesive    equal "-pe/v_natoms"
+
+# Create a property instance
+kim property create 1 cohesive-potential-energy-cubic-crystal
+#=== kim property ===========================================
+
+# Set all the key-value pairs for this property instance
+kim property modify 1 key short-name source-value 1 fcc                                                key species source-value 1 Ar                                                    key a source-value ${lattice_constant}                                                 source-unit angstrom                                                       key basis-atom-coordinates source-value 1 1:3 0.0 0.0 0.0                                                   source-value 2 1:3 0.0 0.5 0.5                                                   source-value 3 1:3 0.5 0.0 0.5                                                   source-value 4 1:3 0.5 0.5 0.0                        key space-group source-value Fm-3m                                               key cohesive-potential-energy source-value ${ecohesive}                                                        source-unit eV
+kim property modify 1 key short-name source-value 1 fcc                                                key species source-value 1 Ar                                                    key a source-value 5.24850905686617                                                 source-unit angstrom                                                       key basis-atom-coordinates source-value 1 1:3 0.0 0.0 0.0                                                   source-value 2 1:3 0.0 0.5 0.5                                                   source-value 3 1:3 0.5 0.0 0.5                                                   source-value 4 1:3 0.5 0.5 0.0                        key space-group source-value Fm-3m                                               key cohesive-potential-energy source-value ${ecohesive}                                                        source-unit eV
+kim property modify 1 key short-name source-value 1 fcc                                                key species source-value 1 Ar                                                    key a source-value 5.24850905686617                                                 source-unit angstrom                                                       key basis-atom-coordinates source-value 1 1:3 0.0 0.0 0.0                                                   source-value 2 1:3 0.0 0.5 0.5                                                   source-value 3 1:3 0.5 0.0 0.5                                                   source-value 4 1:3 0.5 0.5 0.0                        key space-group source-value Fm-3m                                               key cohesive-potential-energy source-value 0.0865055084950538                                                        source-unit eV
+#=== kim property ===========================================
+
+# Dump the results in a file
+kim property dump "results.edn"
+#=== kim property ===========================================
+Total wall time: 0:00:00
diff --git a/examples/kim/log.10Feb21.in.kim-pm-query.melt.clang.1 b/examples/kim/log.10Feb21.in.kim-pm-query.melt.clang.1
new file mode 100644
index 0000000000..be12cda3da
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.kim-pm-query.melt.clang.1
@@ -0,0 +1,210 @@
+LAMMPS (10 Feb 2021)
+# 3d Lennard-Jones melt
+#
+# This example requires that the KIM Portable Model (PM)
+# `SW_StillingerWeber_1985_Si__MO_405512056662_005`
+# is installed.  This can be done with the command
+#   kim-api-collections-management install user SW_StillingerWeber_1985_Si__MO_405512056662_005
+# If this command does not work, you may need to setup your PATH to find the utility.
+# If you installed the kim-api using the LAMMPS CMake build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS build directory)
+#   source ./kim_build-prefix/bin/kim-api-activate
+# If you installed the kim-api using the LAMMPS Make build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS src directory)
+#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
+# (where you should relplace X.Y.Z with the appropriate kim-api version number).
+#
+# Or, see `https://openkim.org/doc/obtaining-models` for alternative options.
+#
+
+variable     x index 1
+variable     y index 1
+variable     z index 1
+
+variable     xx equal 20*$x
+variable     xx equal 20*1
+variable     yy equal 20*$y
+variable     yy equal 20*1
+variable     zz equal 20*$z
+variable     zz equal 20*1
+
+kim          init SW_StillingerWeber_1985_Si__MO_405512056662_005 real
+#=== BEGIN kim init ==========================================
+units real
+neighbor 2.0 bin   # Angstroms
+timestep 1.0       # femtoseconds
+
+This model has 9 mutable parameters. 
+ No.      | Parameter name     | data type  | extent
+-----------------------------------------------------
+ 1        | A                  | "Double"   | 1
+ 2        | B                  | "Double"   | 1
+ 3        | p                  | "Double"   | 1
+ 4        | q                  | "Double"   | 1
+ 5        | sigma              | "Double"   | 1
+ 6        | gamma              | "Double"   | 1
+ 7        | cutoff             | "Double"   | 1
+ 8        | lambda             | "Double"   | 1
+ 9        | costheta0          | "Double"   | 1
+#=== END kim init ============================================
+
+kim          query a0 get_lattice_constant_cubic crystal=["fcc"] species=["Si"] units=["angstrom"]
+#=== BEGIN kim-query =========================================
+variable a0 string "4.146581932902336"
+#=== END kim-query ===========================================
+
+
+lattice      fcc ${a0}
+lattice      fcc 4.146581932902336
+Lattice spacing in x,y,z = 4.1465819 4.1465819 4.1465819
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+region       box block 0 20 0 ${yy} 0 ${zz}
+region       box block 0 20 0 20 0 ${zz}
+region       box block 0 20 0 20 0 20
+create_box   1 box
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (82.931639 82.931639 82.931639)
+  1 by 1 by 1 MPI processor grid
+create_atoms 1 box
+Created 32000 atoms
+  create_atoms CPU = 0.004 seconds
+
+kim          interactions Si
+#=== BEGIN kim interactions ==================================
+pair_style kim SW_StillingerWeber_1985_Si__MO_405512056662_005
+pair_coeff * * Si
+#=== END kim interactions ====================================
+
+
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
+
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
+
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+
+run          100
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- @Comment
+{
+\documentclass{article}
+\usepackage{url}
+\begin{document}
+This Model originally published in \cite{MO_405512056662_005a, MO_405512056662_005b} is archived in OpenKIM~\cite{MO_405512056662_005, MD_335816936951_004, tadmor:elliott:2011, elliott:tadmor:2011}.
+\bibliographystyle{vancouver}
+\bibliography{kimcite-MO_405512056662_005.bib}
+\end{document}
+}
+
+@Misc{MO_405512056662_005,
+  author       = {Amit K Singh},
+  title        = {{S}tillinger-{W}eber potential for {S}i due to {S}tillinger and {W}eber (1985) v005},
+  doi          = {10.25950/c74b293f},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/c74b293f}},
+  keywords     = {OpenKIM, Model, MO_405512056662_005},
+  publisher    = {OpenKIM},
+  year         = 2018,
+}
+
+@Misc{MD_335816936951_004,
+  author       = {Mingjian Wen},
+  title        = {{S}tillinger-{W}eber ({SW}) {M}odel {D}river v004},
+  doi          = {10.25950/f3abd2d6},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/c74b293f}},
+  keywords     = {OpenKIM, Model Driver, MD_335816936951_004},
+  publisher    = {OpenKIM},
+  year         = 2018,
+}
+
+@Article{tadmor:elliott:2011,
+  author    = {E. B. Tadmor and R. S. Elliott and J. P. Sethna and R. E. Miller and C. A. Becker},
+  title     = {The potential of atomistic simulations and the {K}nowledgebase of {I}nteratomic {M}odels},
+  journal   = {{JOM}},
+  year      = {2011},
+  volume    = {63},
+  number    = {7},
+  pages     = {17},
+  doi       = {10.1007/s11837-011-0102-6},
+}
+
+@Misc{elliott:tadmor:2011,
+  author       = {Ryan S. Elliott and Ellad B. Tadmor},
+  title        = {{K}nowledgebase of {I}nteratomic {M}odels ({KIM}) Application Programming Interface ({API})},
+  howpublished = {\url{https://openkim.org/kim-api}},
+  publisher    = {OpenKIM},
+  year         = 2011,
+  doi          = {10.25950/ff8f563a},
+}
+
+@Article{MO_405512056662_005a,
+  author = {Stillinger, Frank H. and Weber, Thomas A.},
+  doi = {10.1103/PhysRevB.31.5262},
+  issue = {8},
+  journal = {Physical Review B},
+  month = {Apr},
+  pages = {5262--5271},
+  publisher = {American Physical Society},
+  title = {Computer simulation of local order in condensed phases of silicon},
+  volume = {31},
+  year = {1985},
+}
+
+@Book{MO_405512056662_005b,
+  author = {Tadmor, Ellad B. and Miller, Ronald E.},
+  doi = {10.1017/CBO9781139003582},
+  publisher = {Cambridge University Press},
+  title = {Modeling Materials: {C}ontinuum, Atomistic and Multiscale Techniques},
+  year = {2011},
+}
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 4.07118
+  ghost atom cutoff = 4.07118
+  binsize = 2.03559, bins = 41 41 41
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair kim, perpetual
+      attributes: full, newton off, cut 4.07118
+      pair build: full/bin/atomonly
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 10.36 | 10.36 | 10.36 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0          200   -126084.25            0   -107007.66    1528.8768 
+     100    94.450495   -116016.03            0   -107007.07    2282.2685 
+Loop time of 18.2886 on 1 procs for 100 steps with 32000 atoms
+
+Performance: 0.472 ns/day, 50.802 hours/ns, 5.468 timesteps/s
+99.9% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 18.155     | 18.155     | 18.155     |   0.0 | 99.27
+Neigh   | 0.087194   | 0.087194   | 0.087194   |   0.0 |  0.48
+Comm    | 0.009477   | 0.009477   | 0.009477   |   0.0 |  0.05
+Output  | 6.7e-05    | 6.7e-05    | 6.7e-05    |   0.0 |  0.00
+Modify  | 0.02616    | 0.02616    | 0.02616    |   0.0 |  0.14
+Other   |            | 0.0111     |            |       |  0.06
+
+Nlocal:        32000.0 ave       32000 max       32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        9667.00 ave        9667 max        9667 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:         0.00000 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:      450192.0 ave      450192 max      450192 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 450192
+Ave neighs/atom = 14.068500
+Neighbor list builds = 3
+Dangerous builds = 0
+Total wall time: 0:00:21
diff --git a/examples/kim/log.10Feb21.in.kim-pm-query.melt.clang.4 b/examples/kim/log.10Feb21.in.kim-pm-query.melt.clang.4
new file mode 100644
index 0000000000..f982e79425
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.kim-pm-query.melt.clang.4
@@ -0,0 +1,210 @@
+LAMMPS (10 Feb 2021)
+# 3d Lennard-Jones melt
+#
+# This example requires that the KIM Portable Model (PM)
+# `SW_StillingerWeber_1985_Si__MO_405512056662_005`
+# is installed.  This can be done with the command
+#   kim-api-collections-management install user SW_StillingerWeber_1985_Si__MO_405512056662_005
+# If this command does not work, you may need to setup your PATH to find the utility.
+# If you installed the kim-api using the LAMMPS CMake build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS build directory)
+#   source ./kim_build-prefix/bin/kim-api-activate
+# If you installed the kim-api using the LAMMPS Make build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS src directory)
+#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
+# (where you should relplace X.Y.Z with the appropriate kim-api version number).
+#
+# Or, see `https://openkim.org/doc/obtaining-models` for alternative options.
+#
+
+variable     x index 1
+variable     y index 1
+variable     z index 1
+
+variable     xx equal 20*$x
+variable     xx equal 20*1
+variable     yy equal 20*$y
+variable     yy equal 20*1
+variable     zz equal 20*$z
+variable     zz equal 20*1
+
+kim          init SW_StillingerWeber_1985_Si__MO_405512056662_005 real
+#=== BEGIN kim init ==========================================
+units real
+neighbor 2.0 bin   # Angstroms
+timestep 1.0       # femtoseconds
+
+This model has 9 mutable parameters. 
+ No.      | Parameter name     | data type  | extent
+-----------------------------------------------------
+ 1        | A                  | "Double"   | 1
+ 2        | B                  | "Double"   | 1
+ 3        | p                  | "Double"   | 1
+ 4        | q                  | "Double"   | 1
+ 5        | sigma              | "Double"   | 1
+ 6        | gamma              | "Double"   | 1
+ 7        | cutoff             | "Double"   | 1
+ 8        | lambda             | "Double"   | 1
+ 9        | costheta0          | "Double"   | 1
+#=== END kim init ============================================
+
+kim          query a0 get_lattice_constant_cubic crystal=["fcc"] species=["Si"] units=["angstrom"]
+#=== BEGIN kim-query =========================================
+variable a0 string "4.146581932902336"
+#=== END kim-query ===========================================
+
+
+lattice      fcc ${a0}
+lattice      fcc 4.146581932902336
+Lattice spacing in x,y,z = 4.1465819 4.1465819 4.1465819
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+region       box block 0 20 0 ${yy} 0 ${zz}
+region       box block 0 20 0 20 0 ${zz}
+region       box block 0 20 0 20 0 20
+create_box   1 box
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (82.931639 82.931639 82.931639)
+  1 by 2 by 2 MPI processor grid
+create_atoms 1 box
+Created 32000 atoms
+  create_atoms CPU = 0.001 seconds
+
+kim          interactions Si
+#=== BEGIN kim interactions ==================================
+pair_style kim SW_StillingerWeber_1985_Si__MO_405512056662_005
+pair_coeff * * Si
+#=== END kim interactions ====================================
+
+
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
+
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
+
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+
+run          100
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- @Comment
+{
+\documentclass{article}
+\usepackage{url}
+\begin{document}
+This Model originally published in \cite{MO_405512056662_005a, MO_405512056662_005b} is archived in OpenKIM~\cite{MO_405512056662_005, MD_335816936951_004, tadmor:elliott:2011, elliott:tadmor:2011}.
+\bibliographystyle{vancouver}
+\bibliography{kimcite-MO_405512056662_005.bib}
+\end{document}
+}
+
+@Misc{MO_405512056662_005,
+  author       = {Amit K Singh},
+  title        = {{S}tillinger-{W}eber potential for {S}i due to {S}tillinger and {W}eber (1985) v005},
+  doi          = {10.25950/c74b293f},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/c74b293f}},
+  keywords     = {OpenKIM, Model, MO_405512056662_005},
+  publisher    = {OpenKIM},
+  year         = 2018,
+}
+
+@Misc{MD_335816936951_004,
+  author       = {Mingjian Wen},
+  title        = {{S}tillinger-{W}eber ({SW}) {M}odel {D}river v004},
+  doi          = {10.25950/f3abd2d6},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/c74b293f}},
+  keywords     = {OpenKIM, Model Driver, MD_335816936951_004},
+  publisher    = {OpenKIM},
+  year         = 2018,
+}
+
+@Article{tadmor:elliott:2011,
+  author    = {E. B. Tadmor and R. S. Elliott and J. P. Sethna and R. E. Miller and C. A. Becker},
+  title     = {The potential of atomistic simulations and the {K}nowledgebase of {I}nteratomic {M}odels},
+  journal   = {{JOM}},
+  year      = {2011},
+  volume    = {63},
+  number    = {7},
+  pages     = {17},
+  doi       = {10.1007/s11837-011-0102-6},
+}
+
+@Misc{elliott:tadmor:2011,
+  author       = {Ryan S. Elliott and Ellad B. Tadmor},
+  title        = {{K}nowledgebase of {I}nteratomic {M}odels ({KIM}) Application Programming Interface ({API})},
+  howpublished = {\url{https://openkim.org/kim-api}},
+  publisher    = {OpenKIM},
+  year         = 2011,
+  doi          = {10.25950/ff8f563a},
+}
+
+@Article{MO_405512056662_005a,
+  author = {Stillinger, Frank H. and Weber, Thomas A.},
+  doi = {10.1103/PhysRevB.31.5262},
+  issue = {8},
+  journal = {Physical Review B},
+  month = {Apr},
+  pages = {5262--5271},
+  publisher = {American Physical Society},
+  title = {Computer simulation of local order in condensed phases of silicon},
+  volume = {31},
+  year = {1985},
+}
+
+@Book{MO_405512056662_005b,
+  author = {Tadmor, Ellad B. and Miller, Ronald E.},
+  doi = {10.1017/CBO9781139003582},
+  publisher = {Cambridge University Press},
+  title = {Modeling Materials: {C}ontinuum, Atomistic and Multiscale Techniques},
+  year = {2011},
+}
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 4.07118
+  ghost atom cutoff = 4.07118
+  binsize = 2.03559, bins = 41 41 41
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair kim, perpetual
+      attributes: full, newton off, cut 4.07118
+      pair build: full/bin/atomonly
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 3.489 | 3.489 | 3.489 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0          200   -126084.25            0   -107007.66    1528.8768 
+     100    94.450495   -116016.03            0   -107007.07    2282.2685 
+Loop time of 5.00432 on 4 procs for 100 steps with 32000 atoms
+
+Performance: 1.727 ns/day, 13.901 hours/ns, 19.983 timesteps/s
+99.7% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 4.9281     | 4.9366     | 4.9447     |   0.3 | 98.65
+Neigh   | 0.02399    | 0.024135   | 0.024318   |   0.1 |  0.48
+Comm    | 0.020646   | 0.029014   | 0.037515   |   4.3 |  0.58
+Output  | 2.9e-05    | 3.325e-05  | 4.2e-05    |   0.0 |  0.00
+Modify  | 0.008808   | 0.0088445  | 0.00888    |   0.0 |  0.18
+Other   |            | 0.005691   |            |       |  0.11
+
+Nlocal:        8000.00 ave        8029 max        7968 min
+Histogram: 1 1 0 0 0 0 0 0 0 2
+Nghost:        4259.00 ave        4303 max        4202 min
+Histogram: 1 0 0 0 0 0 2 0 0 1
+Neighs:         0.00000 ave           0 max           0 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+FullNghs:      112548.0 ave      113091 max      111995 min
+Histogram: 1 0 0 1 0 0 0 1 0 1
+
+Total # of neighbors = 450192
+Ave neighs/atom = 14.068500
+Neighbor list builds = 3
+Dangerous builds = 0
+Total wall time: 0:00:07
diff --git a/examples/kim/log.10Feb21.in.kim-pm.melt.clang.1 b/examples/kim/log.10Feb21.in.kim-pm.melt.clang.1
new file mode 100644
index 0000000000..f27f8e6d83
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.kim-pm.melt.clang.1
@@ -0,0 +1,204 @@
+LAMMPS (10 Feb 2021)
+# 3d Lennard-Jones melt
+#
+# This example requires that the KIM Portable Model (PM)
+# `SW_StillingerWeber_1985_Si__MO_405512056662_005`
+# is installed.  This can be done with the command
+#   kim-api-collections-management install user SW_StillingerWeber_1985_Si__MO_405512056662_005
+# If this command does not work, you may need to setup your PATH to find the utility.
+# If you installed the kim-api using the LAMMPS CMake build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS build directory)
+#   source ./kim_build-prefix/bin/kim-api-activate
+# If you installed the kim-api using the LAMMPS Make build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS src directory)
+#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
+# (where you should relplace X.Y.Z with the appropriate kim-api version number).
+#
+# Or, see `https://openkim.org/doc/obtaining-models` for alternative options.
+#
+
+variable     x index 1
+variable     y index 1
+variable     z index 1
+
+variable     xx equal 20*$x
+variable     xx equal 20*1
+variable     yy equal 20*$y
+variable     yy equal 20*1
+variable     zz equal 20*$z
+variable     zz equal 20*1
+
+kim          init SW_StillingerWeber_1985_Si__MO_405512056662_005 real
+#=== BEGIN kim init ==========================================
+units real
+neighbor 2.0 bin   # Angstroms
+timestep 1.0       # femtoseconds
+
+This model has 9 mutable parameters. 
+ No.      | Parameter name     | data type  | extent
+-----------------------------------------------------
+ 1        | A                  | "Double"   | 1
+ 2        | B                  | "Double"   | 1
+ 3        | p                  | "Double"   | 1
+ 4        | q                  | "Double"   | 1
+ 5        | sigma              | "Double"   | 1
+ 6        | gamma              | "Double"   | 1
+ 7        | cutoff             | "Double"   | 1
+ 8        | lambda             | "Double"   | 1
+ 9        | costheta0          | "Double"   | 1
+#=== END kim init ============================================
+
+
+lattice      fcc 4.4300
+Lattice spacing in x,y,z = 4.4300000 4.4300000 4.4300000
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+region       box block 0 20 0 ${yy} 0 ${zz}
+region       box block 0 20 0 20 0 ${zz}
+region       box block 0 20 0 20 0 20
+create_box   1 box
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (88.600000 88.600000 88.600000)
+  1 by 1 by 1 MPI processor grid
+create_atoms 1 box
+Created 32000 atoms
+  create_atoms CPU = 0.002 seconds
+
+kim          interactions Si
+#=== BEGIN kim interactions ==================================
+pair_style kim SW_StillingerWeber_1985_Si__MO_405512056662_005
+pair_coeff * * Si
+#=== END kim interactions ====================================
+
+
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
+
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
+
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+
+run          100
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- @Comment
+{
+\documentclass{article}
+\usepackage{url}
+\begin{document}
+This Model originally published in \cite{MO_405512056662_005a, MO_405512056662_005b} is archived in OpenKIM~\cite{MO_405512056662_005, MD_335816936951_004, tadmor:elliott:2011, elliott:tadmor:2011}.
+\bibliographystyle{vancouver}
+\bibliography{kimcite-MO_405512056662_005.bib}
+\end{document}
+}
+
+@Misc{MO_405512056662_005,
+  author       = {Amit K Singh},
+  title        = {{S}tillinger-{W}eber potential for {S}i due to {S}tillinger and {W}eber (1985) v005},
+  doi          = {10.25950/c74b293f},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/c74b293f}},
+  keywords     = {OpenKIM, Model, MO_405512056662_005},
+  publisher    = {OpenKIM},
+  year         = 2018,
+}
+
+@Misc{MD_335816936951_004,
+  author       = {Mingjian Wen},
+  title        = {{S}tillinger-{W}eber ({SW}) {M}odel {D}river v004},
+  doi          = {10.25950/f3abd2d6},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/c74b293f}},
+  keywords     = {OpenKIM, Model Driver, MD_335816936951_004},
+  publisher    = {OpenKIM},
+  year         = 2018,
+}
+
+@Article{tadmor:elliott:2011,
+  author    = {E. B. Tadmor and R. S. Elliott and J. P. Sethna and R. E. Miller and C. A. Becker},
+  title     = {The potential of atomistic simulations and the {K}nowledgebase of {I}nteratomic {M}odels},
+  journal   = {{JOM}},
+  year      = {2011},
+  volume    = {63},
+  number    = {7},
+  pages     = {17},
+  doi       = {10.1007/s11837-011-0102-6},
+}
+
+@Misc{elliott:tadmor:2011,
+  author       = {Ryan S. Elliott and Ellad B. Tadmor},
+  title        = {{K}nowledgebase of {I}nteratomic {M}odels ({KIM}) Application Programming Interface ({API})},
+  howpublished = {\url{https://openkim.org/kim-api}},
+  publisher    = {OpenKIM},
+  year         = 2011,
+  doi          = {10.25950/ff8f563a},
+}
+
+@Article{MO_405512056662_005a,
+  author = {Stillinger, Frank H. and Weber, Thomas A.},
+  doi = {10.1103/PhysRevB.31.5262},
+  issue = {8},
+  journal = {Physical Review B},
+  month = {Apr},
+  pages = {5262--5271},
+  publisher = {American Physical Society},
+  title = {Computer simulation of local order in condensed phases of silicon},
+  volume = {31},
+  year = {1985},
+}
+
+@Book{MO_405512056662_005b,
+  author = {Tadmor, Ellad B. and Miller, Ronald E.},
+  doi = {10.1017/CBO9781139003582},
+  publisher = {Cambridge University Press},
+  title = {Modeling Materials: {C}ontinuum, Atomistic and Multiscale Techniques},
+  year = {2011},
+}
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 4.07118
+  ghost atom cutoff = 4.07118
+  binsize = 2.03559, bins = 44 44 44
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair kim, perpetual
+      attributes: full, newton off, cut 4.07118
+      pair build: full/bin/atomonly
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 10.44 | 10.44 | 10.44 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0          200   -85249.847            0   -66173.259   -33302.387 
+     100    253.43357    -90346.68            0   -66173.441   -14888.698 
+Loop time of 17.7449 on 1 procs for 100 steps with 32000 atoms
+
+Performance: 0.487 ns/day, 49.291 hours/ns, 5.635 timesteps/s
+99.9% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 17.64      | 17.64      | 17.64      |   0.0 | 99.41
+Neigh   | 0.060149   | 0.060149   | 0.060149   |   0.0 |  0.34
+Comm    | 0.008585   | 0.008585   | 0.008585   |   0.0 |  0.05
+Output  | 6.3e-05    | 6.3e-05    | 6.3e-05    |   0.0 |  0.00
+Modify  | 0.025324   | 0.025324   | 0.025324   |   0.0 |  0.14
+Other   |            | 0.01057    |            |       |  0.06
+
+Nlocal:        32000.0 ave       32000 max       32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        7760.00 ave        7760 max        7760 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:         0.00000 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:      402352.0 ave      402352 max      402352 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 402352
+Ave neighs/atom = 12.573500
+Neighbor list builds = 4
+Dangerous builds = 0
+Total wall time: 0:00:17
diff --git a/examples/kim/log.10Feb21.in.kim-pm.melt.clang.4 b/examples/kim/log.10Feb21.in.kim-pm.melt.clang.4
new file mode 100644
index 0000000000..2107e3f876
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.kim-pm.melt.clang.4
@@ -0,0 +1,204 @@
+LAMMPS (10 Feb 2021)
+# 3d Lennard-Jones melt
+#
+# This example requires that the KIM Portable Model (PM)
+# `SW_StillingerWeber_1985_Si__MO_405512056662_005`
+# is installed.  This can be done with the command
+#   kim-api-collections-management install user SW_StillingerWeber_1985_Si__MO_405512056662_005
+# If this command does not work, you may need to setup your PATH to find the utility.
+# If you installed the kim-api using the LAMMPS CMake build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS build directory)
+#   source ./kim_build-prefix/bin/kim-api-activate
+# If you installed the kim-api using the LAMMPS Make build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS src directory)
+#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
+# (where you should relplace X.Y.Z with the appropriate kim-api version number).
+#
+# Or, see `https://openkim.org/doc/obtaining-models` for alternative options.
+#
+
+variable     x index 1
+variable     y index 1
+variable     z index 1
+
+variable     xx equal 20*$x
+variable     xx equal 20*1
+variable     yy equal 20*$y
+variable     yy equal 20*1
+variable     zz equal 20*$z
+variable     zz equal 20*1
+
+kim          init SW_StillingerWeber_1985_Si__MO_405512056662_005 real
+#=== BEGIN kim init ==========================================
+units real
+neighbor 2.0 bin   # Angstroms
+timestep 1.0       # femtoseconds
+
+This model has 9 mutable parameters. 
+ No.      | Parameter name     | data type  | extent
+-----------------------------------------------------
+ 1        | A                  | "Double"   | 1
+ 2        | B                  | "Double"   | 1
+ 3        | p                  | "Double"   | 1
+ 4        | q                  | "Double"   | 1
+ 5        | sigma              | "Double"   | 1
+ 6        | gamma              | "Double"   | 1
+ 7        | cutoff             | "Double"   | 1
+ 8        | lambda             | "Double"   | 1
+ 9        | costheta0          | "Double"   | 1
+#=== END kim init ============================================
+
+
+lattice      fcc 4.4300
+Lattice spacing in x,y,z = 4.4300000 4.4300000 4.4300000
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+region       box block 0 20 0 ${yy} 0 ${zz}
+region       box block 0 20 0 20 0 ${zz}
+region       box block 0 20 0 20 0 20
+create_box   1 box
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (88.600000 88.600000 88.600000)
+  1 by 2 by 2 MPI processor grid
+create_atoms 1 box
+Created 32000 atoms
+  create_atoms CPU = 0.001 seconds
+
+kim          interactions Si
+#=== BEGIN kim interactions ==================================
+pair_style kim SW_StillingerWeber_1985_Si__MO_405512056662_005
+pair_coeff * * Si
+#=== END kim interactions ====================================
+
+
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
+
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
+
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+
+run          100
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- @Comment
+{
+\documentclass{article}
+\usepackage{url}
+\begin{document}
+This Model originally published in \cite{MO_405512056662_005a, MO_405512056662_005b} is archived in OpenKIM~\cite{MO_405512056662_005, MD_335816936951_004, tadmor:elliott:2011, elliott:tadmor:2011}.
+\bibliographystyle{vancouver}
+\bibliography{kimcite-MO_405512056662_005.bib}
+\end{document}
+}
+
+@Misc{MO_405512056662_005,
+  author       = {Amit K Singh},
+  title        = {{S}tillinger-{W}eber potential for {S}i due to {S}tillinger and {W}eber (1985) v005},
+  doi          = {10.25950/c74b293f},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/c74b293f}},
+  keywords     = {OpenKIM, Model, MO_405512056662_005},
+  publisher    = {OpenKIM},
+  year         = 2018,
+}
+
+@Misc{MD_335816936951_004,
+  author       = {Mingjian Wen},
+  title        = {{S}tillinger-{W}eber ({SW}) {M}odel {D}river v004},
+  doi          = {10.25950/f3abd2d6},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/c74b293f}},
+  keywords     = {OpenKIM, Model Driver, MD_335816936951_004},
+  publisher    = {OpenKIM},
+  year         = 2018,
+}
+
+@Article{tadmor:elliott:2011,
+  author    = {E. B. Tadmor and R. S. Elliott and J. P. Sethna and R. E. Miller and C. A. Becker},
+  title     = {The potential of atomistic simulations and the {K}nowledgebase of {I}nteratomic {M}odels},
+  journal   = {{JOM}},
+  year      = {2011},
+  volume    = {63},
+  number    = {7},
+  pages     = {17},
+  doi       = {10.1007/s11837-011-0102-6},
+}
+
+@Misc{elliott:tadmor:2011,
+  author       = {Ryan S. Elliott and Ellad B. Tadmor},
+  title        = {{K}nowledgebase of {I}nteratomic {M}odels ({KIM}) Application Programming Interface ({API})},
+  howpublished = {\url{https://openkim.org/kim-api}},
+  publisher    = {OpenKIM},
+  year         = 2011,
+  doi          = {10.25950/ff8f563a},
+}
+
+@Article{MO_405512056662_005a,
+  author = {Stillinger, Frank H. and Weber, Thomas A.},
+  doi = {10.1103/PhysRevB.31.5262},
+  issue = {8},
+  journal = {Physical Review B},
+  month = {Apr},
+  pages = {5262--5271},
+  publisher = {American Physical Society},
+  title = {Computer simulation of local order in condensed phases of silicon},
+  volume = {31},
+  year = {1985},
+}
+
+@Book{MO_405512056662_005b,
+  author = {Tadmor, Ellad B. and Miller, Ronald E.},
+  doi = {10.1017/CBO9781139003582},
+  publisher = {Cambridge University Press},
+  title = {Modeling Materials: {C}ontinuum, Atomistic and Multiscale Techniques},
+  year = {2011},
+}
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 4.07118
+  ghost atom cutoff = 4.07118
+  binsize = 2.03559, bins = 44 44 44
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair kim, perpetual
+      attributes: full, newton off, cut 4.07118
+      pair build: full/bin/atomonly
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 3.517 | 3.517 | 3.517 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0          200   -85249.847            0   -66173.259   -33302.387 
+     100    253.43357    -90346.68            0   -66173.441   -14888.698 
+Loop time of 4.87378 on 4 procs for 100 steps with 32000 atoms
+
+Performance: 1.773 ns/day, 13.538 hours/ns, 20.518 timesteps/s
+99.7% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 4.8075     | 4.816      | 4.8244     |   0.3 | 98.81
+Neigh   | 0.015902   | 0.015996   | 0.016077   |   0.1 |  0.33
+Comm    | 0.018078   | 0.026375   | 0.034752   |   4.2 |  0.54
+Output  | 3e-05      | 3.5e-05    | 4.4e-05    |   0.0 |  0.00
+Modify  | 0.009331   | 0.0094922  | 0.009588   |   0.1 |  0.19
+Other   |            | 0.005919   |            |       |  0.12
+
+Nlocal:        8000.00 ave        8014 max        7988 min
+Histogram: 1 1 0 0 0 0 1 0 0 1
+Nghost:        3374.75 ave        3389 max        3361 min
+Histogram: 1 0 1 0 0 0 0 1 0 1
+Neighs:         0.00000 ave           0 max           0 min
+Histogram: 4 0 0 0 0 0 0 0 0 0
+FullNghs:      100588.0 ave      100856 max      100392 min
+Histogram: 1 0 1 0 1 0 0 0 0 1
+
+Total # of neighbors = 402352
+Ave neighs/atom = 12.573500
+Neighbor list builds = 4
+Dangerous builds = 0
+Total wall time: 0:00:04
diff --git a/examples/kim/log.10Feb21.in.kim-query.clang.1 b/examples/kim/log.10Feb21.in.kim-query.clang.1
new file mode 100644
index 0000000000..01fc8cd7dd
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.kim-query.clang.1
@@ -0,0 +1,655 @@
+LAMMPS (10 Feb 2021)
+# kim query example
+#
+# Requirement:
+#
+# This example requires LAMMPS is built with KIM package. A requirement for
+# the KIM package, is the KIM API library that must be downloaded from the
+# OpenKIM website and installed before LAMMPS is compiled. The 'kim query'
+# command requires the libcurl library to be installed. See the
+# `https://lammps.sandia.gov/doc/Build_extras.html#kim` doc page for further
+# details
+#
+# This example requires that the KIM Models
+# `EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005`
+# and
+# `EAM_Dynamo_MendelevAckland_2007v3_Zr__MO_004835508849_000`
+# are installed.
+#
+# This can be done with the commands
+# `kim-api-collections-management install user `EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005`
+# `kim-api-collections-management install user `EAM_Dynamo_MendelevAckland_2007v3_Zr__MO_004835508849_000`
+#
+# If these commands do not work, you may need to setup your PATH to find the utility.
+# If you installed the kim-api using the LAMMPS CMake build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS build directory)
+#   source ./kim_build-prefix/bin/kim-api-activate
+# If you installed the kim-api using the LAMMPS Make build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS src directory)
+#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
+# (where you should relplace X.Y.Z with the appropriate kim-api version number).
+#
+# Or, see https://openkim.org/doc/obtaining-models for alternative options.
+#
+
+# -----------------------------------------------
+# Get an equilibrium fcc crystal lattice constant
+# -----------------------------------------------
+kim   init EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal
+#=== BEGIN kim init ==========================================
+units metal
+neighbor 2.0 bin   # Angstroms
+timestep 1.0e-3    # picoseconds
+
+This model has 6 mutable parameters. 
+ No.      | Parameter name     | data type  | extent
+-----------------------------------------------------
+ 1        | cutoff             | "Double"   | 1
+ 2        | deltaRho           | "Double"   | 1
+ 3        | deltaR             | "Double"   | 1
+ 4        | embeddingData      | "Double"   | 500
+ 5        | rPhiData           | "Double"   | 500
+ 6        | densityData        | "Double"   | 500
+#=== END kim init ============================================
+
+kim   query latconst_1 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom]
+#=== BEGIN kim-query =========================================
+variable latconst_1 string "4.032082033157349"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005) = ${latconst_1}"
+FCC lattice constant (EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005) = 4.032082033157349
+# Get the lattice contant from a different model
+kim   query latconst_2 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005]
+#=== BEGIN kim-query =========================================
+variable latconst_2 string "4.024845376610756"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005) = ${latconst_2}"
+FCC lattice constant (EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005) = 4.024845376610756
+clear
+
+
+# -----------------------------------------------
+# Get an equilibrium fcc crystal lattice constant
+# -----------------------------------------------
+kim   query latconst_1 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005]
+#=== BEGIN kim-query =========================================
+variable latconst_1 string "4.032082033157349"
+#=== END kim-query ===========================================
+
+kim   query latconst_2 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005]
+#=== BEGIN kim-query =========================================
+variable latconst_2 string "4.024845376610756"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005)   = ${latconst_1}"
+FCC lattice constant (EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005)   = 4.032082033157349
+print "FCC lattice constant (EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005) = ${latconst_2}"
+FCC lattice constant (EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005) = 4.024845376610756
+clear
+
+
+# -----------------------------------------------
+# Get an equilibrium hcp crystal lattice constant
+# -----------------------------------------------
+kim   init EAM_Dynamo_MendelevAckland_2007v3_Zr__MO_004835508849_000 metal
+#=== BEGIN kim init ==========================================
+units metal
+neighbor 2.0 bin   # Angstroms
+timestep 1.0e-3    # picoseconds
+
+This model has 6 mutable parameters. 
+ No.      | Parameter name     | data type  | extent
+-----------------------------------------------------
+ 1        | cutoff             | "Double"   | 1
+ 2        | deltaRho           | "Double"   | 1
+ 3        | deltaR             | "Double"   | 1
+ 4        | embeddingData      | "Double"   | 10000
+ 5        | rPhiData           | "Double"   | 10000
+ 6        | densityData        | "Double"   | 10000
+#=== END kim init ============================================
+
+kim   query latconst split get_lattice_constant_hexagonal crystal=["hcp"] species=["Zr"] units=["angstrom"]
+#=== BEGIN kim-query =========================================
+variable latconst_1 string 3.234055244384789
+variable latconst_2 string  5.167650199630013
+#=== END kim-query ===========================================
+
+print "HCP lattice constants = ${latconst_1}, ${latconst_2}"
+HCP lattice constants = 3.234055244384789, 5.167650199630013
+clear
+
+
+# -----------------------------------------------
+# Query for KIM models from openkim.org
+# Get all the EAM models that support Al
+# -----------------------------------------------
+kim   query model index get_available_models species=[Al] potential_type=[eam]
+#=== BEGIN kim-query =========================================
+variable model index "EAM_CubicNaturalSpline_ErcolessiAdams_1994_Al__MO_800509458712_002"  "EAM_Dynamo_AngeloMoodyBaskes_1995_NiAlH__MO_418978237058_005"  "EAM_Dynamo_CaiYe_1996_AlCu__MO_942551040047_005"  "EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005"  "EAM_Dynamo_FarkasJones_1996_NbTiAl__MO_042691367780_000"  "EAM_Dynamo_JacobsenNorskovPuska_1987_Al__MO_411692133366_000"  "EAM_Dynamo_LandaWynblattSiegel_2000_AlPb__MO_699137396381_005"  "EAM_Dynamo_LiuAdams_1998_AlMg__MO_019873715786_000"  "EAM_Dynamo_LiuErcolessiAdams_2004_Al__MO_051157671505_000"  "EAM_Dynamo_LiuLiuBorucki_1999_AlCu__MO_020851069572_000"  "EAM_Dynamo_LiuOhotnickyAdams_1997_AlMg__MO_559870613549_000"  "EAM_Dynamo_MendelevAstaRahman_2009_AlMg__MO_658278549784_005"  "EAM_Dynamo_MendelevFangYe_2015_AlSm__MO_338600200739_000"  "EAM_Dynamo_MendelevKramerBecker_2008_Al__MO_106969701023_005"  "EAM_Dynamo_MendelevSrolovitzAckland_2005_AlFe__MO_577453891941_005"  "EAM_Dynamo_MishinFarkasMehl_1999_Al__MO_651801486679_005"  "EAM_Dynamo_MishinMehlPapaconstantopoulos_2002_NiAl__MO_109933561507_005"  "EAM_Dynamo_Mishin_2004_NiAl__MO_101214310689_005"  "EAM_Dynamo_PunMishin_2009_NiAl__MO_751354403791_005"  "EAM_Dynamo_PunYamakovMishin_2013_AlCo__MO_678952612413_000"  "EAM_Dynamo_PunYamakovMishin_2013_NiAlCo__MO_826591359508_000"  "EAM_Dynamo_SchopfBrommerFrigan_2012_AlMnPd__MO_137572817842_000"  "EAM_Dynamo_SturgeonLaird_2000_Al__MO_120808805541_005"  "EAM_Dynamo_VailheFarkas_1997_CoAl__MO_284963179498_005"  "EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005"  "EAM_Dynamo_Zhakhovsky_2009_Al__MO_519613893196_000"  "EAM_Dynamo_ZhouJohnsonWadley_2004NISTretabulation_Al__MO_060567868558_000"  "EAM_Dynamo_ZhouJohnsonWadley_2004_Al__MO_131650261510_005"  "EAM_Dynamo_ZhouWadleyJohnson_2001_Al__MO_049243498555_000"  "EAM_Dynamo_ZopeMishin_2003_Al__MO_664470114311_005"  "EAM_Dynamo_ZopeMishin_2003_TiAl__MO_117656786760_005"  "EAM_ErcolessiAdams_1994_Al__MO_324507536345_003"  "EAM_IMD_BrommerGaehler_2006A_AlNiCo__MO_122703700223_003"  "EAM_IMD_BrommerGaehler_2006B_AlNiCo__MO_128037485276_003"  "EAM_IMD_SchopfBrommerFrigan_2012_AlMnPd__MO_878712978062_003"  "EAM_QuinticClampedSpline_ErcolessiAdams_1994_Al__MO_450093727396_002"  "EAM_QuinticHermiteSpline_ErcolessiAdams_1994_Al__MO_781138671863_002"  "EMT_Asap_Standard_JacobsenStoltzeNorskov_1996_AlAgAuCuNiPdPt__MO_115316750986_001"  "EMT_Asap_Standard_JacobsenStoltzeNorskov_1996_Al__MO_623376124862_001"
+#=== END kim-query ===========================================
+
+label model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_CubicNaturalSpline_ErcolessiAdams_1994_Al__MO_800509458712_002]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.032082748413087"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_CubicNaturalSpline_ErcolessiAdams_1994_Al__MO_800509458712_002) = 4.032082748413087
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_AngeloMoodyBaskes_1995_NiAlH__MO_418978237058_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.050000071525574"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_AngeloMoodyBaskes_1995_NiAlH__MO_418978237058_005) = 4.050000071525574
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_CaiYe_1996_AlCu__MO_942551040047_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.049763545393944"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_CaiYe_1996_AlCu__MO_942551040047_005) = 4.049763545393944
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.032082033157349"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005) = 4.032082033157349
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_FarkasJones_1996_NbTiAl__MO_042691367780_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "3.869337007403374"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_FarkasJones_1996_NbTiAl__MO_042691367780_000) = 3.869337007403374
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_JacobsenNorskovPuska_1987_Al__MO_411692133366_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "3.987558534741402"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_JacobsenNorskovPuska_1987_Al__MO_411692133366_000) = 3.987558534741402
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_LandaWynblattSiegel_2000_AlPb__MO_699137396381_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.031036108732224"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_LandaWynblattSiegel_2000_AlPb__MO_699137396381_005) = 4.031036108732224
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_LiuAdams_1998_AlMg__MO_019873715786_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.03203821182251"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_LiuAdams_1998_AlMg__MO_019873715786_000) = 4.03203821182251
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_LiuErcolessiAdams_2004_Al__MO_051157671505_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "9.5"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_LiuErcolessiAdams_2004_Al__MO_051157671505_000) = 9.5
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_LiuLiuBorucki_1999_AlCu__MO_020851069572_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.032073378562927"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_LiuLiuBorucki_1999_AlCu__MO_020851069572_000) = 4.032073378562927
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_LiuOhotnickyAdams_1997_AlMg__MO_559870613549_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "8.5"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_LiuOhotnickyAdams_1997_AlMg__MO_559870613549_000) = 8.5
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_MendelevAstaRahman_2009_AlMg__MO_658278549784_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.045270472764969"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_MendelevAstaRahman_2009_AlMg__MO_658278549784_005) = 4.045270472764969
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_MendelevFangYe_2015_AlSm__MO_338600200739_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.040926471352577"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_MendelevFangYe_2015_AlSm__MO_338600200739_000) = 4.040926471352577
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_MendelevKramerBecker_2008_Al__MO_106969701023_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.045259781181811"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_MendelevKramerBecker_2008_Al__MO_106969701023_005) = 4.045259781181811
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_MendelevSrolovitzAckland_2005_AlFe__MO_577453891941_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.03330184519291"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_MendelevSrolovitzAckland_2005_AlFe__MO_577453891941_005) = 4.03330184519291
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_MishinFarkasMehl_1999_Al__MO_651801486679_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.050004702806472"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_MishinFarkasMehl_1999_Al__MO_651801486679_005) = 4.050004702806472
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_MishinMehlPapaconstantopoulos_2002_NiAl__MO_109933561507_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.051526293158533"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_MishinMehlPapaconstantopoulos_2002_NiAl__MO_109933561507_005) = 4.051526293158533
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_Mishin_2004_NiAl__MO_101214310689_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.049999862909317"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_Mishin_2004_NiAl__MO_101214310689_005) = 4.049999862909317
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_PunMishin_2009_NiAl__MO_751354403791_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.050000071525574"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_PunMishin_2009_NiAl__MO_751354403791_005) = 4.050000071525574
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_PunYamakovMishin_2013_AlCo__MO_678952612413_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.05000014603138"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_PunYamakovMishin_2013_AlCo__MO_678952612413_000) = 4.05000014603138
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_PunYamakovMishin_2013_NiAlCo__MO_826591359508_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.05000014603138"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_PunYamakovMishin_2013_NiAlCo__MO_826591359508_000) = 4.05000014603138
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_SchopfBrommerFrigan_2012_AlMnPd__MO_137572817842_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.210718545317654"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_SchopfBrommerFrigan_2012_AlMnPd__MO_137572817842_000) = 4.210718545317654
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_SturgeonLaird_2000_Al__MO_120808805541_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.050010219216347"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_SturgeonLaird_2000_Al__MO_120808805541_005) = 4.050010219216347
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_VailheFarkas_1997_CoAl__MO_284963179498_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.049696564674378"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_VailheFarkas_1997_CoAl__MO_284963179498_005) = 4.049696564674378
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.024845376610756"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_WineyKubotaGupta_2010_Al__MO_149316865608_005) = 4.024845376610756
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_Zhakhovsky_2009_Al__MO_519613893196_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.031999975442885"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_Zhakhovsky_2009_Al__MO_519613893196_000) = 4.031999975442885
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_ZhouJohnsonWadley_2004NISTretabulation_Al__MO_060567868558_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.050199627876282"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_ZhouJohnsonWadley_2004NISTretabulation_Al__MO_060567868558_000) = 4.050199627876282
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_ZhouJohnsonWadley_2004_Al__MO_131650261510_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.050180745124819"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_ZhouJohnsonWadley_2004_Al__MO_131650261510_005) = 4.050180745124819
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_ZhouWadleyJohnson_2001_Al__MO_049243498555_000]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.081654928624631"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_ZhouWadleyJohnson_2001_Al__MO_049243498555_000) = 4.081654928624631
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_ZopeMishin_2003_Al__MO_664470114311_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.050000011920929"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_ZopeMishin_2003_Al__MO_664470114311_005) = 4.050000011920929
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_ZopeMishin_2003_TiAl__MO_117656786760_005]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.049999445676804"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_Dynamo_ZopeMishin_2003_TiAl__MO_117656786760_005) = 4.049999445676804
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_ErcolessiAdams_1994_Al__MO_324507536345_003]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.032082714140415"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_ErcolessiAdams_1994_Al__MO_324507536345_003) = 4.032082714140415
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_IMD_BrommerGaehler_2006A_AlNiCo__MO_122703700223_003]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.128871455788613"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_IMD_BrommerGaehler_2006A_AlNiCo__MO_122703700223_003) = 4.128871455788613
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_IMD_BrommerGaehler_2006B_AlNiCo__MO_128037485276_003]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.073718130588532"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_IMD_BrommerGaehler_2006B_AlNiCo__MO_128037485276_003) = 4.073718130588532
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_IMD_SchopfBrommerFrigan_2012_AlMnPd__MO_878712978062_003]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.210700303316115"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_IMD_SchopfBrommerFrigan_2012_AlMnPd__MO_878712978062_003) = 4.210700303316115
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_QuinticClampedSpline_ErcolessiAdams_1994_Al__MO_450093727396_002]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.032082897424699"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_QuinticClampedSpline_ErcolessiAdams_1994_Al__MO_450093727396_002) = 4.032082897424699
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_QuinticHermiteSpline_ErcolessiAdams_1994_Al__MO_781138671863_002]
+#=== BEGIN kim-query =========================================
+variable latconst string "4.03208246231079"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EAM_QuinticHermiteSpline_ErcolessiAdams_1994_Al__MO_781138671863_002) = 4.03208246231079
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EMT_Asap_Standard_JacobsenStoltzeNorskov_1996_AlAgAuCuNiPdPt__MO_115316750986_001]
+#=== BEGIN kim-query =========================================
+variable latconst string "3.994616635143757"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EMT_Asap_Standard_JacobsenStoltzeNorskov_1996_AlAgAuCuNiPdPt__MO_115316750986_001) = 3.994616635143757
+next  model
+jump  SELF model_loop
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EMT_Asap_Standard_JacobsenStoltzeNorskov_1996_Al__MO_623376124862_001]
+#=== BEGIN kim-query =========================================
+variable latconst string "3.994608342647553"
+#=== END kim-query ===========================================
+
+print "FCC lattice constant (${model}) = ${latconst}"
+FCC lattice constant (EMT_Asap_Standard_JacobsenStoltzeNorskov_1996_Al__MO_623376124862_001) = 3.994608342647553
+next  model
+jump  SELF model_loop
+clear
+
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- @Comment
+{
+\documentclass{article}
+\usepackage{url}
+\begin{document}
+This Model originally published in \cite{MO_123629422045_005a} is archived in OpenKIM~\cite{MO_123629422045_005, MD_120291908751_005, tadmor:elliott:2011, elliott:tadmor:2011}.
+\bibliographystyle{vancouver}
+\bibliography{kimcite-MO_123629422045_005.bib}
+\end{document}
+}
+
+@Misc{MO_123629422045_005,
+  author       = {Ryan S. Elliott},
+  title        = {{EAM} potential ({LAMMPS} cubic hermite tabulation) for {A}l developed by {E}rcolessi and {A}dams (1994) v005},
+  doi          = {10.25950/7cd2a6ab},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/7cd2a6ab}},
+  keywords     = {OpenKIM, Model, MO_123629422045_005},
+  publisher    = {OpenKIM},
+  year         = 2018,
+}
+
+@Misc{MD_120291908751_005,
+  author       = {Ryan S. Elliott},
+  title        = {{EAM} {M}odel {D}river for tabulated potentials with cubic {H}ermite spline interpolation as used in {LAMMPS} v005},
+  doi          = {10.25950/68defa36},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/7cd2a6ab}},
+  keywords     = {OpenKIM, Model Driver, MD_120291908751_005},
+  publisher    = {OpenKIM},
+  year         = 2018,
+}
+
+@Article{tadmor:elliott:2011,
+  author    = {E. B. Tadmor and R. S. Elliott and J. P. Sethna and R. E. Miller and C. A. Becker},
+  title     = {The potential of atomistic simulations and the {K}nowledgebase of {I}nteratomic {M}odels},
+  journal   = {{JOM}},
+  year      = {2011},
+  volume    = {63},
+  number    = {7},
+  pages     = {17},
+  doi       = {10.1007/s11837-011-0102-6},
+}
+
+@Misc{elliott:tadmor:2011,
+  author       = {Ryan S. Elliott and Ellad B. Tadmor},
+  title        = {{K}nowledgebase of {I}nteratomic {M}odels ({KIM}) Application Programming Interface ({API})},
+  howpublished = {\url{https://openkim.org/kim-api}},
+  publisher    = {OpenKIM},
+  year         = 2011,
+  doi          = {10.25950/ff8f563a},
+}
+
+@Article{MO_123629422045_005a,
+  author = {F. Ercolessi and J. B. Adams},
+  doi = {10.1209/0295-5075/26/8/005},
+  journal = {Europhysics Letters},
+  number = {8},
+  pages = {583},
+  title = {Interatomic Potentials from First-Principles Calculations: {T}he Force-Matching Method},
+  volume = {26},
+  year = {1994},
+}
+- @Comment
+{
+\documentclass{article}
+\usepackage{url}
+\begin{document}
+This Model originally published in \cite{MO_004835508849_000a} is archived in OpenKIM~\cite{MO_004835508849_000, MD_120291908751_005, tadmor:elliott:2011, elliott:tadmor:2011}.
+\bibliographystyle{vancouver}
+\bibliography{kimcite-MO_004835508849_000.bib}
+\end{document}
+}
+
+@Misc{MO_004835508849_000,
+  author       = {Ellad Tadmor},
+  title        = {{F}innis-{S}inclair potential ({LAMMPS} cubic hermite tabulation) for {Z}r developed by {M}endelev and {A}ckland (2007); version 3 refitted for radiation studies v000},
+  doi          = {10.25950/7b7b5ab5},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/7b7b5ab5}},
+  keywords     = {OpenKIM, Model, MO_004835508849_000},
+  publisher    = {OpenKIM},
+  year         = 2018,
+}
+
+@Misc{MD_120291908751_005,
+  author       = {Ryan S. Elliott},
+  title        = {{EAM} {M}odel {D}river for tabulated potentials with cubic {H}ermite spline interpolation as used in {LAMMPS} v005},
+  doi          = {10.25950/68defa36},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/7b7b5ab5}},
+  keywords     = {OpenKIM, Model Driver, MD_120291908751_005},
+  publisher    = {OpenKIM},
+  year         = 2018,
+}
+
+@Article{tadmor:elliott:2011,
+  author    = {E. B. Tadmor and R. S. Elliott and J. P. Sethna and R. E. Miller and C. A. Becker},
+  title     = {The potential of atomistic simulations and the {K}nowledgebase of {I}nteratomic {M}odels},
+  journal   = {{JOM}},
+  year      = {2011},
+  volume    = {63},
+  number    = {7},
+  pages     = {17},
+  doi       = {10.1007/s11837-011-0102-6},
+}
+
+@Misc{elliott:tadmor:2011,
+  author       = {Ryan S. Elliott and Ellad B. Tadmor},
+  title        = {{K}nowledgebase of {I}nteratomic {M}odels ({KIM}) Application Programming Interface ({API})},
+  howpublished = {\url{https://openkim.org/kim-api}},
+  publisher    = {OpenKIM},
+  year         = 2011,
+  doi          = {10.25950/ff8f563a},
+}
+
+@Article{MO_004835508849_000a,
+  author = {Mendelev, M. I. and Ackland, G. J.},
+  doi = {10.1080/09500830701191393},
+  journal = {Philosophical Magazine Letters},
+  number = {5},
+  pages = {349-359},
+  title = {Development of an interatomic potential for the simulation of phase transformations in zirconium},
+  volume = {87},
+  year = {2007},
+}
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Total wall time: 0:01:58
diff --git a/examples/kim/log.10Feb21.in.kim-sm.melt.clang.1 b/examples/kim/log.10Feb21.in.kim-sm.melt.clang.1
new file mode 100644
index 0000000000..bb00b7fec4
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.kim-sm.melt.clang.1
@@ -0,0 +1,208 @@
+LAMMPS (10 Feb 2021)
+# 3d Lennard-Jones melt
+#
+# This example requires that the KIM Simulator Model (PM)
+# `Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000`
+# is installed. This can be done with the command
+#   kim-api-collections-management install user Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
+# If this command does not work, you may need to setup your PATH to find the utility.
+# If you installed the kim-api using the LAMMPS CMake build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS build directory)
+#   source ./kim_build-prefix/bin/kim-api-activate
+# If you installed the kim-api using the LAMMPS Make build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS src directory)
+#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
+# (where you should relplace X.Y.Z with the appropriate kim-api version number).
+#
+# See `https://openkim.org/doc/obtaining-models` for alternative options.
+#
+
+variable     x index 1
+variable     y index 1
+variable     z index 1
+
+variable     xx equal 20*$x
+variable     xx equal 20*1
+variable     yy equal 20*$y
+variable     yy equal 20*1
+variable     zz equal 20*$z
+variable     zz equal 20*1
+
+kim          init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000 real
+#=== BEGIN kim init ==========================================
+# Using KIM Simulator Model : Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
+# For Simulator             : LAMMPS 28 Feb 2019
+# Running on                : LAMMPS 10 Feb 2021
+#
+units real
+neighbor 2.0 bin   # Angstroms
+timestep 1.0       # femtoseconds
+atom_style charge
+neigh_modify one 4000
+#=== END kim init ============================================
+
+
+lattice      fcc 4.4300
+Lattice spacing in x,y,z = 4.4300000 4.4300000 4.4300000
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+region       box block 0 20 0 ${yy} 0 ${zz}
+region       box block 0 20 0 20 0 ${zz}
+region       box block 0 20 0 20 0 20
+create_box   1 box
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (88.600000 88.600000 88.600000)
+  1 by 1 by 1 MPI processor grid
+create_atoms 1 box
+Created 32000 atoms
+  create_atoms CPU = 0.003 seconds
+
+kim          interactions O
+#=== BEGIN kim interactions ==================================
+variable kim_periodic equal 1
+pair_style reax/c /var/tmp/kim-shared-library-parameter-file-directory-pgBW45WFK0TI/lmp_control safezone 2.0 mincap 100
+pair_coeff * * /var/tmp/kim-shared-library-parameter-file-directory-pgBW45WFK0TI/ffield.reax.rdx O
+Reading potential file /var/tmp/kim-shared-library-parameter-file-directory-pgBW45WFK0TI/ffield.reax.rdx with DATE: 2010-02-19
+fix reaxqeq all qeq/reax 1 0.0 10.0 1.0e-6 /var/tmp/kim-shared-library-parameter-file-directory-pgBW45WFK0TI/param.qeq
+#=== END kim interactions ====================================
+
+
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
+
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
+
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+
+run          100
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- @Comment
+{
+\documentclass{article}
+\usepackage{url}
+\begin{document}
+This Simulator Model originally published in \cite{SM_107643900657_000a} is archived in OpenKIM~\cite{SM_107643900657_000, tadmor:elliott:2011, elliott:tadmor:2011}.
+\bibliographystyle{vancouver}
+\bibliography{kimcite-SM_107643900657_000.bib}
+\end{document}
+}
+
+@Misc{SM_107643900657_000,
+  author       = {Ellad Tadmor},
+  title        = {{LAMMPS} {R}eax{FF} potential for {RDX} ({C}-{H}-{N}-{O}) systems developed by {S}trachan et al. (2003) v000},
+  doi          = {10.25950/acd3fc89},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/acd3fc89}},
+  keywords     = {OpenKIM, Simulator Model, SM_107643900657_000},
+  publisher    = {OpenKIM},
+  year         = 2019,
+}
+
+@Article{tadmor:elliott:2011,
+  author    = {E. B. Tadmor and R. S. Elliott and J. P. Sethna and R. E. Miller and C. A. Becker},
+  title     = {The potential of atomistic simulations and the {K}nowledgebase of {I}nteratomic {M}odels},
+  journal   = {{JOM}},
+  year      = {2011},
+  volume    = {63},
+  number    = {7},
+  pages     = {17},
+  doi       = {10.1007/s11837-011-0102-6},
+}
+
+@Misc{elliott:tadmor:2011,
+  author       = {Ryan S. Elliott and Ellad B. Tadmor},
+  title        = {{K}nowledgebase of {I}nteratomic {M}odels ({KIM}) Application Programming Interface ({API})},
+  howpublished = {\url{https://openkim.org/kim-api}},
+  publisher    = {OpenKIM},
+  year         = 2011,
+  doi          = {10.25950/ff8f563a},
+}
+
+@Article{SM_107643900657_000a,
+  author = {Strachan, Alejandro and van Duin, Adri C. T. and Chakraborty, Debashis and Dasgupta, Siddharth and Goddard, William A.},
+  doi = {10.1103/PhysRevLett.91.098301},
+  issue = {9},
+  journal = {Physical Review Letters},
+  month = {Aug},
+  numpages = {4},
+  pages = {098301},
+  publisher = {American Physical Society},
+  title = {Shock Waves in High-Energy Materials: {T}he Initial Chemical Events in Nitramine {RDX}},
+  volume = {91},
+  year = {2003},
+}
+- pair reax/c command:
+
+@Article{Aktulga12,
+ author = {H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama},
+ title = {Parallel reactive molecular dynamics: Numerical methods and algorithmic techniques},
+ journal = {Parallel Computing},
+ year =    2012,
+ volume =  38,
+ pages =   {245--259}
+}
+
+- fix qeq/reax command:
+
+@Article{Aktulga12,
+ author = {H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama},
+ title = {Parallel reactive molecular dynamics: Numerical methods and algorithmic techniques},
+ journal = {Parallel Computing},
+ year =    2012,
+ volume =  38,
+ pages =   {245--259}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 4000, page size: 100000
+  master list distance cutoff = 10.3
+  ghost atom cutoff = 10.3
+  binsize = 5.15, bins = 18 18 18
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair reax/c, perpetual
+      attributes: half, newton off, ghost
+      pair build: half/bin/newtoff/ghost
+      stencil: half/ghost/bin/3d/newtoff
+      bin: standard
+  (2) fix qeq/reax, perpetual, copy from (1)
+      attributes: half, newton off, ghost
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 1803.0 | 1803.0 | 1803.0 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0          200   -39091.147            0   -20014.559    19501.107 
+     100    63.198252   -26042.062            0   -20014.027    21497.661 
+Loop time of 40.2545 on 1 procs for 100 steps with 32000 atoms
+
+Performance: 0.215 ns/day, 111.818 hours/ns, 2.484 timesteps/s
+99.1% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 24.364     | 24.364     | 24.364     |   0.0 | 60.52
+Neigh   | 0.4185     | 0.4185     | 0.4185     |   0.0 |  1.04
+Comm    | 0.022045   | 0.022045   | 0.022045   |   0.0 |  0.05
+Output  | 6.6e-05    | 6.6e-05    | 6.6e-05    |   0.0 |  0.00
+Modify  | 15.438     | 15.438     | 15.438     |   0.0 | 38.35
+Other   |            | 0.01285    |            |       |  0.03
+
+Nlocal:        32000.0 ave       32000 max       32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        26825.0 ave       26825 max       26825 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    3.73924e+06 ave 3.73924e+06 max 3.73924e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 3739236
+Ave neighs/atom = 116.85112
+Neighbor list builds = 3
+Dangerous builds = 0
+Total wall time: 0:00:41
diff --git a/examples/kim/log.10Feb21.in.kim-sm.melt.clang.4 b/examples/kim/log.10Feb21.in.kim-sm.melt.clang.4
new file mode 100644
index 0000000000..90c8adc6b0
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.kim-sm.melt.clang.4
@@ -0,0 +1,208 @@
+LAMMPS (10 Feb 2021)
+# 3d Lennard-Jones melt
+#
+# This example requires that the KIM Simulator Model (PM)
+# `Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000`
+# is installed. This can be done with the command
+#   kim-api-collections-management install user Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
+# If this command does not work, you may need to setup your PATH to find the utility.
+# If you installed the kim-api using the LAMMPS CMake build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS build directory)
+#   source ./kim_build-prefix/bin/kim-api-activate
+# If you installed the kim-api using the LAMMPS Make build, you can do the following
+# (where the current working directory is assumed to be the LAMMPS src directory)
+#   source ../lib/kim/installed-kim-api-X.Y.Z/bin/kim-api-activate
+# (where you should relplace X.Y.Z with the appropriate kim-api version number).
+#
+# See `https://openkim.org/doc/obtaining-models` for alternative options.
+#
+
+variable     x index 1
+variable     y index 1
+variable     z index 1
+
+variable     xx equal 20*$x
+variable     xx equal 20*1
+variable     yy equal 20*$y
+variable     yy equal 20*1
+variable     zz equal 20*$z
+variable     zz equal 20*1
+
+kim          init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000 real
+#=== BEGIN kim init ==========================================
+# Using KIM Simulator Model : Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
+# For Simulator             : LAMMPS 28 Feb 2019
+# Running on                : LAMMPS 10 Feb 2021
+#
+units real
+neighbor 2.0 bin   # Angstroms
+timestep 1.0       # femtoseconds
+atom_style charge
+neigh_modify one 4000
+#=== END kim init ============================================
+
+
+lattice      fcc 4.4300
+Lattice spacing in x,y,z = 4.4300000 4.4300000 4.4300000
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+region       box block 0 20 0 ${yy} 0 ${zz}
+region       box block 0 20 0 20 0 ${zz}
+region       box block 0 20 0 20 0 20
+create_box   1 box
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (88.600000 88.600000 88.600000)
+  1 by 2 by 2 MPI processor grid
+create_atoms 1 box
+Created 32000 atoms
+  create_atoms CPU = 0.001 seconds
+
+kim          interactions O
+#=== BEGIN kim interactions ==================================
+variable kim_periodic equal 1
+pair_style reax/c /var/tmp/kim-shared-library-parameter-file-directory-zYQfH0ms5WSw/lmp_control safezone 2.0 mincap 100
+pair_coeff * * /var/tmp/kim-shared-library-parameter-file-directory-zYQfH0ms5WSw/ffield.reax.rdx O
+Reading potential file /var/tmp/kim-shared-library-parameter-file-directory-zYQfH0ms5WSw/ffield.reax.rdx with DATE: 2010-02-19
+fix reaxqeq all qeq/reax 1 0.0 10.0 1.0e-6 /var/tmp/kim-shared-library-parameter-file-directory-zYQfH0ms5WSw/param.qeq
+#=== END kim interactions ====================================
+
+
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
+
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
+
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+
+run          100
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- @Comment
+{
+\documentclass{article}
+\usepackage{url}
+\begin{document}
+This Simulator Model originally published in \cite{SM_107643900657_000a} is archived in OpenKIM~\cite{SM_107643900657_000, tadmor:elliott:2011, elliott:tadmor:2011}.
+\bibliographystyle{vancouver}
+\bibliography{kimcite-SM_107643900657_000.bib}
+\end{document}
+}
+
+@Misc{SM_107643900657_000,
+  author       = {Ellad Tadmor},
+  title        = {{LAMMPS} {R}eax{FF} potential for {RDX} ({C}-{H}-{N}-{O}) systems developed by {S}trachan et al. (2003) v000},
+  doi          = {10.25950/acd3fc89},
+  howpublished = {OpenKIM, \url{https://doi.org/10.25950/acd3fc89}},
+  keywords     = {OpenKIM, Simulator Model, SM_107643900657_000},
+  publisher    = {OpenKIM},
+  year         = 2019,
+}
+
+@Article{tadmor:elliott:2011,
+  author    = {E. B. Tadmor and R. S. Elliott and J. P. Sethna and R. E. Miller and C. A. Becker},
+  title     = {The potential of atomistic simulations and the {K}nowledgebase of {I}nteratomic {M}odels},
+  journal   = {{JOM}},
+  year      = {2011},
+  volume    = {63},
+  number    = {7},
+  pages     = {17},
+  doi       = {10.1007/s11837-011-0102-6},
+}
+
+@Misc{elliott:tadmor:2011,
+  author       = {Ryan S. Elliott and Ellad B. Tadmor},
+  title        = {{K}nowledgebase of {I}nteratomic {M}odels ({KIM}) Application Programming Interface ({API})},
+  howpublished = {\url{https://openkim.org/kim-api}},
+  publisher    = {OpenKIM},
+  year         = 2011,
+  doi          = {10.25950/ff8f563a},
+}
+
+@Article{SM_107643900657_000a,
+  author = {Strachan, Alejandro and van Duin, Adri C. T. and Chakraborty, Debashis and Dasgupta, Siddharth and Goddard, William A.},
+  doi = {10.1103/PhysRevLett.91.098301},
+  issue = {9},
+  journal = {Physical Review Letters},
+  month = {Aug},
+  numpages = {4},
+  pages = {098301},
+  publisher = {American Physical Society},
+  title = {Shock Waves in High-Energy Materials: {T}he Initial Chemical Events in Nitramine {RDX}},
+  volume = {91},
+  year = {2003},
+}
+- pair reax/c command:
+
+@Article{Aktulga12,
+ author = {H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama},
+ title = {Parallel reactive molecular dynamics: Numerical methods and algorithmic techniques},
+ journal = {Parallel Computing},
+ year =    2012,
+ volume =  38,
+ pages =   {245--259}
+}
+
+- fix qeq/reax command:
+
+@Article{Aktulga12,
+ author = {H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama},
+ title = {Parallel reactive molecular dynamics: Numerical methods and algorithmic techniques},
+ journal = {Parallel Computing},
+ year =    2012,
+ volume =  38,
+ pages =   {245--259}
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 4000, page size: 100000
+  master list distance cutoff = 10.3
+  ghost atom cutoff = 10.3
+  binsize = 5.15, bins = 18 18 18
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair reax/c, perpetual
+      attributes: half, newton off, ghost
+      pair build: half/bin/newtoff/ghost
+      stencil: half/ghost/bin/3d/newtoff
+      bin: standard
+  (2) fix qeq/reax, perpetual, copy from (1)
+      attributes: half, newton off, ghost
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 630.2 | 630.2 | 630.2 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0          200   -39091.147            0   -20014.559    19501.107 
+     100    63.198252   -26042.062            0   -20014.027    21497.661 
+Loop time of 15.049 on 4 procs for 100 steps with 32000 atoms
+
+Performance: 0.574 ns/day, 41.803 hours/ns, 6.645 timesteps/s
+99.0% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 9.8158     | 9.8159     | 9.8161     |   0.0 | 65.23
+Neigh   | 0.17685    | 0.17759    | 0.17832    |   0.1 |  1.18
+Comm    | 0.028692   | 0.028847   | 0.028942   |   0.1 |  0.19
+Output  | 2.5e-05    | 3.575e-05  | 4.6e-05    |   0.0 |  0.00
+Modify  | 5.0171     | 5.0179     | 5.0186     |   0.0 | 33.34
+Other   |            | 0.008715   |            |       |  0.06
+
+Nlocal:        8000.00 ave        8010 max        7993 min
+Histogram: 2 0 0 0 0 1 0 0 0 1
+Nghost:        12605.0 ave       12612 max       12595 min
+Histogram: 1 0 0 0 1 0 0 0 0 2
+Neighs:    1.00097e+06 ave 1.00187e+06 max  1.0006e+06 min
+Histogram: 2 1 0 0 0 0 0 0 0 1
+
+Total # of neighbors = 4003876
+Ave neighs/atom = 125.12113
+Neighbor list builds = 3
+Dangerous builds = 0
+Total wall time: 0:00:15
diff --git a/examples/kim/log.10Feb21.in.lammps.melt.clang.1 b/examples/kim/log.10Feb21.in.lammps.melt.clang.1
new file mode 100644
index 0000000000..eb2922f413
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.lammps.melt.clang.1
@@ -0,0 +1,88 @@
+LAMMPS (10 Feb 2021)
+# 3d Lennard-Jones melt
+
+variable     x index 1
+variable     y index 1
+variable     z index 1
+
+variable     xx equal 20*$x
+variable     xx equal 20*1
+variable     yy equal 20*$y
+variable     yy equal 20*1
+variable     zz equal 20*$z
+variable     zz equal 20*1
+
+units        real
+
+lattice      fcc 4.4300
+Lattice spacing in x,y,z = 4.4300000 4.4300000 4.4300000
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+region       box block 0 20 0 ${yy} 0 ${zz}
+region       box block 0 20 0 20 0 ${zz}
+region       box block 0 20 0 20 0 20
+create_box   1 box
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (88.600000 88.600000 88.600000)
+  1 by 1 by 1 MPI processor grid
+create_atoms 1 box
+Created 32000 atoms
+  create_atoms CPU = 0.002 seconds
+
+pair_style   lj/cut 8.1500
+pair_coeff   1 1 0.0104 3.4000
+
+#pair_style  kim LennardJones_Ar
+#pair_coeff  * * Ar
+
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
+
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
+
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+
+run          100
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 8.45
+  ghost atom cutoff = 8.45
+  binsize = 4.225, bins = 21 21 21
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 19.23 | 19.23 | 19.23 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0          200    6290.8194            0    25367.408    6750.7421 
+     100    98.747096    15900.676            0    25319.465    10184.453 
+Loop time of 1.92822 on 1 procs for 100 steps with 32000 atoms
+
+Performance: 4.481 ns/day, 5.356 hours/ns, 51.861 timesteps/s
+99.8% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 1.7377     | 1.7377     | 1.7377     |   0.0 | 90.12
+Neigh   | 0.14234    | 0.14234    | 0.14234    |   0.0 |  7.38
+Comm    | 0.011694   | 0.011694   | 0.011694   |   0.0 |  0.61
+Output  | 6.7e-05    | 6.7e-05    | 6.7e-05    |   0.0 |  0.00
+Modify  | 0.02476    | 0.02476    | 0.02476    |   0.0 |  1.28
+Other   |            | 0.01163    |            |       |  0.60
+
+Nlocal:        32000.0 ave       32000 max       32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        19911.0 ave       19911 max       19911 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    1.96027e+06 ave 1.96027e+06 max 1.96027e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 1960266
+Ave neighs/atom = 61.258313
+Neighbor list builds = 3
+Dangerous builds = 0
+Total wall time: 0:00:01
diff --git a/examples/kim/log.10Feb21.in.lammps.melt.clang.4 b/examples/kim/log.10Feb21.in.lammps.melt.clang.4
new file mode 100644
index 0000000000..b8751c4e41
--- /dev/null
+++ b/examples/kim/log.10Feb21.in.lammps.melt.clang.4
@@ -0,0 +1,88 @@
+LAMMPS (10 Feb 2021)
+# 3d Lennard-Jones melt
+
+variable     x index 1
+variable     y index 1
+variable     z index 1
+
+variable     xx equal 20*$x
+variable     xx equal 20*1
+variable     yy equal 20*$y
+variable     yy equal 20*1
+variable     zz equal 20*$z
+variable     zz equal 20*1
+
+units        real
+
+lattice      fcc 4.4300
+Lattice spacing in x,y,z = 4.4300000 4.4300000 4.4300000
+region       box block 0 ${xx} 0 ${yy} 0 ${zz}
+region       box block 0 20 0 ${yy} 0 ${zz}
+region       box block 0 20 0 20 0 ${zz}
+region       box block 0 20 0 20 0 20
+create_box   1 box
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (88.600000 88.600000 88.600000)
+  1 by 2 by 2 MPI processor grid
+create_atoms 1 box
+Created 32000 atoms
+  create_atoms CPU = 0.001 seconds
+
+pair_style   lj/cut 8.1500
+pair_coeff   1 1 0.0104 3.4000
+
+#pair_style  kim LennardJones_Ar
+#pair_coeff  * * Ar
+
+mass         1 39.95
+velocity     all create 200.0 232345 loop geom
+
+neighbor     0.3 bin
+neigh_modify delay 0 every 1 check yes
+
+fix          1 all nve
+#fix         1 all npt temp 1.0 1.0 1.0 iso 1.0 1.0 3.0
+
+run          100
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 8.45
+  ghost atom cutoff = 8.45
+  binsize = 4.225, bins = 21 21 21
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 7.633 | 7.633 | 7.633 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0          200    6290.8194            0    25367.408    6750.7421 
+     100    98.747096    15900.676            0    25319.465    10184.453 
+Loop time of 0.561006 on 4 procs for 100 steps with 32000 atoms
+
+Performance: 15.401 ns/day, 1.558 hours/ns, 178.251 timesteps/s
+99.6% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.48486    | 0.48676    | 0.48817    |   0.2 | 86.77
+Neigh   | 0.040698   | 0.04091    | 0.041066   |   0.1 |  7.29
+Comm    | 0.016616   | 0.01811    | 0.0202     |   1.1 |  3.23
+Output  | 3e-05      | 3.575e-05  | 4.7e-05    |   0.0 |  0.01
+Modify  | 0.008934   | 0.009025   | 0.009142   |   0.1 |  1.61
+Other   |            | 0.006161   |            |       |  1.10
+
+Nlocal:        8000.00 ave        8012 max        7989 min
+Histogram: 1 0 0 0 2 0 0 0 0 1
+Nghost:        9131.00 ave        9142 max        9119 min
+Histogram: 1 0 0 0 0 2 0 0 0 1
+Neighs:        490066.0 ave      491443 max      489273 min
+Histogram: 2 0 0 0 1 0 0 0 0 1
+
+Total # of neighbors = 1960266
+Ave neighs/atom = 61.258313
+Neighbor list builds = 3
+Dangerous builds = 0
+Total wall time: 0:00:00

From a73f6f58ad75b9d68b1c13863ace10aef7d5bc71 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Mon, 15 Feb 2021 13:12:40 -0600
Subject: [PATCH 079/116] Extra check to prevent illegal neighbor request esp,
 in unit conversion mode

---
 src/KIM/pair_kim.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/KIM/pair_kim.cpp b/src/KIM/pair_kim.cpp
index 2f1fb9da3e..5fa93b09d0 100644
--- a/src/KIM/pair_kim.cpp
+++ b/src/KIM/pair_kim.cpp
@@ -601,6 +601,8 @@ void PairKIM::init_style()
 
     // set cutoff
     neighbor->requests[irequest]->cut = 1;
+    if (kim_cutoff_values[i] <= neighbor->skin)
+      error->all(FLERR,"Illegal neighbor request (force cutoff <= skin)");
     neighbor->requests[irequest]->cutoff
       = kim_cutoff_values[i] + neighbor->skin;
   }

From 21a60235eb4c604a1645562b588b956b5ada543f Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 16 Feb 2021 13:57:55 -0600
Subject: [PATCH 080/116] Fix a wrong indexing for optional explicit argument

---
 src/KIM/kim_param.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KIM/kim_param.cpp b/src/KIM/kim_param.cpp
index 1ebbed62f6..161e8c9fc2 100644
--- a/src/KIM/kim_param.cpp
+++ b/src/KIM/kim_param.cpp
@@ -367,7 +367,7 @@ void KimParam::command(int narg, char **arg)
               varsname.resize(1);
               varsname[0] = varname;
             // Default explicit (optional) formatarg
-            } else if (i - 1 + nvars < narg) {
+            } else if (i - 1 + nvars - 1 < narg) {
               varsname.resize(nvars);
               --i;
               for (int j = 0; j < nvars; ++j, ++i) varsname[j] = arg[i];

From c139adf95be4866be30a571d99b61082ee3ecc5a Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 16 Feb 2021 15:12:56 -0600
Subject: [PATCH 081/116] Fix the index for get argument and correct the string
 variable format

---
 src/KIM/kim_param.cpp | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/KIM/kim_param.cpp b/src/KIM/kim_param.cpp
index 161e8c9fc2..04e2bdceca 100644
--- a/src/KIM/kim_param.cpp
+++ b/src/KIM/kim_param.cpp
@@ -346,6 +346,8 @@ void KimParam::command(int narg, char **arg)
         if (i < narg) {
           // Get the variable/variable_base name
           varname = arg[i++];
+          if (varname == "split" || varname == "list" || varname == "explicit")
+            error->all(FLERR, "Illegal variable name in 'kim param get'");
         } else {
           std::string msg("Wrong number of arguments in 'kim param get' ");
           msg += "command.\nThe LAMMPS variable name is mandatory";
@@ -362,15 +364,22 @@ void KimParam::command(int narg, char **arg)
               for (int j = 0, k = nlbound; j < nvars; ++j, ++k) {
                 varsname[j] = fmt::format("{}_{}", varname, k);
               }
+              ++i;
             } else if (strcmp(arg[i], "list") == 0) {
               list_requested = true;
               varsname.resize(1);
               varsname[0] = varname;
+              ++i;
             // Default explicit (optional) formatarg
             } else if (i - 1 + nvars - 1 < narg) {
               varsname.resize(nvars);
               --i;
-              for (int j = 0; j < nvars; ++j, ++i) varsname[j] = arg[i];
+              for (int j = 0; j < nvars; ++j, ++i) {
+                varsname[j] = arg[i];
+                if (varsname[j] == "split" || varsname[j] == "list" ||
+                    varsname[j] == "explicit")
+                  error->all(FLERR, "Illegal variable name in 'kim param get'");
+              }
               if (i < narg) {
                 if (strcmp(arg[i], "explicit") == 0) ++i;
               }
@@ -396,8 +405,7 @@ void KimParam::command(int narg, char **arg)
               ++i;
             } else {
               if ((strcmp(arg[i], "list") == 0) ||
-                  (strcmp(arg[i], "explicit") == 0))
-                ++i;
+                  (strcmp(arg[i], "explicit") == 0)) ++i;
 
               varsname[0] = varname;
             }
@@ -427,7 +435,7 @@ void KimParam::command(int narg, char **arg)
               str += fmt::format(" {}", V);
             }
 
-            auto setcmd = fmt::format("{} string {}", varsname[0], str);
+            auto setcmd = fmt::format("{} string \"{}\"", varsname[0], str);
             input->variable->set(setcmd);
             input->write_echo(fmt::format("variable {}\n", setcmd));
 
@@ -465,7 +473,7 @@ void KimParam::command(int narg, char **arg)
               str += fmt::format(" {}", V);
             }
 
-            auto setcmd = fmt::format("{} string {}", varsname[0], str);
+            auto setcmd = fmt::format("{} string \"{}\"", varsname[0], str);
             input->variable->set(setcmd);
             input->write_echo(fmt::format("variable {}\n", setcmd));
 

From 29926c4f71ae1c72e066c71809299b262c74fe7b Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 16 Feb 2021 16:44:12 -0600
Subject: [PATCH 082/116] update kim command unittests with extra test cases
 for kim param command

---
 unittest/commands/test_kim_commands.cpp | 65 +++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/unittest/commands/test_kim_commands.cpp b/unittest/commands/test_kim_commands.cpp
index 9d02cdb74c..3934e5de6f 100644
--- a/unittest/commands/test_kim_commands.cpp
+++ b/unittest/commands/test_kim_commands.cpp
@@ -347,6 +347,71 @@ TEST_F(KimCommandsTest, kim_param)
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
     ASSERT_TRUE(std::string(lmp->input->variable->retrieve("shift")) == "2");
+
+    TEST_FAILURE(".*ERROR: Illegal variable name in 'kim param get'.*",
+                 lmp->input->one("kim param get cutoffs 1:3 list"););
+    TEST_FAILURE(".*ERROR: Illegal variable name in 'kim param get'.*",
+                 lmp->input->one("kim param get cutoffs 1:3 cutoffs_1 cutoffs_2 list"););
+    TEST_FAILURE(".*ERROR: Illegal variable name in 'kim param get'.*",
+                 lmp->input->one("kim param get cutoffs 1:3 split"););
+    TEST_FAILURE(".*ERROR: Illegal variable name in 'kim param get'.*",
+                 lmp->input->one("kim param get cutoffs 1:3 cutoffs_1 cutoffs_2 split"););
+    TEST_FAILURE(".*ERROR: Illegal variable name in 'kim param get'.*",
+                 lmp->input->one("kim param get cutoffs 1:3 explicit"););
+    TEST_FAILURE(".*ERROR: Illegal variable name in 'kim param get'.*",
+                 lmp->input->one("kim param get cutoffs 1:3 cutoffs_1 cutoffs_2 explicit"););
+    TEST_FAILURE(".*ERROR: Wrong number of arguments in 'kim param get' "
+                 "command.\nThe LAMMPS '3' variable names or 'cutoffs "
+                 "split/list' is mandatory.*",
+                 lmp->input->one("kim param get cutoffs 1:3 cutoffs"););
+    TEST_FAILURE(".*ERROR: Wrong number of arguments in 'kim param get' "
+                 "command.\nThe LAMMPS '3' variable names or 'cutoffs_1 "
+                 "split' is mandatory.*",
+                 lmp->input->one("kim param get cutoffs 1:3 cutoffs_1 cutoffs_2"););
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("kim param get cutoffs 1:3 cutoffs_1 cutoffs_2 cutoffs_3");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs_1")) == "2.20943");
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs_2")) == "2.10252");
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs_3")) == "5.666115");
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("kim param get cutoffs 1:3 cutoffs_1 cutoffs_2 cutoffs_3 explicit");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs_1")) == "2.20943");
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs_2")) == "2.10252");
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs_3")) == "5.666115");
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("kim param get cutoffs 1:3 cutoffs split");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs_1")) == "2.20943");
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs_2")) == "2.10252");
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs_3")) == "5.666115");
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("kim param get cutoffs 1:3 cutoffs list");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs")) == "2.20943 2.10252 5.666115");
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("kim param set cutoffs 1 2.21 cutoffs 2 2.11");
+    lmp->input->one("kim param get cutoffs 1:2 cutoffs list");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs")) == "2.21 2.11");
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("kim param set cutoffs 1:3 2.3 2.2 5.7");
+    lmp->input->one("kim param get cutoffs 1:3 cutoffs list");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs")) == "2.3 2.2 5.7");
 }
 
 TEST_F(KimCommandsTest, kim_property)

From 3b9cbe4361fabd25dccf5fc43b294af80912ac01 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 16 Feb 2021 16:44:58 -0600
Subject: [PATCH 083/116] Update the kim command doc

Update the document with the latest interface changes. Replace the
discontinued models in the examples with the correct models. Test all
provided examples and fix the mistakes in them.
---
 doc/src/kim_commands.rst | 1299 ++++++++++++++++++++------------------
 1 file changed, 694 insertions(+), 605 deletions(-)

diff --git a/doc/src/kim_commands.rst b/doc/src/kim_commands.rst
index e9afa48fd5..47b8d3e790 100644
--- a/doc/src/kim_commands.rst
+++ b/doc/src/kim_commands.rst
@@ -1,106 +1,136 @@
-.. index:: kim_init, kim_interactions, kim_query, kim_param, kim_property
+.. index:: kim_commands
 
-:ref:`kim_init<kim_init command>` command
-=========================================
-
-:ref:`kim_interactions<kim_interactions command>` command
-=========================================================
-
-:ref:`kim_query<kim_query command>` command
-===========================================
-
-:ref:`kim_param<kim_param command>` command
-===========================================
-
-:ref:`kim_property<kim_property command>` command
-=================================================
+kim command
+===========
 
 Syntax
 """"""
 
 .. code-block:: LAMMPS
 
-   kim_init model user_units unitarg
-   kim_interactions typeargs
-   kim_query variable formatarg query_function queryargs
-   kim_param get param_name index_range variables formatarg
-   kim_param set param_name index_range values
-   kim_property create  instance_id property_id
-   kim_property modify  instance_id key key_name key_name_key key_name_value
-   kim_property remove  instance_id key key_name
-   kim_property destroy instance_id
-   kim_property dump    file
+   kim sub-command
 
-.. _formatarg_options:
-
-* model = name of the KIM interatomic model (the KIM ID for models archived in OpenKIM)
-* user_units = the LAMMPS :doc:`units <units>` style assumed in the LAMMPS input script
-* unitarg = *unit_conversion_mode* (optional)
-* typeargs = atom type to species mapping (one entry per atom type) or *fixed_types* for models with a preset fixed mapping
-* variable(s) = single name or list of names of (string style) LAMMPS variable(s) where a query result or parameter get result is stored. Variables that do not exist will be created by the command.
-* formatarg = *list, split, or explicit* (optional):
-
-  .. parsed-literal::
-
-     *list* = returns a single string with a list of space separated values
-            (e.g. "1.0 2.0 3.0"), which is placed in a LAMMPS variable as
-            defined by the *variable* argument. [default for *kim_query*]
-     *split* = returns the values separately in new variables with names based
-            on the prefix specified in *variable* and a number appended to
-            indicate which element in the list of values is in the variable.
-     *explicit* = returns the values separately in one more more variable names
-            provided as arguments that precede *formatarg*\ . [default for *kim_param*]
-
-* query_function = name of the OpenKIM web API query function to be used
-* queryargs = a series of *keyword=value* pairs that represent the web query; supported keywords depend on the query function
-* param_name = name of a KIM portable model parameter
-* index_range = KIM portable model parameter index range (an integer for a single element, or pair of integers separated by a colon for a range of elements)
-* values = new value(s) to replace the current value(s) of a KIM portable model parameter
-* instance_id = a positive integer identifying the KIM property instance
-* property_id = identifier of a `KIM Property Definition <https://openkim.org/properties>`_, which can be (1) a property short name, (2) the full unique ID of the property (including the contributor and date), (3) a file name corresponding to a local property definition file
-* key_name = one of the keys belonging to the specified KIM property definition
-* key_name_key = a key belonging to a key-value pair (standardized in the `KIM Properties Framework <https://openkim.org/doc/schema/properties-framework>`__)
-* key_name_value = value to be associated with a key_name_key in a key-value pair
-* file = name of a file to write the currently defined set of KIM property instances to
+* sub-command = :ref:`init <init>` or :ref:`interactions <interactions>` or
+  :ref:`query <query>` or :ref:`param <param>` or :ref:`property <property>`
 
 Examples
 """"""""
 
 .. code-block:: LAMMPS
 
-   kim_init SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
-   kim_interactions Si
-   kim_init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000 real
-   kim_init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000 metal unit_conversion_mode
-   kim_interactions C H O
-   kim_init Sim_LAMMPS_IFF_PCFF_HeinzMishraLinEmami_2015Ver1v5_FccmetalsMineralsSolventsPolymers__SM_039297821658_000 real
-   kim_interactions fixed_types
-   kim_query a0 get_lattice_constant_cubic crystal=["fcc"] species=["Al"] units=["angstrom"]
-   kim_param get gamma 1 varGamma
-   kim_param set gamma 1 3.0
-   kim_property create  1 atomic-mass
-   kim_property modify  1 key mass source-value 26.98154
-   kim_property modify  1 key species source-value Al
-   kim_property remove  1 key species
-   kim_property destroy 1
-   kim_property dump    results.edn
-
+   kim init ...
+   kim interactions ...
+   kim query ...
+   kim param ...
+   kim property ...
 
 .. _kim_description:
 
 Description
 """""""""""
 
-The set of *kim_commands* provide a high-level wrapper around the
+*kim command* provides a set of high-level wrapper around the
 `Open Knowledgebase of Interatomic Models (OpenKIM) <https://openkim.org>`_
 repository of interatomic models (IMs) (potentials and force fields),
-so that they can be used by LAMMPS scripts.  These commands do not implement
-any computations directly, but rather generate LAMMPS input commands based
-on the information retrieved from the OpenKIM repository to initialize and
-activate OpenKIM IMs and query their predictions for use in the LAMMPS script.
-All LAMMPS input commands generated and executed by *kim_commands* are
+so that they can be used by LAMMPS scripts.  This command is followed by a
+a set of sub-coammnds.  The kim command does not implement any computations
+directly, but rather generates LAMMPS input commands based on the information
+retrieved from the OpenKIM repository to initialize and activate OpenKIM IMs
+and query their predictions for use in the LAMMPS script.
+All LAMMPS input commands generated and executed by *kim command* are
 echoed to the LAMMPS log file.
 
+Full syntax
+"""""""""""
+
+.. code-block:: LAMMPS
+
+   kim init model user_units unitarg
+   kim interactions typeargs
+   kim query variable formatarg query_function queryargs
+   kim param get param_name index_range variables formatarg
+   kim param set param_name index_range values
+   kim property create  instance_id property_id
+   kim property modify  instance_id key key_name key_name_key key_name_value
+   kim property remove  instance_id key key_name
+   kim property destroy instance_id
+   kim property dump    file
+
+.. _formatarg_options:
+
+* model = name of the KIM interatomic model (the KIM ID for models archived in
+  OpenKIM)
+* user_units = the LAMMPS :doc:`units <units>` style assumed in the LAMMPS
+  input script
+* unitarg = *unit_conversion_mode* (optional)
+* typeargs = atom type to species mapping (one entry per atom type) or
+  *fixed_types* for models with a preset fixed mapping
+* variable(s) = single name or list of names of (string style) LAMMPS
+  variable(s) where a query result or parameter get result is stored.  Variables
+  that do not exist will be created by the command
+* formatarg = *list, split, index, or explicit* (optional):
+
+  .. parsed-literal::
+
+     *list* = returns a single string with a list of space separated values
+            (e.g. "1.0 2.0 3.0"), which is placed in a LAMMPS variable as
+            defined by the *variable* argument. [default for *query*
+            sub-command]
+     *split* = returns the values separately in new variables with names based
+            on the prefix specified in *variable* and a number appended to
+            indicate which element in the list of values is in the variable
+     *index* = returns a variable style index that can be incremented via the
+            next command.  This enables the construction of simple loops
+     *explicit* = returns the values separately in one more more variable names
+            provided as arguments that precede *formatarg*\ . [default for
+            *kim_param*]
+
+* query_function = name of the OpenKIM web API query function to be used
+* queryargs = a series of *keyword=value* pairs that represent the web query;
+  supported keywords depend on the query function
+* param_name = name of a KIM portable model parameter
+* index_range = KIM portable model parameter index range (an integer for a
+  single element, or pair of integers separated by a colon for a range of
+  elements)
+* values = new value(s) to replace the current value(s) of a KIM portable model
+  parameter
+* instance_id = a positive integer identifying the KIM property instance
+* property_id = identifier of a
+  `KIM Property Definition <https://openkim.org/properties>`_, which can be (1)
+  a property short name, (2) the full unique ID of the property (including the
+  contributor and date), (3) a file name corresponding to a local property
+  definition file
+* key_name = one of the keys belonging to the specified KIM property definition
+* key_name_key = a key belonging to a key-value pair (standardized in the
+  `KIM Properties Framework <https://openkim.org/doc/schema/properties-framework>`__)
+* key_name_value = value to be associated with a key_name_key in a key-value
+  pair
+* file = name of a file to write the currently defined set of KIM property
+  instances to
+
+Full syntax examples
+""""""""""""""""""""
+
+.. code-block:: LAMMPS
+
+   kim init SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
+   kim interactions Si
+   kim init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_001 real
+   kim init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_001 metal unit_conversion_mode
+   kim interactions C H O
+   kim init Sim_LAMMPS_IFF_PCFF_HeinzMishraLinEmami_2015Ver1v5_FccmetalsMineralsSolventsPolymers__SM_039297821658_000 real
+   kim interactions fixed_types
+   kim query a0 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom]
+   kim query model index get_available_models species=[Al] potential_type=[eam]
+   kim param get gamma  1 varGamma
+   kim param set gamma  1 3.0
+   kim property create  1 atomic-mass
+   kim property modify  1 key mass source-value 26.98154
+   kim property modify  1 key species source-value Al
+   kim property remove  1 key species
+   kim property destroy 1
+   kim property dump    results.edn
+
 Benefits of Using OpenKIM IMs
 -----------------------------
 
@@ -109,22 +139,49 @@ Employing OpenKIM IMs provides LAMMPS users with multiple benefits:
 Reliability
 ^^^^^^^^^^^
 
-* All content archived in OpenKIM is reviewed by the `KIM Editor <https://openkim.org/governance/>`_ for quality.
-* IMs in OpenKIM are archived with full provenance control. Each is associated with a maintainer responsible for the integrity of the content. All changes are tracked and recorded.
-* IMs in OpenKIM are exhaustively tested using `KIM Tests <https://openkim.org/doc/evaluation/kim-tests/>`_ that compute a host of material properties, and `KIM Verification Checks <https://openkim.org/doc/evaluation/kim-verification-checks/>`_ that provide the user with information on various aspects of the IM behavior and coding correctness. This information is displayed on the IM's page accessible through the  `OpenKIM browse interface <https://openkim.org/browse>`_.
+* All content archived in OpenKIM is reviewed by the
+  `KIM Editor <https://openkim.org/governance/>`_ for quality.
+* IMs in OpenKIM are archived with full provenance control.  Each is associated
+  with a maintainer responsible for the integrity of the content.  All changes
+  are tracked and recorded.
+* IMs in OpenKIM are exhaustively tested using
+  `KIM Tests <https://openkim.org/doc/evaluation/kim-tests/>`_ that compute a
+  host of material properties, and
+  `KIM Verification Checks <https://openkim.org/doc/evaluation/kim-verification-checks/>`_
+  that provide the user with information on various aspects of the IM behavior
+  and coding correctness.  This information is displayed on the IM's page
+  accessible through the
+  `OpenKIM browse interface <https://openkim.org/browse>`_.
 
 Reproducibility
 ^^^^^^^^^^^^^^^
 
-* Each IM in OpenKIM is issued a unique identifier (`KIM ID <https://openkim.org/doc/schema/kim-ids/>`_), which includes a version number (last three digits).  Any changes that can result in different numerical values lead to a version increment in the KIM ID. This makes it possible to reproduce simulations since the specific version of a specific IM used can be retrieved using its KIM ID.
-* OpenKIM is a member organization of `DataCite <https://datacite.org/>`_ and issues digital object identifiers (DOIs) to all IMs archived in OpenKIM. This makes it possible to cite the IM code used in a simulation in a publications to give credit to the developers and further facilitate reproducibility.
+* Each IM in OpenKIM is issued a unique identifier
+  (`KIM ID <https://openkim.org/doc/schema/kim-ids/>`_), which includes a
+  version number (last three digits).  Any changes that can result in different
+  numerical values lead to a version increment in the KIM ID.  This makes it
+  possible to reproduce simulations since the specific version of a specific IM
+  used can be retrieved using its KIM ID.
+* OpenKIM is a member organization of `DataCite <https://datacite.org/>`_ and
+  issues digital object identifiers (DOIs) to all IMs archived in OpenKIM.  This
+  makes it possible to cite the IM code used in a simulation in a publications
+  to give credit to the developers and further facilitate reproducibility.
 
 Convenience
 ^^^^^^^^^^^
 
-* IMs in OpenKIM are distributed in binary form along with LAMMPS and can be used in a LAMMPS input script simply by providing their KIM ID in the *kim_init* command documented on this page.
-* The *kim_query* web query tool provides the ability to use the predictions of IMs for supported material properties (computed via `KIM Tests <https://openkim.org/doc/evaluation/kim-tests/>`_) as part of a LAMMPS input script setup and analysis.
-* Support is provided for unit conversion between the :doc:`unit style <units>` used in the LAMMPS input script and the units required by the OpenKIM IM. This makes it possible to use a single input script with IMs using different units without change and minimizes the likelihood of errors due to incompatible units.
+* IMs in OpenKIM are distributed in binary form along with LAMMPS and can be
+  used in a LAMMPS input script simply by providing their KIM ID in the
+  *kim init* command documented on this page.
+* The *kim_query* web query tool provides the ability to use the predictions of
+  IMs for supported material properties (computed via
+  `KIM Tests <https://openkim.org/doc/evaluation/kim-tests/>`_) as part of a
+  LAMMPS input script setup and analysis.
+* Support is provided for unit conversion between the :doc:`unit style <units>`
+  used in the LAMMPS input script and the units required by the OpenKIM IM.
+  This makes it possible to use a single input script with IMs using different
+  units without change and minimizes the likelihood of errors due to
+  incompatible units.
 
 .. _IM_types:
 
@@ -135,12 +192,23 @@ There are two types of IMs archived in OpenKIM:
 
 .. _PM_type:
 
-1. The first type is called a *KIM Portable Model* (PM). A KIM PM is an independent computer implementation of an IM written in one of the languages supported by KIM (C, C++, Fortran) that conforms to the KIM Application Programming Interface (`KIM API <https://openkim.org/kim-api/>`_) Portable Model Interface (PMI) standard. A KIM PM will work seamlessly with any simulation code that supports the KIM API/PMI standard (including LAMMPS; see `complete list of supported codes <https://openkim.org/projects-using-kim/>`_).
-2. The second type is called a *KIM Simulator Model* (SM). A KIM SM is an IM that is implemented natively within a simulation code (\ *simulator*\ ) that supports the KIM API Simulator Model Interface (SMI); in this case LAMMPS. A separate SM package is archived in OpenKIM for each parameterization of the IM, which includes all of the necessary parameter files, LAMMPS commands, and metadata (supported species, units, etc.) needed to run the IM in LAMMPS.
+1. The first type is called a *KIM Portable Model* (PM).  A KIM PM is an
+   independent computer implementation of an IM written in one of the languages
+   supported by KIM (C, C++, Fortran) that conforms to the KIM Application
+   Programming Interface (`KIM API <https://openkim.org/kim-api/>`_) Portable
+   Model Interface (PMI) standard.  A KIM PM will work seamlessly with any
+   simulation code that supports the KIM API/PMI standard (including LAMMPS; see
+   `complete list of supported codes <https://openkim.org/projects-using-kim/>`_).
+2. The second type is called a *KIM Simulator Model* (SM).  A KIM SM is an IM
+   that is implemented natively within a simulation code (\ *simulator*\ ) that
+   supports the KIM API Simulator Model Interface (SMI); in this case LAMMPS.  A
+   separate SM package is archived in OpenKIM for each parameterization of the
+   IM, which includes all of the necessary parameter files, LAMMPS commands, and
+   metadata (supported species, units, etc.) needed to run the IM in LAMMPS.
 
-With these two IM types, OpenKIM can archive and test almost all IMs that
-can be used by LAMMPS. (It is easy to contribute new IMs to OpenKIM, see
-the `upload instructions <https://openkim.org/doc/repository/adding-content/>`_.)
+With these two IM types, OpenKIM can archive and test almost all IMs that can be
+used by LAMMPS. (It is easy to contribute new IMs to OpenKIM, see the
+`upload instructions <https://openkim.org/doc/repository/adding-content/>`_.)
 
 OpenKIM IMs are uniquely identified by a
 `KIM ID <https://openkim.org/doc/schema/kim-ids/>`_.
@@ -155,7 +223,7 @@ By convention SM prefixes begin with *Sim_* to readily identify them.
 .. parsed-literal::
 
    SW_StillingerWeber_1985_Si__MO_405512056662_005
-   Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000
+   Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_001
 
 Each OpenKIM IM has a dedicated "Model Page" on `OpenKIM <https://openkim.org>`_
 providing all the information on the IM including a title, description,
@@ -169,16 +237,17 @@ The URL for the Model Page is constructed from the
 
    https://openkim.org/id/extended_KIM_ID
 
-For example, for the Stillinger--Weber potential
-listed above the Model Page is located at:
+For example, for the Stillinger--Weber potential listed above the Model Page is
+located at:
 
 .. parsed-literal::
 
    `https://openkim.org/id/SW_StillingerWeber_1985_Si__MO_405512056662_005 <https://openkim.org/id/SW_StillingerWeber_1985_Si__MO_405512056662_005>`_
 
-See the `current list of KIM PMs and SMs archived in OpenKIM <https://openkim.org/browse/models/by-species>`_.
-This list is sorted by species and can be filtered to display only
-IMs for certain species combinations.
+See the
+`current list of KIM PMs and SMs archived in OpenKIM <https://openkim.org/browse/models/by-species>`_.
+This list is sorted by species and can be filtered to display only IMs for
+certain species combinations.
 
 See `Obtaining KIM Models <https://openkim.org/doc/usage/obtaining-models>`_ to
 learn how to install a pre-built binary of the OpenKIM Repository of Models.
@@ -190,91 +259,87 @@ learn how to install a pre-built binary of the OpenKIM Repository of Models.
 Using OpenKIM IMs with LAMMPS
 -----------------------------
 
-Two commands are employed when using OpenKIM IMs, one to select the
-IM and perform necessary initialization (\ *kim_init*\ ), and the second
+Two sub-commands are employed when using OpenKIM IMs, one to select the
+IM and perform necessary initialization (\ *kim init*\ ), and the second
 to set up the IM for use by executing any necessary LAMMPS commands
-(\ *kim_interactions*\ ). Both are required.
+(\ *kim interactions*\ ).  Both are required.
 
 See the *examples/kim* directory for example input scripts that use KIM PMs
 and KIM SMs.
 
-.. _kim_init command:
+.. _init:
 
-OpenKIM IM Initialization (*kim_init*)
+OpenKIM IM Initialization (*kim init*)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The *kim_init* mode command must be issued **before**
-the simulation box is created (normally at the top of the file).
-This command sets the OpenKIM IM that will be used and may issue
-additional commands changing LAMMPS default settings that are required
-for using the selected IM (such as :doc:`units <units>` or
-:doc:`atom_style <atom_style>`). If needed, those settings can be overridden,
-however, typically a script containing a *kim_init* command
-would not include *units* and *atom_style* commands.
+The *kim* command followed by *init* sub-command must be issued **before**
+the simulation box is created (normally at the top of the file).  This command
+sets the OpenKIM IM that will be used and may issue additional commands changing
+LAMMPS default settings that are required for using the selected IM (such as
+:doc:`units <units>` or :doc:`atom_style <atom_style>`).  If needed, those
+settings can be overridden, however, typically a script containing a *kim init*
+command would not include *units* and *atom_style* commands.
 
-The required arguments of *kim_init* are the *model* name of the
-IM to be used in the simulation (for an IM archived in OpenKIM this is
-its `extended KIM ID <https://openkim.org/doc/schema/kim-ids/>`_, and
-the *user_units*, which are the LAMMPS :doc:`units style <units>` used
-in the input script.  (Any dimensioned numerical values in the input
-script and values read in from files are expected to be in the
-*user_units* system.)
+The required arguments of *kim init* are the *model* name of the IM to be used
+in the simulation (for an IM archived in OpenKIM this is its
+`extended KIM ID <https://openkim.org/doc/schema/kim-ids/>`_, and the
+*user_units*, which are the LAMMPS :doc:`units style <units>` used in the input
+script. (Any dimensioned numerical values in the input script and values read in
+from files are expected to be in the *user_units* system.)
 
-The selected IM can be either a :ref:`KIM PM or a KIM SM <IM_types>`.
-For a KIM SM, the *kim_init* command verifies that the SM is designed
-to work with LAMMPS (and not another simulation code).
-In addition, the LAMMPS version used for defining
-the SM and the LAMMPS version being currently run are
-printed to help diagnose any incompatible changes to input script or
-command syntax between the two LAMMPS versions.
+The selected IM can be either a :ref:`KIM PM or a KIM SM <IM_types>`.  For a KIM
+SM, the *kim init* command verifies that the SM is designed to work with LAMMPS
+(and not another simulation code).  In addition, the LAMMPS version used for
+defining the SM and the LAMMPS version being currently run are printed to help
+diagnose any incompatible changes to input script or command syntax between the
+two LAMMPS versions.
 
-Based on the selected model *kim_init* may modify the
-:doc:`atom_style <atom_style>`.
-Some SMs have requirements for this setting. If this is the case, then
-*atom_style* will be set to the required style. Otherwise, the value is left
-unchanged (which in the absence of an *atom_style* command in the input script
-is the :doc:`default atom_style value <atom_style>`).
+Based on the selected model *kim init* may modify the
+:doc:`atom_style <atom_style>`.  Some SMs have requirements for this setting.
+If this is the case, then *atom_style* will be set to the required style.
+Otherwise, the value is left unchanged (which in the absence of an *atom_style*
+command in the input script is the
+:doc:`default atom_style value <atom_style>`).
 
-Regarding units, the *kim_init* command behaves in different ways depending
-on whether or not *unit conversion mode* is activated as indicated by the
-optional *unitarg* argument.
-If unit conversion mode is **not** active, then *user_units* must
-either match the required units of the IM or the IM must be able
-to adjust its units to match. (The latter is only possible with some KIM PMs;
-SMs can never adjust their units.) If a match is possible, the LAMMPS
-:doc:`units <units>` command is called to set the units to
-*user_units*\ . If the match fails, the simulation is terminated with
-an error.
+Regarding units, the *kim init* behaves in different ways depending on whether
+or not *unit conversion mode* is activated as indicated by the optional
+*unitarg* argument.
+If unit conversion mode is **not** active, then *user_units* must either match
+the required units of the IM or the IM must be able to adjust its units to
+match. (The latter is only possible with some KIM PMs; SMs can never adjust
+their units.) If a match is possible, the LAMMPS :doc:`units <units>` command is
+called to set the units to *user_units*\ .  If the match fails, the simulation is
+terminated with an error.
 
-Here is an example of a LAMMPS script to compute the cohesive energy
-of a face-centered cubic (fcc) lattice for the Ercolessi and Adams (1994)
-potential for Al:
+Here is an example of a LAMMPS script to compute the cohesive energy of a
+face-centered cubic (fcc) lattice for the MEAM potential by Pascuet and
+Fernandez (2015) for Al.
 
 .. code-block:: LAMMPS
 
-   kim_init         EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal
-   boundary         p p p
-   lattice          fcc 4.032
-   region           simbox block 0 1 0 1 0 1 units lattice
-   create_box       1 simbox
-   create_atoms     1 box
-   mass             1 26.981539
-   kim_interactions Al
-   run              0
-   variable         Ec equal (pe/count(all))/${_u_energy}
-   print            "Cohesive Energy = ${EcJ} eV"
+   kim          init Sim_LAMMPS_MEAM_PascuetFernandez_2015_Al__SM_811588957187_000 metal
+   boundary     p p p
+   lattice      fcc 4.049
+   region       simbox block 0 1 0 1 0 1 units lattice
+   create_box   1 simbox
+   create_atoms 1 box
+   mass         1 26.981539
+   kim          interactions Al
+   run          0
+   variable     Ec equal (pe/count(all))
+   print        "Cohesive Energy = ${Ec} eV"
 
-The above script will end with an error in the *kim_init* line if the
-IM is changed to another potential for Al that does not work with *metal*
-units. To address this *kim_init* offers the *unit_conversion_mode*
-as shown below.
-If unit conversion mode *is* active, then *kim_init* calls the LAMMPS
-:doc:`units <units>` command to set the units to the IM's required or
-preferred units. Conversion factors between the IM's units and the *user_units*
-are defined for all :doc:`physical quantities <units>` (mass, distance, etc.).
+The above script will end with an error in the *kim init* line if the IM is
+changed to another potential for Al that does not work with *metal* units.  To
+address this, *kim init* offers the *unit_conversion_mode* as shown below.
+
+If unit conversion mode *is* active, then *kim init* calls the LAMMPS
+:doc:`units <units>` command to set the units to the IM's required or preferred
+units.  Conversion factors between the IM's units and the *user_units* are
+defined for all :doc:`physical quantities <units>` (mass, distance, etc.).
 (Note that converting to or from the "lj" unit style is not supported.)
-These factors are stored as :doc:`internal style variables <variable>` with
-the following standard names:
+These factors are stored as :doc:`internal style variables <variable>` with the
+following standard names:
 
 .. parsed-literal::
 
@@ -297,127 +362,125 @@ If desired, the input script can be designed to work with these conversion
 factors so that the script will work without change with any OpenKIM IM.
 (This approach is used in the
 `OpenKIM Testing Framework <https://openkim.org/doc/evaluation/kim-tests/>`_.)
-For example, the script given above for the cohesive energy of fcc Al
-can be rewritten to work with any IM regardless of units. The following
-script constructs an fcc lattice with a lattice parameter defined in
-meters, computes the total energy, and prints the cohesive energy in
-Joules regardless of the units of the IM.
+
+For example, the script given above for the cohesive energy of fcc Al can be
+rewritten to work with any IM regardless of units.  The following script
+constructs an fcc lattice with a lattice parameter defined in meters, computes
+the total energy, and prints the cohesive energy in Joules regardless of the
+units of the IM.
 
 .. code-block:: LAMMPS
 
-   kim_init         EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 si unit_conversion_mode
-   boundary         p p p
-   lattice          fcc 4.032e-10*${_u_distance}
-   region           simbox block 0 1 0 1 0 1 units lattice
-   create_box       1 simbox
-   create_atoms     1 box
-   mass             1 4.480134e-26*${_u_mass}
-   kim_interactions Al
-   run              0
-   variable         Ec_in_J equal (pe/count(all))/${_u_energy}
-   print            "Cohesive Energy = ${Ec_in_J} J"
+   kim          init Sim_LAMMPS_MEAM_PascuetFernandez_2015_Al__SM_811588957187_000 si unit_conversion_mode
+   boundary     p p p
+   lattice      fcc $(4.049e-10*v__u_distance)
+   region       simbox block 0 1 0 1 0 1 units lattice
+   create_box   1 simbox
+   create_atoms 1 box
+   mass         1 $(4.480134e-26*v__u_mass)
+   kim          interactions Al
+   neighbor     $(0.001e-10*v__u_distance) bin
+   run          0
+   variable     Ec_in_J equal (pe/count(all))/v__u_energy
+   print        "Cohesive Energy = ${Ec_in_J} J"
 
-Note the multiplication by ${_u_distance} and ${_u_mass} to convert
-from SI units (specified in the *kim_init* command) to whatever units the
-IM uses (metal in this case), and the division by ${_u_energy}
-to convert from the IM's energy units to SI units (Joule). This script
-will work correctly for any IM for Al (KIM PM or SM) selected by the
-*kim_init* command.
+Note the multiplication by `v__u_distance` and `v__u_mass` to convert from SI
+units (specified in the *kim init* command) to whatever units the IM uses (metal
+in this case), and the division by `v__u_energy` to convert from the IM's energy
+units to SI units (Joule).  This script will work correctly for any IM for Al
+(KIM PM or SM) selected by the *kim init* command.
 
 Care must be taken to apply unit conversion to dimensional variables read in
-from a file. For example, if a configuration of atoms is read in from a
-dump file using the :doc:`read_dump <read_dump>` command, the following can
-be done to convert the box and all atomic positions to the correct units:
+from a file.  For example, if a configuration of atoms is read in from a dump
+file using the :doc:`read_dump <read_dump>` command, the following can be done
+to convert the box and all atomic positions to the correct units:
+
 
 .. code-block:: LAMMPS
 
-   variable xyfinal equal xy*${_u_distance}
-   variable xzfinal equal xz*${_u_distance}
-   variable yzfinal equal yz*${_u_distance}
    change_box all x scale ${_u_distance} &
-                          y scale ${_u_distance} &
-                          z scale ${_u_distance} &
-                          xy final ${xyfinal} &
-                          xz final ${xzfinal} &
-                          yz final ${yzfinal} &
-                          remap
+                  y scale ${_u_distance} &
+                  z scale ${_u_distance} &
+                  xy final $(xy*v__u_distance) &
+                  xz final $(xz*v__u_distance) &
+                  yz final $(yz*v__u_distance) &
+                  remap
 
 .. note::
 
-   Unit conversion will only work if the conversion factors are placed in
-   all appropriate places in the input script. It is up to the user to do this
+   Unit conversion will only work if the conversion factors are placed in all
+   appropriate places in the input script.  It is up to the user to do this
    correctly.
 
-.. _kim_interactions command:
+.. _interactions:
 
-OpenKIM IM Execution (*kim_interactions*)
+OpenKIM IM Execution (*kim interactions*)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The second and final step in using an OpenKIM IM is to execute the
-*kim_interactions* command. This command must be preceded by a *kim_init*
+*kim interactions* command.  This command must be preceded by a *kim init*
 command and a command that defines the number of atom types *N* (such as
 :doc:`create_box <create_box>`).
-The *kim_interactions* command has one argument *typeargs*\ . This argument
+The *kim interactions* command has one argument *typeargs*\ .  This argument
 contains either a list of *N* chemical species, which defines a mapping between
-atom types in LAMMPS to the available species in the OpenKIM IM, or the
-keyword *fixed_types* for models that have a preset fixed mapping (i.e.
-the mapping between LAMMPS atom types and chemical species is defined by
-the model and cannot be changed). In the latter case, the user must consult
-the model documentation to see how many atom types there are and how they
-map to the chemical species.
+atom types in LAMMPS to the available species in the OpenKIM IM, or the keyword
+*fixed_types* for models that have a preset fixed mapping (i.e.  the mapping
+between LAMMPS atom types and chemical species is defined by the model and
+cannot be changed).  In the latter case, the user must consult the model
+documentation to see how many atom types there are and how they map to the
+chemical species.
 
-For example, consider an OpenKIM IM that supports Si and C species.
-If the LAMMPS simulation has four atom types, where the first three are Si,
-and the fourth is C, the following *kim_interactions* command would be used:
+For example, consider an OpenKIM IM that supports Si and C species.  If the
+LAMMPS simulation has four atom types, where the first three are Si, and the
+fourth is C, the following *kim interactions* command would be used:
 
 .. code-block:: LAMMPS
 
-   kim_interactions Si Si Si C
+   kim interactions Si Si Si C
 
 Alternatively, for a model with a fixed mapping the command would be:
 
 .. code-block:: LAMMPS
 
-   kim_interactions fixed_types
+   kim interactions fixed_types
 
-The *kim_interactions* command performs all the necessary steps to set up
-the OpenKIM IM selected in the *kim_init* command. The specific actions depend
-on whether the IM is a KIM PM or a KIM SM.  For a KIM PM,
-a :doc:`pair_style kim <pair_kim>` command is executed followed by
-the appropriate *pair_coeff* command. For example, for the
-Ercolessi and Adams (1994) KIM PM for Al set by the following commands:
+The *kim interactions* command performs all the necessary steps to set up the
+OpenKIM IM selected in the *kim_init* command.  The specific actions depend on
+whether the IM is a KIM PM or a KIM SM.  For a KIM PM, a
+:doc:`pair_style kim <pair_kim>` command is executed followed by the appropriate
+*pair_coeff* command.  For example, for the Ercolessi and Adams (1994) KIM PM for
+Al set by the following commands:
 
 .. code-block:: LAMMPS
 
-   kim_init EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal
+   kim init EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal
    ...
    ...  box specification lines skipped
    ...
-   kim_interactions Al
+   kim interactions Al
 
-the *kim_interactions* command executes the following LAMMPS input commands:
+the *kim interactions* command executes the following LAMMPS input commands:
 
 .. code-block:: LAMMPS
 
    pair_style kim EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005
    pair_coeff * * Al
 
-For a KIM SM, the generated input commands may be more complex
-and require that LAMMPS is built with the required packages included
-for the type of potential being used. The set of commands to be executed
-is defined in the SM specification file, which is part of the SM package.
-For example, for the Strachan et al. (2003) ReaxFF SM
-set by the following commands:
+For a KIM SM, the generated input commands may be more complex and require that
+LAMMPS is built with the required packages included for the type of potential
+being used.  The set of commands to be executed is defined in the SM
+specification file, which is part of the SM package.  For example, for the
+Strachan et al.  (2003) ReaxFF SM set by the following commands:
 
 .. code-block:: LAMMPS
 
-   kim_init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000 real
+   kim init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_000 real
    ...
    ...  box specification lines skipped
    ...
-   kim_interactions C H N O
+   kim interactions C H N O
 
-the *kim_interactions* command executes the following LAMMPS input commands:
+the *kim interactions* command executes the following LAMMPS input commands:
 
 .. code-block:: LAMMPS
 
@@ -427,325 +490,354 @@ the *kim_interactions* command executes the following LAMMPS input commands:
 
 .. note::
 
-    The files *lmp_control*, *ffield.reax.rdx* and *param.qeq*
-    are specific to the Strachan et al. (2003) ReaxFF parameterization
-    and are archived as part of the SM package in OpenKIM.
+    The files *lmp_control*, *ffield.reax.rdx* and *param.qeq* are specific to
+    the Strachan et al.  (2003) ReaxFF parameterization and are archived as part
+    of the SM package in OpenKIM.
 
 .. note::
 
-    Parameters like cutoff radii and charge tolerances,
-    which have an effect on IM predictions, are also included in the
-    SM definition ensuring reproducibility.
+    Parameters like cutoff radii and charge tolerances, which have an effect on
+    IM predictions, are also included in the SM definition ensuring
+    reproducibility.
 
 .. note::
 
-   When using *kim_init* and *kim_interactions* to select
-   and set up an OpenKIM IM, other LAMMPS commands
-   for the same functions (such as pair_style, pair_coeff, bond_style,
-   bond_coeff, fixes related to charge equilibration, etc.) should normally
-   not appear in the input script.
+   When using *kim init* and *kim interactions* to select and set up an OpenKIM
+   IM, other LAMMPS commands for the same functions (such as pair_style,
+   pair_coeff, bond_style, bond_coeff, fixes related to charge equilibration,
+   etc.) should normally not appear in the input script.
 
-.. _kim_query command:
+.. note::
 
-Using OpenKIM Web Queries in LAMMPS (*kim_query*)
+   Changing a periodic boundary to a non-periodic one, or in general using the
+   :doc:`change_box <change_box>` command after the interactions are set via
+   *kim interactions* or *pair_coeff* commands might affect some of the
+   settings.  For example, SM models containing Coulombic terms in the
+   interactions require different settings if a periodic boundary changes to a
+   non-periodic one.  In these cases, *kim interactions* must be called again
+   after the *change_box* command to provide the correct settings.
+
+.. _query:
+
+Using OpenKIM Web Queries in LAMMPS (*kim query*)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The *kim_query* command performs a web query to retrieve the predictions
-of an IM set by *kim_init* for material properties archived in
+The *kim query* command performs a web query to retrieve the predictions of an
+IM set by *kim init* for material properties archived in
 `OpenKIM <https://openkim.org>`_.
 
-.. note::
-
-   The *kim_query* command must be preceded by a *kim_init* command.
-
-The syntax for the *kim_query* command is as follows:
-
+The syntax for the *kim query* command is as follows:
 
 .. code-block:: LAMMPS
 
-   kim_query variable formatarg query_function queryargs
+   kim query variable formatarg query_function queryargs
 
 The result of the query is stored in one or more
-:doc:`string style variables <variable>` as determined by the
-optional *formatarg* argument :ref:`documented above <formatarg_options>`.
-For the "list" setting of *formatarg* (or if *formatarg* is not
-specified), the result is returned as a space-separated list of
-values in *variable*\ .
-The *formatarg* keyword "split" separates the result values into
-individual variables of the form *prefix_I*, where *prefix* is set to the
-*kim_query* *variable* argument and *I* ranges from 1 to the number of
-returned values. The number and order of the returned values is determined
-by the type of query performed.  (Note that the "explicit" setting of
-*formatarg* is not supported by *kim_query*\ .)
+:doc:`string style variables <variable>` as determined by the optional
+*formatarg* argument :ref:`documented above <formatarg_options>`.  For the "list"
+setting of *formatarg* (or if *formatarg* is not specified), the result is
+returned as a space-separated list of values in *variable*\ .  The *formatarg*
+keyword "split" separates the result values into individual variables of the
+form *prefix_I*, where *prefix* is set to the *kim query* *variable* argument
+and *I* ranges from 1 to the number of returned values.  The number and order of
+the returned values is determined by the type of query performed.  The
+*formatarg* keyword "index" returns a :doc:`variable style index <variable>`
+that can be incremented via the :doc:`next <next>` command.  This enables the
+construction of simple loops over the returned values by the type of query
+performed. (Note that the "explicit" setting of *formatarg* is not supported by
+*kim query*\ .)
 
 .. note::
 
-   *kim_query* only supports queries that return a single result or
-   an array of values. More complex queries that return a JSON structure
-   are not currently supported. An attempt to use *kim_query* in such
-   cases will generate an error.
+   *kim query* only supports queries that return a single result or an array of
+   values.  More complex queries that return a JSON structure are not currently
+   supported.  An attempt to use *kim query* in such cases will generate an
+   error.
 
-The second required argument *query_function* is the name of the
-query function to be called (e.g. *get_lattice_constant_cubic*\ ).
-All following :doc:`arguments <Commands_parse>` are parameters handed over to
-the web query in the format *keyword=value*\ , where *value* is always
-an array of one or more comma-separated items in brackets.
-The list of supported keywords and the type and format of their values
-depend on the query function used. The current list of query functions
-is available on the OpenKIM webpage at
+The second required argument *query_function* is the name of the query function
+to be called (e.g.  *get_lattice_constant_cubic*\ ).  All following
+:doc:`arguments <Commands_parse>` are parameters handed over to the web query in
+the format *keyword=value*\ , where *value* is always an array of one or more
+comma-separated items in brackets.  The list of supported keywords and the type
+and format of their values depend on the query function used.  The current list
+of query functions is available on the OpenKIM webpage at
 `https://openkim.org/doc/usage/kim-query <https://openkim.org/doc/usage/kim-query>`_.
 
 .. note::
 
-   All query functions require the *model* keyword, which identifies
-   the IM whose predictions are being queried. This keyword is automatically
-   generated by *kim_query* based on the IM set in *kim_init* and must not
-   be specified as an argument to *kim_query*\ .
+   All query functions, except *get_available_models* function, require the
+   *model* keyword, which identifies the IM whose predictions are being queried.
+   *kim query* automatically generates the *model* keyword based on the IM set
+   in by *kim init*, and it can be overwritten if specified as an argument to
+   the *kim query*\ .  Where *Kim init* is not specified, the *model* keyword
+   must be provided as an argument to the *kim query*\ .
 
 .. note::
 
-   Each *query_function* is associated with a default method (implemented
-   as a `KIM Test <https://openkim.org/doc/evaluation/kim-tests/>`_)
-   used to compute this property. In cases where there are multiple
-   methods in OpenKIM for computing a property, a *method* keyword can
-   be provided to select the method of choice.  See the
-   `query documentation <https://openkim.org/doc/usage/kim-query>`_
-   to see which methods are available for a given *query_function*\ .
+   Each *query_function* is associated with a default method (implemented as a
+   `KIM Test <https://openkim.org/doc/evaluation/kim-tests/>`_) used to compute
+   this property.  In cases where there are multiple methods in OpenKIM for
+   computing a property, a *method* keyword can be provided to select the method
+   of choice.  See the
+   `query documentation <https://openkim.org/doc/usage/kim-query>`_ to see which
+   methods are available for a given *query_function*\ .
 
-*kim_query* Usage Examples and Further Clarifications
+*kim query* Usage Examples and Further Clarifications
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The data obtained by *kim_query* commands can be used as part of the setup
-or analysis phases of LAMMPS simulations. Some examples are given below.
+The data obtained by *kim query* commands can be used as part of the setup or
+analysis phases of LAMMPS simulations.  Some examples are given below.
 
 **Define an equilibrium fcc crystal**
 
 .. code-block:: LAMMPS
 
-   kim_init         EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal
-   boundary         p p p
-   kim_query        a0 get_lattice_constant_cubic crystal=["fcc"] species=["Al"] units=["angstrom"]
-   lattice          fcc ${a0}
+   kim      init EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal
+   boundary p p p
+   kim      query a0 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom]
+   lattice  fcc ${a0}
    ...
 
-The *kim_query* command retrieves from `OpenKIM <https://openkim.org>`_
-the equilibrium lattice constant predicted by the Ercolessi and Adams (1994)
-potential for the fcc structure and places it in
-variable *a0*\ . This variable is then used on the next line to set up the
-crystal. By using *kim_query*, the user is saved the trouble and possible
-error of tracking this value down, or of having to perform an energy
-minimization to find the equilibrium lattice constant.
+.. code-block:: LAMMPS
+
+   units   metal
+   kim     query a0 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005]
+   lattice fcc ${a0}
+   ...
+
+The *kim query* command retrieves from `OpenKIM <https://openkim.org>`_ the
+equilibrium lattice constant predicted by the Ercolessi and Adams (1994)
+potential for the fcc structure and places it in variable *a0*\ .  This variable
+is then used on the next line to set up the crystal.  By using *kim query*, the
+user is saved the trouble and possible error of tracking this value down, or of
+having to perform an energy minimization to find the equilibrium lattice
+constant.
 
 .. note::
 
-    In *unit_conversion_mode* the results obtained from a
-    *kim_query* would need to be converted to the appropriate units system.
-    For example, in the above script, the lattice command would need to be
-    changed to: "lattice fcc ${a0}*${_u_distance}".
+    In *unit_conversion_mode* the results obtained from a *kim query* would need
+    to be converted to the appropriate units system.  For example, in the above
+    script, the lattice command would need to be changed to:
+    "lattice fcc $(v_a0*v__u_distance)".
 
 **Define an equilibrium hcp crystal**
 
 .. code-block:: LAMMPS
 
-   kim_init         EAM_Dynamo_Mendelev_2007_Zr__MO_848899341753_000 metal
-   boundary         p p p
-   kim_query        latconst split get_lattice_constant_hexagonal crystal=["hcp"] species=["Zr"] units=["angstrom"]
-   variable         a0 equal latconst_1
-   variable         c0 equal latconst_2
-   variable         c_to_a equal ${c0}/${a0}
-   lattice          custom ${a0} a1 0.5 -0.866025 0 a2 0.5 0.866025 0 a3 0 0 ${c_to_a} &
-                    basis 0.333333 0.666666 0.25 basis 0.666666 0.333333 0.75
+   kim      init EAM_Dynamo_MendelevAckland_2007v3_Zr__MO_004835508849_000 metal
+   boundary p p p
+   kim      query latconst split get_lattice_constant_hexagonal crystal=[hcp] species=[Zr] units=[angstrom]
+   lattice  custom ${latconst_1} a1 0.5 -0.866025 0 a2 0.5 0.866025 0 a3 0 0 $(latconst_2/latconst_1) &
+            basis 0.333333 0.666666 0.25 basis 0.666666 0.333333 0.75
    ...
 
-In this case the *kim_query* returns two arguments (since the hexagonal
-close packed (hcp) structure has two independent lattice constants).
-The *formatarg* keyword "split" places the two values into
-the variables *latconst_1* and *latconst_2*\ . (These variables are
-created if they do not already exist.) For convenience the variables
-*a0* and *c0* are created in order to make the remainder of the
-input script more readable.
+In this case the *kim query* returns two arguments (since the hexagonal close
+packed (hcp) structure has two independent lattice constants).  The *formatarg*
+keyword "split" places the two values into the variables *latconst_1* and
+*latconst_2*\ .  (These variables are created if they do not already exist.)
 
 **Define a crystal at finite temperature accounting for thermal expansion**
 
 .. code-block:: LAMMPS
 
-   kim_init         EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal
-   boundary         p p p
-   kim_query        a0 get_lattice_constant_cubic crystal=["fcc"] species=["Al"] units=["angstrom"]
-   kim_query        alpha get_linear_thermal_expansion_coefficient_cubic  crystal=["fcc"] species=["Al"] units=["1/K"] temperature=[293.15] temperature_units=["K"]
-   variable         DeltaT equal 300
-   lattice          fcc ${a0}*${alpha}*${DeltaT}
+   kim      init EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal
+   boundary p p p
+   kim      query a0 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom]
+   kim      query alpha get_linear_thermal_expansion_coefficient_cubic crystal=[fcc] species=[Al] units=[1/K] temperature=[293.15] temperature_units=[K]
+   variable DeltaT equal 300
+   lattice  fcc $(v_a0*v_alpha*v_DeltaT)
    ...
 
-As in the previous example, the equilibrium lattice constant is obtained
-for the Ercolessi and Adams (1994) potential. However, in this case the
-crystal is scaled to the appropriate lattice constant at room temperature
-(293.15 K) by using the linear thermal expansion constant predicted by the
-potential.
+As in the previous example, the equilibrium lattice constant is obtained for the
+Ercolessi and Adams (1994) potential.  However, in this case the crystal is
+scaled to the appropriate lattice constant at room temperature (293.15 K) by
+using the linear thermal expansion constant predicted by the potential.
 
 .. note::
 
-   When passing numerical values as arguments (as in the case
-   of the temperature in the above example) it is also possible to pass a
-   tolerance indicating how close to the value is considered a match.
-   If no tolerance is passed a default value is used. If multiple results
-   are returned (indicating that the tolerance is too large), *kim_query*
-   will return an error. See the
-   `query documentation <https://openkim.org/doc/usage/kim-query>`_
-   to see which numerical arguments and tolerances are available for a
-   given *query_function*\ .
+   When passing numerical values as arguments (as in the case of the temperature
+   in the above example) it is also possible to pass a tolerance indicating how
+   close to the value is considered a match.  If no tolerance is passed a default
+   value is used.  If multiple results are returned (indicating that the
+   tolerance is too large), *kim query* will return an error.  See the
+   `query documentation <https://openkim.org/doc/usage/kim-query>`_ to see which
+   numerical arguments and tolerances are available for a given
+   *query_function*\ .
 
 **Compute defect formation energy**
 
 .. code-block:: LAMMPS
 
-   kim_init         EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal
+   kim      init EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005 metal
    ...
    ... Build fcc crystal containing some defect and compute the total energy
    ... which is stored in the variable *Etot*
    ...
-   kim_query        Ec get_cohesive_energy_cubic crystal=["fcc"] species=["Al"] units=["eV"]
-   variable         Eform equal ${Etot} - count(all)*${Ec}
+   kim      query Ec get_cohesive_energy_cubic crystal=[fcc] species=[Al] units=[eV]
+   variable Eform equal ${Etot} - count(all)*${Ec}
    ...
 
-The defect formation energy *Eform* is computed by subtracting from *Etot* the
-ideal fcc cohesive energy of the atoms in the system obtained from
+The defect formation energy *Eform* is computed by subtracting the ideal fcc
+cohesive energy of the atoms in the system from *Etot*\ .  The ideal fcc
+cohesive energy of the atoms is obtained from
 `OpenKIM <https://openkim.org>`_ for the Ercolessi and Adams (1994) potential.
 
+**Retrieve equilibrium fcc crystal of all EAM potentials that support a specific species**
+
+.. code-block:: LAMMPS
+
+   kim   query model index get_available_models species=[Al] potential_type=[eam]
+   label model_loop
+   kim   query latconst get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom] model=[${model}]
+   print "FCC lattice constant (${model} potential) = ${latconst}"
+   ...
+   ...   do something with current value of latconst
+   ...
+   next  model
+   jump  SELF model_loop
+
+In this example, the *index* mode of *formatarg* is used.  The first *kim query*
+returns the list of all available EAM potentials that support the *Al* species
+and archived in `OpenKIM <https://openkim.org>`_.  The result of the query
+operation is stored in the LAMMPS variable *model* as an index *variable*\ .
+This variable is used later to access the values one at a time within a loop as
+shown in the example.  The second *kim query* command retrieves from
+`OpenKIM <https://openkim.org>`_ the equilibrium lattice constant predicted by
+each potential for the fcc structure and places it in variable *latconst*\ .
+
 .. note::
 
-   *kim_query* commands return results archived in
-   `OpenKIM <https://openkim.org>`_. These results are obtained
-   using programs for computing material properties
-   (KIM Tests and KIM Test Drivers) that were contributed to OpenKIM.
-   In order to give credit to Test developers, the number of times results
-   from these programs are queried is tracked. No other information about
-   the nature of the query or its source is recorded.
+   *kim query* commands return results archived in
+   `OpenKIM <https://openkim.org>`_.  These results are obtained using programs
+   for computing material properties (KIM Tests and KIM Test Drivers) that were
+   contributed to OpenKIM.  In order to give credit to Test developers, the
+   number of times results from these programs are queried is tracked.  No other
+   information about the nature of the query or its source is recorded.
 
-.. _kim_param command:
 
-Accessing KIM Model Parameters from LAMMPS (*kim_param*)
+.. _param:
+
+Accessing KIM Model Parameters from LAMMPS (*kim param*)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-All IMs are functional forms containing a set of
-parameters.  The values of these parameters are typically
-selected to best reproduce a training set of quantum mechanical
-calculations or available experimental data.  For example, a
-Lennard-Jones potential intended to model argon might have the values of
-its two parameters, epsilon and sigma, fit to the
-dimer dissociation energy or thermodynamic properties at a critical point
-of the phase diagram.
+All IMs are functional forms containing a set of parameters.  These parameters'
+values are typically selected to best reproduce a training set of quantum
+mechanical calculations or available experimental data.  For example, a
+Lennard-Jones potential intended to model argon might have the values of its two
+parameters, epsilon, and sigma, fit to the dimer dissociation energy or
+thermodynamic properties at a critical point of the phase diagram.
 
-Normally a user employing an IM should not modify its parameters since,
-as noted above, these are selected to reproduce material properties.
-However, there are cases where accessing and modifying IM parameters
-is desired, such as for assessing uncertainty, fitting an IM,
-or working with an ensemble of IMs. As explained :ref:`above <IM_types>`,
-IMs archived in OpenKIM are either Portable Models (PMs) or
-Simulator Models (SMs). KIM PMs are complete independent implementations
-of an IM, whereas KIM SMs are wrappers to an IM implemented within LAMMPS.
-Two different mechanisms are provided for accessing IM parameters in these
-two cases:
+Normally a user employing an IM should not modify its parameters since, as noted
+above, these are selected to reproduce material properties.  However, there are
+cases where accessing and modifying IM parameters is desired, such as for
+assessing uncertainty, fitting an IM, or working with an ensemble of IMs.  As
+explained :ref:`above <IM_types>`, IMs archived in OpenKIM are either Portable
+Models (PMs) or Simulator Models (SMs).  KIM PMs are complete independent
+implementations of an IM, whereas KIM SMs are wrappers to an IM implemented
+within LAMMPS.  Two different mechanisms are provided for accessing IM parameters
+in these two cases:
 
-* For a KIM PM, the *kim_param* command can be used to *get* and *set* the values of the PM's parameters as explained below.
-* For a KIM SM, the user should consult the documentation page for the specific IM and follow instructions there for how to modify its parameters (if possible).
+* For a KIM PM, the *kim param* command can be used to *get* and *set* the
+  values of the PM's parameters as explained below.
+* For a KIM SM, the user should consult the documentation page for the specific
+  IM and follow instructions there for how to modify its parameters (if
+  possible).
 
-The *kim_param get* and *kim_param set* commands provide an interface
-to access and change the parameters of a KIM PM that "publishes" its
-parameters and makes them publicly available (see the
+The *kim param get* and *kim param set* commands provide an interface to access
+and change the parameters of a KIM PM that "publishes" its parameters and makes
+them publicly available (see the
 `KIM API documentation <https://kim-api.readthedocs.io/en/devel/features.html>`_
 for details).
 
 .. note::
 
-   The *kim_param get/set* commands must be preceded by *kim_init*\ .
-   The *kim_param set* command must additionally be preceded by a
-   *kim_interactions* command (or alternatively by a *pair_style kim*
-   and *pair_coeff* commands).  The *kim_param set* command may be used wherever a *pair_coeff* command may occur.
+   The *kim param get/set* commands must be preceded by *kim init*\ .
+   The *kim param set* command must additionally be preceded by a
+   *kim_interactions* command (or alternatively by a *pair_style kim* and
+   *pair_coeff* commands).  The *kim param set* command may be used wherever a
+   *pair_coeff* command may occur.
 
-The syntax for the *kim_param* command is as follows:
+The syntax for the *kim param* command is as follows:
 
 .. code-block:: LAMMPS
 
-   kim_param get param_name index_range variable formatarg
-   kim_param set param_name index_range values
+   kim param get param_name index_range variable formatarg
+   kim param set param_name index_range values
 
-Here, *param_name* is the name of a KIM PM parameter (which is published
-by the PM and available for access). The specific string used to identify
-a parameter is defined by the PM. For example, for the
+Here, *param_name* is the name of a KIM PM parameter (which is published by the
+PM and available for access).  The specific string used to identify a parameter
+is defined by the PM.  For example, for the
 `Stillinger--Weber (SW) potential in OpenKIM <https://openkim.org/id/SW_StillingerWeber_1985_Si__MO_405512056662_005>`_,
 the parameter names are *A, B, p, q, sigma, gamma, cutoff, lambda, costheta0*\ .
 
 .. note::
 
    The list of all the parameters that a PM exposes for access/mutation are
-   automatically written to the lammps log file when *kim_init* is called.
+   automatically written to the lammps log file when *kim init* is called.
 
-Each published parameter of a KIM PM takes the form of an array of
-numerical values. The array can contain one element for a single-valued
-parameter, or a set of values. For example, the
+Each published parameter of a KIM PM takes the form of an array of numerical
+values.  The array can contain one element for a single-valued parameter, or a
+set of values.  For example, the
 `multispecies SW potential for the Zn-Cd-Hg-S-Se-Te system <https://openkim.org/id/SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002>`_
 has the same parameter names as the
 `single-species SW potential <https://openkim.org/id/SW_StillingerWeber_1985_Si__MO_405512056662_005>`_,
 but each parameter array contains 21 entries that correspond to the parameter
 values used for each pairwise combination of the model's six supported species
-(this model does not have parameters specific to individual ternary
-combinations of its supported species).
+(this model does not have parameters specific to individual ternary combinations
+of its supported species).
 
-The *index_range* argument may either be an integer referring to
-a specific element within the array associated with the parameter
-specified by *param_name*, or a pair of integers separated by a colon
-that refer to a slice of this array.  In both cases, one-based indexing is
-used to refer to the entries of the array.
+The *index_range* argument may either be an integer referring to a specific
+element within the array associated with the parameter specified by
+*param_name*, or a pair of integers separated by a colon that refer to a slice
+of this array.  In both cases, one-based indexing is used to refer to the
+entries of the array.
 
 The result of a *get* operation for a specific *index_range* is stored in
-one or more :doc:`LAMMPS string style variables <variable>` as determined
-by the optional *formatarg* argument :ref:`documented above. <formatarg_options>`
-If not specified, the default for *formatarg* is "explicit" for the
-*kim_param* command.
+one or more :doc:`LAMMPS string style variables <variable>` as determined by the
+optional *formatarg* argument :ref:`documented above. <formatarg_options>` If
+not specified, the default for *formatarg* is "explicit" for the *kim param*
+command.
 
-For the case where the result is an array with multiple values
-(i.e. *index_range* contains a range), the optional "split" or "explicit"
-*formatarg* keywords can be used to separate the results into multiple
-variables; see the examples below.
-Multiple parameters can be retrieved with a single call to *kim_param get*
-by repeating the argument list following *get*\ .
+For the case where the result is an array with multiple values (i.e.
+*index_range* contains a range), the optional "split" or "explicit" *formatarg*
+keywords can be used to separate the results into multiple variables; see the
+examples below.  Multiple parameters can be retrieved with a single call to
+*kim param get* by repeating the argument list following *get*\ .
 
-For a *set* operation, the *values* argument contains the new value(s)
-for the element(s) of the parameter specified by *index_range*\ . For the case
-where multiple values are being set, *values* contains a set of values
-separated by spaces. Multiple parameters can be set with a single call to
-*kim_param set* by repeating the argument list following *set*\ .
+For a *set* operation, the *values* argument contains the new value(s) for the
+element(s) of the parameter specified by *index_range*\ .  For the case where
+multiple values are being set, *values* contains a set of values separated by
+spaces.  Multiple parameters can be set with a single call to *kim param set* by
+repeating the argument list following *set*\ .
 
-*kim_param* Usage Examples and Further Clarifications
+*kim param* Usage Examples and Further Clarifications
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Examples of getting and setting KIM PM parameters with further
-clarifications are provided below.
+Examples of getting and setting KIM PM parameters with further clarifications
+are provided below.
 
 **Getting a scalar parameter**
 
 .. code-block:: LAMMPS
 
-   kim_init         SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
+   kim init  SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
    ...
-   kim_param        get A 1 VARA
+   kim param get A 1 VARA
 
-In this case, the value of the SW *A* parameter is retrieved and placed
-in the LAMMPS variable *VARA*\ . The variable *VARA* can be used
-in the remainder of the input script in the same manner as any other
-LAMMPS variable.
+In this case, the value of the SW *A* parameter is retrieved and placed in the
+LAMMPS variable *VARA*\ .  The variable *VARA* can be used in the remainder of
+the input script in the same manner as any other LAMMPS variable.
 
 **Getting multiple scalar parameters with a single call**
 
 .. code-block:: LAMMPS
 
-   kim_init         SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
+   kim init  SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
    ...
-   kim_param        get A 1 VARA B 1 VARB
+   kim param get A 1 VARA B 1 VARB
 
-This retrieves the *A* and *B* parameters of the SW potential and stores
-them in the LAMMPS variables *VARA* and *VARB*\ .
+In this example, it is shown how to retrieve the *A* and *B* parameters of the
+SW potential and store them in the LAMMPS variables *VARA* and *VARB*\ .
 
 **Getting a range of values from a parameter**
 
@@ -754,9 +846,9 @@ determined by the *formatarg* argument.
 
 .. code-block:: LAMMPS
 
-   kim_init         SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
+   kim init  SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
    ...
-   kim_param        get lambda 7:9 LAM_TeTe LAM_TeZn LAM_TeSe
+   kim param get lambda 7:9 LAM_TeTe LAM_TeZn LAM_TeSe
 
 In this case, *formatarg* is not specified and therefore the default
 "explicit" mode is used. (The behavior would be the same if the word
@@ -766,166 +858,164 @@ lambda retrieved by the *get* operation are placed in the LAMMPS variables
 
 .. note::
 
-   In the above example, elements 7--9 of the lambda parameter correspond
-   to Te-Te, Te-Zm and Te-Se interactions. This can be determined by visiting
-   the `model page for the specified potential <https://openkim.org/id/SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002>`_
-   and looking at its parameter file linked to at the bottom of the page
-   (file with .param ending) and consulting the README documentation
-   provided with the driver for the PM being used. A link to the driver
-   is provided at the top of the model page.
+   In the above example, elements 7--9 of the lambda parameter correspond to
+   Te-Te, Te-Zm and Te-Se interactions.  This can be determined by visiting the
+   `model page for the specified potential <https://openkim.org/id/SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002>`_
+   and looking at its parameter file linked to at the bottom of the page (file
+   with .param ending) and consulting the README documentation provided with the
+   driver for the PM being used.  A link to the driver is provided at the top of
+   the model page.
 
 .. code-block:: LAMMPS
 
-   kim_init         SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
+   kim init SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
    ...
-   kim_param        get lambda 15:17 LAMS list
-   variable         LAM_VALUE index ${LAMS}
-   label            loop_on_lambda
+   kim      param get lambda 15:17 LAMS list
+   variable LAM_VALUE index ${LAMS}
+   label    loop_on_lambda
    ...
-   ... do something with current value of lambda
+   ...      do something with the current value of lambda
    ...
-   next             LAM_VALUE
-   jump             SELF loop_on_lambda
+   next     LAM_VALUE
+   jump     SELF loop_on_lambda
 
-In this case, the "list" mode of *formatarg* is used.
-The result of the *get* operation is stored in the LAMMPS variable
-*LAMS* as a string containing the three retrieved values separated
-by spaces, e.g "1.0 2.0 3.0". This can be used in LAMMPS with an
-*index* variable to access the values one at a time within a loop
-as shown in the example. At each iteration of the loop *LAM_VALUE*
-contains the current value of lambda.
+In this case, the "list" mode of *formatarg* is used.  The result of the *get*
+operation is stored in the LAMMPS variable *LAMS* as a string containing the
+three retrieved values separated by spaces, e.g "1.0 2.0 3.0".  This can be used
+in LAMMPS with an *index* variable to access the values one at a time within a
+loop as shown in the example.  At each iteration of the loop *LAM_VALUE* contains
+the current value of lambda.
 
 .. code-block:: LAMMPS
 
-   kim_init         SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
+   kim init  SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
    ...
-   kim_param        get lambda 15:17 LAM split
+   kim param get lambda 15:17 LAM split
 
-In this case, the "split" mode of *formatarg* is used.
-The three values retrieved by the *get* operation are stored in
-the three LAMMPS variables *LAM_15*, *LAM_16* and *LAM_17*\ .
-The provided name "LAM" is used as prefix and the location in
-the lambda array is appended to create the variable names.
+In this case, the "split" mode of *formatarg* is used.  The three values
+retrieved by the *get* operation are stored in the three LAMMPS variables
+*LAM_15*, *LAM_16* and *LAM_17*\ .  The provided name "LAM" is used as prefix and
+the location in the lambda array is appended to create the variable names.
 
 **Setting a scalar parameter**
 
 .. code-block:: LAMMPS
 
-   kim_init         SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
+   kim init SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
    ...
-   kim_interactions Si
-   kim_param        set gamma 1 2.6
+   kim interactions Si
+   kim param set gamma 1 2.6
 
 Here, the SW potential's gamma parameter is set to 2.6.  Note that the *get*
-and *set* commands work together, so that a *get* following a *set*
-operation will return the new value that was set. For example:
+and *set* commands work together, so that a *get* following a *set* operation
+will return the new value that was set.  For example,
 
 .. code-block:: LAMMPS
 
    ...
-   kim_interactions Si
-   kim_param        get gamma 1 ORIG_GAMMA
-   kim_param        set gamma 1 2.6
-   kim_param        get gamma 1 NEW_GAMMA
+   kim interactions Si
+   kim param get gamma 1 ORIG_GAMMA
+   kim param set gamma 1 2.6
+   kim param get gamma 1 NEW_GAMMA
    ...
-   print            "original gamma = ${ORIG_GAMMA}, new gamma = ${NEW_GAMMA}"
+   print "original gamma = ${ORIG_GAMMA}, new gamma = ${NEW_GAMMA}"
 
-Here, *ORIG_GAMMA* will contain the original gamma value for the SW
-potential, while *NEW_GAMMA* will contain the value 2.6.
+Here, *ORIG_GAMMA* will contain the original gamma value for the SW potential,
+while *NEW_GAMMA* will contain the value 2.6.
 
 **Setting multiple scalar parameters with a single call**
 
 .. code-block:: LAMMPS
 
-   kim_init         SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
+   kim      init SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
    ...
-   kim_interactions Cd Te
-   variable        VARG equal 2.6
-   variable        VARS equal 2.0951
-   kim_param       set gamma 1 ${VARG} sigma 3 ${VARS}
+   kim      interactions Cd Te
+   variable VARG equal 2.6
+   variable VARS equal 2.0951
+   kim      param set gamma 1 ${VARG} sigma 3 ${VARS}
 
-In this case, the first element of the *gamma* parameter and
-third element of the *sigma* parameter are set to 2.6 and 2.0951,
-respectively. This example also shows how LAMMPS variables can
-be used when setting parameters.
+In this case, the first element of the *gamma* parameter and third element of
+the *sigma* parameter are set to 2.6 and 2.0951, respectively.  This example
+also shows how LAMMPS variables can be used when setting parameters.
 
 **Setting a range of values of a parameter**
 
 .. code-block:: LAMMPS
 
-   kim_init         SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
+   kim init SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
    ...
-   kim_interactions Cd Te Zn Se Hg S
-   kim_param        set sigma 2:6 2.35214 2.23869 2.04516 2.43269 1.80415
+   kim interactions Cd Te Zn Se Hg S
+   kim param set sigma 2:6 2.35214 2.23869 2.04516 2.43269 1.80415
 
-In this case, elements 2 through 6 of the parameter *sigma*
-are set to the values 2.35214, 2.23869, 2.04516, 2.43269 and 1.80415 in
-order.
+In this case, elements 2 through 6 of the parameter *sigma* are set to the
+values 2.35214, 2.23869, 2.04516, 2.43269 and 1.80415 in order.
 
-.. _kim_property command:
+.. _property:
 
-Writing material properties computed in LAMMPS to standard KIM property instance format (*kim_property*)
+Writing material properties computed in LAMMPS to standard KIM property instance format (*kim property*)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-As explained :ref:`above<kim_description>`,
-The OpenKIM system includes a collection of Tests (material property calculation codes),
-Models (interatomic potentials), Predictions, and Reference Data (DFT or experiments).
-Specifically, a KIM Test is a computation that when coupled with a KIM Model generates
-the prediction of that model for a specific material property rigorously defined
-by a KIM Property Definition (see the
+As explained :ref:`above<kim_description>`, the OpenKIM system includes a
+collection of Tests (material property calculation codes), Models (interatomic
+potentials), Predictions, and Reference Data (DFT or experiments).  Specifically,
+a KIM Test is a computation that when coupled with a KIM Model generates the
+prediction of that model for a specific material property rigorously defined by
+a KIM Property Definition (see the
 `KIM Properties Framework <https://openkim.org/doc/schema/properties-framework/>`__
-for further details). A prediction of a material property for a given model is a specific
-numerical realization of a property definition, referred to as a "Property
-Instance."  The objective of the *kim_property* command is to make it easy to
-output material properties in a standardized, machine readable, format that can be easily
-ingested by other programs.
-Additionally, it aims to make it as easy as possible to convert a LAMMPS script that computes a
-material property into a KIM Test that can then be uploaded to `openkim.org <https://openkim.org>`_
+for further details).  A prediction of a material property for a given model is a
+specific numerical realization of a property definition, referred to as a
+"Property Instance." The objective of the *kim property* command is to make it
+easy to output material properties in a standardized, machine readable, format
+that can be easily ingested by other programs.  Additionally, it aims to make it
+as easy as possible to convert a LAMMPS script that computes a material property
+into a KIM Test that can then be uploaded to
+`openkim.org <https://openkim.org>`_
 
-A developer interested in creating a KIM Test using a LAMMPS script should
-first determine whether a property definition that applies to their calculation
+A developer interested in creating a KIM Test using a LAMMPS script should first
+determine whether a property definition that applies to their calculation
 already exists in OpenKIM by searching the `properties page
 <https://openkim.org/properties>`_.  If none exists, it is possible to use a
 locally defined property definition contained in a file until it can be
 uploaded to the official repository (see below).  Once one or more applicable
-property definitions have been identified, the *kim_property create*,
-*kim_property modify*, *kim_property remove*, and *kim_property destroy*,
+property definitions have been identified, the *kim property create*,
+*kim property modify*, *kim property remove*, and *kim property destroy*,
 commands provide an interface to create, set, modify, remove, and destroy
-instances of them within a LAMMPS script.  Their general syntax is as follows:
+instances of them within a LAMMPS script.  Their general syntax is as follows,
 
 .. code-block:: LAMMPS
 
-   kim_property create  instance_id property_id
-   kim_property modify  instance_id key key_name key_name_key key_name_value
-   kim_property remove  instance_id key key_name
-   kim_property destroy instance_id
-   kim_property dump    file
+   kim property create  instance_id property_id
+   kim property modify  instance_id key key_name key_name_key key_name_value
+   kim property remove  instance_id key key_name
+   kim property destroy instance_id
+   kim property dump    file
 
 Here, *instance_id* is a positive integer used to uniquely identify each
 property instance; (note that the results file can contain multiple property
-instances).  A property_id is an identifier of a
+instances).  A *property_id* is an identifier of a
 `KIM Property Definition <https://openkim.org/properties>`_,
 which can be (1) a property short name, (2) the full unique ID of the property
 (including the contributor and date), (3) a file name corresponding to a local
-property definition file.  Examples of each of these cases are shown below:
+property definition file.  Examples of each of these cases are shown below,
 
 .. code-block:: LAMMPS
 
-   kim_property create 1 atomic-mass
-   kim_property create 2 cohesive-energy-relation-cubic-crystal
+   kim property create 1 atomic-mass
+   kim property create 2 cohesive-energy-relation-cubic-crystal
 
 .. code-block:: LAMMPS
 
-   kim_property create 1 tag:brunnels@noreply.openkim.org,2016-05-11:property/atomic-mass
-   kim_property create 2 tag:staff@noreply.openkim.org,2014-04-15:property/cohesive-energy-relation-cubic-crystal
+   kim property create 1 tag:brunnels@noreply.openkim.org,2016-05-11:property/atomic-mass
+   kim property create 2 tag:staff@noreply.openkim.org,2014-04-15:property/cohesive-energy-relation-cubic-crystal
 
 .. code-block:: LAMMPS
 
-   kim_property create 1 new-property.edn
-   kim_property create 2 /home/mary/marys-kim-properties/dissociation-energy.edn
+   kim property create 1 new-property.edn
+   kim property create 2 /home/mary/marys-kim-properties/dissociation-energy.edn
 
-In the last example, "new-property.edn" and "/home/mary/marys-kim-properties/dissociation-energy.edn" are the
-names of files that contain user-defined (local) property definitions.
+In the last example, "new-property.edn" and
+"/home/mary/marys-kim-properties/dissociation-energy.edn" are the names of files
+that contain user-defined (local) property definitions.
 
 A KIM property instance takes the form of a "map," i.e. a set of key-value
 pairs akin to Perl's hash, Python's dictionary, or Java's Hashtable.  It
@@ -944,13 +1034,13 @@ as stipulated in the property definition.
     Each map assigned to a *key_name* must contain the *key_name_key*
     "source-value" and an associated *key_name_value* of the appropriate
     type (as defined in the relevant KIM Property Definition).  For keys that are
-    defined as having physical units, the
-    "source-unit" *key_name_key* must also be given a string value recognized
-    by `GNU units <https://www.gnu.org/software/units/>`_.
+    defined as having physical units, the "source-unit" *key_name_key* must also
+    be given a string value recognized by
+    `GNU units <https://www.gnu.org/software/units/>`_.
 
-Once a *kim_property create* command has been given to instantiate a property
+Once a *kim property create* command has been given to instantiate a property
 instance, maps associated with the property's keys can be edited using the
-*kim_property modify* command.  In using this command, the special keyword
+*kim property modify* command.  In using this command, the special keyword
 "key" should be given, followed by the property key name and the key-value pair
 in the map associated with the key that is to be set.  For example, the
 `atomic-mass <https://openkim.org/properties/show/2016-05-11/brunnels@noreply.openkim.org/atomic-mass>`_
@@ -959,37 +1049,37 @@ An instance of this property could be created like so:
 
 .. code-block:: LAMMPS
 
-   kim_property create 1 atomic-mass
-   kim_property modify 1 key species source-value Al
-   kim_property modify 1 key mass    source-value 26.98154
-   kim_property modify 1 key mass    source-unit amu
+   kim property create 1 atomic-mass
+   kim property modify 1 key species source-value Al
+   kim property modify 1 key mass    source-value 26.98154
+   kim property modify 1 key mass    source-unit amu
 
 or, equivalently,
 
 .. code-block:: LAMMPS
 
-   kim_property create 1 atomic-mass
-   kim_property modify 1 key species source-value Al       &
+   kim property create 1 atomic-mass
+   kim property modify 1 key species source-value Al       &
                          key mass    source-value 26.98154 &
                                      source-unit  amu
 
-*kim_property* Usage Examples and Further Clarifications
+*kim property* Usage Examples and Further Clarifications
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 **Create**
 
 .. code-block:: LAMMPS
 
-   kim_property create instance_id property_id
+   kim property create instance_id property_id
 
-The *kim_property create* command takes as input a property instance ID and the
+The *kim property create* command takes as input a property instance ID and the
 property definition name, and creates an initial empty property instance data
 structure.  For example,
 
 .. code-block:: LAMMPS
 
-   kim_property create 1 atomic-mass
-   kim_property create 2 cohesive-energy-relation-cubic-crystal
+   kim property create 1 atomic-mass
+   kim property create 2 cohesive-energy-relation-cubic-crystal
 
 creates an empty property instance of the "atomic-mass" property definition
 with instance ID 1 and an empty instance of the
@@ -1002,32 +1092,32 @@ path of a file containing a valid property definition.  For example,
 
 .. code-block:: LAMMPS
 
-   kim_property create 1 new-property.edn
+   kim property create 1 new-property.edn
 
 where "new-property.edn" refers to a file name containing a new property
 definition that does not exist in OpenKIM.
 
 If the *property_id* given cannot be found in OpenKIM and no file of this name
 containing a valid property definition can be found, this command will produce
-an error with an appropriate message.  Calling *kim_property create* with the
+an error with an appropriate message.  Calling *kim property create* with the
 same instance ID multiple times will also produce an error.
 
 **Modify**
 
 .. code-block:: LAMMPS
 
-   kim_property modify instance_id key key_name key_name_key key_name_value
+   kim property modify instance_id key key_name key_name_key key_name_value
 
-The *kim_property modify* command incrementally builds the property instance
-by receiving property definition keys along with associated arguments. Each
+The *kim property modify* command incrementally builds the property instance
+by receiving property definition keys along with associated arguments.  Each
 *key_name* is associated with a map containing one or more key-value pairs (in
 the form of *key_name_key*-*key_name_value* pairs).  For example,
 
 .. code-block:: LAMMPS
 
-   kim_property modify 1 key species source-value Al
-   kim_property modify 1 key mass    source-value 26.98154
-   kim_property modify 1 key mass    source-unit  amu
+   kim property modify 1 key species source-value Al
+   kim property modify 1 key mass    source-value 26.98154
+   kim property modify 1 key mass    source-unit  amu
 
 where the special keyword "key" is followed by a *key_name* ("species" or
 "mass" in the above) and one or more key-value pairs.  These key-value pairs
@@ -1036,7 +1126,7 @@ command line is reached.  Thus, the above could equivalently be written as
 
 .. code-block:: LAMMPS
 
-   kim_property modify 1 key species source-value Al       &
+   kim property modify 1 key species source-value Al       &
                          key mass    source-value 26.98154 &
                          key mass    source-unit  amu
 
@@ -1044,19 +1134,18 @@ As an example of modifying multiple key-value pairs belonging to the map of a
 single property key, the following command modifies the map of the
 "cohesive-potential-energy" property key to contain the key "source-unit" which
 is assigned a value of "eV" and the key "digits" which is assigned a value of
-5:
+5,
 
 .. code-block:: LAMMPS
 
-   kim_property modify 2 key cohesive-potential-energy source-unit eV digits 5
+   kim property modify 2 key cohesive-potential-energy source-unit eV digits 5
 
 .. note::
 
-    The relevant data types of the values in the map are handled
-    automatically based on the specification of the key in the
-    KIM Property Definition.  In the example above,
-    this means that the value "eV" will automatically be interpreted as a string
-    while the value 5 will be interpreted as an integer.
+    The relevant data types of the values in the map are handled automatically
+    based on the specification of the key in the KIM Property Definition.  In
+    the example above, this means that the value "eV" will automatically be
+    interpreted as a string while the value 5 will be interpreted as an integer.
 
 The values contained in maps can either be scalars, as in all of the examples
 above, or arrays depending on which is stipulated in the corresponding Property
@@ -1067,7 +1156,7 @@ dimensionality of the array.
 
 .. note::
 
-   All array indexing used by *kim_property modify* is one-based, i.e. the
+   All array indexing used by *kim property modify* is one-based, i.e. the
    indices are enumerated 1, 2, 3, ...
 
 .. note::
@@ -1088,20 +1177,20 @@ of the "species" property key, we can do so by issuing:
 
 .. code-block:: LAMMPS
 
-   kim_property modify 2 key species source-value 1 Al
-   kim_property modify 2 key species source-value 2 Al
-   kim_property modify 2 key species source-value 3 Al
-   kim_property modify 2 key species source-value 4 Al
+   kim property modify 2 key species source-value 1 Al
+   kim property modify 2 key species source-value 2 Al
+   kim property modify 2 key species source-value 3 Al
+   kim property modify 2 key species source-value 4 Al
 
 .. note::
 
     No declaration of the number of elements in this array was given;
-    *kim_property modify* will automatically handle memory management to allow
+    *kim property modify* will automatically handle memory management to allow
     an arbitrary number of elements to be added to the array.
 
 .. note::
 
-   In the event that *kim_property modify* is used to set the value of an
+   In the event that *kim property modify* is used to set the value of an
    array index without having set the values of all lesser indices, they will
    be assigned default values based on the data type associated with the key in
    the map:
@@ -1124,8 +1213,8 @@ of the "species" property key, we can do so by issuing:
 
    .. code-block:: LAMMPS
 
-      kim_property create 2 cohesive-energy-relation-cubic-crystal
-      kim_property modify 2 key species source-value 4 Al
+      kim property create 2 cohesive-energy-relation-cubic-crystal
+      kim property modify 2 key species source-value 4 Al
 
    will result in the "source-value" key in the map for the property key
    "species" being assigned the array ["", "", "", "Al"].
@@ -1137,12 +1226,12 @@ included).  Thus, the snippet above could equivalently be written:
 
 .. code-block:: LAMMPS
 
-   kim_property modify 2 key species source-value 1:4 Al Al Al Al
+   kim property modify 2 key species source-value 1:4 Al Al Al Al
 
 Calling this command with a non-positive index, e.g.
-``kim_property modify 2 key species source-value 0 Al``, or an incorrect
+``kim property modify 2 key species source-value 0 Al``, or an incorrect
 number of input arguments, e.g.
-``kim_property modify 2 key species source-value 1:4 Al Al``, will result in an
+``kim property modify 2 key species source-value 1:4 Al Al``, will result in an
 error.
 
 As an example of modifying multidimensional arrays, consider the "basis-atoms"
@@ -1165,36 +1254,36 @@ each row at a time using colon notation:
 
 .. code-block:: LAMMPS
 
-   kim_property modify 2 key basis-atom-coordinates source-value 1 1:3 0.0 0.0 0.0
-   kim_property modify 2 key basis-atom-coordinates source-value 2 1:3 0.5 0.5 0.0
-   kim_property modify 2 key basis-atom-coordinates source-value 3 1:3 0.5 0.0 0.5
-   kim_property modify 2 key basis-atom-coordinates source-value 4 1:3 0.0 0.5 0.5
+   kim property modify 2 key basis-atom-coordinates source-value 1 1:3 0.0 0.0 0.0
+   kim property modify 2 key basis-atom-coordinates source-value 2 1:3 0.5 0.5 0.0
+   kim property modify 2 key basis-atom-coordinates source-value 3 1:3 0.5 0.0 0.5
+   kim property modify 2 key basis-atom-coordinates source-value 4 1:3 0.0 0.5 0.5
 
 Where the first index given refers to a row and the second index refers to a
 column.  We could, instead, choose to set each column at a time like so:
 
 .. code-block:: LAMMPS
 
-   kim_property modify 2 key basis-atom-coordinates source-value 1:4 1 0.0 0.5 0.5 0.0 &
+   kim property modify 2 key basis-atom-coordinates source-value 1:4 1 0.0 0.5 0.5 0.0 &
                          key basis-atom-coordinates source-value 1:4 2 0.0 0.5 0.0 0.5 &
                          key basis-atom-coordinates source-value 1:4 3 0.0 0.0 0.5 0.5
 
 .. note::
 
-   Multiple calls of *kim_property modify* made for the same instance ID
+   Multiple calls of *kim property modify* made for the same instance ID
    can be combined into a single invocation, meaning the following are
    both valid:
 
    .. code-block:: LAMMPS
 
-      kim_property modify 2 key basis-atom-coordinates source-value 1 1:3 0.0 0.0 0.0 &
+      kim property modify 2 key basis-atom-coordinates source-value 1 1:3 0.0 0.0 0.0 &
                             key basis-atom-coordinates source-value 2 1:3 0.5 0.5 0.0 &
                             key basis-atom-coordinates source-value 3 1:3 0.5 0.0 0.5 &
                             key basis-atom-coordinates source-value 4 1:3 0.0 0.5 0.5
 
    .. code-block:: LAMMPS
 
-      kim_property modify 2 key short-name source-value 1 fcc                         &
+      kim property modify 2 key short-name source-value 1 fcc                         &
                             key species source-value 1:4 Al Al Al Al                  &
                             key a source-value 1:5 3.9149 4.0000 4.032 4.0817 4.1602  &
                                   source-unit angstrom                                &
@@ -1211,46 +1300,46 @@ column.  We could, instead, choose to set each column at a time like so:
 
    .. code-block:: LAMMPS
 
-      kim_property modify 2 key basis-atom-coordinates 1 1:3 0.0 0.0 0.0
+      kim property modify 2 key basis-atom-coordinates 1 1:3 0.0 0.0 0.0
 
    is valid but
 
    .. code-block:: LAMMPS
 
-      kim_property modify 2 key basis-atom-coordinates 1:2 1:3 0.0 0.0 0.0 0.0 0.0 0.0
+      kim property modify 2 key basis-atom-coordinates 1:2 1:3 0.0 0.0 0.0 0.0 0.0 0.0
 
    is not.
 
 .. note::
 
-   After one sets a value in a map with the *kim_property modify* command,
+   After one sets a value in a map with the *kim property modify* command,
    additional calls will overwrite the previous value.
 
 **Remove**
 
 .. code-block:: LAMMPS
 
-   kim_property remove instance_id key key_name
+   kim property remove instance_id key key_name
 
-The *kim_property remove* command can be used to remove a property key from a
+The *kim property remove* command can be used to remove a property key from a
 property instance.  For example,
 
 .. code-block:: LAMMPS
 
-   kim_property remove 2 key basis-atom-coordinates
+   kim property remove 2 key basis-atom-coordinates
 
 **Destroy**
 
 .. code-block:: LAMMPS
 
-   kim_property destroy instance_id
+   kim property destroy instance_id
 
-The *kim_property destroy* command deletes a previously created property
+The *kim property destroy* command deletes a previously created property
 instance ID.  For example,
 
 .. code-block:: LAMMPS
 
-   kim_property destroy 2
+   kim property destroy 2
 
 .. note::
 
@@ -1259,22 +1348,22 @@ instance ID.  For example,
 
 **Dump**
 
-The *kim_property dump*  command can be used to write the content of all
+The *kim property dump*  command can be used to write the content of all
 currently defined property instances to a file:
 
 .. code-block:: LAMMPS
 
-   kim_property dump file
+   kim property dump file
 
 For example,
 
 .. code-block:: LAMMPS
 
-   kim_property dump results.edn
+   kim property dump results.edn
 
 .. note::
 
-    Issuing the *kim_property dump* command clears all existing property
+    Issuing the *kim property dump* command clears all existing property
     instances from memory.
 
 Citation of OpenKIM IMs
@@ -1283,32 +1372,32 @@ Citation of OpenKIM IMs
 When publishing results obtained using OpenKIM IMs researchers are requested
 to cite the OpenKIM project :ref:`(Tadmor) <kim-mainpaper>`, KIM API
 :ref:`(Elliott) <kim-api>`, and the specific IM codes used in the simulations,
-in addition to the relevant scientific references for the IM.
-The citation format for an IM is displayed on its page on
-`OpenKIM <https://openkim.org>`_ along with the corresponding BibTex file,
-and is automatically added to the LAMMPS citation reminder.
+in addition to the relevant scientific references for the IM. The citation
+format for an IM is displayed on its page on
+`OpenKIM <https://openkim.org>`_ along with the corresponding BibTex file, and
+is automatically added to the LAMMPS citation reminder.
 
-Citing the IM software (KIM infrastructure and specific PM or SM codes)
-used in the simulation gives credit to the researchers who developed them
-and enables open source efforts like OpenKIM to function.
+Citing the IM software (KIM infrastructure and specific PM or SM codes) used in
+the simulation gives credit to the researchers who developed them and enables
+open source efforts like OpenKIM to function.
 
 Restrictions
 """"""""""""
 
-The set of *kim_commands* is part of the KIM package.  It is only enabled if
-LAMMPS is built with that package. A requirement for the KIM package,
-is the KIM API library that must be downloaded from the
-`OpenKIM website <https://openkim.org/kim-api/>`_ and installed before
-LAMMPS is compiled. When installing LAMMPS from binary, the kim-api package
-is a dependency that is automatically downloaded and installed. The *kim_query*
-command requires the *libcurl* library to be installed.  The *kim_property*
+The *kim* command is part of the KIM package.  It is only enabled if LAMMPS is
+built with that package.  A requirement for the KIM package, is the KIM API
+library that must be downloaded from the
+`OpenKIM website <https://openkim.org/kim-api/>`_ and installed before LAMMPS is
+compiled.  When installing LAMMPS from binary, the kim-api package is a
+dependency that is automatically downloaded and installed.  The *kim query*
+command requires the *libcurl* library to be installed.  The *kim property*
 command requires *Python* 3.6 or later and the *kim-property* python package to
-be installed. See the KIM section of the :doc:`Packages details <Packages_details>`
-for details.
+be installed.  See the KIM section of the
+:doc:`Packages details <Packages_details>` for details.
 
-Furthermore, when using *kim_commands* to run KIM SMs, any packages required
-by the native potential being used or other commands or fixes that it invokes
-must be installed.
+Furthermore, when using *kim* command to run KIM SMs, any packages required by
+the native potential being used or other commands or fixes that it invokes must
+be installed.
 
 Related commands
 """"""""""""""""

From 4cd42093a02fcb55ac61c6b5149a744acdd09d0e Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 16 Feb 2021 16:47:12 -0600
Subject: [PATCH 084/116] fixed the correct models for extra tests and update
 the command interface

---
 doc/src/Build_extras.rst | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst
index 8f1154a167..60d5ad09af 100644
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@@ -258,18 +258,18 @@ To build with this package, the KIM library with API v2 must be downloaded
 and built on your system. It must include the KIM models that you want to
 use with LAMMPS.
 
-If you would like to use the :doc:`kim_query <kim_commands>`
+If you would like to use the :doc:`kim query <kim_commands>`
 command, you also need to have libcurl installed with the matching
 development headers and the curl-config tool.
 
-If you would like to use the :doc:`kim_property <kim_commands>`
+If you would like to use the :doc:`kim property <kim_commands>`
 command, you need to build LAMMPS with the PYTHON package installed
 and linked to Python 3.6 or later. See the :ref:`PYTHON package build info <python>`
 for more details on this. After successfully building LAMMPS with Python, you
-also need to install the kim-property Python package, which can be easily done using
-*pip* as ``pip install kim-property``, or from the *conda-forge* channel as
-``conda install kim-property`` if LAMMPS is built in Conda. More detailed
-information is available at:
+also need to install the ``kim-property`` Python package, which can be easily
+done using *pip* as ``pip install kim-property``, or from the *conda-forge*
+channel as ``conda install kim-property`` if LAMMPS is built in Conda. More
+detailed information is available at:
 `kim-property installation <https://github.com/openkim/kim-property#installing-kim-property>`_.
 
 In addition to installing the KIM API, it is also necessary to install the
@@ -309,7 +309,7 @@ minutes to hours) to build.  Of course you only need to do that once.)
 
       You can download and build the KIM library manually if you prefer;
       follow the instructions in ``lib/kim/README``.  You can also do
-      this in one step from the lammps/src dir, using a command like
+      this in one step from the lammps/src directory, using a command like
       these, which simply invoke the ``lib/kim/Install.py`` script with
       the specified args.
 
@@ -329,7 +329,7 @@ minutes to hours) to build.  Of course you only need to do that once.)
 
       .. code-block:: make
 
-         LMP_INC =       -DLMP_NO_SSL_CHECK
+         LMP_INC = -DLMP_NO_SSL_CHECK
 
 Debugging OpenKIM web queries in LAMMPS
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -377,10 +377,11 @@ Enabling the extra unit tests have some requirements,
   Conda. More detailed information is available at:
   `kim-property installation <https://github.com/openkim/kim-property#installing-kim-property>`_.
 * It is also necessary to install
-  ``EAM_Dynamo_Mendelev_2007_Zr__MO_848899341753_000``, and
-  ``EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005`` KIM models.
+  ``EAM_Dynamo_MendelevAckland_2007v3_Zr__MO_004835508849_000``,
+  ``EAM_Dynamo_ErcolessiAdams_1994_Al__MO_123629422045_005``, and
+  ``LennardJones612_UniversalShifted__MO_959249795837_003`` KIM models.
   See `Obtaining KIM Models <http://openkim.org/doc/usage/obtaining-models>`_
-  to learn how to install a pre-build binary of the OpenKIM Repository of
+  to learn how to install a pre-built binary of the OpenKIM Repository of
   Models or see
   `Installing KIM Models <https://openkim.org/doc/usage/obtaining-models/#installing_models>`_
   to learn how to install the specific KIM models.

From 0e465516325de08ffc6cca430ca4c294e94b36aa Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 16 Feb 2021 16:47:57 -0600
Subject: [PATCH 085/116] Remove the old command interface and update the kim
 command as the only one

---
 doc/src/Commands_all.rst | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/doc/src/Commands_all.rst b/doc/src/Commands_all.rst
index a38bd5f0db..132425948e 100644
--- a/doc/src/Commands_all.rst
+++ b/doc/src/Commands_all.rst
@@ -60,11 +60,7 @@ An alphabetic list of all general LAMMPS commands.
    * :doc:`include <include>`
    * :doc:`info <info>`
    * :doc:`jump <jump>`
-   * :doc:`kim_init <kim_commands>`
-   * :doc:`kim_interactions <kim_commands>`
-   * :doc:`kim_param <kim_commands>`
-   * :doc:`kim_property <kim_commands>`
-   * :doc:`kim_query <kim_commands>`
+   * :doc:`kim <kim_commands>`
    * :doc:`kspace_modify <kspace_modify>`
    * :doc:`kspace_style <kspace_style>`
    * :doc:`label <label>`

From aefd9650654f1bdf1236943b1ac372f7d674787c Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 16 Feb 2021 16:49:03 -0600
Subject: [PATCH 086/116] update the intro for the kim command

---
 doc/src/Intro_features.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/Intro_features.rst b/doc/src/Intro_features.rst
index 7898d5018b..648c427b11 100644
--- a/doc/src/Intro_features.rst
+++ b/doc/src/Intro_features.rst
@@ -85,7 +85,7 @@ commands)
 * water potentials: TIP3P, TIP4P, SPC
 * implicit solvent potentials: hydrodynamic lubrication, Debye
 * force-field compatibility with common CHARMM, AMBER, DREIDING,     OPLS, GROMACS, COMPASS options
-* access to the `OpenKIM Repository <http://openkim.org>`_ of potentials via     :doc:`kim_init, kim_interactions, and kim_query <kim_commands>` commands
+* access to the `OpenKIM Repository <http://openkim.org>`_ of potentials via     :doc:`kim command <kim_commands>`
 * hybrid potentials: multiple pair, bond, angle, dihedral, improper     potentials can be used in one simulation
 * overlaid potentials: superposition of multiple pair potentials
 

From a254d5b480669d1f2cdecb083cb6af5a7e429fe5 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 16 Feb 2021 16:49:30 -0600
Subject: [PATCH 087/116] minor correction

---
 doc/src/Packages_details.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/src/Packages_details.rst b/doc/src/Packages_details.rst
index 455df083e4..6b50382243 100644
--- a/doc/src/Packages_details.rst
+++ b/doc/src/Packages_details.rst
@@ -367,11 +367,13 @@ KIM package
 
 **Contents:**
 
-This package contains a set of commands that serve as a wrapper on the
+This package contains a command with a set of subcommands that serve as a
+wrapper on the
 `Open Knowledgebase of Interatomic Models (OpenKIM) <https://openkim.org>`_
 repository of interatomic models (IMs) enabling compatible ones to be used in
 LAMMPS simulations.
 
+
 This includes :doc:`kim_init <kim_commands>`, and
 :doc:`kim_interactions <kim_commands>` commands to select, initialize and
 instantiate the IM, a :doc:`kim_query <kim_commands>` command to perform web

From 0fdaf586108fff7f9db06a2277f6dbba8bf654f9 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 16 Feb 2021 16:49:58 -0600
Subject: [PATCH 088/116] correct the reference to kim command

---
 doc/src/pair_adp.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/pair_adp.rst b/doc/src/pair_adp.rst
index 5eec0004bf..9fdc837ac7 100644
--- a/doc/src/pair_adp.rst
+++ b/doc/src/pair_adp.rst
@@ -59,7 +59,7 @@ command to specify them.
 * The OpenKIM Project at
   `https://openkim.org/browse/models/by-type <https://openkim.org/browse/models/by-type>`_
   provides ADP potentials that can be used directly in LAMMPS with the
-  :doc:`kim_commands <kim_commands>` interface.
+  :doc:`kim <kim_commands>` interface.
 
 ----------
 

From 40e8f01aa6ff2ca6bf586395eee70390bcbcc9a9 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 16 Feb 2021 16:50:05 -0600
Subject: [PATCH 089/116] correct the reference to kim command

---
 doc/src/pair_eam.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/pair_eam.rst b/doc/src/pair_eam.rst
index 8ca23773ec..f88faea7af 100644
--- a/doc/src/pair_eam.rst
+++ b/doc/src/pair_eam.rst
@@ -141,7 +141,7 @@ interatomic potentials and file formats.
 The OpenKIM Project at
 `https://openkim.org/browse/models/by-type <https://openkim.org/browse/models/by-type>`_
 provides EAM potentials that can be used directly in LAMMPS with the
-:doc:`kim_commands <kim_commands>` interface.
+:doc:`kim <kim_commands>` interface.
 
 ----------
 

From 9fc0753cef32af1aecf1434a931a73d4c17f29f1 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 16 Feb 2021 16:50:28 -0600
Subject: [PATCH 090/116] correct the reference to kim command

---
 doc/src/pair_kim.rst | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/doc/src/pair_kim.rst b/doc/src/pair_kim.rst
index 46855b5d9c..1b10d70b65 100644
--- a/doc/src/pair_kim.rst
+++ b/doc/src/pair_kim.rst
@@ -23,29 +23,30 @@ Examples
 Description
 """""""""""
 
-This pair style is a wrapper on the `Open Knowledgebase of Interatomic Models (OpenKIM) <https://openkim.org>`_ repository of interatomic
-potentials to enable their use in LAMMPS scripts.
+This pair style is a wrapper on the
+`Open Knowledgebase of Interatomic Models (OpenKIM) <https://openkim.org>`_
+repository of interatomic potentials to enable their use in LAMMPS scripts.
 
 The preferred interface for using interatomic models archived in
-OpenKIM is the :doc:`kim_commands interface <kim_commands>`. That
+OpenKIM is the :doc:`kim command interface <kim_commands>`. That
 interface supports both "KIM Portable Models" (PMs) that conform to the
 KIM API Portable Model Interface (PMI) and can be used by any
 simulation code that conforms to the KIM API/PMI, and
-"KIM Simulator Models" that are natively implemented within a single
+"KIM Simulator Models" (SMs) that are natively implemented within a single
 simulation code (like LAMMPS) and can only be used with it.
 The *pair_style kim* command is limited to KIM PMs. It is
-used by the :doc:`kim_commands interface <kim_commands>` as needed.
+used by the :doc:`kim command interface <kim_commands>` as needed.
 
 .. note::
 
-   Since *pair_style kim* is called by *kim_interactions* as needed,
-   is not recommended to be directly used in input scripts.
+   Since *pair_style kim* is called by *kim interactions* as needed,
+   it is not recommended to be directly used in input scripts.
 
 ----------
 
 The argument *model* is the name of the KIM PM.
 For potentials archived in OpenKIM
-this is the extended KIM ID (see :doc:`kim_commands <kim_commands>`
+this is the extended KIM ID (see :doc:`kim command <kim_commands>`
 for details). LAMMPS can invoke any KIM PM, however there can
 be incompatibilities (for example due to unit matching issues).
 In the event of an incompatibility, the code will terminate with
@@ -106,7 +107,7 @@ Restrictions
 """"""""""""
 
 This pair style is part of the KIM package. See details on
-restrictions in :doc:`kim_commands <kim_commands>`.
+restrictions in :doc:`kim command <kim_commands>`.
 
 This current version of pair_style kim is compatible with the
 kim-api package version 2.0.0 and higher.
@@ -114,7 +115,7 @@ kim-api package version 2.0.0 and higher.
 Related commands
 """"""""""""""""
 
-:doc:`pair_coeff <pair_coeff>`, :doc:`kim_commands <kim_commands>`
+:doc:`pair_coeff <pair_coeff>`, :doc:`kim command <kim_commands>`
 
 Default
 """""""

From 21a78d803139339dfe0165a944553db31fe2e030 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Wed, 17 Feb 2021 18:16:54 -0600
Subject: [PATCH 091/116] Corect the function name in lower case

---
 src/KIM/pair_kim.cpp | 2 +-
 src/KIM/pair_kim.h   | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/KIM/pair_kim.cpp b/src/KIM/pair_kim.cpp
index 5fa93b09d0..559c656189 100644
--- a/src/KIM/pair_kim.cpp
+++ b/src/KIM/pair_kim.cpp
@@ -1163,6 +1163,6 @@ void PairKIM::set_kim_model_has_flags()
   }
 }
 
-KIM_Model *PairKIM::get_KIM_Model() { return pkim; }
+KIM_Model *PairKIM::get_kim_model() { return pkim; }
 
 std::string PairKIM::get_atom_type_list() { return atom_type_list; }
diff --git a/src/KIM/pair_kim.h b/src/KIM/pair_kim.h
index 0c2e8d2684..cceecff14d 100644
--- a/src/KIM/pair_kim.h
+++ b/src/KIM/pair_kim.h
@@ -89,8 +89,7 @@ class PairKIM : public Pair {
   virtual double memory_usage();
 
   // Get the KIM_Model object
-  KIM_Model *get_KIM_Model();
-
+  KIM_Model *get_kim_model();
   // Get the atom type list
   std::string get_atom_type_list();
  protected:

From 4af0714a8cd0c472f1f53788e093d85c5655412e Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Wed, 17 Feb 2021 18:59:20 -0600
Subject: [PATCH 092/116] Update the 'kim param' interface

Remove the dependency to 'kim init'. To get/set the parameter
values, pair style must be assigned. Must use 'kim interactions'
or 'pair_style kim' before 'kim param get/set'
---
 src/KIM/kim_param.cpp | 66 +++++++------------------------------------
 1 file changed, 10 insertions(+), 56 deletions(-)

diff --git a/src/KIM/kim_param.cpp b/src/KIM/kim_param.cpp
index 04e2bdceca..24eb108abb 100644
--- a/src/KIM/kim_param.cpp
+++ b/src/KIM/kim_param.cpp
@@ -150,7 +150,7 @@ void KimParam::command(int narg, char **arg)
 
   if (narg < 4) error->all(FLERR, "Illegal 'kim param' command");
 
-  std::string kim_param_get_set = arg[0];
+  std::string kim_param_get_set(arg[0]);
 
   if ((kim_param_get_set != "get") && (kim_param_get_set != "set")) {
     std::string msg("Incorrect arguments in 'kim param' command.\n");
@@ -158,15 +158,6 @@ void KimParam::command(int narg, char **arg)
     error->all(FLERR, msg);
   }
 
-  // Check if we called a kim init command
-  // by finding fix STORE/KIM
-  // retrieve model name and model units.
-
-  char *model_name;
-  char *model_units;
-
-  bool isPortableModel(false);
-
   int const ifix = modify->find_fix("KIM_MODEL_STORE");
   if (ifix >= 0) {
     FixStoreKIM *fix_store = reinterpret_cast<FixStoreKIM *>(modify->fix[ifix]);
@@ -175,15 +166,10 @@ void KimParam::command(int narg, char **arg)
         reinterpret_cast<KIM_SimulatorModel *>(
             fix_store->getptr("simulator_model"));
 
-    isPortableModel = simulatorModel ? false : true;
-    if (!isPortableModel)
+    if (simulatorModel)
       error->all(FLERR,
         "'kim param' can only be used with a KIM Portable Model");
-
-    model_name = (char *)fix_store->getptr("model_name");
-    model_units = (char *)fix_store->getptr("model_units");
-  } else
-    error->all(FLERR, "Must use 'kim init' before 'kim param'");
+  }
 
   input->write_echo(fmt::format("#=== BEGIN kim param {} ==================="
                                 "==================\n", kim_param_get_set));
@@ -192,15 +178,13 @@ void KimParam::command(int narg, char **arg)
 
   std::string atom_type_list;
 
-  int kim_error;
-
   bool isPairStyleAssigned = force->pair ? true : false;
   if (isPairStyleAssigned) {
     Pair *pair = force->pair_match("kim", 1, 0);
     if (pair) {
       PairKIM *pairKIM = reinterpret_cast<PairKIM *>(pair);
 
-      pkim = pairKIM->get_KIM_Model();
+      pkim = pairKIM->get_kim_model();
       if (!pkim)
         error->all(FLERR, "Unable to get the KIM Portable Model");
 
@@ -213,39 +197,11 @@ void KimParam::command(int narg, char **arg)
       error->all(FLERR, "Pair style is defined, but there is "
                         "no match for kim style in lammps");
   } else {
-    if (kim_param_get_set == "set") {
-      std::string msg("Wrong 'kim param set' command.\n");
-      msg += "To set the new parameter values, pair style must ";
-      msg += "be assigned.\nMust use 'kim interactions' or";
-      msg += "'pair_style kim' before 'kim param set'";
-      error->all(FLERR, msg);
-    } else {
-      KIM_LengthUnit lengthUnit;
-      KIM_EnergyUnit energyUnit;
-      KIM_ChargeUnit chargeUnit;
-      KIM_TemperatureUnit temperatureUnit;
-      KIM_TimeUnit timeUnit;
-
-      get_kim_unit_names(model_units, lengthUnit, energyUnit,
-                         chargeUnit, temperatureUnit, timeUnit, error);
-
-      int units_accepted;
-
-      kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased,
-                                   lengthUnit,
-                                   energyUnit,
-                                   chargeUnit,
-                                   temperatureUnit,
-                                   timeUnit,
-                                   model_name,
-                                   &units_accepted,
-                                   &pkim);
-      if (kim_error)
-        error->all(FLERR, "Unable to create KIM Portable Model");
-
-      auto logID = fmt::format("{}_Model", comm->me);
-      KIM_Model_SetLogID(pkim, logID.c_str());
-    }
+    auto msg = fmt::format("Illegal 'kim param {0}' command.\nTo {0} the new "
+                           "parameter values, pair style must be assigned.\n"
+                           "Must use 'kim interactions' or 'pair_style kim' "
+                           "before 'kim param {0}'", kim_param_get_set);
+    error->all(FLERR, msg);
   }
 
   // Get the number of mutable parameters in the kim model
@@ -255,6 +211,7 @@ void KimParam::command(int narg, char **arg)
   if (numberOfParameters) {
     // Get the parameters
     if (kim_param_get_set == "get") {
+      int kim_error;
       // Parameter name
       char *paramname = nullptr;
       // Variable name
@@ -502,9 +459,6 @@ void KimParam::command(int narg, char **arg)
   } else
     error->all(FLERR, "This model has No mutable parameters");
 
-  if (!isPairStyleAssigned)
-    KIM_Model_Destroy(&pkim);
-
   input->write_echo(fmt::format("#=== END kim param {} ====================="
                                 "==================\n", kim_param_get_set));
 }

From ad93c64fa6259720b35f91b0377d038b91745309 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Wed, 17 Feb 2021 19:02:45 -0600
Subject: [PATCH 093/116] update the kim command unittests to match with the
 new kim param interface

---
 unittest/commands/test_kim_commands.cpp | 65 ++++++++++++++++++-------
 1 file changed, 47 insertions(+), 18 deletions(-)

diff --git a/unittest/commands/test_kim_commands.cpp b/unittest/commands/test_kim_commands.cpp
index 3934e5de6f..6196e70445 100644
--- a/unittest/commands/test_kim_commands.cpp
+++ b/unittest/commands/test_kim_commands.cpp
@@ -263,8 +263,6 @@ TEST_F(KimCommandsTest, kim_param)
     TEST_FAILURE(".*ERROR: Incorrect arguments in 'kim param' command.\n"
                  "'kim param get/set' is mandatory.*",
                  lmp->input->one("kim param unknown shift 1 shift"););
-    TEST_FAILURE(".*ERROR: Must use 'kim init' before 'kim param'.*",
-                 lmp->input->one("kim param get shift 1 shift"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("clear");
@@ -279,6 +277,27 @@ TEST_F(KimCommandsTest, kim_param)
     lmp->input->one("kim init LennardJones612_UniversalShifted__MO_959249795837_003 real");
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
+    TEST_FAILURE(".*ERROR: Illegal 'kim param get' command.\nTo get the new "
+                 "parameter values, pair style must be assigned.\nMust use 'kim"
+                 " interactions' or 'pair_style kim' before 'kim param get'.*",
+                 lmp->input->one("kim param get shift 1 shift"););
+
+    TEST_FAILURE(".*ERROR: Illegal 'kim param set' command.\nTo set the new "
+                 "parameter values, pair style must be assigned.\nMust use 'kim"
+                 " interactions' or 'pair_style kim' before 'kim param set'.*",
+                 lmp->input->one("kim param set shift 1 2"););
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("clear");
+    lmp->input->one("kim init LennardJones612_UniversalShifted__MO_959249795837_003 real");
+    lmp->input->one("lattice fcc 4.4300");
+    lmp->input->one("region box block 0 10 0 10 0 10");
+    lmp->input->one("create_box 1 box");
+    lmp->input->one("create_atoms 1 box");
+    lmp->input->one("kim interactions Ar");
+    lmp->input->one("mass 1 39.95");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
     TEST_FAILURE(".*ERROR: Illegal index '0' for "
                  "'shift' parameter with the extent of '1'.*",
                  lmp->input->one("kim param get shift 0 shift"););
@@ -301,11 +320,6 @@ TEST_F(KimCommandsTest, kim_param)
     TEST_FAILURE(".*ERROR: Wrong argument in 'kim param get' command.\nThis "
                  "Model does not have the requested 'unknown' parameter.*",
                  lmp->input->one("kim param get unknown 1 unknown"););
-    TEST_FAILURE(".*ERROR: Wrong 'kim param set' command.\n"
-                 "To set the new parameter values, pair style must "
-                 "be assigned.\nMust use 'kim interactions' or"
-                 "'pair_style kim' before 'kim param set'.*",
-                 lmp->input->one("kim param set shift 1 2"););
 
     if (!verbose) ::testing::internal::CaptureStdout();
     lmp->input->one("kim param get shift 1 shift");
@@ -314,17 +328,6 @@ TEST_F(KimCommandsTest, kim_param)
     ASSERT_FALSE(lmp->input->variable->find("shift") == -1);
     ASSERT_TRUE(std::string(lmp->input->variable->retrieve("shift")) == "1");
 
-    if (!verbose) ::testing::internal::CaptureStdout();
-    lmp->input->one("clear");
-    lmp->input->one("kim init LennardJones612_UniversalShifted__MO_959249795837_003 real");
-    lmp->input->one("lattice fcc 4.4300");
-    lmp->input->one("region box block 0 10 0 10 0 10");
-    lmp->input->one("create_box 1 box");
-    lmp->input->one("create_atoms 1 box");
-    lmp->input->one("kim interactions Ar");
-    lmp->input->one("mass 1 39.95");
-    if (!verbose) ::testing::internal::GetCapturedStdout();
-
     TEST_FAILURE(".*ERROR: Illegal index '2' for "
                  "'shift' parameter with the extent of '1'.*",
                  lmp->input->one("kim param set shift 2 2"););
@@ -412,6 +415,32 @@ TEST_F(KimCommandsTest, kim_param)
     if (!verbose) ::testing::internal::GetCapturedStdout();
 
     ASSERT_TRUE(std::string(lmp->input->variable->retrieve("cutoffs")) == "2.3 2.2 5.7");
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("clear");
+    lmp->input->one("units real");
+    lmp->input->one("lattice fcc 4.4300");
+    lmp->input->one("region box block 0 10 0 10 0 10");
+    lmp->input->one("create_box 1 box");
+    lmp->input->one("create_atoms 1 box");
+    lmp->input->one("mass 1 39.95");
+    lmp->input->one("pair_style kim LennardJones612_UniversalShifted__MO_959249795837_003");
+    lmp->input->one("pair_coeff * * Ar");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("kim param get shift 1 shift");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("shift")) == "1");
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("variable new_shift equal 2");
+    lmp->input->one("kim param set shift 1 ${new_shift}");
+    lmp->input->one("kim param get shift 1 shift");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    ASSERT_TRUE(std::string(lmp->input->variable->retrieve("shift")) == "2");
 }
 
 TEST_F(KimCommandsTest, kim_property)

From d73ba9a3b45d47fb23bf1a11074a16707ef51586 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Wed, 17 Feb 2021 19:25:31 -0600
Subject: [PATCH 094/116] Update the 'kim param' doc with the new interface

---
 doc/src/kim_commands.rst | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/doc/src/kim_commands.rst b/doc/src/kim_commands.rst
index 47b8d3e790..1dc2625dda 100644
--- a/doc/src/kim_commands.rst
+++ b/doc/src/kim_commands.rst
@@ -751,11 +751,10 @@ for details).
 
 .. note::
 
-   The *kim param get/set* commands must be preceded by *kim init*\ .
-   The *kim param set* command must additionally be preceded by a
-   *kim_interactions* command (or alternatively by a *pair_style kim* and
-   *pair_coeff* commands).  The *kim param set* command may be used wherever a
-   *pair_coeff* command may occur.
+   The *kim param set/get* command must be preceded by a *kim_interactions*
+   command (or alternatively by a *pair_style kim* and *pair_coeff* commands).
+   The *kim param set* command may be used wherever a *pair_coeff* command may
+   occur.
 
 The syntax for the *kim param* command is as follows:
 
@@ -820,11 +819,21 @@ are provided below.
 
 .. code-block:: LAMMPS
 
-   kim init  SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
+   kim init SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
    ...
+   kim interactions Si
    kim param get A 1 VARA
 
-In this case, the value of the SW *A* parameter is retrieved and placed in the
+or,
+
+.. code-block:: LAMMPS
+
+   ...
+   pair_style kim SW_StillingerWeber_1985_Si__MO_405512056662_005
+   pair_coeff * * Si
+   kim param get A 1 VARA
+
+In these cases, the value of the SW *A* parameter is retrieved and placed in the
 LAMMPS variable *VARA*\ .  The variable *VARA* can be used in the remainder of
 the input script in the same manner as any other LAMMPS variable.
 
@@ -832,8 +841,8 @@ the input script in the same manner as any other LAMMPS variable.
 
 .. code-block:: LAMMPS
 
-   kim init  SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
    ...
+   kim interactions Si
    kim param get A 1 VARA B 1 VARB
 
 In this example, it is shown how to retrieve the *A* and *B* parameters of the
@@ -846,8 +855,9 @@ determined by the *formatarg* argument.
 
 .. code-block:: LAMMPS
 
-   kim init  SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
+   kim init SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
    ...
+   kim interactions Te Zn Se
    kim param get lambda 7:9 LAM_TeTe LAM_TeZn LAM_TeSe
 
 In this case, *formatarg* is not specified and therefore the default
@@ -868,8 +878,8 @@ lambda retrieved by the *get* operation are placed in the LAMMPS variables
 
 .. code-block:: LAMMPS
 
-   kim init SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
    ...
+   kim      interactions Te Zn Se
    kim      param get lambda 15:17 LAMS list
    variable LAM_VALUE index ${LAMS}
    label    loop_on_lambda
@@ -888,8 +898,8 @@ the current value of lambda.
 
 .. code-block:: LAMMPS
 
-   kim init  SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002 metal
    ...
+   kim interactions Te Zn Se
    kim param get lambda 15:17 LAM split
 
 In this case, the "split" mode of *formatarg* is used.  The three values

From 2fb0506a4307106bd992f49e979201517006f002 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Thu, 18 Feb 2021 05:29:59 -0600
Subject: [PATCH 095/116] Fix some typos in kim interface

---
 doc/src/kim_commands.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/src/kim_commands.rst b/doc/src/kim_commands.rst
index 1dc2625dda..1a47a83cc7 100644
--- a/doc/src/kim_commands.rst
+++ b/doc/src/kim_commands.rst
@@ -83,7 +83,7 @@ Full syntax
             next command.  This enables the construction of simple loops
      *explicit* = returns the values separately in one more more variable names
             provided as arguments that precede *formatarg*\ . [default for
-            *kim_param*]
+            *kim param*]
 
 * query_function = name of the OpenKIM web API query function to be used
 * queryargs = a series of *keyword=value* pairs that represent the web query;
@@ -173,7 +173,7 @@ Convenience
 * IMs in OpenKIM are distributed in binary form along with LAMMPS and can be
   used in a LAMMPS input script simply by providing their KIM ID in the
   *kim init* command documented on this page.
-* The *kim_query* web query tool provides the ability to use the predictions of
+* The *kim query* web query tool provides the ability to use the predictions of
   IMs for supported material properties (computed via
   `KIM Tests <https://openkim.org/doc/evaluation/kim-tests/>`_) as part of a
   LAMMPS input script setup and analysis.
@@ -445,7 +445,7 @@ Alternatively, for a model with a fixed mapping the command would be:
    kim interactions fixed_types
 
 The *kim interactions* command performs all the necessary steps to set up the
-OpenKIM IM selected in the *kim_init* command.  The specific actions depend on
+OpenKIM IM selected in the *kim init* command.  The specific actions depend on
 whether the IM is a KIM PM or a KIM SM.  For a KIM PM, a
 :doc:`pair_style kim <pair_kim>` command is executed followed by the appropriate
 *pair_coeff* command.  For example, for the Ercolessi and Adams (1994) KIM PM for
@@ -751,7 +751,7 @@ for details).
 
 .. note::
 
-   The *kim param set/get* command must be preceded by a *kim_interactions*
+   The *kim param set/get* command must be preceded by a *kim interactions*
    command (or alternatively by a *pair_style kim* and *pair_coeff* commands).
    The *kim param set* command may be used wherever a *pair_coeff* command may
    occur.

From eea023321ad962a372954460b9e5c2b2e3c59206 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Thu, 18 Feb 2021 06:03:42 -0600
Subject: [PATCH 096/116] Fix a minor mistake in variable name

---
 src/KIM/kim_query.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KIM/kim_query.cpp b/src/KIM/kim_query.cpp
index db2bd47f94..365f60a094 100644
--- a/src/KIM/kim_query.cpp
+++ b/src/KIM/kim_query.cpp
@@ -296,7 +296,7 @@ char *do_query(const std::string &qfunction, const std::string &mname,
             std::string sval = val.substr(0, n);
             if (utils::is_integer(sval) ||
                 utils::is_double(sval) ||
-                (val.front() == '"' && val.back() == '"')) {
+                (sval.front() == '"' && sval.back() == '"')) {
               query += fmt::format("{},", sval);
             } else {
               query += fmt::format("\"{}\",", sval);

From 5c00162367dc4dcea3dc7044ed2b78e655137765 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Thu, 18 Feb 2021 06:18:23 -0600
Subject: [PATCH 097/116] Adding extra cases for 'kim query command'

---
 unittest/commands/test_kim_commands.cpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/unittest/commands/test_kim_commands.cpp b/unittest/commands/test_kim_commands.cpp
index 6196e70445..3a5818edb9 100644
--- a/unittest/commands/test_kim_commands.cpp
+++ b/unittest/commands/test_kim_commands.cpp
@@ -695,6 +695,30 @@ TEST_F(KimCommandsTest, kim_query)
     lmp->input->one(squery);
     lmp->input->one("variable model_name delete");
     if (!verbose) ::testing::internal::GetCapturedStdout();
+
+    if (!verbose) ::testing::internal::CaptureStdout();
+    lmp->input->one("clear");
+
+    squery = "kim query model_name index get_available_models ";
+    squery += "species=[Al] potential_type=[eam,meam]";
+    lmp->input->one(squery);
+    lmp->input->one("variable model_name delete");
+
+    squery = "kim query model_name index get_available_models ";
+    squery += "species=[Al] potential_type=[\"eam\",\"meam\"]";
+    lmp->input->one(squery);
+    lmp->input->one("variable model_name delete");
+
+    squery = "kim query model_name index get_available_models ";
+    squery += "species=[Al] potential_type=[eam,\"meam\"]";
+    lmp->input->one(squery);
+    lmp->input->one("variable model_name delete");
+
+    squery = "kim query model_name index get_available_models ";
+    squery += "species=[Al] potential_type=[\"eam\",meam]";
+    lmp->input->one(squery);
+    lmp->input->one("variable model_name delete");
+    if (!verbose) ::testing::internal::GetCapturedStdout();
 #endif
 }
 } // namespace LAMMPS_NS

From f0fd379c7034f6e92b3a3529d70452e03e7d6a22 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Thu, 18 Feb 2021 07:14:22 -0600
Subject: [PATCH 098/116] update the kim command interface with the new changes

---
 doc/src/Packages_details.rst | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/doc/src/Packages_details.rst b/doc/src/Packages_details.rst
index 6b50382243..815db12668 100644
--- a/doc/src/Packages_details.rst
+++ b/doc/src/Packages_details.rst
@@ -374,12 +374,12 @@ repository of interatomic models (IMs) enabling compatible ones to be used in
 LAMMPS simulations.
 
 
-This includes :doc:`kim_init <kim_commands>`, and
-:doc:`kim_interactions <kim_commands>` commands to select, initialize and
-instantiate the IM, a :doc:`kim_query <kim_commands>` command to perform web
+This includes :doc:`kim init <kim_commands>`, and
+:doc:`kim interactions <kim_commands>` commands to select, initialize and
+instantiate the IM, a :doc:`kim query <kim_commands>` command to perform web
 queries for material property predictions of OpenKIM IMs, a
-:doc:`kim_param <kim_commands>` command to access KIM Model Parameters from
-LAMMPS, and a :doc:`kim_property <kim_commands>` command to write material
+:doc:`kim param <kim_commands>` command to access KIM Model Parameters from
+LAMMPS, and a :doc:`kim property <kim_commands>` command to write material
 properties computed in LAMMPS to standard KIM property instance format.
 
 Support for KIM IMs that conform to the
@@ -388,8 +388,8 @@ is provided by the :doc:`pair_style kim <pair_kim>` command.
 
 .. note::
 
-   The command *pair_style kim* is called by *kim_interactions* and
-   is not recommended to be directly used in input scripts.
+   The command *pair_style kim* is called by *kim interactions* and is not
+   recommended to be directly used in input scripts.
 
 To use this package you must have the KIM API library available on your
 system. The KIM API is available for download on the
@@ -406,7 +406,7 @@ and is funded by the `National Science Foundation <https://www.nsf.gov/>`_.
 API and the *pair_style kim* command. Yaser Afshar (U Minnesota),
 Axel Kohlmeyer (Temple U), Ellad Tadmor (U Minnesota), and
 Daniel Karls (U Minnesota) contributed to the
-:doc:`kim_commands <kim_commands>` interface in close collaboration with
+:doc:`kim command <kim_commands>` interface in close collaboration with
 Ryan Elliott.
 
 **Install:**
@@ -416,7 +416,7 @@ This package has :ref:`specific installation instructions <kim>` on the
 
 **Supporting info:**
 
-* :doc:`kim_commands <kim_commands>`
+* :doc:`kim command <kim_commands>`
 * :doc:`pair_style kim <pair_kim>`
 * src/KIM: filenames -> commands
 * src/KIM/README

From 90238477088525279d163f7d8c640c848a4446dd Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Thu, 18 Feb 2021 07:15:57 -0600
Subject: [PATCH 099/116] update the kim command name

---
 doc/src/pair_adp.rst | 2 +-
 doc/src/pair_eam.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/src/pair_adp.rst b/doc/src/pair_adp.rst
index 9fdc837ac7..abc8cb3c4c 100644
--- a/doc/src/pair_adp.rst
+++ b/doc/src/pair_adp.rst
@@ -59,7 +59,7 @@ command to specify them.
 * The OpenKIM Project at
   `https://openkim.org/browse/models/by-type <https://openkim.org/browse/models/by-type>`_
   provides ADP potentials that can be used directly in LAMMPS with the
-  :doc:`kim <kim_commands>` interface.
+  :doc:`kim command <kim_commands>` interface.
 
 ----------
 
diff --git a/doc/src/pair_eam.rst b/doc/src/pair_eam.rst
index f88faea7af..a8b67e7f41 100644
--- a/doc/src/pair_eam.rst
+++ b/doc/src/pair_eam.rst
@@ -141,7 +141,7 @@ interatomic potentials and file formats.
 The OpenKIM Project at
 `https://openkim.org/browse/models/by-type <https://openkim.org/browse/models/by-type>`_
 provides EAM potentials that can be used directly in LAMMPS with the
-:doc:`kim <kim_commands>` interface.
+:doc:`kim command <kim_commands>` interface.
 
 ----------
 

From f04b8f525eb9c37f748e700f820e79deb8114ebe Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Thu, 18 Feb 2021 07:16:21 -0600
Subject: [PATCH 100/116] naming update

---
 doc/src/pair_kim.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/src/pair_kim.rst b/doc/src/pair_kim.rst
index 1b10d70b65..551a0bc46e 100644
--- a/doc/src/pair_kim.rst
+++ b/doc/src/pair_kim.rst
@@ -28,14 +28,14 @@ This pair style is a wrapper on the
 repository of interatomic potentials to enable their use in LAMMPS scripts.
 
 The preferred interface for using interatomic models archived in
-OpenKIM is the :doc:`kim command interface <kim_commands>`. That
+OpenKIM is the :doc:`kim command <kim_commands>` interface. That
 interface supports both "KIM Portable Models" (PMs) that conform to the
 KIM API Portable Model Interface (PMI) and can be used by any
 simulation code that conforms to the KIM API/PMI, and
 "KIM Simulator Models" (SMs) that are natively implemented within a single
 simulation code (like LAMMPS) and can only be used with it.
 The *pair_style kim* command is limited to KIM PMs. It is
-used by the :doc:`kim command interface <kim_commands>` as needed.
+used by the :doc:`kim command <kim_commands>` interface as needed.
 
 .. note::
 

From 31a878bcd618f2b08610d3f29bbcfa2608e2ed0b Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Thu, 18 Feb 2021 07:17:25 -0600
Subject: [PATCH 101/116] remove the old kim commands

---
 doc/utils/sphinx-config/LAMMPSLexer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/utils/sphinx-config/LAMMPSLexer.py b/doc/utils/sphinx-config/LAMMPSLexer.py
index 72536d494f..ae40b094c2 100644
--- a/doc/utils/sphinx-config/LAMMPSLexer.py
+++ b/doc/utils/sphinx-config/LAMMPSLexer.py
@@ -8,8 +8,8 @@ LAMMPS_COMMANDS = ("angle_coeff", "angle_style", "atom_modify", "atom_style",
 "delete_bonds", "dielectric", "dihedral_coeff", "dihedral_style", "dimension",
 "displace_atoms", "dump_modify", "dynamical_matrix", "echo",
 "fix_modify", "group2ndx", "hyper", "if", "improper_coeff",
-"improper_style", "include", "info", "jump", "kim_init", "kim_interactions",
-"kim_param", "kim_query", "kspace_modify", "kspace_style", "label", "lattice",
+"improper_style", "include", "info", "jump", "kim",
+"kspace_modify", "kspace_style", "label", "lattice",
 "log", "mass", "message", "minimize", "min_modify", "min_style", "molecule",
 "ndx2group", "neb", "neb/spin", "neighbor", "neigh_modify", "newton", "next",
 "package", "pair_coeff", "pair_modify", "pair_style", "pair_write",

From 9273a45d3444ed8c97a5fc99fb7f7aac9ec01d04 Mon Sep 17 00:00:00 2001
From: tadmor <tadmor@aem.umn.edu>
Date: Fri, 19 Feb 2021 16:35:51 -0600
Subject: [PATCH 102/116] Revising kim command documentation.

---
 doc/src/kim_commands.rst | 336 +++++++++++++++++----------------------
 1 file changed, 143 insertions(+), 193 deletions(-)

diff --git a/doc/src/kim_commands.rst b/doc/src/kim_commands.rst
index 1a47a83cc7..116585c8ac 100644
--- a/doc/src/kim_commands.rst
+++ b/doc/src/kim_commands.rst
@@ -29,165 +29,17 @@ Examples
 Description
 """""""""""
 
-*kim command* provides a set of high-level wrapper around the
-`Open Knowledgebase of Interatomic Models (OpenKIM) <https://openkim.org>`_
-repository of interatomic models (IMs) (potentials and force fields),
-so that they can be used by LAMMPS scripts.  This command is followed by a
-a set of sub-coammnds.  The kim command does not implement any computations
-directly, but rather generates LAMMPS input commands based on the information
-retrieved from the OpenKIM repository to initialize and activate OpenKIM IMs
-and query their predictions for use in the LAMMPS script.
-All LAMMPS input commands generated and executed by *kim command* are
-echoed to the LAMMPS log file.
+The *kim command* includes a set of sub-commands that allow LAMMPS users
+to use interatomic models (IM) (potentials and force fields) and their predictions
+for various physical properties archived in the 
+`Open Knowledgebase of Interatomic Models (OpenKIM) <https://openkim.org>`_ repository.
 
-Full syntax
-"""""""""""
-
-.. code-block:: LAMMPS
-
-   kim init model user_units unitarg
-   kim interactions typeargs
-   kim query variable formatarg query_function queryargs
-   kim param get param_name index_range variables formatarg
-   kim param set param_name index_range values
-   kim property create  instance_id property_id
-   kim property modify  instance_id key key_name key_name_key key_name_value
-   kim property remove  instance_id key key_name
-   kim property destroy instance_id
-   kim property dump    file
-
-.. _formatarg_options:
-
-* model = name of the KIM interatomic model (the KIM ID for models archived in
-  OpenKIM)
-* user_units = the LAMMPS :doc:`units <units>` style assumed in the LAMMPS
-  input script
-* unitarg = *unit_conversion_mode* (optional)
-* typeargs = atom type to species mapping (one entry per atom type) or
-  *fixed_types* for models with a preset fixed mapping
-* variable(s) = single name or list of names of (string style) LAMMPS
-  variable(s) where a query result or parameter get result is stored.  Variables
-  that do not exist will be created by the command
-* formatarg = *list, split, index, or explicit* (optional):
-
-  .. parsed-literal::
-
-     *list* = returns a single string with a list of space separated values
-            (e.g. "1.0 2.0 3.0"), which is placed in a LAMMPS variable as
-            defined by the *variable* argument. [default for *query*
-            sub-command]
-     *split* = returns the values separately in new variables with names based
-            on the prefix specified in *variable* and a number appended to
-            indicate which element in the list of values is in the variable
-     *index* = returns a variable style index that can be incremented via the
-            next command.  This enables the construction of simple loops
-     *explicit* = returns the values separately in one more more variable names
-            provided as arguments that precede *formatarg*\ . [default for
-            *kim param*]
-
-* query_function = name of the OpenKIM web API query function to be used
-* queryargs = a series of *keyword=value* pairs that represent the web query;
-  supported keywords depend on the query function
-* param_name = name of a KIM portable model parameter
-* index_range = KIM portable model parameter index range (an integer for a
-  single element, or pair of integers separated by a colon for a range of
-  elements)
-* values = new value(s) to replace the current value(s) of a KIM portable model
-  parameter
-* instance_id = a positive integer identifying the KIM property instance
-* property_id = identifier of a
-  `KIM Property Definition <https://openkim.org/properties>`_, which can be (1)
-  a property short name, (2) the full unique ID of the property (including the
-  contributor and date), (3) a file name corresponding to a local property
-  definition file
-* key_name = one of the keys belonging to the specified KIM property definition
-* key_name_key = a key belonging to a key-value pair (standardized in the
-  `KIM Properties Framework <https://openkim.org/doc/schema/properties-framework>`__)
-* key_name_value = value to be associated with a key_name_key in a key-value
-  pair
-* file = name of a file to write the currently defined set of KIM property
-  instances to
-
-Full syntax examples
-""""""""""""""""""""
-
-.. code-block:: LAMMPS
-
-   kim init SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
-   kim interactions Si
-   kim init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_001 real
-   kim init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_001 metal unit_conversion_mode
-   kim interactions C H O
-   kim init Sim_LAMMPS_IFF_PCFF_HeinzMishraLinEmami_2015Ver1v5_FccmetalsMineralsSolventsPolymers__SM_039297821658_000 real
-   kim interactions fixed_types
-   kim query a0 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom]
-   kim query model index get_available_models species=[Al] potential_type=[eam]
-   kim param get gamma  1 varGamma
-   kim param set gamma  1 3.0
-   kim property create  1 atomic-mass
-   kim property modify  1 key mass source-value 26.98154
-   kim property modify  1 key species source-value Al
-   kim property remove  1 key species
-   kim property destroy 1
-   kim property dump    results.edn
-
-Benefits of Using OpenKIM IMs
------------------------------
-
-Employing OpenKIM IMs provides LAMMPS users with multiple benefits:
-
-Reliability
-^^^^^^^^^^^
-
-* All content archived in OpenKIM is reviewed by the
-  `KIM Editor <https://openkim.org/governance/>`_ for quality.
-* IMs in OpenKIM are archived with full provenance control.  Each is associated
-  with a maintainer responsible for the integrity of the content.  All changes
-  are tracked and recorded.
-* IMs in OpenKIM are exhaustively tested using
-  `KIM Tests <https://openkim.org/doc/evaluation/kim-tests/>`_ that compute a
-  host of material properties, and
-  `KIM Verification Checks <https://openkim.org/doc/evaluation/kim-verification-checks/>`_
-  that provide the user with information on various aspects of the IM behavior
-  and coding correctness.  This information is displayed on the IM's page
-  accessible through the
-  `OpenKIM browse interface <https://openkim.org/browse>`_.
-
-Reproducibility
-^^^^^^^^^^^^^^^
-
-* Each IM in OpenKIM is issued a unique identifier
-  (`KIM ID <https://openkim.org/doc/schema/kim-ids/>`_), which includes a
-  version number (last three digits).  Any changes that can result in different
-  numerical values lead to a version increment in the KIM ID.  This makes it
-  possible to reproduce simulations since the specific version of a specific IM
-  used can be retrieved using its KIM ID.
-* OpenKIM is a member organization of `DataCite <https://datacite.org/>`_ and
-  issues digital object identifiers (DOIs) to all IMs archived in OpenKIM.  This
-  makes it possible to cite the IM code used in a simulation in a publications
-  to give credit to the developers and further facilitate reproducibility.
-
-Convenience
-^^^^^^^^^^^
-
-* IMs in OpenKIM are distributed in binary form along with LAMMPS and can be
-  used in a LAMMPS input script simply by providing their KIM ID in the
-  *kim init* command documented on this page.
-* The *kim query* web query tool provides the ability to use the predictions of
-  IMs for supported material properties (computed via
-  `KIM Tests <https://openkim.org/doc/evaluation/kim-tests/>`_) as part of a
-  LAMMPS input script setup and analysis.
-* Support is provided for unit conversion between the :doc:`unit style <units>`
-  used in the LAMMPS input script and the units required by the OpenKIM IM.
-  This makes it possible to use a single input script with IMs using different
-  units without change and minimizes the likelihood of errors due to
-  incompatible units.
+Using OpenKIM provides LAMMPS users with immediate access to a large number of
+verified IMs and their predictions. OpenKIM IMs have multiple benefits including
+`reliability, reproducibility and convenience <https://openkim.org/doc/overview/kim-features/>`_.
 
 .. _IM_types:
 
-Types of IMs in OpenKIM
------------------------
-
 There are two types of IMs archived in OpenKIM:
 
 .. _PM_type:
@@ -237,7 +89,7 @@ The URL for the Model Page is constructed from the
 
    https://openkim.org/id/extended_KIM_ID
 
-For example, for the Stillinger--Weber potential listed above the Model Page is
+For example, for the Stillinger-Weber potential listed above the Model Page is
 located at:
 
 .. parsed-literal::
@@ -256,21 +108,55 @@ learn how to install a pre-built binary of the OpenKIM Repository of Models.
    It is also possible to locally install IMs not archived in OpenKIM,
    in which case their names do not have to conform to the KIM ID format.
 
-Using OpenKIM IMs with LAMMPS
------------------------------
+Using OpenKIM IMs with LAMMPS (*kim init*, *kim interactions*)
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
-Two sub-commands are employed when using OpenKIM IMs, one to select the
+Two sub-commands are employed when using OpenKIM IMs in LAMMPS, one to select the
 IM and perform necessary initialization (\ *kim init*\ ), and the second
 to set up the IM for use by executing any necessary LAMMPS commands
 (\ *kim interactions*\ ).  Both are required.
 
+Syntax
+------
+
+.. code-block:: LAMMPS
+
+   kim init model user_units unitarg
+   kim interactions typeargs
+
+.. _formatarg_options:
+
+* model = name of the KIM interatomic model (the KIM ID for models archived in
+  OpenKIM)
+* user_units = the LAMMPS :doc:`units <units>` style assumed in the LAMMPS
+  input script
+* unitarg = *unit_conversion_mode* (optional)
+* typeargs = atom type to species mapping (one entry per atom type) or
+  *fixed_types* for models with a preset fixed mapping
+
+Examples
+--------
+
+.. code-block:: LAMMPS
+
+   kim init SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
+   kim interactions Si
+  
+   kim init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_001 real
+   kim init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_001 metal unit_conversion_mode
+   kim interactions C H O
+   
+   kim init Sim_LAMMPS_IFF_PCFF_HeinzMishraLinEmami_2015Ver1v5_FccmetalsMineralsSolventsPolymers__SM_039297821658_000 real
+   kim interactions fixed_types
+
+
 See the *examples/kim* directory for example input scripts that use KIM PMs
 and KIM SMs.
 
 .. _init:
 
 OpenKIM IM Initialization (*kim init*)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+--------------------------------------
 
 The *kim* command followed by *init* sub-command must be issued **before**
 the simulation box is created (normally at the top of the file).  This command
@@ -315,6 +201,7 @@ Here is an example of a LAMMPS script to compute the cohesive energy of a
 face-centered cubic (fcc) lattice for the MEAM potential by Pascuet and
 Fernandez (2015) for Al.
 
+
 .. code-block:: LAMMPS
 
    kim          init Sim_LAMMPS_MEAM_PascuetFernandez_2015_Al__SM_811588957187_000 metal
@@ -411,11 +298,10 @@ to convert the box and all atomic positions to the correct units:
    Unit conversion will only work if the conversion factors are placed in all
    appropriate places in the input script.  It is up to the user to do this
    correctly.
-
 .. _interactions:
 
 OpenKIM IM Execution (*kim interactions*)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-----------------------------------------
 
 The second and final step in using an OpenKIM IM is to execute the
 *kim interactions* command.  This command must be preceded by a *kim init*
@@ -520,21 +406,52 @@ the *kim interactions* command executes the following LAMMPS input commands:
 .. _query:
 
 Using OpenKIM Web Queries in LAMMPS (*kim query*)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+"""""""""""""""""""""""""""""""""""""""""""""""""
 
 The *kim query* command performs a web query to retrieve the predictions of an
 IM set by *kim init* for material properties archived in
 `OpenKIM <https://openkim.org>`_.
 
-The syntax for the *kim query* command is as follows:
+Syntax
+------
 
 .. code-block:: LAMMPS
 
    kim query variable formatarg query_function queryargs
 
+.. _formatarg_options:
+
+* variable(s) = single name or list of names of (string style) LAMMPS
+  variable(s) where a query result or parameter get result is stored.  Variables
+  that do not exist will be created by the command
+* formatarg = *list, split, or index* (optional):
+
+  .. parsed-literal::
+
+     *list* = returns a single string with a list of space separated values
+            (e.g. "1.0 2.0 3.0"), which is placed in a LAMMPS variable as
+            defined by the *variable* argument. [default]
+     *split* = returns the values separately in new variables with names based
+            on the prefix specified in *variable* and a number appended to
+            indicate which element in the list of values is in the variable
+     *index* = returns a variable style index that can be incremented via the
+            next command.  This enables the construction of simple loops
+
+* query_function = name of the OpenKIM web API query function to be used
+* queryargs = a series of *keyword=value* pairs that represent the web query;
+  supported keywords depend on the query function
+
+Examples
+--------
+
+.. code-block:: LAMMPS
+
+   kim query a0 get_lattice_constant_cubic crystal=[fcc] species=[Al] units=[angstrom]
+   kim query model index get_available_models species=[Al] potential_type=[eam]
+
 The result of the query is stored in one or more
 :doc:`string style variables <variable>` as determined by the optional
-*formatarg* argument :ref:`documented above <formatarg_options>`.  For the "list"
+*formatarg* argument.  For the "list"
 setting of *formatarg* (or if *formatarg* is not specified), the result is
 returned as a space-separated list of values in *variable*\ .  The *formatarg*
 keyword "split" separates the result values into individual variables of the
@@ -544,8 +461,7 @@ the returned values is determined by the type of query performed.  The
 *formatarg* keyword "index" returns a :doc:`variable style index <variable>`
 that can be incremented via the :doc:`next <next>` command.  This enables the
 construction of simple loops over the returned values by the type of query
-performed. (Note that the "explicit" setting of *formatarg* is not supported by
-*kim query*\ .)
+performed.
 
 .. note::
 
@@ -565,11 +481,11 @@ of query functions is available on the OpenKIM webpage at
 
 .. note::
 
-   All query functions, except *get_available_models* function, require the
+   All query functions, except *get_available_models*, require the
    *model* keyword, which identifies the IM whose predictions are being queried.
    *kim query* automatically generates the *model* keyword based on the IM set
    in by *kim init*, and it can be overwritten if specified as an argument to
-   the *kim query*\ .  Where *Kim init* is not specified, the *model* keyword
+   the *kim query*\ .  Where *kim init* is not specified, the *model* keyword
    must be provided as an argument to the *kim query*\ .
 
 .. note::
@@ -583,7 +499,7 @@ of query functions is available on the OpenKIM webpage at
    methods are available for a given *query_function*\ .
 
 *kim query* Usage Examples and Further Clarifications
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-----------------------------------------------------
 
 The data obtained by *kim query* commands can be used as part of the setup or
 analysis phases of LAMMPS simulations.  Some examples are given below.
@@ -718,7 +634,7 @@ each potential for the fcc structure and places it in variable *latconst*\ .
 .. _param:
 
 Accessing KIM Model Parameters from LAMMPS (*kim param*)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
 All IMs are functional forms containing a set of parameters.  These parameters'
 values are typically selected to best reproduce a training set of quantum
@@ -756,18 +672,39 @@ for details).
    The *kim param set* command may be used wherever a *pair_coeff* command may
    occur.
 
-The syntax for the *kim param* command is as follows:
+Syntax
+------
 
 .. code-block:: LAMMPS
 
    kim param get param_name index_range variable formatarg
    kim param set param_name index_range values
+   
+* param_name = name of a KIM portable model parameter (which is published by the
+  PM and available for access). The specific string used to identify a parameter
+  is defined by the PM.  For example, for the
+  `Stillinger-Weber (SW) potential in OpenKIM <https://openkim.org/id/SW_StillingerWeber_1985_Si__MO_405512056662_005>`_,
+  the parameter names are *A, B, p, q, sigma, gamma, cutoff, lambda, costheta0*
+* index_range = KIM portable model parameter index range (an integer for a
+  single element, or pair of integers separated by a colon for a range of
+  elements)
+* variable(s) = single name or list of names of (string style) LAMMPS
+  variable(s) where a query result or parameter get result is stored.  Variables
+  that do not exist will be created by the command
+* formatarg = *list, split, or explicit* (optional):
 
-Here, *param_name* is the name of a KIM PM parameter (which is published by the
-PM and available for access).  The specific string used to identify a parameter
-is defined by the PM.  For example, for the
-`Stillinger--Weber (SW) potential in OpenKIM <https://openkim.org/id/SW_StillingerWeber_1985_Si__MO_405512056662_005>`_,
-the parameter names are *A, B, p, q, sigma, gamma, cutoff, lambda, costheta0*\ .
+  .. parsed-literal::
+
+     *list* = returns a single string with a list of space separated values
+            (e.g. "1.0 2.0 3.0"), which is placed in a LAMMPS variable as
+            defined by the *variable* argument.
+     *split* = returns the values separately in new variables with names based
+            on the prefix specified in *variable* and a number appended to
+            indicate which element in the list of values is in the variable
+     *explicit* = returns the values separately in one more more variable names
+            provided as arguments that precede *formatarg*\ . [default]
+* values = new value(s) to replace the current value(s) of a KIM portable model
+  parameter
 
 .. note::
 
@@ -810,7 +747,7 @@ spaces.  Multiple parameters can be set with a single call to *kim param set* by
 repeating the argument list following *set*\ .
 
 *kim param* Usage Examples and Further Clarifications
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-----------------------------------------------------
 
 Examples of getting and setting KIM PM parameters with further clarifications
 are provided below.
@@ -824,7 +761,7 @@ are provided below.
    kim interactions Si
    kim param get A 1 VARA
 
-or,
+or
 
 .. code-block:: LAMMPS
 
@@ -868,7 +805,7 @@ lambda retrieved by the *get* operation are placed in the LAMMPS variables
 
 .. note::
 
-   In the above example, elements 7--9 of the lambda parameter correspond to
+   In the above example, elements 7-9 of the lambda parameter correspond to
    Te-Te, Te-Zm and Te-Se interactions.  This can be determined by visiting the
    `model page for the specified potential <https://openkim.org/id/SW_ZhouWardMartin_2013_CdTeZnSeHgS__MO_503261197030_002>`_
    and looking at its parameter file linked to at the bottom of the page (file
@@ -962,10 +899,10 @@ values 2.35214, 2.23869, 2.04516, 2.43269 and 1.80415 in order.
 
 .. _property:
 
-Writing material properties computed in LAMMPS to standard KIM property instance format (*kim property*)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Writing material properties in standard KIM Property Instance format (*kim property*)
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
-As explained :ref:`above<kim_description>`, the OpenKIM system includes a
+The OpenKIM system includes a
 collection of Tests (material property calculation codes), Models (interatomic
 potentials), Predictions, and Reference Data (DFT or experiments).  Specifically,
 a KIM Test is a computation that when coupled with a KIM Model generates the
@@ -990,7 +927,10 @@ uploaded to the official repository (see below).  Once one or more applicable
 property definitions have been identified, the *kim property create*,
 *kim property modify*, *kim property remove*, and *kim property destroy*,
 commands provide an interface to create, set, modify, remove, and destroy
-instances of them within a LAMMPS script.  Their general syntax is as follows,
+instances of them within a LAMMPS script.
+
+Syntax
+------
 
 .. code-block:: LAMMPS
 
@@ -1000,13 +940,22 @@ instances of them within a LAMMPS script.  Their general syntax is as follows,
    kim property destroy instance_id
    kim property dump    file
 
-Here, *instance_id* is a positive integer used to uniquely identify each
-property instance; (note that the results file can contain multiple property
-instances).  A *property_id* is an identifier of a
-`KIM Property Definition <https://openkim.org/properties>`_,
-which can be (1) a property short name, (2) the full unique ID of the property
-(including the contributor and date), (3) a file name corresponding to a local
-property definition file.  Examples of each of these cases are shown below,
+* instance_id = a positive integer identifying the KIM property instance;
+  (note that the results file can contain multiple property instances)
+* property_id = identifier of a
+  `KIM Property Definition <https://openkim.org/properties>`_, which can be (1)
+  a property short name, (2) the full unique ID of the property (including the
+  contributor and date), (3) a file name corresponding to a local property
+  definition file
+* key_name = one of the keys belonging to the specified KIM property definition
+* key_name_key = a key belonging to a key-value pair (standardized in the
+  `KIM Properties Framework <https://openkim.org/doc/schema/properties-framework>`__)
+* key_name_value = value to be associated with a key_name_key in a key-value
+  pair
+* file = name of a file to write the currently defined set of KIM property
+  instances to
+
+Examples of each of the three *property_id* cases are shown below,
 
 .. code-block:: LAMMPS
 
@@ -1074,7 +1023,7 @@ or, equivalently,
                                      source-unit  amu
 
 *kim property* Usage Examples and Further Clarifications
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+--------------------------------------------------------
 
 **Create**
 
@@ -1377,7 +1326,7 @@ For example,
     instances from memory.
 
 Citation of OpenKIM IMs
------------------------
+"""""""""""""""""""""""
 
 When publishing results obtained using OpenKIM IMs researchers are requested
 to cite the OpenKIM project :ref:`(Tadmor) <kim-mainpaper>`, KIM API
@@ -1425,3 +1374,4 @@ doi: `https://doi.org/10.1007/s11837-011-0102-6 <https://doi.org/10.1007/s11837-
 
 **(Elliott)** Elliott, Tadmor and Bernstein, `https://openkim.org/kim-api <https://openkim.org/kim-api>`_ (2011)
 doi: `https://doi.org/10.25950/FF8F563A <https://doi.org/10.25950/FF8F563A>`_
+

From efa5eff85cee270a2d000dfb7194a24c87befb9c Mon Sep 17 00:00:00 2001
From: "Ryan S. Elliott" <relliott@umn.edu>
Date: Fri, 19 Feb 2021 20:22:51 -0600
Subject: [PATCH 103/116] Some minor changes to kim_commands.rst

---
 doc/src/kim_commands.rst | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/doc/src/kim_commands.rst b/doc/src/kim_commands.rst
index 116585c8ac..d67bd454a9 100644
--- a/doc/src/kim_commands.rst
+++ b/doc/src/kim_commands.rst
@@ -8,21 +8,23 @@ Syntax
 
 .. code-block:: LAMMPS
 
-   kim sub-command
+   kim sub-command args
 
 * sub-command = :ref:`init <init>` or :ref:`interactions <interactions>` or
   :ref:`query <query>` or :ref:`param <param>` or :ref:`property <property>`
 
+* args = arguments used by a particular sub-command
+
 Examples
 """"""""
 
 .. code-block:: LAMMPS
 
-   kim init ...
-   kim interactions ...
-   kim query ...
-   kim param ...
-   kim property ...
+   kim init args
+   kim interactions args
+   kim query args
+   kim param args
+   kim property args
 
 .. _kim_description:
 
@@ -31,7 +33,7 @@ Description
 
 The *kim command* includes a set of sub-commands that allow LAMMPS users
 to use interatomic models (IM) (potentials and force fields) and their predictions
-for various physical properties archived in the 
+for various physical properties archived in the
 `Open Knowledgebase of Interatomic Models (OpenKIM) <https://openkim.org>`_ repository.
 
 Using OpenKIM provides LAMMPS users with immediate access to a large number of
@@ -141,11 +143,11 @@ Examples
 
    kim init SW_StillingerWeber_1985_Si__MO_405512056662_005 metal
    kim interactions Si
-  
+
    kim init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_001 real
    kim init Sim_LAMMPS_ReaxFF_StrachanVanDuinChakraborty_2003_CHNO__SM_107643900657_001 metal unit_conversion_mode
    kim interactions C H O
-   
+
    kim init Sim_LAMMPS_IFF_PCFF_HeinzMishraLinEmami_2015Ver1v5_FccmetalsMineralsSolventsPolymers__SM_039297821658_000 real
    kim interactions fixed_types
 
@@ -679,7 +681,7 @@ Syntax
 
    kim param get param_name index_range variable formatarg
    kim param set param_name index_range values
-   
+
 * param_name = name of a KIM portable model parameter (which is published by the
   PM and available for access). The specific string used to identify a parameter
   is defined by the PM.  For example, for the
@@ -1374,4 +1376,3 @@ doi: `https://doi.org/10.1007/s11837-011-0102-6 <https://doi.org/10.1007/s11837-
 
 **(Elliott)** Elliott, Tadmor and Bernstein, `https://openkim.org/kim-api <https://openkim.org/kim-api>`_ (2011)
 doi: `https://doi.org/10.25950/FF8F563A <https://doi.org/10.25950/FF8F563A>`_
-

From fd9a568df40308e7e2d84d0a917bc8dbbf1fc995 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Sat, 20 Feb 2021 20:21:32 -0600
Subject: [PATCH 104/116] clean up the kim command, remove unused targets and
 double used one to avoid warning

---
 doc/src/kim_commands.rst | 71 +++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 38 deletions(-)

diff --git a/doc/src/kim_commands.rst b/doc/src/kim_commands.rst
index d67bd454a9..22eb4a8c24 100644
--- a/doc/src/kim_commands.rst
+++ b/doc/src/kim_commands.rst
@@ -26,15 +26,14 @@ Examples
    kim param args
    kim property args
 
-.. _kim_description:
-
 Description
 """""""""""
 
-The *kim command* includes a set of sub-commands that allow LAMMPS users
-to use interatomic models (IM) (potentials and force fields) and their predictions
-for various physical properties archived in the
-`Open Knowledgebase of Interatomic Models (OpenKIM) <https://openkim.org>`_ repository.
+The *kim command* includes a set of sub-commands that allow LAMMPS users to use
+interatomic models (IM) (potentials and force fields) and their predictions for
+various physical properties archived in the
+`Open Knowledgebase of Interatomic Models (OpenKIM) <https://openkim.org>`_
+repository.
 
 Using OpenKIM provides LAMMPS users with immediate access to a large number of
 verified IMs and their predictions. OpenKIM IMs have multiple benefits including
@@ -44,8 +43,6 @@ verified IMs and their predictions. OpenKIM IMs have multiple benefits including
 
 There are two types of IMs archived in OpenKIM:
 
-.. _PM_type:
-
 1. The first type is called a *KIM Portable Model* (PM).  A KIM PM is an
    independent computer implementation of an IM written in one of the languages
    supported by KIM (C, C++, Fortran) that conforms to the KIM Application
@@ -113,9 +110,9 @@ learn how to install a pre-built binary of the OpenKIM Repository of Models.
 Using OpenKIM IMs with LAMMPS (*kim init*, *kim interactions*)
 """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
-Two sub-commands are employed when using OpenKIM IMs in LAMMPS, one to select the
-IM and perform necessary initialization (\ *kim init*\ ), and the second
-to set up the IM for use by executing any necessary LAMMPS commands
+Two sub-commands are employed when using OpenKIM IMs in LAMMPS, one to select
+the IM and perform necessary initialization (\ *kim init*\ ), and the second to
+set up the IM for use by executing any necessary LAMMPS commands
 (\ *kim interactions*\ ).  Both are required.
 
 Syntax
@@ -126,8 +123,6 @@ Syntax
    kim init model user_units unitarg
    kim interactions typeargs
 
-.. _formatarg_options:
-
 * model = name of the KIM interatomic model (the KIM ID for models archived in
   OpenKIM)
 * user_units = the LAMMPS :doc:`units <units>` style assumed in the LAMMPS
@@ -196,8 +191,8 @@ If unit conversion mode is **not** active, then *user_units* must either match
 the required units of the IM or the IM must be able to adjust its units to
 match. (The latter is only possible with some KIM PMs; SMs can never adjust
 their units.) If a match is possible, the LAMMPS :doc:`units <units>` command is
-called to set the units to *user_units*\ .  If the match fails, the simulation is
-terminated with an error.
+called to set the units to *user_units*\ .  If the match fails, the simulation
+is terminated with an error.
 
 Here is an example of a LAMMPS script to compute the cohesive energy of a
 face-centered cubic (fcc) lattice for the MEAM potential by Pascuet and
@@ -284,7 +279,6 @@ from a file.  For example, if a configuration of atoms is read in from a dump
 file using the :doc:`read_dump <read_dump>` command, the following can be done
 to convert the box and all atomic positions to the correct units:
 
-
 .. code-block:: LAMMPS
 
    change_box all x scale ${_u_distance} &
@@ -300,6 +294,7 @@ to convert the box and all atomic positions to the correct units:
    Unit conversion will only work if the conversion factors are placed in all
    appropriate places in the input script.  It is up to the user to do this
    correctly.
+
 .. _interactions:
 
 OpenKIM IM Execution (*kim interactions*)
@@ -336,8 +331,8 @@ The *kim interactions* command performs all the necessary steps to set up the
 OpenKIM IM selected in the *kim init* command.  The specific actions depend on
 whether the IM is a KIM PM or a KIM SM.  For a KIM PM, a
 :doc:`pair_style kim <pair_kim>` command is executed followed by the appropriate
-*pair_coeff* command.  For example, for the Ercolessi and Adams (1994) KIM PM for
-Al set by the following commands:
+*pair_coeff* command.  For example, for the Ercolessi and Adams (1994) KIM PM
+for Al set by the following commands:
 
 .. code-block:: LAMMPS
 
@@ -421,8 +416,6 @@ Syntax
 
    kim query variable formatarg query_function queryargs
 
-.. _formatarg_options:
-
 * variable(s) = single name or list of names of (string style) LAMMPS
   variable(s) where a query result or parameter get result is stored.  Variables
   that do not exist will be created by the command
@@ -652,8 +645,8 @@ assessing uncertainty, fitting an IM, or working with an ensemble of IMs.  As
 explained :ref:`above <IM_types>`, IMs archived in OpenKIM are either Portable
 Models (PMs) or Simulator Models (SMs).  KIM PMs are complete independent
 implementations of an IM, whereas KIM SMs are wrappers to an IM implemented
-within LAMMPS.  Two different mechanisms are provided for accessing IM parameters
-in these two cases:
+within LAMMPS.  Two different mechanisms are provided for accessing IM
+parameters in these two cases:
 
 * For a KIM PM, the *kim param* command can be used to *get* and *set* the
   values of the PM's parameters as explained below.
@@ -693,8 +686,10 @@ Syntax
 * variable(s) = single name or list of names of (string style) LAMMPS
   variable(s) where a query result or parameter get result is stored.  Variables
   that do not exist will be created by the command
-* formatarg = *list, split, or explicit* (optional):
 
+.. _formatarg_options:
+
+* formatarg = *list, split, or explicit* (optional):
   .. parsed-literal::
 
      *list* = returns a single string with a list of space separated values
@@ -832,8 +827,8 @@ In this case, the "list" mode of *formatarg* is used.  The result of the *get*
 operation is stored in the LAMMPS variable *LAMS* as a string containing the
 three retrieved values separated by spaces, e.g "1.0 2.0 3.0".  This can be used
 in LAMMPS with an *index* variable to access the values one at a time within a
-loop as shown in the example.  At each iteration of the loop *LAM_VALUE* contains
-the current value of lambda.
+loop as shown in the example.  At each iteration of the loop *LAM_VALUE*
+contains the current value of lambda.
 
 .. code-block:: LAMMPS
 
@@ -843,8 +838,8 @@ the current value of lambda.
 
 In this case, the "split" mode of *formatarg* is used.  The three values
 retrieved by the *get* operation are stored in the three LAMMPS variables
-*LAM_15*, *LAM_16* and *LAM_17*\ .  The provided name "LAM" is used as prefix and
-the location in the lambda array is appended to create the variable names.
+*LAM_15*, *LAM_16* and *LAM_17*\ .  The provided name "LAM" is used as prefix
+and the location in the lambda array is appended to create the variable names.
 
 **Setting a scalar parameter**
 
@@ -906,13 +901,13 @@ Writing material properties in standard KIM Property Instance format (*kim prope
 
 The OpenKIM system includes a
 collection of Tests (material property calculation codes), Models (interatomic
-potentials), Predictions, and Reference Data (DFT or experiments).  Specifically,
-a KIM Test is a computation that when coupled with a KIM Model generates the
-prediction of that model for a specific material property rigorously defined by
-a KIM Property Definition (see the
+potentials), Predictions, and Reference Data (DFT or experiments).
+Specifically, a KIM Test is a computation that when coupled with a KIM Model
+generates the prediction of that model for a specific material property
+rigorously defined by a KIM Property Definition (see the
 `KIM Properties Framework <https://openkim.org/doc/schema/properties-framework/>`__
-for further details).  A prediction of a material property for a given model is a
-specific numerical realization of a property definition, referred to as a
+for further details).  A prediction of a material property for a given model is
+a specific numerical realization of a property definition, referred to as a
 "Property Instance." The objective of the *kim property* command is to make it
 easy to output material properties in a standardized, machine readable, format
 that can be easily ingested by other programs.  Additionally, it aims to make it
@@ -994,9 +989,9 @@ as stipulated in the property definition.
 
     Each map assigned to a *key_name* must contain the *key_name_key*
     "source-value" and an associated *key_name_value* of the appropriate
-    type (as defined in the relevant KIM Property Definition).  For keys that are
-    defined as having physical units, the "source-unit" *key_name_key* must also
-    be given a string value recognized by
+    type (as defined in the relevant KIM Property Definition).  For keys that
+    are defined as having physical units, the "source-unit" *key_name_key* must
+    also be given a string value recognized by
     `GNU units <https://www.gnu.org/software/units/>`_.
 
 Once a *kim property create* command has been given to instantiate a property
@@ -1210,8 +1205,8 @@ with this key should be assigned the following value:
     [0.5, 0.0, 0.5],
     [0.0, 0.5, 0.5]]
 
-While each of the twelve components could be set individually, we can instead set
-each row at a time using colon notation:
+While each of the twelve components could be set individually, we can instead
+set each row at a time using colon notation:
 
 .. code-block:: LAMMPS
 

From 906afd9a3202f0a23b46139c93fff497acc1d572 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Mon, 22 Feb 2021 15:13:58 -0600
Subject: [PATCH 105/116] set the skin value to 2.0 Angstroms

---
 doc/src/kim_commands.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/kim_commands.rst b/doc/src/kim_commands.rst
index 22eb4a8c24..02db71fdf4 100644
--- a/doc/src/kim_commands.rst
+++ b/doc/src/kim_commands.rst
@@ -263,7 +263,7 @@ units of the IM.
    create_atoms 1 box
    mass         1 $(4.480134e-26*v__u_mass)
    kim          interactions Al
-   neighbor     $(0.001e-10*v__u_distance) bin
+   neighbor     $(2e-10*v__u_distance) bin
    run          0
    variable     Ec_in_J equal (pe/count(all))/v__u_energy
    print        "Cohesive Energy = ${Ec_in_J} J"

From d6380f2fd339d9145c0e57159197ed498ed54dc2 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Mon, 22 Feb 2021 15:45:40 -0600
Subject: [PATCH 106/116] Set the skin and timestep default values

Set the skin and timestep default values as 2.0 Angstroms and 1.0
femtosecond for every unit system
---
 src/KIM/kim_init.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/KIM/kim_init.cpp b/src/KIM/kim_init.cpp
index 43ccfda155..c97a720e6c 100644
--- a/src/KIM/kim_init.cpp
+++ b/src/KIM/kim_init.cpp
@@ -347,6 +347,24 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
   cmd += model_units;
   input->one(cmd);
 
+  // Set the skin and timestep default values as
+  // 2.0 Angstroms and 1.0 femtosecond
+
+  std::string skin_cmd =
+    (strcmp(model_units, "real") == 0) ? "neighbor 2.0 bin   # Angstroms":
+    (strcmp(model_units, "metal") == 0) ? "neighbor 2.0 bin   # Angstroms":
+    (strcmp(model_units, "si") == 0) ? "neighbor 2e-10 bin   # meters":
+    (strcmp(model_units, "cgs") == 0) ? "neighbor 2e-8 bin   # centimeters":
+    "neighbor 3.77945224 bin   # Bohr";
+  std::string step_cmd =
+    (strcmp(model_units, "real") == 0) ? "timestep 1.0       # femtoseconds":
+    (strcmp(model_units, "metal") == 0) ? "timestep 1.0e-3    # picoseconds":
+    (strcmp(model_units, "si") == 0) ? "timestep 1e-15       # seconds":
+    (strcmp(model_units, "cgs") == 0) ? "timestep 1e-15      # seconds":
+    "timestep 1.0              # femtoseconds";
+  input->one(skin_cmd);
+  input->one(step_cmd);
+
   if (model_type == SM) {
     int sim_fields, sim_lines;
     char const *sim_field, *sim_value;

From 120cdcd7f6eb1aa15a1851165e9b25087b93c0e7 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Mon, 22 Feb 2021 17:10:01 -0600
Subject: [PATCH 107/116] update the kim commands doc

---
 doc/src/kim_commands.rst | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/doc/src/kim_commands.rst b/doc/src/kim_commands.rst
index 02db71fdf4..fd15fc848a 100644
--- a/doc/src/kim_commands.rst
+++ b/doc/src/kim_commands.rst
@@ -123,6 +123,8 @@ Syntax
    kim init model user_units unitarg
    kim interactions typeargs
 
+.. _typeargs_options:
+
 * model = name of the KIM interatomic model (the KIM ID for models archived in
   OpenKIM)
 * user_units = the LAMMPS :doc:`units <units>` style assumed in the LAMMPS
@@ -192,7 +194,10 @@ the required units of the IM or the IM must be able to adjust its units to
 match. (The latter is only possible with some KIM PMs; SMs can never adjust
 their units.) If a match is possible, the LAMMPS :doc:`units <units>` command is
 called to set the units to *user_units*\ .  If the match fails, the simulation
-is terminated with an error.
+is terminated with an error.  The *kim init* command also sets the
+default value for the :doc:`skin <neighbor>` (extra distance beyond force
+cutoff) as 2.0 Angstroms and sets the default value for the
+:doc:`timestep <timestep>` size as 1.0 femtosecond.
 
 Here is an example of a LAMMPS script to compute the cohesive energy of a
 face-centered cubic (fcc) lattice for the MEAM potential by Pascuet and
@@ -392,13 +397,16 @@ the *kim interactions* command executes the following LAMMPS input commands:
 
 .. note::
 
-   Changing a periodic boundary to a non-periodic one, or in general using the
-   :doc:`change_box <change_box>` command after the interactions are set via
-   *kim interactions* or *pair_coeff* commands might affect some of the
-   settings.  For example, SM models containing Coulombic terms in the
-   interactions require different settings if a periodic boundary changes to a
-   non-periodic one.  In these cases, *kim interactions* must be called again
-   after the *change_box* command to provide the correct settings.
+   *kim interactions* must be called each time after the
+   :doc:`change_box <change_box>` command to provide the correct settings (it
+   should be called with the same :ref:`typeargs <typeargs_options>` as the
+   first call.)  The reason is that changing a periodic boundary to a
+   non-periodic one, or in general, using the *change_box* command after the
+   interactions are set via *kim interactions* or *pair_coeff* commands might
+   affect some of the settings.  For example, SM models containing Coulombic
+   terms in the interactions require different settings if a periodic boundary
+   changes to a non-periodic one.  In other cases, the second call to
+   *kim interactions* does not affect any other settings.
 
 .. _query:
 
@@ -675,6 +683,8 @@ Syntax
    kim param get param_name index_range variable formatarg
    kim param set param_name index_range values
 
+.. _formatarg_options:
+
 * param_name = name of a KIM portable model parameter (which is published by the
   PM and available for access). The specific string used to identify a parameter
   is defined by the PM.  For example, for the
@@ -686,9 +696,6 @@ Syntax
 * variable(s) = single name or list of names of (string style) LAMMPS
   variable(s) where a query result or parameter get result is stored.  Variables
   that do not exist will be created by the command
-
-.. _formatarg_options:
-
 * formatarg = *list, split, or explicit* (optional):
   .. parsed-literal::
 

From 464d9a01c58b2c593f2eccf88e4502d594210458 Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 23 Feb 2021 06:48:45 -0600
Subject: [PATCH 108/116] apply utils::strdup() in a few more cases

---
 src/KIM/pair_kim.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/KIM/pair_kim.cpp b/src/KIM/pair_kim.cpp
index 559c656189..acd32d80a0 100644
--- a/src/KIM/pair_kim.cpp
+++ b/src/KIM/pair_kim.cpp
@@ -309,13 +309,11 @@ void PairKIM::settings(int narg, char **arg)
   set_lmps_flags();
 
   // set KIM Model name
-  int nmlen = strlen(arg[0]);
   if (kim_modelname != 0) {
     delete [] kim_modelname;
     kim_modelname = 0;
   }
-  kim_modelname = new char[nmlen+1];
-  strcpy(kim_modelname, arg[0]);
+  kim_modelname = utils::strdup(arg[0]);
 
   // initialize KIM Model
   kim_init();
@@ -373,9 +371,7 @@ void PairKIM::coeff(int narg, char **arg)
       if (strcmp(arg[i],lmps_unique_elements[j]) == 0) break;
     lmps_map_species_to_unique[i-1] = j;
     if (j == lmps_num_unique_elements) {
-      n = strlen(arg[i]) + 1;
-      lmps_unique_elements[j] = new char[n];
-      strcpy(lmps_unique_elements[j],arg[i]);
+      lmps_unique_elements[j] = utils::strdup(arg[i]);
       lmps_num_unique_elements++;
     }
   }

From ba1cb4218f2ecf53fd2a939d958f6d0cc962e36a Mon Sep 17 00:00:00 2001
From: Yaser Afshar <ya.afshar@gmail.com>
Date: Tue, 23 Feb 2021 07:11:43 -0600
Subject: [PATCH 109/116] update the label to prevent warning on duplicate
 label

---
 doc/src/kim_commands.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/src/kim_commands.rst b/doc/src/kim_commands.rst
index fd15fc848a..295c6a5e40 100644
--- a/doc/src/kim_commands.rst
+++ b/doc/src/kim_commands.rst
@@ -10,7 +10,7 @@ Syntax
 
    kim sub-command args
 
-* sub-command = :ref:`init <init>` or :ref:`interactions <interactions>` or
+* sub-command = :ref:`init <kim_init>` or :ref:`interactions <interactions>` or
   :ref:`query <query>` or :ref:`param <param>` or :ref:`property <property>`
 
 * args = arguments used by a particular sub-command
@@ -152,7 +152,7 @@ Examples
 See the *examples/kim* directory for example input scripts that use KIM PMs
 and KIM SMs.
 
-.. _init:
+.. _kim_init:
 
 OpenKIM IM Initialization (*kim init*)
 --------------------------------------

From 8c9e9fa9ee10c58ef77bd25f9d7c8587a4080021 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 23 Feb 2021 09:36:20 -0500
Subject: [PATCH 110/116] add removed commands to Deprecated commands list and
 processing

---
 src/KIM/pair_kim.cpp | 2 +-
 src/deprecated.cpp   | 3 +++
 src/deprecated.h     | 5 +++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/KIM/pair_kim.cpp b/src/KIM/pair_kim.cpp
index acd32d80a0..b9c3f83147 100644
--- a/src/KIM/pair_kim.cpp
+++ b/src/KIM/pair_kim.cpp
@@ -328,7 +328,7 @@ void PairKIM::coeff(int narg, char **arg)
   // This is called when "pair_coeff ..." is read from input
   // may be called multiple times
 
-  int i,j,n;
+  int i,j;
 
   if (!allocated) allocate();
 
diff --git a/src/deprecated.cpp b/src/deprecated.cpp
index f699289a7d..8523cf353a 100644
--- a/src/deprecated.cpp
+++ b/src/deprecated.cpp
@@ -36,6 +36,9 @@ void Deprecated::command(int /* narg */, char ** /* arg */)
   } else if (cmd == "reset_ids") {
     if (lmp->comm->me == 0)
       utils::logmesg(lmp,"\n'reset_ids' has been renamed to 'reset_atom_ids'\n\n");
+  } else if (utils::strmatch(cmd,"^kim_")) {
+    if (lmp->comm->me == 0)
+      utils::logmesg(lmp,"\n'kim_<command>' has been renamed to 'kim <command>'\n\n");
   }
   error->all(FLERR,"This command is no longer available");
 }
diff --git a/src/deprecated.h b/src/deprecated.h
index 4a7fbc44cf..aceb1181c1 100644
--- a/src/deprecated.h
+++ b/src/deprecated.h
@@ -15,6 +15,11 @@
 
 CommandStyle(DEPRECATED,Deprecated)
 CommandStyle(reset_ids,Deprecated)
+CommandStyle(kim_init,Deprecated)
+CommandStyle(kim_interactions,Deprecated)
+CommandStyle(kim_param,Deprecated)
+CommandStyle(kim_property,Deprecated)
+CommandStyle(kim_query,Deprecated)
 
 #else
 

From d44af3256d3185c4365bb8f45bb0a895125142f8 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 23 Feb 2021 09:51:27 -0500
Subject: [PATCH 111/116] Fix lal_charmm.cu to support HIP compilation

---
 lib/gpu/lal_charmm.cu | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/gpu/lal_charmm.cu b/lib/gpu/lal_charmm.cu
index 42fb810796..304dc34e8b 100644
--- a/lib/gpu/lal_charmm.cu
+++ b/lib/gpu/lal_charmm.cu
@@ -13,15 +13,14 @@
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
-#ifdef NV_KERNEL
-
+#if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
+_texture(pos_tex, float4);
+_texture(q_tex, float);
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
+_texture_2d(pos_tex, int4);
+_texture(q_tex, int2);
 #endif
 
 #else

From 6ecbcc277bbf1c297e4645596d3b43ffd40b26d5 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 23 Feb 2021 09:59:24 -0500
Subject: [PATCH 112/116] update death tests for removed commands

---
 unittest/commands/test_kim_commands.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/unittest/commands/test_kim_commands.cpp b/unittest/commands/test_kim_commands.cpp
index 3a5818edb9..de57edafc4 100644
--- a/unittest/commands/test_kim_commands.cpp
+++ b/unittest/commands/test_kim_commands.cpp
@@ -86,15 +86,15 @@ TEST_F(KimCommandsTest, kim)
                  lmp->input->one("kim"););
     TEST_FAILURE(".*ERROR: Unknown kim subcommand.*",
                  lmp->input->one("kim unknown"););
-    TEST_FAILURE(".*ERROR: Unknown command: kim_init.*",
+    TEST_FAILURE(".*kim_init.*has been renamed to.*",
                  lmp->input->one("kim_init"););
-    TEST_FAILURE(".*ERROR: Unknown command: kim_interactions.*",
+    TEST_FAILURE(".*kim_interactions.*has been renamed to.*",
                  lmp->input->one("kim_interactions"););
-    TEST_FAILURE(".*ERROR: Unknown command: kim_param.*",
+    TEST_FAILURE(".*kim_param.*has been renamed to.*",
                  lmp->input->one("kim_param"););
-    TEST_FAILURE(".*ERROR: Unknown command: kim_property.*",
+    TEST_FAILURE(".*kim_property.*has been renamed to.*",
                  lmp->input->one("kim_property"););
-    TEST_FAILURE(".*ERROR: Unknown command: kim_query.*",
+    TEST_FAILURE(".*kim_query.*has been renamed to.*",
                  lmp->input->one("kim_query"););
 }
 

From 446a0681593209363551eaf4f2958e110de0a508 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 23 Feb 2021 10:14:05 -0500
Subject: [PATCH 113/116] Use hipGetDeviceProperties to avoid int overflows

---
 lib/gpu/geryon/hip_device.h | 39 ++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/lib/gpu/geryon/hip_device.h b/lib/gpu/geryon/hip_device.h
index 373b3783b0..94420aeb21 100644
--- a/lib/gpu/geryon/hip_device.h
+++ b/lib/gpu/geryon/hip_device.h
@@ -41,8 +41,8 @@ struct NVDProperties {
   int maxThreadsPerBlock;
   int maxThreadsDim[3];
   int maxGridSize[3];
-  int sharedMemPerBlock;
-  int totalConstantMemory;
+  CUDA_INT_TYPE sharedMemPerBlock;
+  CUDA_INT_TYPE totalConstantMemory;
   int SIMDWidth;
   int memPitch;
   int regsPerBlock;
@@ -362,32 +362,35 @@ UCL_Device::UCL_Device() {
     CU_SAFE_CALL_NS(hipDeviceGetName(namecstr,1024,dev));
     prop.name=namecstr;
 
-    CU_SAFE_CALL_NS(hipDeviceTotalMem(&prop.totalGlobalMem,dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.multiProcessorCount, hipDeviceAttributeMultiprocessorCount, dev));
+    hipDeviceProp_t hip_prop;
 
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsPerBlock, hipDeviceAttributeMaxThreadsPerBlock, dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[0], hipDeviceAttributeMaxBlockDimX, dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[1], hipDeviceAttributeMaxBlockDimY, dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxThreadsDim[2], hipDeviceAttributeMaxBlockDimZ, dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[0], hipDeviceAttributeMaxGridDimX, dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[1], hipDeviceAttributeMaxGridDimY, dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.maxGridSize[2], hipDeviceAttributeMaxGridDimZ, dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.sharedMemPerBlock, hipDeviceAttributeMaxSharedMemoryPerBlock, dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.totalConstantMemory, hipDeviceAttributeTotalConstantMemory, dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.SIMDWidth, hipDeviceAttributeWarpSize, dev));
+    CU_SAFE_CALL_NS(hipGetDeviceProperties(&hip_prop,dev));
+
+    prop.totalGlobalMem = hip_prop.totalGlobalMem;
+    prop.multiProcessorCount = hip_prop.multiProcessorCount;
+    prop.maxThreadsPerBlock = hip_prop.maxThreadsPerBlock;
+    prop.maxThreadsDim[0] = hip_prop.maxThreadsDim[0];
+    prop.maxThreadsDim[1] = hip_prop.maxThreadsDim[1];
+    prop.maxThreadsDim[2] = hip_prop.maxThreadsDim[2];
+    prop.maxGridSize[0] = hip_prop.maxGridSize[0];
+    prop.maxGridSize[1] = hip_prop.maxGridSize[1];
+    prop.maxGridSize[2] = hip_prop.maxGridSize[2];
+    prop.sharedMemPerBlock = hip_prop.sharedMemPerBlock;
+    prop.totalConstantMemory = hip_prop.totalConstMem;
+    prop.SIMDWidth = hip_prop.warpSize;
+    prop.regsPerBlock = hip_prop.regsPerBlock;
+    prop.clockRate = hip_prop.clockRate;
+    prop.computeMode = hip_prop.computeMode;
     //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.regsPerBlock, hipDeviceAttributeMaxRegistersPerBlock, dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.clockRate, hipDeviceAttributeClockRate, dev));
     //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
 
     //#if CUDA_VERSION >= 2020
     //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
     CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
     //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.computeMode, hipDeviceAttributeComputeMode,dev));
     //#endif
     //#if CUDA_VERSION >= 3010
-    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.concurrentKernels, hipDeviceAttributeConcurrentKernels, dev));
+    prop.concurrentKernels = hip_prop.concurrentKernels;
     //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
     //#endif
 

From 2a0b7a26c092bece2ee37a062aa12556512acac8 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 23 Feb 2021 11:26:07 -0500
Subject: [PATCH 114/116] Add missing cq() getter

---
 lib/gpu/geryon/hip_kernel.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/gpu/geryon/hip_kernel.h b/lib/gpu/geryon/hip_kernel.h
index 10bc9f1334..fbb08e12b1 100644
--- a/lib/gpu/geryon/hip_kernel.h
+++ b/lib/gpu/geryon/hip_kernel.h
@@ -69,6 +69,9 @@ class UCL_Program {
     return _device_ptr->load_module(program, _module, log);
   }
 
+  /// Return the default command queue/stream associated with this data
+  inline hipStream_t & cq() { return _cq; }
+
   friend class UCL_Kernel;
  private:
   hipModule_t _module;

From 70ca0551132fdd9f4413e7ee1dd0f29340bd84fd Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 23 Feb 2021 11:28:05 -0500
Subject: [PATCH 115/116] Add workaround to compile with HIP, new neighbor code
 still needs to be ported

---
 lib/gpu/geryon/hip_device.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/gpu/geryon/hip_device.h b/lib/gpu/geryon/hip_device.h
index 94420aeb21..d3917ed0cf 100644
--- a/lib/gpu/geryon/hip_device.h
+++ b/lib/gpu/geryon/hip_device.h
@@ -8,6 +8,9 @@
 #ifndef HIP_DEVICE
 #define HIP_DEVICE
 
+// workaround after GPU package Feb2021 update
+// todo: make new neighbor code work with HIP
+#define LAL_USE_OLD_NEIGHBOR
 
 #include <hip/hip_runtime.h>
 #include <unordered_map>

From 7d1670d91c890a3a8ae98f6e184f1e01b0a7e1ce Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 23 Feb 2021 07:08:19 -0500
Subject: [PATCH 116/116] cuda 11.0 does not support sm_86, it was added in
 11.1

---
 cmake/Modules/Packages/GPU.cmake | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index e2586881ef..043d41e0ef 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -99,9 +99,13 @@ if(GPU_API STREQUAL "CUDA")
   if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
     string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_75,code=[sm_75,compute_75]")
   endif()
-  # Ampere (GPU Arch 8.0 and 8.6) is supported by CUDA 11 and later
+  # Ampere (GPU Arch 8.0) is supported by CUDA 11 and later
   if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
-    string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_80,code=[sm_80,compute_80] -gencode arch=compute_86,code=[sm_86,compute_86]")
+    string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_80,code=[sm_80,compute_80]")
+  endif()
+  # Ampere (GPU Arch 8.6) is supported by CUDA 11.1 and later
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "11.1")
+    string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_86,code=[sm_86,compute_86]")
   endif()
   if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
     message(WARNING "Unsupported CUDA version. Use at your own risk.")