diff --git a/doc/src/Speed_kokkos.txt b/doc/src/Speed_kokkos.txt
index fd33491253..99d29864dc 100644
--- a/doc/src/Speed_kokkos.txt
+++ b/doc/src/Speed_kokkos.txt
@@ -46,16 +46,15 @@ software version 7.5 or later must be installed on your system. See
 the discussion for the "GPU package"_Speed_gpu.html for details of how
 to check and do this.
 
-NOTE: Kokkos with CUDA currently implicitly assumes that the MPI
-library is CUDA-aware and has support for GPU-direct. This is not
-always the case, especially when using pre-compiled MPI libraries
-provided by a Linux distribution. This is not a problem when using
-only a single GPU and a single MPI rank on a desktop. When running
-with multiple MPI ranks, you may see segmentation faults without
-GPU-direct support.  These can be avoided by adding the flags "-pk
-kokkos gpu/direct off"_Run_options.html to the LAMMPS command line or
-by using the command "package kokkos gpu/direct off"_package.html in
-the input file.
+NOTE: Kokkos with CUDA currently implicitly assumes that the MPI library 
+is CUDA-aware. This is not always the case, especially when using 
+pre-compiled MPI libraries provided by a Linux distribution. This is not 
+a problem when using only a single GPU with a single MPI rank. When 
+running with multiple MPI ranks, you may see segmentation faults without 
+CUDA-aware MPI support. These can be avoided by adding the flags "-pk 
+kokkos cuda/aware off"_Run_options.html to the LAMMPS command line or by 
+using the command "package kokkos cuda/aware off"_package.html in the 
+input file.
 
 [Building LAMMPS with the KOKKOS package:]
 
@@ -217,9 +216,8 @@ case, also packing/unpacking communication buffers on the host may give
 speedup (see the KOKKOS "package"_package.html command). Using CUDA MPS 
 is recommended in this scenario.
 
-Using a CUDA-aware MPI library with 
-support for GPU-direct is highly recommended. GPU-direct use can be 
-avoided by using "-pk kokkos gpu/direct no"_package.html. As above for 
+Using a CUDA-aware MPI library is highly recommended. CUDA-aware MPI use can be 
+avoided by using "-pk kokkos cuda/aware no"_package.html. As above for 
 multi-core CPUs (and no GPU), if N is the number of physical cores/node, 
 then the number of MPI tasks/node should not exceed N.
 
diff --git a/doc/src/package.txt b/doc/src/package.txt
index 6a6d17bcbc..edd409a842 100644
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@@ -64,7 +64,7 @@ args = arguments specific to the style :l
       {no_affinity} values = none
   {kokkos} args = keyword value ...
     zero or more keyword/value pairs may be appended
-    keywords = {neigh} or {neigh/qeq} or {neigh/thread} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} or {gpu/direct}
+    keywords = {neigh} or {neigh/qeq} or {neigh/thread} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} or {cuda/aware}
       {neigh} value = {full} or {half}
         full = full neighbor list
         half = half neighbor list built in thread-safe manner
@@ -87,9 +87,9 @@ args = arguments specific to the style :l
         no = perform communication pack/unpack in non-KOKKOS mode
         host = perform pack/unpack on host (e.g. with OpenMP threading)
         device = perform pack/unpack on device (e.g. on GPU)
-      {gpu/direct} = {off} or {on}
-        off = do not use GPU-direct
-        on = use GPU-direct (default)
+      {cuda/aware} = {off} or {on}
+        off = do not use CUDA-aware MPI
+        on = use CUDA-aware MPI (default)
   {omp} args = Nthreads keyword value ...
     Nthread = # of OpenMP threads to associate with each MPI process
     zero or more keyword/value pairs may be appended
@@ -520,19 +520,21 @@ pack/unpack communicated data. When running small systems on a GPU,
 performing the exchange pack/unpack on the host CPU can give speedup 
 since it reduces the number of CUDA kernel launches.
 
-The {gpu/direct} keyword chooses whether GPU-direct will be used. When 
+The {cuda/aware} keyword chooses whether CUDA-aware MPI will be used. When 
 this keyword is set to {on}, buffers in GPU memory are passed directly 
 through MPI send/receive calls. This reduces overhead of first copying 
-the data to the host CPU. However GPU-direct is not supported on all 
+the data to the host CPU. However CUDA-aware MPI is not supported on all 
 systems, which can lead to segmentation faults and would require using a 
-value of {off}. If LAMMPS can safely detect that GPU-direct is not 
+value of {off}. If LAMMPS can safely detect that CUDA-aware MPI is not 
 available (currently only possible with OpenMPI v2.0.0 or later), then 
-the {gpu/direct} keyword is automatically set to {off} by default. When 
-the {gpu/direct} keyword is set to {off} while any of the {comm} 
+the {cuda/aware} keyword is automatically set to {off} by default. When 
+the {cuda/aware} keyword is set to {off} while any of the {comm} 
 keywords are set to {device}, the value for these {comm} keywords will 
 be automatically changed to {host}. This setting has no effect if not 
-running on GPUs. GPU-direct is available for OpenMPI 1.8 (or later 
-versions), Mvapich2 1.9 (or later), and CrayMPI.
+running on GPUs. CUDA-aware MPI is available for OpenMPI 1.8 (or later 
+versions), Mvapich2 1.9 (or later) when the "MV2_USE_CUDA" environment
+variable is set to "1", CrayMPI, and IBM Spectrum MPI when the "-gpu"
+flag is used.
 
 :line
 
@@ -641,8 +643,8 @@ switch"_Run_options.html.
 
 For the KOKKOS package, the option defaults for GPUs are neigh = full, 
 neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default 
-value, comm = device, gpu/direct = on. When LAMMPS can safely detect 
-that GPU-direct is not available, the default value of gpu/direct 
+value, comm = device, cuda/aware = on. When LAMMPS can safely detect 
+that CUDA-aware MPI is not available, the default value of cuda/aware 
 becomes "off". For CPUs or Xeon Phis, the option defaults are neigh = 
 half, neigh/qeq = half, newton = on, binsize = 0.0, and comm = no. The 
 option neigh/thread = on when there are 16K atoms or less on an MPI 
diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index 87986a9ca9..d0bd978ae7 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -418,7 +418,7 @@ void CommKokkos::forward_comm_pair_device(Pair *pair)
     if (sendproc[iswap] != me) {
       double* buf_send_pair;
       double* buf_recv_pair;
-      if (lmp->kokkos->gpu_direct_flag) {
+      if (lmp->kokkos->cuda_aware_flag) {
         buf_send_pair = k_buf_send_pair.view<DeviceType>().data();
         buf_recv_pair = k_buf_recv_pair.view<DeviceType>().data();
       } else {
@@ -436,7 +436,7 @@ void CommKokkos::forward_comm_pair_device(Pair *pair)
         MPI_Send(buf_send_pair,n,MPI_DOUBLE,sendproc[iswap],0,world);
       if (recvnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
 
-      if (!lmp->kokkos->gpu_direct_flag) {
+      if (!lmp->kokkos->cuda_aware_flag) {
         k_buf_recv_pair.modify<LMPHostType>();
         k_buf_recv_pair.sync<DeviceType>();
       }
diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index 14b84a5733..348217634b 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -529,7 +529,7 @@ void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which)
     if (swap[m].sendproc != me) {
       FFT_SCALAR* buf1;
       FFT_SCALAR* buf2;
-      if (lmp->kokkos->gpu_direct_flag) {
+      if (lmp->kokkos->cuda_aware_flag) {
         buf1 = k_buf1.view<DeviceType>().data();
         buf2 = k_buf2.view<DeviceType>().data();
       } else {
@@ -545,7 +545,7 @@ void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which)
                swap[m].sendproc,0,gridcomm);
       MPI_Wait(&request,MPI_STATUS_IGNORE);
 
-      if (!lmp->kokkos->gpu_direct_flag) {
+      if (!lmp->kokkos->cuda_aware_flag) {
         k_buf2.modify<LMPHostType>();
         k_buf2.sync<DeviceType>();
       }
@@ -579,7 +579,7 @@ void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which)
     if (swap[m].recvproc != me) {
       FFT_SCALAR* buf1;
       FFT_SCALAR* buf2;
-      if (lmp->kokkos->gpu_direct_flag) {
+      if (lmp->kokkos->cuda_aware_flag) {
         buf1 = k_buf1.view<DeviceType>().data();
         buf2 = k_buf2.view<DeviceType>().data();
       } else {
@@ -595,7 +595,7 @@ void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which)
                swap[m].recvproc,0,gridcomm);
       MPI_Wait(&request,MPI_STATUS_IGNORE);
 
-      if (!lmp->kokkos->gpu_direct_flag) {
+      if (!lmp->kokkos->cuda_aware_flag) {
         k_buf2.modify<LMPHostType>();
         k_buf2.sync<DeviceType>();
       }
diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index 3fa84d98b2..18a4a3168f 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -28,32 +28,36 @@
 
 #ifdef KOKKOS_ENABLE_CUDA
 
-// for detecting GPU-direct support:
-// the function  int have_gpu_direct()
-// - returns -1 if GPU-direct support is unknown
-// - returns  0 if no GPU-direct support available
-// - returns  1 if GPU-direct support is available
+// for detecting CUDA-aware MPI support:
+// the variable int have_cuda_aware
+// - is  1 if CUDA-aware MPI support is available
+// - is  0 if CUDA-aware MPI support is unavailable
+// - is -1 if CUDA-aware MPI support is unknown
 
-#define GPU_DIRECT_UNKNOWN static int have_gpu_direct() {return -1;}
+#define CUDA_AWARE_UNKNOWN static int have_cuda_aware = -1;
 
-// OpenMPI supports detecting GPU-direct as of version 2.0.0
-#if OPEN_MPI
+// OpenMPI supports detecting CUDA-aware MPI as of version 2.0.0
 
+#if (OPEN_MPI)
 #if (OMPI_MAJOR_VERSION >= 2)
+
 #include <mpi-ext.h>
-#if defined(MPIX_CUDA_AWARE_SUPPORT)
-static int have_gpu_direct() { return MPIX_Query_cuda_support(); }
+
+#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
+static int have_cuda_aware = 1;
+#elif defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT
+static int have_cuda_aware = 0;
 #else
-GPU_DIRECT_UNKNOWN
-#endif
+CUDA_AWARE_UNKNOWN
+#endif // defined(MPIX_CUDA_AWARE_SUPPORT)
 
 #else // old OpenMPI
-GPU_DIRECT_UNKNOWN
-#endif
+CUDA_AWARE_UNKNOWN
+#endif // (OMPI_MAJOR_VERSION >=2)
 
 #else // unknown MPI library
-GPU_DIRECT_UNKNOWN
-#endif
+CUDA_AWARE_UNKNOWN
+#endif // OPEN_MPI
 
 #endif // KOKKOS_ENABLE_CUDA
 
@@ -66,6 +70,10 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   kokkos_exists = 1;
   lmp->kokkos = this;
 
+  exchange_comm_changed = 0;
+  forward_comm_changed = 0;
+  reverse_comm_changed = 0;
+
   delete memory;
   memory = new MemoryKokkos(lmp);
   memoryKK = (MemoryKokkos*) memory;
@@ -145,29 +153,10 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
 #ifdef KOKKOS_ENABLE_CUDA
   if (ngpus <= 0)
     error->all(FLERR,"Kokkos has been compiled for CUDA but no GPUs are requested");
-
-  // check and warn about GPU-direct availability when using multiple MPI tasks
-
-  int nmpi = 0;
-  MPI_Comm_size(world,&nmpi);
-  if ((nmpi > 1) && (me == 0)) {
-    if ( 1 == have_gpu_direct() ) {
-      ; // all good, nothing to warn about
-    } else if (-1 == have_gpu_direct() ) {
-      error->warning(FLERR,"Kokkos with CUDA assumes GPU-direct is available,"
-                     " but cannot determine if this is the case\n         try"
-                     " '-pk kokkos gpu/direct off' when getting segmentation faults");
-    } else if ( 0 == have_gpu_direct() ) {
-      error->warning(FLERR,"GPU-direct is NOT available, "
-                     "using '-pk kokkos gpu/direct off' by default");
-    } else {
-      ; // should never get here
-    }
-  }
 #endif
 
 #ifndef KOKKOS_ENABLE_SERIAL
-  if (nthreads == 1)
+  if (nthreads == 1 && me == 0)
     error->warning(FLERR,"When using a single thread, the Kokkos Serial backend "
                          "(i.e. Makefile.kokkos_mpi_only) gives better performance "
                          "than the OpenMP backend");
@@ -183,7 +172,11 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   // default settings for package kokkos command
 
   binsize = 0.0;
-  gpu_direct_flag = 1;
+#ifdef KOKKOS_ENABLE_CUDA
+  cuda_aware_flag = 1; 
+#else
+  cuda_aware_flag = 0;
+#endif
   neigh_thread = 0;
   neigh_thread_set = 0;
   neighflag_qeq_set = 0;
@@ -206,10 +199,66 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
     exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
   }
 
-#if KOKKOS_USE_CUDA
-  // only if we can safely detect, that GPU-direct is not available, change default
-  if (0 == have_gpu_direct()) gpu_direct_flag = 0;
+#ifdef KOKKOS_ENABLE_CUDA
+
+  // check and warn about CUDA-aware MPI availability when using multiple MPI tasks
+  // change default only if we can safely detect that CUDA-aware MPI is not available
+
+  int nmpi = 0;
+  MPI_Comm_size(world,&nmpi);
+  if (nmpi > 0) {
+
+    // Check for IBM Spectrum MPI
+
+    int len;
+    char mpi_version[MPI_MAX_LIBRARY_VERSION_STRING];
+    MPI_Get_library_version(mpi_version, &len);
+    if (strstr(&mpi_version[0], "Spectrum") != NULL) {
+      cuda_aware_flag = 0;
+      char* str;
+      if (str = getenv("OMPI_MCA_pml_pami_enable_cuda"))
+        if((strcmp(str,"1") == 0)) {
+          have_cuda_aware = 1;
+          cuda_aware_flag = 1;
+        }
+
+      if (!cuda_aware_flag)
+        if (me == 0)
+          error->warning(FLERR,"The Spectrum MPI '-gpu' flag is not set. Disabling CUDA-aware MPI");
+    }
+
+    if (cuda_aware_flag == 1 && have_cuda_aware == 0) {
+      if (me == 0)
+        error->warning(FLERR,"Turning off CUDA-aware MPI since it is not detected, "
+                       "use '-pk kokkos cuda/aware on' to override");
+      cuda_aware_flag = 0;
+    } else if (have_cuda_aware == -1) { // maybe we are dealing with MPICH, MVAPICH2 or some derivative?
+    // MVAPICH2
+#if (defined MPICH) && (defined MVAPICH2_VERSION)
+      char* str;
+      cuda_aware_flag = 0;
+      if (str = getenv("MV2_ENABLE_CUDA")
+        if ((strcmp(str,"1") == 0))
+          cuda_aware_flag = 1;
+
+      if (!cuda_aware_flag)
+        if (me == 0)
+          error->warning(FLERR,"MVAPICH2 'MV2_ENABLE_CUDA' environment variable is not set. Disabling CUDA-aware MPI");
+    // pure MPICH or some unsupported MPICH derivative
+#elif (defined MPICH) && !(defined MVAPICH2_VERSION)
+      if (me == 0)
+        error->warning(FLERR,"Detected MPICH. Disabling CUDA-aware MPI");
+      cuda_aware_flag = 0;
+#else
+  if (me == 0)
+    error->warning(FLERR,"Kokkos with CUDA assumes CUDA-aware MPI is available,"
+                   " but cannot determine if this is the case\n         try"
+                   " '-pk kokkos cuda/aware off' if getting segmentation faults");
+
 #endif
+    } // if (-1 == have_cuda_aware)
+  } // nmpi > 0
+#endif // KOKKOS_ENABLE_CUDA
 
 #ifdef KILL_KOKKOS_ON_SIGSEGV
   signal(SIGSEGV, my_signal_handler);
@@ -290,6 +339,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
         exchange_comm_classic = 0;
         exchange_comm_on_host = 0;
       } else error->all(FLERR,"Illegal package kokkos command");
+      exchange_comm_changed = 0;
       iarg += 2;
     } else if (strcmp(arg[iarg],"comm/forward") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
@@ -301,6 +351,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
         forward_comm_classic = 0;
         forward_comm_on_host = 0;
       } else error->all(FLERR,"Illegal package kokkos command");
+      forward_comm_changed = 0;
       iarg += 2;
     } else if (strcmp(arg[iarg],"comm/reverse") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
@@ -312,11 +363,12 @@ void KokkosLMP::accelerator(int narg, char **arg)
         reverse_comm_classic = 0;
         reverse_comm_on_host = 0;
       } else error->all(FLERR,"Illegal package kokkos command");
+      reverse_comm_changed = 0;
       iarg += 2;
-    } else if (strcmp(arg[iarg],"gpu/direct") == 0) {
+    } else if (strcmp(arg[iarg],"cuda/aware") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
-      if (strcmp(arg[iarg+1],"off") == 0) gpu_direct_flag = 0;
-      else if (strcmp(arg[iarg+1],"on") == 0) gpu_direct_flag = 1;
+      if (strcmp(arg[iarg+1],"off") == 0) cuda_aware_flag = 0;
+      else if (strcmp(arg[iarg+1],"on") == 0) cuda_aware_flag = 1;
       else error->all(FLERR,"Illegal package kokkos command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"neigh/thread") == 0) {
@@ -329,15 +381,38 @@ void KokkosLMP::accelerator(int narg, char **arg)
     } else error->all(FLERR,"Illegal package kokkos command");
   }
 
-  // if "gpu/direct off" and "comm device", change to "comm host"
+  // if "cuda/aware off" and "comm device", change to "comm host"
 
-  if (!gpu_direct_flag) {
-   if (exchange_comm_classic == 0 && exchange_comm_on_host == 0)
-     exchange_comm_on_host = 1;
-   if (forward_comm_classic == 0 && forward_comm_on_host == 0)
-     forward_comm_on_host = 1;
-   if (reverse_comm_classic == 0 && reverse_comm_on_host == 0)
-     reverse_comm_on_host = 1;
+  if (!cuda_aware_flag) {
+    if (exchange_comm_classic == 0 && exchange_comm_on_host == 0) {
+      exchange_comm_on_host = 1;
+      exchange_comm_changed = 1;
+    }
+    if (forward_comm_classic == 0 && forward_comm_on_host == 0) {
+      forward_comm_on_host = 1;
+      forward_comm_changed = 1;
+    }
+    if (reverse_comm_classic == 0 && reverse_comm_on_host == 0) {
+      reverse_comm_on_host = 1;
+      reverse_comm_changed = 1;
+    }
+  }
+
+  // if "cuda/aware on" and comm flags were changed previously, change them back
+
+  if (cuda_aware_flag) {
+    if (exchange_comm_changed) {
+      exchange_comm_on_host = 0;
+      exchange_comm_changed = 0;
+    }
+    if (forward_comm_changed) {
+      forward_comm_on_host = 0;
+      forward_comm_changed = 0;
+    }
+    if (reverse_comm_changed) {
+      reverse_comm_on_host = 0;
+      reverse_comm_changed = 0;
+    }
   }
 
   // set newton flags
diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h
index 46044799c4..b9f1e66c68 100644
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@@ -32,10 +32,13 @@ class KokkosLMP : protected Pointers {
   int exchange_comm_on_host;
   int forward_comm_on_host;
   int reverse_comm_on_host;
+  int exchange_comm_changed;
+  int forward_comm_changed;
+  int reverse_comm_changed;
   int nthreads,ngpus;
   int numa;
   int auto_sync;
-  int gpu_direct_flag;
+  int cuda_aware_flag;
   int neigh_thread;
   int neigh_thread_set;
   int newtonflag;
diff --git a/src/MAKE/MACHINES/Makefile.lassen_kokkos b/src/MAKE/MACHINES/Makefile.lassen_kokkos
new file mode 100644
index 0000000000..23697bfea2
--- /dev/null
+++ b/src/MAKE/MACHINES/Makefile.lassen_kokkos
@@ -0,0 +1,125 @@
+# lassen_kokkos = KOKKOS/CUDA, V100 GPU and Power9, IBM Spectrum MPI, nvcc compiler with gcc 7.3.1
+
+SHELL = /bin/sh
+
+# ---------------------------------------------------------------------
+# compiler/linker settings
+# specify flags and libraries needed for your compiler
+
+KOKKOS_ABSOLUTE_PATH = $(shell cd $(KOKKOS_PATH); pwd)
+export MPICH_CXX = $(KOKKOS_ABSOLUTE_PATH)/bin/nvcc_wrapper
+export OMPI_CXX = $(KOKKOS_ABSOLUTE_PATH)/bin/nvcc_wrapper
+CC =		$(KOKKOS_ABSOLUTE_PATH)/bin/nvcc_wrapper
+CCFLAGS =	-g -O3
+SHFLAGS =	-fPIC
+DEPFLAGS =	-M
+
+LINK =		$(KOKKOS_ABSOLUTE_PATH)/bin/nvcc_wrapper
+LINKFLAGS =	-g -O3
+LIB = 
+SIZE =		size
+
+ARCHIVE =	ar
+ARFLAGS =	-rc
+SHLIBFLAGS =	-shared
+KOKKOS_DEVICES = Cuda
+KOKKOS_ARCH = Power9,Volta70
+
+# ---------------------------------------------------------------------
+# LAMMPS-specific settings, all OPTIONAL
+# specify settings for LAMMPS features you will use
+# if you change any -D setting, do full re-compile after "make clean"
+
+# LAMMPS ifdef settings
+# see possible settings in Section 2.2 (step 4) of manual
+
+LMP_INC =	-DLAMMPS_GZIP
+
+# MPI library
+# see discussion in Section 2.2 (step 5) of manual
+# MPI wrapper compiler/linker can provide this info
+# can point to dummy MPI library in src/STUBS as in Makefile.serial
+# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
+# INC = path for mpi.h, MPI compiler settings
+# PATH = path for MPI library
+# LIB = name of MPI library
+
+MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1/include
+MPI_PATH = 
+MPI_LIB = -L/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1/lib -lmpi_ibm
+
+# FFT library
+# see discussion in Section 2.2 (step 6) of manaul
+# can be left blank to use provided KISS FFT library
+# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
+# PATH = path for FFT library
+# LIB = name of FFT library
+
+FFT_INC =    	
+FFT_PATH = 
+FFT_LIB =	
+
+# JPEG and/or PNG library
+# see discussion in Section 2.2 (step 7) of manual
+# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
+# INC = path(s) for jpeglib.h and/or png.h
+# PATH = path(s) for JPEG library and/or PNG library
+# LIB = name(s) of JPEG library and/or PNG library
+
+JPG_INC =       
+JPG_PATH = 	
+JPG_LIB =	
+
+# ---------------------------------------------------------------------
+# build rules and dependencies
+# do not edit this section
+
+include	Makefile.package.settings
+include	Makefile.package
+
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
+EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
+EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
+EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
+EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
+
+# Path to src files
+
+vpath %.cpp ..
+vpath %.h ..
+
+# Link target
+
+$(EXE):	$(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
+	$(SIZE) $(EXE)
+
+# Library targets
+
+lib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
+
+shlib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
+# Compilation rules
+
+%.o:%.cpp $(EXTRA_CPP_DEPENDS)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+%.d:%.cpp $(EXTRA_CPP_DEPENDS)
+	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
+
+%.o:%.cu $(EXTRA_CPP_DEPENDS)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+# Individual dependencies
+
+depend : fastdep.exe $(SRC)
+	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
+
+fastdep.exe: ../DEPEND/fastdep.c
+	cc -O -o $@ $<
+
+sinclude .depend
diff --git a/src/STUBS/mpi.c b/src/STUBS/mpi.c
index af7a489bb1..cd92750b9a 100644
--- a/src/STUBS/mpi.c
+++ b/src/STUBS/mpi.c
@@ -76,6 +76,23 @@ int MPI_Finalized(int *flag)
 
 /* ---------------------------------------------------------------------- */
 
+/* return "LAMMPS MPI STUBS" as name of the library */
+
+int MPI_Get_library_version(char *version, int *resultlen)
+{
+  const char string[] = "LAMMPS MPI STUBS";
+  int len;
+
+  if (!version || !resultlen) return MPI_ERR_ARG;
+
+  len = strlen(string);
+  memcpy(version,string,len+1);
+  *resultlen = len;
+  return MPI_SUCCESS;
+}
+
+/* ---------------------------------------------------------------------- */
+
 /* return "localhost" as name of the processor */
 
 int MPI_Get_processor_name(char *name, int *resultlen)
diff --git a/src/STUBS/mpi.h b/src/STUBS/mpi.h
index 1eca1ec527..063dc542be 100644
--- a/src/STUBS/mpi.h
+++ b/src/STUBS/mpi.h
@@ -63,6 +63,7 @@ extern "C" {
 #define MPI_IN_PLACE NULL
 
 #define MPI_MAX_PROCESSOR_NAME 128
+#define MPI_MAX_LIBRARY_VERSION_STRING 128
 
 typedef void MPI_User_function(void *invec, void *inoutvec,
                                int *len, MPI_Datatype *datatype);
@@ -79,6 +80,7 @@ typedef struct _MPI_Status MPI_Status;
 int MPI_Init(int *argc, char ***argv);
 int MPI_Initialized(int *flag);
 int MPI_Finalized(int *flag);
+int MPI_Get_library_version(char *version, int *resultlen);
 int MPI_Get_processor_name(char *name, int *resultlen);
 int MPI_Get_version(int *major, int *minor);