Merge remote-tracking branch 'upstream/master'

2018-10-26 22:01:05 +01:00
parent 2e79d9f340 2428c1c1f3
commit ff9f836be4
1856 changed files with 103330 additions and 24994 deletions
--- a/lib/README
+++ b/lib/README
@ -35,6 +35,8 @@ linalg        set of BLAS and LAPACK routines needed by USER-ATC package
 	        from Axel Kohlmeyer (Temple U)
 meam	      modified embedded atom method (MEAM) potential, MEAM package
                from Greg Wagner (Sandia)
+message       client/server communication library via MPI, sockets, files
+	        from Steve Plimpton (Sandia)
 molfile       hooks to VMD molfile plugins, used by the USER-MOLFILE package
                from Axel Kohlmeyer (Temple U) and the VMD development team
 mscg          hooks to the MSCG library, used by fix_mscg command
--- a/lib/gpu/Install.py
+++ b/lib/gpu/Install.py
@ -23,15 +23,17 @@ optionally copies Makefile.auto to a new Makefile.osuffix

  -m = use Makefile.machine as starting point, copy to Makefile.auto
       default machine = linux
+       default for -h, -a, -p, -e settings are those in -m Makefile
  -h = set CUDA_HOME variable in Makefile.auto to hdir
       hdir = path to NVIDIA Cuda software, e.g. /usr/local/cuda
  -a = set CUDA_ARCH variable in Makefile.auto to arch
-       use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0)
-                     or GeForce GTX 580 or similar
-       use arch = 30 for Tesla K10 (Kepler)
-       use arch = 35 for Tesla K40 (Kepler) or GeForce GTX Titan or similar
-       use arch = 37 for Tesla dual K80 (Kepler)
-       use arch = 60 for Tesla P100 (Pascal)
+       use arch = sm_20 for Fermi (C2050/C2070, deprecated as of CUDA 8.0)
+                        or GeForce GTX 580 or similar
+       use arch = sm_30 for Kepler (K10)
+       use arch = sm_35 for Kepler (K40) or GeForce GTX Titan or similar
+       use arch = sm_37 for Kepler (dual K80)
+       use arch = sm_60 for Pascal (P100)
+       use arch = sm_70 for Volta
  -p = set CUDA_PRECISION variable in Makefile.auto to precision
       use precision = double or mixed or single
  -e = set EXTRAMAKE variable in Makefile.auto to Makefile.lammps.esuffix
@ -46,7 +48,7 @@ Examples:

 make lib-gpu args="-b"      # build GPU lib with default Makefile.linux
 make lib-gpu args="-m xk7 -p single -o xk7.single"      # create new Makefile.xk7.single, altered for single-precision
-make lib-gpu args="-m mpi -a 35 -p single -o mpi.mixed -b" # create new Makefile.mpi.mixed, also build GPU lib with these settings
+make lib-gpu args="-m mpi -a sm_35 -p single -o mpi.mixed -b" # create new Makefile.mpi.mixed, also build GPU lib with these settings
 """

 # print error message or help
@ -127,7 +129,7 @@ for line in lines:
  if hflag and words[0] == "CUDA_HOME" and words[1] == '=':
    line = line.replace(words[2],hdir)
  if aflag and words[0] == "CUDA_ARCH" and words[1] == '=':
-    line = line.replace(words[2],"-arch=sm_%s" % arch)
+    line = line.replace(words[2],"-arch=%s" % arch)
  if pflag and words[0] == "CUDA_PRECISION" and words[1] == '=':
    line = line.replace(words[2],precstr)
  if eflag and words[0] == "EXTRAMAKE" and words[1] == '=':
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@ -13,8 +13,8 @@ endif

 NVCC = nvcc

-# Tesla CUDA
-CUDA_ARCH = -arch=sm_21
+# older CUDA
+#CUDA_ARCH = -arch=sm_21
 # newer CUDA
 #CUDA_ARCH = -arch=sm_13
 # older CUDA
--- a/lib/gpu/Nvidia.makefile_multi
+++ b/lib/gpu/Nvidia.makefile_multi
@ -79,7 +79,10 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
       $(OBJ_DIR)/lal_lj_cubic.o $(OBJ_DIR)/lal_lj_cubic_ext.o \
       $(OBJ_DIR)/lal_ufm.o $(OBJ_DIR)/lal_ufm_ext.o \
       $(OBJ_DIR)/lal_dipole_long_lj.o $(OBJ_DIR)/lal_dipole_long_lj_ext.o \
-       $(OBJ_DIR)/lal_lj_expand_coul_long.o $(OBJ_DIR)/lal_lj_expand_coul_long_ext.o
+       $(OBJ_DIR)/lal_lj_expand_coul_long.o $(OBJ_DIR)/lal_lj_expand_coul_long_ext.o \
+       $(OBJ_DIR)/lal_coul_long_cs.o $(OBJ_DIR)/lal_coul_long_cs_ext.o \
+       $(OBJ_DIR)/lal_born_coul_long_cs.o $(OBJ_DIR)/lal_born_coul_long_cs_ext.o \
+       $(OBJ_DIR)/lal_born_coul_wolf_cs.o $(OBJ_DIR)/lal_born_coul_wolf_cs_ext.o

 CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@ -137,7 +140,10 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/lj_cubic.cubin $(OBJ_DIR)/lj_cubic_cubin.h \
       $(OBJ_DIR)/ufm.cubin $(OBJ_DIR)/ufm_cubin.h \
       $(OBJ_DIR)/dipole_long_lj.cubin $(OBJ_DIR)/dipole_long_lj_cubin.h \
-       $(OBJ_DIR)/lj_expand_coul_long.cubin $(OBJ_DIR)/lj_expand_coul_long_cubin.h
+       $(OBJ_DIR)/lj_expand_coul_long.cubin $(OBJ_DIR)/lj_expand_coul_long_cubin.h \
+       $(OBJ_DIR)/coul_long_cs.cubin $(OBJ_DIR)/coul_long_cs_cubin.h \
+       $(OBJ_DIR)/born_coul_long_cs.cubin $(OBJ_DIR)/born_coul_long_cs_cubin.h \
+       $(OBJ_DIR)/born_coul_wolf_cs.cubin $(OBJ_DIR)/born_coul_wolf_cs_cubin.h

 all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)

@ -837,6 +843,42 @@ $(OBJ_DIR)/lal_lj_expand_coul_long.o: $(ALL_H) lal_lj_expand_coul_long.h lal_lj_
 $(OBJ_DIR)/lal_lj_expand_coul_long_ext.o: $(ALL_H) lal_lj_expand_coul_long.h lal_lj_expand_coul_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_lj_expand_coul_long_ext.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/coul_long_cs.cubin: lal_coul_long_cs.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_coul_long_cs.cu
+
+$(OBJ_DIR)/coul_long_cs_cubin.h: $(OBJ_DIR)/coul_long_cs.cubin $(OBJ_DIR)/coul_long_cs.cubin
+	$(BIN2C) -c -n coul_long_cs $(OBJ_DIR)/coul_long_cs.cubin > $(OBJ_DIR)/coul_long_cs_cubin.h
+
+$(OBJ_DIR)/lal_coul_long_cs.o: $(ALL_H) lal_coul_long_cs.h lal_coul_long_cs.cpp $(OBJ_DIR)/coul_long_cs_cubin.h $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_coul_long.o
+	$(CUDR) -o $@ -c lal_coul_long_cs.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_long_cs_ext.o: $(ALL_H) lal_coul_long_cs.h lal_coul_long_cs_ext.cpp lal_coul_long.h
+	$(CUDR) -o $@ -c lal_coul_long_cs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_long_cs.cubin: lal_born_coul_long_cs.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_born_coul_long_cs.cu
+
+$(OBJ_DIR)/born_coul_long_cs_cubin.h: $(OBJ_DIR)/born_coul_long_cs.cubin $(OBJ_DIR)/born_coul_long_cs.cubin
+	$(BIN2C) -c -n born_coul_long_cs $(OBJ_DIR)/born_coul_long_cs.cubin > $(OBJ_DIR)/born_coul_long_cs_cubin.h
+
+$(OBJ_DIR)/lal_born_coul_long_cs.o: $(ALL_H) lal_born_coul_long_cs.h lal_born_coul_long_cs.cpp $(OBJ_DIR)/born_coul_long_cs_cubin.h $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_born_coul_long.o
+	$(CUDR) -o $@ -c lal_born_coul_long_cs.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_long_cs_ext.o: $(ALL_H) lal_born_coul_long_cs.h lal_born_coul_long_cs_ext.cpp lal_born_coul_long.h
+	$(CUDR) -o $@ -c lal_born_coul_long_cs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_wolf_cs.cubin: lal_born_coul_wolf_cs.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_born_coul_wolf_cs.cu
+
+$(OBJ_DIR)/born_coul_wolf_cs_cubin.h: $(OBJ_DIR)/born_coul_wolf_cs.cubin $(OBJ_DIR)/born_coul_wolf_cs.cubin
+	$(BIN2C) -c -n born_coul_wolf_cs $(OBJ_DIR)/born_coul_wolf_cs.cubin > $(OBJ_DIR)/born_coul_wolf_cs_cubin.h
+
+$(OBJ_DIR)/lal_born_coul_wolf_cs.o: $(ALL_H) lal_born_coul_wolf_cs.h lal_born_coul_wolf_cs.cpp $(OBJ_DIR)/born_coul_wolf_cs_cubin.h $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_born_coul_wolf.o
+	$(CUDR) -o $@ -c lal_born_coul_wolf_cs.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_wolf_cs_ext.o: $(ALL_H) lal_born_coul_wolf_cs.h lal_born_coul_wolf_cs_ext.cpp lal_born_coul_wolf.h
+	$(CUDR) -o $@ -c lal_born_coul_wolf_cs_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 

--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -48,7 +48,18 @@ struct NVDProperties {
  int minor;
  CUDA_INT_TYPE totalGlobalMem;
  int multiProcessorCount;
-  CUdevprop_st p;
+
+  int maxThreadsPerBlock;
+  int maxThreadsDim[3];
+  int maxGridSize[3];
+  int sharedMemPerBlock;
+  int totalConstantMemory;
+  int SIMDWidth;
+  int memPitch;
+  int regsPerBlock;
+  int clockRate;
+  int textureAlign;
+
  int kernelExecTimeoutEnabled;
  int integrated;
  int canMapHostMemory;
@ -210,18 +221,18 @@ class UCL_Device {
  inline double clock_rate() { return clock_rate(_device); }
  /// Clock rate in GHz
  inline double clock_rate(const int i)
-    { return _properties[i].p.clockRate*1e-6;}
+    { return _properties[i].clockRate*1e-6;}

  /// Get the maximum number of threads per block
  inline size_t group_size() { return group_size(_device); }
  /// Get the maximum number of threads per block
  inline size_t group_size(const int i)
-    { return _properties[i].p.maxThreadsPerBlock; }
+    { return _properties[i].maxThreadsPerBlock; }

  /// Return the maximum memory pitch in bytes for current device
  inline size_t max_pitch() { return max_pitch(_device); }
  /// Return the maximum memory pitch in bytes
-  inline size_t max_pitch(const int i) { return _properties[i].p.memPitch; }
+  inline size_t max_pitch(const int i) { return _properties[i].memPitch; }

  /// Returns false if accelerator cannot be shared by multiple processes
  /** If it cannot be determined, true is returned **/
@ -260,6 +271,9 @@ class UCL_Device {
  /// List all devices along with all properties
  inline void print_all(std::ostream &out);

+  /// Select the platform that has accelerators (for compatibility with OpenCL)
+  inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
+
 private:
  int _device, _num_devices;
  std::vector<NVDProperties> _properties;
@ -272,49 +286,54 @@ class UCL_Device {
 UCL_Device::UCL_Device() {
  CU_SAFE_CALL_NS(cuInit(0));
  CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
-  for (int dev=0; dev<_num_devices; ++dev) {
-    CUdevice m;
-    CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
+  for (int i=0; i<_num_devices; ++i) {
+    CUdevice dev;
+    CU_SAFE_CALL_NS(cuDeviceGet(&dev,i));
    int major, minor;
-    CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
    if (major==9999)
      continue;

-    _properties.push_back(NVDProperties());
-    _properties.back().device_id=dev;
-    _properties.back().major=major;
-    _properties.back().minor=minor;
+    NVDProperties prop;
+    prop.device_id = i;
+    prop.major=major;
+    prop.minor=minor;

    char namecstr[1024];
-    CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
-    _properties.back().name=namecstr;
+    CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,dev));
+    prop.name=namecstr;
+
+    CU_SAFE_CALL_NS(cuDeviceTotalMem(&prop.totalGlobalMem,dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev));
+
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.maxThreadsDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.maxThreadsDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.maxThreadsDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.maxGridSize[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.maxGridSize[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.maxGridSize[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.SIMDWidth, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));

-    CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
-    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
-                                       CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-                                         m));
-    CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m));
    #if CUDA_VERSION >= 2020
-    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().kernelExecTimeoutEnabled,
-                      CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
-    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().integrated,
-                      CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
-    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().canMapHostMemory,
-                      CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
-    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode,
-                      CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
    #endif
    #if CUDA_VERSION >= 3010
-    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().concurrentKernels,
-                      CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
-    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().ECCEnabled,
-                      CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
    #endif
+
+    _properties.push_back(prop);
  }
  _device=-1;
  _cq.push_back(CUstream());
@ -390,27 +409,27 @@ void UCL_Device::print_all(std::ostream &out) {
        << cores(i) << std::endl;
    #endif
    out << "  Total amount of constant memory:               "
-        << _properties[i].p.totalConstantMemory << " bytes\n";
+        << _properties[i].totalConstantMemory << " bytes\n";
    out << "  Total amount of local/shared memory per block: "
-        << _properties[i].p.sharedMemPerBlock << " bytes\n";
+        << _properties[i].sharedMemPerBlock << " bytes\n";
    out << "  Total number of registers available per block: "
-        << _properties[i].p.regsPerBlock << std::endl;
+        << _properties[i].regsPerBlock << std::endl;
    out << "  Warp size:                                     "
-        << _properties[i].p.SIMDWidth << std::endl;
+        << _properties[i].SIMDWidth << std::endl;
    out << "  Maximum number of threads per block:           "
-        << _properties[i].p.maxThreadsPerBlock << std::endl;
+        << _properties[i].maxThreadsPerBlock << std::endl;
    out << "  Maximum group size (# of threads per block)    "
-        << _properties[i].p.maxThreadsDim[0] << " x "
-        << _properties[i].p.maxThreadsDim[1] << " x "
-        << _properties[i].p.maxThreadsDim[2] << std::endl;
+        << _properties[i].maxThreadsDim[0] << " x "
+        << _properties[i].maxThreadsDim[1] << " x "
+        << _properties[i].maxThreadsDim[2] << std::endl;
    out << "  Maximum item sizes (# threads for each dim)    "
-        << _properties[i].p.maxGridSize[0] << " x "
-        << _properties[i].p.maxGridSize[1] << " x "
-        << _properties[i].p.maxGridSize[2] << std::endl;
+        << _properties[i].maxGridSize[0] << " x "
+        << _properties[i].maxGridSize[1] << " x "
+        << _properties[i].maxGridSize[2] << std::endl;
    out << "  Maximum memory pitch:                          "
        << max_pitch(i) << " bytes\n";
    out << "  Texture alignment:                             "
-        << _properties[i].p.textureAlign << " bytes\n";
+        << _properties[i].textureAlign << " bytes\n";
    out << "  Clock rate:                                    "
        << clock_rate(i) << " GHz\n";
    #if CUDA_VERSION >= 2020
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -165,8 +165,8 @@ class UCL_Device {
  /// Get the current OpenCL device name
  inline std::string name() { return name(_device); }
  /// Get the OpenCL device name
-  inline std::string name(const int i)
-    { return std::string(_properties[i].name); }
+  inline std::string name(const int i) {
+    return std::string(_properties[i].name); }

  /// Get a string telling the type of the current device
  inline std::string device_type_name() { return device_type_name(_device); }
@ -281,7 +281,7 @@ class UCL_Device {
  inline cl_device_id & cl_device() { return _cl_device; }

  /// Select the platform that has accelerators
-  inline void set_platform_accelerator(int pid=-1);
+  inline int set_platform_accelerator(int pid=-1);

 private:
  int _num_platforms;          // Number of platforms
@ -324,6 +324,7 @@ UCL_Device::~UCL_Device() {

 void UCL_Device::clear() {
  _properties.clear();
+  _cl_devices.clear();
  if (_device>-1) {
    for (size_t i=0; i<_cq.size(); i++) {
      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
@ -520,8 +521,6 @@ int UCL_Device::device_type(const int i) {

 // Set the CUDA device to the specified device number
 int UCL_Device::set(int num) {
-  clear();
-
  cl_device_id *device_list = new cl_device_id[_num_devices];
  cl_uint n;
  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
@ -612,7 +611,7 @@ void UCL_Device::print_all(std::ostream &out) {

 // Select the platform that is associated with accelerators
 // if pid < 0, select the first platform
-void UCL_Device::set_platform_accelerator(int pid) {
+int UCL_Device::set_platform_accelerator(int pid) {
  if (pid < 0) {
    int found = 0;
    for (int n=0; n<_num_platforms; n++) {
@ -625,10 +624,11 @@ void UCL_Device::set_platform_accelerator(int pid) {
          break;
        }
      }
-      if (found) break;
+      if (found) return UCL_SUCCESS;
    }
+    return UCL_ERROR;
  } else {
-    set_platform(pid);
+    return set_platform(pid);
  }
 }

--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -38,8 +38,8 @@ namespace ucl_opencl {
 /// Class for timing OpenCL events
 class UCL_Timer {
 public:
-  inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
-  inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
+  inline UCL_Timer() : _total_time(0.0f), _initialized(false), has_measured_time(false) { }
+  inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false), has_measured_time(false)
    { init(dev); }

  inline ~UCL_Timer() { clear(); }
@ -49,11 +49,10 @@ class UCL_Timer {
  inline void clear() {
    if (_initialized) {
      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
-      clReleaseEvent(start_event);
-      clReleaseEvent(stop_event);
      _initialized=false;
      _total_time=0.0;
    }
+    has_measured_time = false;
  }

  /// Initialize default command queue for timing
@ -66,25 +65,39 @@ class UCL_Timer {
    _cq=cq;
    clRetainCommandQueue(_cq);
    _initialized=true;
+    has_measured_time = false;
  }

  /// Start timing on default command queue
-  inline void start() { UCL_OCL_MARKER(_cq,&start_event); }
+  inline void start() {
+    UCL_OCL_MARKER(_cq,&start_event);
+    has_measured_time = false;
+  }

  /// Stop timing on default command queue
-  inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); }
+  inline void stop() {
+    UCL_OCL_MARKER(_cq,&stop_event);
+    has_measured_time = true;
+  }

  /// Block until the start event has been reached on device
-  inline void sync_start()
-    { CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
+  inline void sync_start() {
+    CL_SAFE_CALL(clWaitForEvents(1,&start_event));
+    has_measured_time = false;
+  }

  /// Block until the stop event has been reached on device
-  inline void sync_stop()
-    { CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
+  inline void sync_stop() {
+    CL_SAFE_CALL(clWaitForEvents(1,&stop_event));
+    has_measured_time = true;
+  }

  /// Set the time elapsed to zero (not the total_time)
-  inline void zero()
-    { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); }
+  inline void zero() {
+    has_measured_time = false;
+    UCL_OCL_MARKER(_cq,&start_event);
+    UCL_OCL_MARKER(_cq,&stop_event);
+  }

  /// Set the total time to zero
  inline void zero_total() { _total_time=0.0; }
@ -99,6 +112,7 @@ class UCL_Timer {

  /// Return the time (ms) of last start to stop - Forces synchronization
  inline double time() {
+    if(!has_measured_time) return 0.0;
    cl_ulong tstart,tend;
    CL_SAFE_CALL(clWaitForEvents(1,&stop_event));
    CL_SAFE_CALL(clGetEventProfilingInfo(stop_event,
@ -107,6 +121,9 @@ class UCL_Timer {
    CL_SAFE_CALL(clGetEventProfilingInfo(start_event,
                                         CL_PROFILING_COMMAND_END,
                                         sizeof(cl_ulong), &tstart, NULL));
+    clReleaseEvent(start_event);
+    clReleaseEvent(stop_event);
+    has_measured_time = false;
    return (tend-tstart)*t_factor;
  }

@ -123,8 +140,9 @@ class UCL_Timer {
  cl_event start_event, stop_event;
  cl_command_queue _cq;
  double _total_time;
-  bool _initialized;
  double t_factor;
+  bool _initialized;
+  bool has_measured_time;
 };

 } // namespace
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -322,10 +322,12 @@ class Atom {

  // Copy charges to device asynchronously
  inline void add_q_data() {
+    time_q.start();
    if (_q_avail==false) {
      q.update_device(_nall,true);
      _q_avail=true;
    }
+    time_q.stop();
  }

  // Cast quaternions to write buffer
@ -347,10 +349,12 @@ class Atom {
  // Copy quaternions to device
  /** Copies nall()*4 elements **/
  inline void add_quat_data() {
+    time_quat.start();
    if (_quat_avail==false) {
      quat.update_device(_nall*4,true);
      _quat_avail=true;
    }
+    time_quat.stop();
  }

  /// Cast velocities and tags to write buffer
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -34,8 +34,8 @@ using namespace LAMMPS_AL;

 template <class numtyp, class acctyp>
 DeviceT::Device() : _init_count(0), _device_init(false),
-                                  _gpu_mode(GPU_FORCE), _first_device(0),
-                                  _last_device(0), _compiled(false) {
+                    _gpu_mode(GPU_FORCE), _first_device(0),
+                    _last_device(0), _platform_id(-1), _compiled(false) {
 }

 template <class numtyp, class acctyp>
@ -67,6 +67,17 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
  _particle_split=p_split;
  _cell_size=cell_size;
  _block_pair=block_pair;
+  // support selecting platform though "package device" keyword.
+  // "0:generic" will select platform 0 and tune for generic device
+  // "1:fermi" will select platform 1 and tune for Nvidia Fermi gpu
+  if (ocl_vendor) {
+    char *sep = NULL;
+    if ((sep = strstr(ocl_vendor,":"))) {
+      *sep = '\0';
+      _platform_id = atoi(ocl_vendor);
+      ocl_vendor = sep+1;
+    }
+  }

  // Get the rank/size within the world
  MPI_Comm_rank(_comm_world,&_world_me);
@ -119,8 +130,16 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,

  // Time on the device only if 1 proc per gpu
  _time_device=true;
+
+#if 0
+  // XXX: the following setting triggers a memory leak with OpenCL and MPI
+  //      setting _time_device=true for all processes doesn't seem to be a
+  //      problem with either (no segfault, no (large) memory leak.
+  //      thus keeping this disabled for now. may need to review later.
+  //      2018-07-23 <akohlmey@gmail.com>
  if (_procs_per_gpu>1)
    _time_device=false;
+#endif

  // Set up a per device communicator
  MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
@ -135,6 +154,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
    return -7;
  #endif

+  if (gpu->set_platform_accelerator(_platform_id)!=UCL_SUCCESS)
+    return -12;
+
  if (gpu->set(my_gpu)!=UCL_SUCCESS)
    return -6;

@ -191,13 +213,15 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
    _ocl_vendor_string="-DUSE_OPENCL";
    int token_count=0;
    std::string params[13];
-    char *pch = strtok(ocl_vendor,"\" ");
+    char *pch = strtok(ocl_vendor,",");
+    pch = strtok(NULL,",");
+    if (pch == NULL) return -11;
    while (pch != NULL) {
      if (token_count==13)
        return -11;
      params[token_count]=pch;
      token_count++;
-      pch = strtok(NULL,"\" ");
+      pch = strtok(NULL,",");
    }
    _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
                        " -DTHREADS_PER_ATOM="+params[1]+
@ -656,7 +680,7 @@ int DeviceT::compile_kernels() {
  dev_program=new UCL_Program(*gpu);
  int success=dev_program->load_string(device,compile_string().c_str());
  if (success!=UCL_SUCCESS)
-    return -4;
+    return -6;
  k_zero.set_function(*dev_program,"kernel_zero");
  k_info.set_function(*dev_program,"kernel_info");
  _compiled=true;
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -292,7 +292,7 @@ class Device {
  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
      _replica_size;
-  int _gpu_mode, _first_device, _last_device, _nthreads;
+  int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads;
  double _particle_split;
  double _cpu_full;
  double _ptx_arch;
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -127,10 +127,10 @@ void Neighbor::alloc(bool &success) {
    dev_packed.clear();
    success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
                                         _packed_permissions)==UCL_SUCCESS);
-    dev_acc.clear();
-    success=success && (dev_acc.alloc(_max_atoms,*dev,
+    dev_ilist.clear();
+    success=success && (dev_ilist.alloc(_max_atoms,*dev,
                                      UCL_READ_WRITE)==UCL_SUCCESS);
-    _c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes();
+    _c_bytes+=dev_packed.row_bytes()+dev_ilist.row_bytes();
  }
  if (_max_host>0) {
    nbor_host.clear();
@ -197,7 +197,7 @@ void Neighbor::clear() {

    host_packed.clear();
    host_acc.clear();
-    dev_acc.clear();
+    dev_ilist.clear();
    dev_nbor.clear();
    nbor_host.clear();
    dev_packed.clear();
@ -281,7 +281,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
  }
  UCL_D_Vec<int> acc_view;
  acc_view.view_offset(inum,dev_nbor,inum*2);
-  ucl_copy(acc_view,host_acc,true);
+  ucl_copy(acc_view,host_acc,inum*2,true);

  UCL_H_Vec<int> host_view;
  host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
@ -289,7 +289,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
    int i=ilist[ii];
    host_view[i] = ii;
  }
-  ucl_copy(dev_acc,host_view,true);
+  ucl_copy(dev_ilist,host_view,true);

  time_nbor.stop();

@ -364,7 +364,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
  }
  UCL_D_Vec<int> acc_view;
  acc_view.view_offset(inum,dev_nbor,inum*2);
-  ucl_copy(acc_view,host_acc,true);
+  ucl_copy(acc_view,host_acc,inum*2,true);
  time_nbor.stop();

  if (_use_packing==false) {
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -110,7 +110,7 @@ class Neighbor {
      }
      if (_time_device) {
        time_nbor.add_to_total();
-        time_kernel.add_to_total();
+        if (_use_packing==false) time_kernel.add_to_total();
        if (_gpu_nbor==2) {
          time_hybrid1.add_to_total();
          time_hybrid2.add_to_total();
@ -200,7 +200,7 @@ class Neighbor {
  /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
  UCL_H_Vec<int> host_acc;
  /// Device storage for accessing atom indices from the neighbor list (3-body)
-  UCL_D_Vec<int> dev_acc;
+  UCL_D_Vec<int> dev_ilist;

  // ----------------- Data for GPU Neighbor Calculation ---------------

--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -119,6 +119,8 @@
 #define BLOCK_ELLIPSE 128
 #define MAX_SHARED_TYPES 11

+#if (__CUDACC_VER_MAJOR__ < 9)
+
 #ifdef _SINGLE_SINGLE
 #define shfl_xor __shfl_xor
 #else
@ -132,6 +134,25 @@ ucl_inline double shfl_xor(double var, int laneMask, int width) {
 }
 #endif

+#else
+
+#ifdef _SINGLE_SINGLE
+ucl_inline double shfl_xor(double var, int laneMask, int width) {
+  return __shfl_xor_sync(0xffffffff, var, laneMask, width);
+}
+#else
+ucl_inline double shfl_xor(double var, int laneMask, int width) {
+  int2 tmp;
+  tmp.x = __double2hiint(var);
+  tmp.y = __double2loint(var);
+  tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,laneMask,width);
+  tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,laneMask,width);
+  return __hiloint2double(tmp.x,tmp.y);
+}
+#endif
+
+#endif
+
 #endif

 #endif
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@ -243,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@ -252,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@ -544,7 +544,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const __global int * dev_acc,
+                             const __global int * dev_ilist,
                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
@ -614,13 +614,13 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
@ -698,7 +698,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const __global int * dev_acc,
+                             const __global int * dev_ilist,
                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
@ -768,13 +768,13 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@ -272,7 +272,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                   &this->dev_short_nbor,
-                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);

  ainum=this->ans->inum();
  nbor_pitch=this->nbor->nbor_pitch();
@ -311,7 +311,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@ -320,7 +320,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -696,7 +696,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
-                                  const __global int * dev_acc,
+                                  const __global int * dev_ilist,
                                  const __global int * dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
@ -777,13 +777,13 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
@ -941,7 +941,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
-                                        const __global int * dev_acc,
+                                        const __global int * dev_ilist,
                                        const __global int * dev_short_nbor,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
@ -1022,13 +1022,13 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@ -272,7 +272,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                   &this->dev_short_nbor,
-                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);

  ainum=this->ans->inum();
  nbor_pitch=this->nbor->nbor_pitch();
@ -311,7 +311,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@ -320,7 +320,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@ -272,7 +272,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int nbor_j, nbor_end, i, numj;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -432,7 +432,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int nbor, nbor_end, i, numj;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -547,7 +547,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -702,7 +702,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
-                                  const __global int * dev_acc,
+                                  const __global int * dev_ilist,
                                  const __global int * dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
@ -740,7 +740,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -785,13 +785,13 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
@ -956,7 +956,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
-                                        const __global int * dev_acc,
+                                        const __global int * dev_ilist,
                                        const __global int * dev_short_nbor,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
@ -994,7 +994,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-    const int* nbor_mem = dev_packed;
+    const __global int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -1039,13 +1039,13 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@ -297,7 +297,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                   &this->dev_short_nbor,
-                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);

  ainum=this->ans->inum();
  nbor_pitch=this->nbor->nbor_pitch();
@ -337,7 +337,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@ -346,7 +346,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@ -278,7 +278,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int nbor_j, nbor_end, i, numj;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -445,7 +445,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int nbor, nbor_end, i, numj;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -563,7 +563,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -714,7 +714,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
-                                  const __global int * dev_acc,
+                                  const __global int * dev_ilist,
                                  const __global int * dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
@ -750,7 +750,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -795,13 +795,13 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
@ -959,7 +959,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
-                                        const __global int * dev_acc,
+                                        const __global int * dev_ilist,
                                        const __global int * dev_short_nbor,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
@ -995,7 +995,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-    const int* nbor_mem = dev_packed;
+    const __global int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -1040,13 +1040,13 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
--- a/lib/gpu/lal_vashishta.cpp
+++ b/lib/gpu/lal_vashishta.cpp
@ -278,7 +278,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  } else {
@ -286,7 +286,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@ -554,7 +554,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const __global int * dev_acc,
+                             const __global int * dev_ilist,
                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
@ -623,13 +623,13 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
@ -709,7 +709,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const __global int * dev_acc,
+                             const __global int * dev_ilist,
                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
@ -778,13 +778,13 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
--- a/lib/kokkos/cmake/kokkos_settings.cmake
+++ b/lib/kokkos/cmake/kokkos_settings.cmake
@ -158,7 +158,7 @@ if (NOT "${KOKKOS_INTERNAL_PATHS}" STREQUAL "")
  set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} ${KOKKOS_INTERNAL_PATHS})
 endif()
 if (NOT "${KOKKOS_INTERNAL_ADDTOPATH}" STREQUAL "")
-  set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} "PATH=\"${KOKKOS_INTERNAL_ADDTOPATH}:$ENV{PATH}\"")
+  set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} "PATH=${KOKKOS_INTERNAL_ADDTOPATH}:$ENV{PATH}")
 endif()

 if (CMAKE_CXX_STANDARD)
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@ -292,7 +292,8 @@ public:

 #if ! defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC )
      if ( 0 == r ) {
-        Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
+        //Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
+        return handle_type();
      }
 #endif

--- a/lib/latte/Install.py
+++ b/lib/latte/Install.py
@ -40,7 +40,7 @@ version = '1.2.1'
 checksums = { \
        '1.1.0' : '533635721ee222d0ed2925a18fb5b294', \
        '1.2.0' : '68bf0db879da5e068a71281020239ae7', \
-        '1.2.1' : 'bed76e7e76c545c36dd848a8f1fd35eb' \
+        '1.2.1' : '85ac414fdada2d04619c8f936344df14', \
        }

 # print error message or help
--- a/lib/latte/Makefile.lammps.ifort
+++ b/lib/latte/Makefile.lammps.ifort
@ -4,9 +4,9 @@

 latte_SYSINC = 
 latte_SYSLIB = ../../lib/latte/filelink.o \
-               -llatte -lifcore -lsvml -lompstub -limf -lmkl_intel_lp64 \
-               -lmkl_intel_thread -lmkl_core -lmkl_intel_thread -lpthread \
-               -openmp -O0
+               -llatte -lifport -lifcore -lsvml -lompstub -limf \
+               -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core \
+               -lmkl_intel_thread -lpthread -openmp
 latte_SYSPATH = -openmp -L${MKLROOT}/lib/intel64 -lmkl_lapack95_lp64 \
                -L/opt/intel/composer_xe_2013_sp1.2.144/compiler/lib/intel64

--- a/lib/message/Install.py
+++ b/lib/message/Install.py
@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+# Install.py tool to build the CSlib library
+# used to automate the steps described in the README file in this dir
+
+from __future__ import print_function
+import sys,os,re,subprocess
+
+# help message
+
+help = """
+Syntax from src dir: make lib-message args="-m"
+                 or: make lib-message args="-s -z"
+Syntax from lib dir: python Install.py -m
+                 or: python Install.py -s -z 
+
+specify zero or more options, order does not matter
+
+  -m = parallel build of CSlib library
+  -s = serial build of CSlib library
+  -z = build CSlib library with ZMQ socket support, default = no ZMQ support
+
+Example:
+
+make lib-message args="-m -z"   # build parallel CSlib with ZMQ support
+make lib-message args="-s"   # build serial CSlib with no ZMQ support
+"""
+
+# print error message or help
+
+def error(str=None):
+  if not str: print(help)
+  else: print("ERROR",str)
+  sys.exit()
+
+# expand to full path name
+# process leading '~' or relative path
+
+def fullpath(path):
+  return os.path.abspath(os.path.expanduser(path))
+
+def which(program):
+  def is_exe(fpath):
+    return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
+
+  fpath, fname = os.path.split(program)
+  if fpath:
+    if is_exe(program):
+      return program
+  else:
+    for path in os.environ["PATH"].split(os.pathsep):
+      path = path.strip('"')
+      exe_file = os.path.join(path, program)
+      if is_exe(exe_file):
+        return exe_file
+
+  return None
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+mpiflag = False
+serialflag = False
+zmqflag = False
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-m":
+    mpiflag = True
+    iarg += 1
+  elif args[iarg] == "-s":
+    serialflag = True
+    iarg += 1
+  elif args[iarg] == "-z":
+    zmqflag = True
+    iarg += 1
+  else: error()
+
+if (not mpiflag and not serialflag):
+  error("Must use either -m or -s flag")
+
+if (mpiflag and serialflag):
+  error("Cannot use -m and -s flag at the same time")
+
+# build CSlib
+# copy resulting lib to cslib/src/libmessage.a
+# copy appropriate Makefile.lammps.* to Makefile.lammps
+
+print("Building CSlib ...")
+srcdir = fullpath("./cslib/src")
+
+if mpiflag and zmqflag:
+  cmd = "cd %s; make lib_parallel" % srcdir
+elif mpiflag and not zmqflag:
+  cmd = "cd %s; make lib_parallel zmq=no" % srcdir
+elif not mpiflag and zmqflag:
+  cmd = "cd %s; make lib_serial" % srcdir
+elif not mpiflag and not zmqflag:
+  cmd = "cd %s; make lib_serial zmq=no" % srcdir
+  
+print(cmd)
+txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+print(txt.decode('UTF-8'))
+
+if mpiflag: cmd = "cd %s; cp libcsmpi.a libmessage.a" % srcdir
+else: cmd = "cd %s; cp libcsnompi.a libmessage.a" % srcdir
+print(cmd)
+txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+print(txt.decode('UTF-8'))
+
+if zmqflag: cmd = "cp Makefile.lammps.zmq Makefile.lammps"
+else: cmd = "cp Makefile.lammps.nozmq Makefile.lammps"
+print(cmd)
+txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+print(txt.decode('UTF-8'))
--- a/lib/message/Makefile.lammps.nozmq
+++ b/lib/message/Makefile.lammps.nozmq
@ -0,0 +1,5 @@
+# Settings that the LAMMPS build will import when this package library is used
+
+message_SYSINC = 
+message_SYSLIB = 
+message_SYSPATH = 
--- a/lib/message/Makefile.lammps.zmq
+++ b/lib/message/Makefile.lammps.zmq
@ -0,0 +1,5 @@
+# Settings that the LAMMPS build will import when this package library is used
+
+message_SYSINC = 
+message_SYSLIB = -lzmq
+message_SYSPATH = 
--- a/lib/message/README
+++ b/lib/message/README
@ -0,0 +1,51 @@
+This directory contains the CSlib library which is required
+to use the MESSAGE package and its client/server commands
+in a LAMMPS input script.
+
+The CSlib libary is included in the LAMMPS distribution.  A fuller
+version including documentation and test programs is available at
+http://cslib.sandia.gov.  It was developed by Steve Plimpton at Sandia
+National Laboratories.
+
+You can type "make lib-message" from the src directory to see help on
+how to build this library via make commands, or you can do the same
+thing by typing "python Install.py" from within this directory, or you
+can do it manually by following the instructions below.
+
+The CSlib can be optionally built with support for sockets using
+the open-source ZeroMQ (ZMQ) library.  If it is not installed
+on your system, it is easy to download and install.
+
+Go to the ZMQ website for details: http://zeromq.org
+
+-----------------
+
+Instructions:
+
+1.  Compile CSlib from within cslib/src with one of the following:
+    % make lib_parallel     # build parallel library with ZMQ socket support
+    % make lib_serial       # build serial library with ZMQ support
+    % make lib_parallel zmq=no   # build parallel lib with no ZMQ support
+    % make lib_serial zmq=no     # build serial lib with no ZMQ support
+
+2.  Copy the produced cslib/src/libcsmpi.a or libscnompi.a file to
+    cslib/src/libmessage.a
+
+3.  Copy either lib/message/Makefile.lammps.zmq or Makefile.lammps.nozmq
+    to lib/message/Makefile.lammps, depending on whether you
+    build the library with ZMQ support or not.
+    If your ZMQ library is not in a place your shell path finds,
+    you can set the INCLUDE and PATH variables in Makefile.lammps
+    to point to the dirs where the ZMQ include and library files are.
+
+-----------------
+
+When these steps are complete you can build LAMMPS
+with the MESSAGAE package installed:
+
+% cd lammps/src
+% make yes-message
+% make mpi (or whatever target you wish)
+
+Note that if you download and unpack a new LAMMPS tarball, you will
+need to re-build the CSlib in this dir.
--- a/lib/message/cslib/LICENSE
+++ b/lib/message/cslib/LICENSE
@ -0,0 +1,32 @@
+Program: CSlib client/server coupling library
+
+Copyright 2018 National Technology & Engineering Solutions of Sandia,
+LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the
+U.S. Government retains certain rights in this software.  This
+software is distributed under the modified Berkeley Software 
+Distribution (BSD) License.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of Sandia Corporation nor the names of contributors
+  to this software may be used to endorse or promote products derived
+  from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/lib/message/cslib/README
+++ b/lib/message/cslib/README
@ -0,0 +1,23 @@
+This is the the Client/Server messaging library (CSlib).
+
+Only the source directory and license file are included here as part
+of the LAMMPS distribution.  The full CSlib distribution, including
+documentation and test codes, can be found at the website:
+http://cslib.sandia.gov (as of Aug 2018).
+
+The contact author is
+
+Steve Plimpton
+Sandia National Laboratories
+sjplimp@sandia.gov
+http://www.sandia.gov/~sjplimp
+
+The CSlib is distributed as open-source code under the modified
+Berkeley Software Distribution (BSD) License.  See the accompanying
+LICENSE file.
+
+This directory contains the following:
+
+README         this file
+LICENSE        GNU LGPL license
+src            source files for library
--- a/lib/message/cslib/src/Makefile
+++ b/lib/message/cslib/src/Makefile
@ -0,0 +1,107 @@
+# Makefile for CSlib = client/server messaging library
+# type "make help" for options
+
+SHELL = /bin/sh
+
+# ----------------------------------------
+# should only need to change this section
+# compiler/linker settings
+# ----------------------------------------
+
+CC =		g++
+CCFLAGS =	-g -O3 -DZMQ_$(ZMQ) -DMPI_$(MPI)
+SHFLAGS =	-fPIC
+ARCHIVE =       ar
+ARCHFLAGS =     -rc
+SHLIBFLAGS =	-shared
+
+# files
+
+LIB =	libcsmpi.a
+SHLIB =	libcsmpi.so
+SRC =	$(wildcard *.cpp)
+INC =	$(wildcard *.h)
+OBJ = 	$(SRC:.cpp=.o)
+
+# build with ZMQ support or not
+
+zmq = 		yes
+ZMQ =		$(shell echo $(zmq) | tr a-z A-Z)
+
+ifeq ($(ZMQ),YES)
+  ZMQLIB = -lzmq
+else
+  CCFLAGS += -I./STUBS_ZMQ
+endif
+
+# build with MPI support or not
+
+mpi = 		yes
+MPI =		$(shell echo $(mpi) | tr a-z A-Z)
+
+ifeq ($(MPI),YES)
+  CC = mpicxx
+else
+  CCFLAGS += -I./STUBS_MPI
+  LIB = libcsnompi.a
+  SHLIB = libcsnompi.so
+endif
+
+# targets
+
+shlib:	shlib_parallel shlib_serial
+
+lib:	lib_parallel lib_serial
+
+all:	shlib lib
+
+help:
+	@echo 'make                   default = shlib'
+	@echo 'make shlib             build 2 shared CSlibs: parallel & serial'
+	@echo 'make lib               build 2 static CSlibs: parallel & serial'
+	@echo 'make all               build 4 CSlibs: shlib and lib'
+	@echo 'make shlib_parallel    build shared parallel CSlib'
+	@echo 'make shlib_serial      build shared serial CSlib'
+	@echo 'make lib_parallel      build static parallel CSlib'
+	@echo 'make lib_serial        build static serial CSlib'
+	@echo 'make ... zmq=no        build w/out ZMQ support'
+	@echo 'make clean             remove all *.o files'
+	@echo 'make clean-all         remove *.o and lib files'
+	@echo 'make tar               create a tarball, 2 levels up'
+
+shlib_parallel:
+	$(MAKE) clean
+	$(MAKE) shared zmq=$(zmq) mpi=yes
+
+shlib_serial:
+	$(MAKE) clean
+	$(MAKE) shared zmq=$(zmq) mpi=no
+
+lib_parallel:
+	$(MAKE) clean
+	$(MAKE) static zmq=$(zmq) mpi=yes
+
+lib_serial:
+	$(MAKE) clean
+	$(MAKE) static zmq=$(zmq) mpi=no
+
+static:	$(OBJ)
+	$(ARCHIVE) $(ARCHFLAGS) $(LIB) $(OBJ)
+
+shared:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) -o $(SHLIB) $(OBJ) $(ZMQLIB)
+
+clean:
+	@rm -f *.o *.pyc
+
+clean-all:
+	@rm -f *.o *.pyc lib*.a lib*.so
+
+tar:
+	cd ../..; tar cvf cslib.tar cslib/README cslib/LICENSE \
+		cslib/doc cslib/src cslib/test
+
+# rules
+
+%.o:%.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) -c $<
--- a/lib/message/cslib/src/STUBS_MPI/mpi.h
+++ b/lib/message/cslib/src/STUBS_MPI/mpi.h
@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+// MPI constants and dummy functions
+
+#ifndef MPI_DUMMY_H
+#define MPI_DUMMY_H
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+namespace CSLIB_NS {
+
+typedef int MPI_Comm;
+typedef int MPI_Fint;
+typedef int MPI_Datatype;
+typedef int MPI_Status;
+typedef int MPI_Op;
+typedef int MPI_Info;
+
+#define MPI_COMM_WORLD 0
+#define MPI_MAX_PORT_NAME 0
+#define MPI_INFO_NULL 0
+#define MPI_INT 1
+#define MPI_LONG_LONG 2
+#define MPI_FLOAT 3
+#define MPI_DOUBLE 4
+#define MPI_CHAR 5
+#define MPI_SUM 0
+
+static void MPI_Init(int *, char ***) {}
+static MPI_Comm MPI_Comm_f2c(MPI_Comm world) {return world;}
+static void MPI_Comm_rank(MPI_Comm, int *) {}
+static void MPI_Comm_size(MPI_Comm, int *) {}
+  
+static void MPI_Open_port(MPI_Info, char *) {}
+static void MPI_Close_port(const char *) {}
+static void MPI_Comm_accept(const char *, MPI_Info, int, 
+                            MPI_Comm, MPI_Comm *) {}
+static void MPI_Comm_connect(const char *, MPI_Info, int, 
+                             MPI_Comm, MPI_Comm *) {}
+
+static void MPI_Comm_split(MPI_Comm, int, int, MPI_Comm *) {}
+static void MPI_Comm_free(MPI_Comm *) {}
+
+static void MPI_Send(const void *, int, MPI_Datatype, int, int, MPI_Comm) {}
+static void MPI_Recv(void *, int, MPI_Datatype, int, int, 
+                     MPI_Comm, MPI_Status *) {}
+
+static void MPI_Allreduce(const void *in, void *out, int, MPI_Datatype type, 
+                          MPI_Op op, MPI_Comm)
+{
+  if (type == MPI_INT) *((int *) out) = *((int *) in);
+}
+static void MPI_Scan(const void *in, void *out, int, MPI_Datatype intype,
+		     MPI_Op op,MPI_Comm)
+{
+  if (intype == MPI_INT) *((int *) out) = *((int *) in);
+}
+
+static void MPI_Bcast(void *, int, MPI_Datatype, int, MPI_Comm) {}
+static void MPI_Allgather(const void *in, int incount, MPI_Datatype intype,
+                          void *out, int, MPI_Datatype, MPI_Comm)
+{
+  // assuming incount = 1
+  if (intype == MPI_INT) *((int *) out) = *((int *) in);
+}
+static void MPI_Allgatherv(const void *in, int incount, MPI_Datatype intype,
+                           void *out, const int *, const int *,
+                           MPI_Datatype, MPI_Comm)
+{
+  if (intype == MPI_INT) memcpy(out,in,incount*sizeof(int));
+  else if (intype == MPI_LONG_LONG) memcpy(out,in,incount*sizeof(int64_t));
+  else if (intype == MPI_FLOAT) memcpy(out,in,incount*sizeof(float));
+  else if (intype == MPI_DOUBLE) memcpy(out,in,incount*sizeof(double));
+  else if (intype == MPI_CHAR) memcpy(out,in,incount*sizeof(char));
+}
+
+static void MPI_Abort(MPI_Comm, int) {exit(1);}
+static void MPI_Finalize() {}
+
+}
+
+#endif
--- a/lib/message/cslib/src/STUBS_ZMQ/zmq.h
+++ b/lib/message/cslib/src/STUBS_ZMQ/zmq.h
@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+// ZMQ constants and dummy functions
+
+#ifndef ZMQ_DUMMY_H
+#define ZMQ_DUMMY_H
+
+namespace CSLIB_NS {
+
+#define ZMQ_REQ 0
+#define ZMQ_REP 0
+
+static void *zmq_ctx_new() {return NULL;}
+static void *zmq_connect(void *, char *) {return NULL;}
+static int zmq_bind(void *, char *) {return 0;}
+static void *zmq_socket(void *,int) {return NULL;}
+static void zmq_close(void *) {}
+static void zmq_ctx_destroy(void *) {}
+static void zmq_send(void *, void *, int, int) {}
+static void zmq_recv(void *, void *, int, int) {}
+
+};
+
+#endif
--- a/lib/message/cslib/src/cslib.cpp
+++ b/lib/message/cslib/src/cslib.cpp
@ -0,0 +1,768 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#include "cslib.h"
+#include "msg_file.h"
+#include "msg_zmq.h"
+#include "msg_mpi_one.h"
+#include "msg_mpi_two.h"
+
+using namespace CSLIB_NS;
+
+#define MAXTYPE 5       // # of defined field data types
+
+/* ---------------------------------------------------------------------- */
+
+CSlib::CSlib(int csflag, const char *mode, const void *ptr, const void *pcomm)
+{
+  if (pcomm) myworld = (uint64_t) *((MPI_Comm *) pcomm);
+  else myworld = 0;
+
+#ifdef MPI_NO
+  if (pcomm) 
+    error_all("constructor(): CSlib invoked with MPI_Comm "
+              "but built w/out MPI support");
+#endif
+#ifdef MPI_YES              // NOTE: this could be OK to allow ??
+                            // would allow a parallel app to invoke CSlib
+                            //   in parallel and/or in serial
+  if (!pcomm) 
+    error_all("constructor(): CSlib invoked w/out MPI_Comm "
+              "but built with MPI support");
+#endif
+
+  client = server = 0;
+  if (csflag == 0) client = 1;
+  else if (csflag == 1) server = 1;
+  else error_all("constructor(): Invalid client/server arg");
+
+  if (pcomm == NULL) {
+    me = 0;
+    nprocs = 1;
+
+    if (strcmp(mode,"file") == 0) msg = new MsgFile(csflag,ptr);
+    else if (strcmp(mode,"zmq") == 0) msg = new MsgZMQ(csflag,ptr);
+    else if (strcmp(mode,"mpi/one") == 0) 
+      error_all("constructor(): No mpi/one mode for serial lib usage");
+    else if (strcmp(mode,"mpi/two") == 0)
+      error_all("constructor(): No mpi/two mode for serial lib usage");
+    else error_all("constructor(): Unknown mode");
+
+  } else if (pcomm) {
+    MPI_Comm world = (MPI_Comm) myworld;
+    MPI_Comm_rank(world,&me);
+    MPI_Comm_size(world,&nprocs);
+
+    if (strcmp(mode,"file") == 0) msg = new MsgFile(csflag,ptr,world);
+    else if (strcmp(mode,"zmq") == 0) msg = new MsgZMQ(csflag,ptr,world);
+    else if (strcmp(mode,"mpi/one") == 0) msg = new MsgMPIOne(csflag,ptr,world);
+    else if (strcmp(mode,"mpi/two") == 0) msg = new MsgMPITwo(csflag,ptr,world);
+    else error_all("constructor(): Unknown mode");
+  }
+
+  maxfield = 0;
+  fieldID = fieldtype = fieldlen = fieldoffset = NULL;
+  maxheader = 0;
+  header = NULL;
+  maxbuf = 0;
+  buf = NULL;
+
+  recvcounts = displs = NULL;
+  maxglobal = 0;
+  allids = NULL;
+  maxfieldbytes = 0;
+  fielddata = NULL;
+  
+  pad = "\0\0\0\0\0\0\0";    // just length 7 since will have trailing NULL
+  
+  nsend = nrecv = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+CSlib::~CSlib()
+{
+  deallocate_fields();
+  sfree(header);
+  sfree(buf);
+  
+  sfree(recvcounts);
+  sfree(displs);
+  sfree(allids);
+  sfree(fielddata);
+
+  delete msg;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::send(int msgID_caller, int nfield_caller)
+{
+  if (nfield_caller < 0) error_all("send(): Invalid nfield");
+
+  msgID = msgID_caller;
+  nfield = nfield_caller;
+  allocate_fields();
+
+  fieldcount = 0;
+  nbuf = 0;
+  
+  if (fieldcount == nfield) send_message();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::pack_int(int id, int value)
+{
+  pack(id,1,1,&value);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::pack_int64(int id, int64_t value)
+{
+  pack(id,2,1,&value);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::pack_float(int id, float value)
+{
+  pack(id,3,1,&value);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::pack_double(int id, double value)
+{
+  pack(id,4,1,&value);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::pack_string(int id, char *value)
+{
+  pack(id,5,strlen(value)+1,value);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::pack(int id, int ftype, int flen, void *data)
+{
+  if (find_field(id,fieldcount) >= 0)
+    error_all("pack(): Reuse of field ID");
+  if (ftype < 1 || ftype > MAXTYPE) error_all("pack(): Invalid ftype");
+  if (flen < 0) error_all("pack(): Invalid flen");
+    
+  fieldID[fieldcount] = id;
+  fieldtype[fieldcount] = ftype;
+  fieldlen[fieldcount] = flen;
+
+  int nbytes,nbytesround;
+  onefield(ftype,flen,nbytes,nbytesround);
+
+  memcpy(&buf[nbuf],data,nbytes);
+  memcpy(&buf[nbuf+nbytes],pad,nbytesround-nbytes);
+  nbuf += nbytesround;
+  
+  fieldcount++;
+  if (fieldcount == nfield) send_message();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::pack_parallel(int id, int ftype,
+			  int nlocal, int *ids, int nper, void *data)
+{
+  int i,j,k,m;
+
+  if (find_field(id,fieldcount) >= 0)
+    error_all("pack_parallel(): Reuse of field ID");
+  if (ftype < 1 || ftype > MAXTYPE) error_all("pack_parallel(): Invalid ftype");
+  if (nlocal < 0) error_all("pack_parallel(): Invalid nlocal");
+  if (nper < 1) error_all("pack_parallel(): Invalid nper");
+
+  MPI_Comm world = (MPI_Comm) myworld;
+
+  // NOTE: check for overflow of maxglobal and flen
+
+  int nglobal;
+  MPI_Allreduce(&nlocal,&nglobal,1,MPI_INT,MPI_SUM,world);
+  int flen = nper*nglobal;
+
+  fieldID[fieldcount] = id;
+  fieldtype[fieldcount] = ftype;
+  fieldlen[fieldcount] = flen;
+  
+  // nlocal datums, each of nper length, from all procs
+  // final data in buf = datums for all natoms, ordered by ids
+
+  if (recvcounts == NULL) {
+    recvcounts = (int *) smalloc(nprocs*sizeof(int));
+    displs = (int *) smalloc(nprocs*sizeof(int));
+  }
+
+  MPI_Allgather(&nlocal,1,MPI_INT,recvcounts,1,MPI_INT,world);
+
+  displs[0] = 0;
+  for (int iproc = 1; iproc < nprocs; iproc++)
+    displs[iproc] = displs[iproc-1] + recvcounts[iproc-1];
+
+  if (ids && nglobal > maxglobal) {
+    sfree(allids);
+    maxglobal = nglobal;
+    // NOTE: maxglobal*sizeof(int) could overflow int
+    allids = (int *) smalloc(maxglobal*sizeof(int));
+  }
+
+  MPI_Allgatherv(ids,nlocal,MPI_INT,allids,
+                 recvcounts,displs,MPI_INT,world);
+  
+  int nlocalsize = nper*nlocal;
+  MPI_Allgather(&nlocalsize,1,MPI_INT,recvcounts,1,MPI_INT,world);
+
+  displs[0] = 0;
+  for (int iproc = 1; iproc < nprocs; iproc++)
+    displs[iproc] = displs[iproc-1] + recvcounts[iproc-1];
+
+  int nbytes,nbytesround;
+  onefield(ftype,flen,nbytes,nbytesround);
+
+  if (ftype == 1) {
+    int *alldata;
+    if (ids) {
+      if (nbytes > maxfieldbytes) {
+        sfree(fielddata);
+        maxfieldbytes = nbytes;   
+        fielddata = (char *) smalloc(maxfieldbytes);
+      }
+      alldata = (int *) fielddata;
+    } else alldata = (int *) &buf[nbuf];
+    MPI_Allgatherv(data,nlocalsize,MPI_INT,alldata,
+		   recvcounts,displs,MPI_INT,world);
+    if (ids) {
+      int *bufptr = (int *) &buf[nbuf];
+      m = 0;
+      for (i = 0; i < nglobal; i++) {
+	j = (allids[i]-1) * nper;
+	if (nper == 1) bufptr[j] = alldata[m++];
+	else
+	  for (k = 0; k < nper; k++)
+	    bufptr[j++] = alldata[m++];
+      }
+    }
+
+  } else if (ftype == 2) {
+    int64_t *alldata;
+    if (ids) {
+      if (nbytes > maxfieldbytes) {
+        sfree(fielddata);
+        maxfieldbytes = nbytes;   
+        fielddata = (char *) smalloc(maxfieldbytes);
+      }
+      alldata = (int64_t *) fielddata;
+    } else alldata = (int64_t *) &buf[nbuf];
+    // NOTE: may be just MPI_LONG on some machines
+    MPI_Allgatherv(data,nlocalsize,MPI_LONG_LONG,alldata,
+		   recvcounts,displs,MPI_LONG_LONG,world);
+    if (ids) {
+      int64_t *bufptr = (int64_t *) &buf[nbuf];
+      m = 0;
+      for (i = 0; i < nglobal; i++) {
+	j = (allids[i]-1) * nper;
+	if (nper == 1) bufptr[j] = alldata[m++];
+	else
+	  for (k = 0; k < nper; k++)
+	    bufptr[j++] = alldata[m++];
+      }
+    }
+    
+  } else if (ftype == 3) {
+    float *alldata;
+    if (ids) {
+      if (nbytes > maxfieldbytes) {
+        sfree(fielddata);
+        maxfieldbytes = nbytes;   
+        fielddata = (char *) smalloc(maxfieldbytes);
+      }
+      alldata = (float *) fielddata;
+    } else alldata = (float *) &buf[nbuf];
+    MPI_Allgatherv(data,nlocalsize,MPI_FLOAT,alldata,
+                   recvcounts,displs,MPI_FLOAT,world);
+    if (ids) {
+      float *bufptr = (float *) &buf[nbuf];
+      m = 0;
+      for (i = 0; i < nglobal; i++) {
+	j = (allids[i]-1) * nper;
+	if (nper == 1) bufptr[j] = alldata[m++];
+	else
+	  for (k = 0; k < nper; k++)
+	    bufptr[j++] = alldata[m++];
+      }
+    }
+
+  } else if (ftype == 4) {
+    double *alldata;
+    if (ids) {
+      if (nbytes > maxfieldbytes) {
+        sfree(fielddata);
+        maxfieldbytes = nbytes;   
+        fielddata = (char *) smalloc(maxfieldbytes);
+      }
+      alldata = (double *) fielddata;
+    } else alldata = (double *) &buf[nbuf];
+    MPI_Allgatherv(data,nlocalsize,MPI_DOUBLE,alldata,
+                   recvcounts,displs,MPI_DOUBLE,world);
+    if (ids) {
+      double *bufptr = (double *) &buf[nbuf];
+      m = 0;
+      for (i = 0; i < nglobal; i++) {
+	j = (allids[i]-1) * nper;
+	if (nper == 1) bufptr[j] = alldata[m++];
+	else
+	  for (k = 0; k < nper; k++)
+	    bufptr[j++] = alldata[m++];
+      }
+    }
+
+    /* eventually ftype = BYTE, but not yet
+  } else if (ftype == 5) {
+    char *alldata;
+    if (ids) {
+      if (nbytes > maxfieldbytes) {
+        sfree(fielddata);
+        maxfieldbytes = nbytes;   
+        fielddata = (char *) smalloc(maxfieldbytes);
+      }
+      alldata = (char *) fielddata;
+    } else alldata = (char *) &buf[nbuf];
+    MPI_Allgatherv(data,nlocalsize,MPI_CHAR,alldata,
+                   recvcounts,displs,MPI_CHAR,world);
+    if (ids) {
+      char *bufptr = (char *) &buf[nbuf];
+      m = 0;
+      for (i = 0; i < nglobal; i++) {
+	j = (allids[i]-1) * nper;
+	memcpy(&bufptr[j],&alldata[m],nper);
+	m += nper;
+      }
+    }
+    */
+  }
+
+  memcpy(&buf[nbuf+nbytes],pad,nbytesround-nbytes);
+  nbuf += nbytesround;
+
+  fieldcount++;
+  if (fieldcount == nfield) send_message();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::send_message()
+{
+  // setup header message
+
+  int m = 0;
+  header[m++] = msgID;
+  header[m++] = nfield;
+  for (int ifield = 0; ifield < nfield; ifield++) {
+    header[m++] = fieldID[ifield];
+    header[m++] = fieldtype[ifield];
+    header[m++] = fieldlen[ifield];
+  }
+
+  msg->send(nheader,header,nbuf,buf);
+  nsend++;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int CSlib::recv(int &nfield_caller, int *&fieldID_caller, 
+		int *&fieldtype_caller, int *&fieldlen_caller)
+{
+  msg->recv(maxheader,header,maxbuf,buf);
+  nrecv++;
+
+  // unpack header message
+  
+  int m = 0;
+  msgID = header[m++];
+  nfield = header[m++];
+  allocate_fields();
+
+  int nbytes,nbytesround;
+
+  nbuf = 0;
+  for (int ifield = 0; ifield < nfield; ifield++) {
+    fieldID[ifield] = header[m++];
+    fieldtype[ifield] = header[m++];
+    fieldlen[ifield] = header[m++];
+    fieldoffset[ifield] = nbuf;
+    onefield(fieldtype[ifield],fieldlen[ifield],nbytes,nbytesround);
+    nbuf += nbytesround;
+  }
+  
+  // return message parameters
+
+  nfield_caller = nfield;
+  fieldID_caller = fieldID;
+  fieldtype_caller = fieldtype;
+  fieldlen_caller = fieldlen;
+
+  return msgID;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int CSlib::unpack_int(int id)
+{
+  int ifield = find_field(id,nfield);
+  if (ifield < 0) error_all("unpack_int(): Unknown field ID");
+  if (fieldtype[ifield] != 1) error_all("unpack_int(): Mis-match of ftype");
+  if (fieldlen[ifield] != 1) error_all("unpack_int(): Flen is not 1");
+
+  int *ptr = (int *) unpack(id);
+  return *ptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int64_t CSlib::unpack_int64(int id)
+{
+  int ifield = find_field(id,nfield);
+  if (ifield < 0) error_all("unpack_int64(): Unknown field ID");
+  if (fieldtype[ifield] != 2) error_all("unpack_int64(): Mis-match of ftype");
+  if (fieldlen[ifield] != 1) error_all("unpack_int64(): Flen is not 1");
+
+  int64_t *ptr = (int64_t *) unpack(id);
+  return *ptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+float CSlib::unpack_float(int id)
+{
+  int ifield = find_field(id,nfield);
+  if (ifield < 0) error_all("unpack_float(): Unknown field ID");
+  if (fieldtype[ifield] != 3) error_all("unpack_float(): Mis-match of ftype");
+  if (fieldlen[ifield] != 1) error_all("unpack_float(): Flen is not 1");
+
+  float *ptr = (float *) unpack(id);
+  return *ptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double CSlib::unpack_double(int id)
+{
+  int ifield = find_field(id,nfield);
+  if (ifield < 0) error_all("unpack_double(): Unknown field ID");
+  if (fieldtype[ifield] != 4) error_all("unpack_double(): Mis-match of ftype");
+  if (fieldlen[ifield] != 1) error_all("unpack_double(): Flen is not 1");
+
+  double *ptr = (double *) unpack(id);
+  return *ptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+char *CSlib::unpack_string(int id)
+{
+  int ifield = find_field(id,nfield);
+  if (ifield < 0) error_all("unpack_string(): Unknown field ID");
+  if (fieldtype[ifield] != 5) error_all("unpack_string(): Mis-match of ftype");
+
+  char *ptr = (char *) unpack(id);
+  return ptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void *CSlib::unpack(int id)
+{
+  int ifield = find_field(id,nfield);
+  if (ifield < 0) error_all("unpack(): Unknown field ID");
+  return &buf[fieldoffset[ifield]];
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::unpack(int id, void *data)
+{
+  int ifield = find_field(id,nfield);
+  if (ifield < 0) error_all("unpack(): Unknown field ID");
+  
+  int ftype = fieldtype[ifield];
+  int nbytes = fieldlen[ifield];
+  if (ftype == 1) nbytes *= sizeof(int);
+  else if (ftype == 2) nbytes *= sizeof(int64_t);
+  else if (ftype == 3) nbytes *= sizeof(float);
+  else if (ftype == 4) nbytes *= sizeof(double);
+  memcpy(data,&buf[fieldoffset[ifield]],nbytes);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::unpack_parallel(int id, int nlocal, int *ids, int nper, void *data)
+{
+  int i,j,k,m;
+
+  int ifield = find_field(id,nfield);
+  if (ifield < 0) error_all("unpack_parallel(): Unknown field ID");
+  if (nlocal < 0) error_all("unpack_parallel(): Invalid nlocal");
+  if (nper < 1) error_all("pack_parallel(): Invalid nper");
+
+  MPI_Comm world = (MPI_Comm) myworld;
+
+  int upto;
+  if (!ids) {
+    MPI_Scan(&nlocal,&upto,1,MPI_INT,MPI_SUM,world);
+    upto -= nlocal;
+  }
+  
+  if (fieldtype[ifield] == 1) {
+    int *local = (int *) data;
+    int *global = (int *) &buf[fieldoffset[ifield]];
+    if (!ids) memcpy(local,&global[nper*upto],nper*nlocal*sizeof(int));
+    else {
+      m = 0;
+      for (i = 0; i < nlocal; i++) {
+	j = (ids[i]-1) * nper;
+	if (nper == 1) local[m++] = global[j];
+	else
+	  for (k = 0; k < nper; k++)
+	    local[m++] = global[j++];
+      }
+    } 
+
+  } else if (fieldtype[ifield] == 2) {
+    int64_t *local = (int64_t *) data;
+    int64_t *global = (int64_t *) &buf[fieldoffset[ifield]];
+    if (!ids) memcpy(local,&global[nper*upto],nper*nlocal*sizeof(int64_t));
+    else {
+      m = 0;
+      for (i = 0; i < nlocal; i++) {
+	j = (ids[i]-1) * nper;
+	if (nper == 1) local[m++] = global[j];
+	else
+	  for (k = 0; k < nper; k++)
+	    local[m++] = global[j++];
+      }
+    }
+
+  } else if (fieldtype[ifield] == 3) {
+    float *local = (float *) data;
+    float *global = (float *) &buf[fieldoffset[ifield]];
+    if (!ids) memcpy(local,&global[nper*upto],nper*nlocal*sizeof(float));
+    else {
+      m = 0;
+      for (i = 0; i < nlocal; i++) {
+	j = (ids[i]-1) * nper;
+	if (nper == 1) local[m++] = global[j];
+	else
+	  for (k = 0; k < nper; k++)
+	    local[m++] = global[j++];
+      }
+    }
+    
+  } else if (fieldtype[ifield] == 4) {
+    double *local = (double *) data;
+    double *global = (double *) &buf[fieldoffset[ifield]];
+    if (!ids) memcpy(local,&global[nper*upto],nper*nlocal*sizeof(double));
+    else {
+      m = 0;
+      for (i = 0; i < nlocal; i++) {
+	j = (ids[i]-1) * nper;
+	if (nper == 1) local[m++] = global[j];
+	else
+	  for (k = 0; k < nper; k++)
+	    local[m++] = global[j++];
+      }
+    }
+    
+    /* eventually ftype = BYTE, but not yet
+  } else if (fieldtype[ifield] == 5) {
+    char *local = (char *) data;
+    char *global = (char *) &buf[fieldoffset[ifield]];
+    if (!ids) memcpy(local,&global[nper*upto],nper*nlocal*sizeof(char));
+    else {
+      m = 0;
+      for (i = 0; i < nlocal; i++) {
+	j = (ids[i]-1) * nper;
+	memcpy(&local[m],&global[j],nper);
+	m += nper;
+      }
+    }
+    */
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int CSlib::extract(int flag)
+{
+  if (flag == 1) return nsend;
+  if (flag == 2) return nrecv;
+  error_all("extract(): Invalid flag");
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::onefield(int ftype, int flen, int &nbytes, int &nbytesround)
+{
+  int64_t bigbytes,bigbytesround;
+  int64_t biglen = flen;
+  
+  if (ftype == 1) bigbytes = biglen * sizeof(int);
+  else if (ftype == 2) bigbytes = biglen * sizeof(int64_t);
+  else if (ftype == 3) bigbytes = biglen * sizeof(float);
+  else if (ftype == 4) bigbytes = biglen * sizeof(double);
+  else if (ftype == 5) bigbytes = biglen * sizeof(char);
+  bigbytesround = roundup(bigbytes,8);
+
+  if (nbuf + bigbytesround > INT_MAX)
+    error_all("pack(): Message size exceeds 32-bit integer limit");
+
+  nbytes = (int) bigbytes;
+  nbytesround = (int) bigbytesround;
+  if (nbuf + nbytesround > maxbuf) {
+    maxbuf = nbuf + nbytesround;
+    buf = (char *) srealloc(buf,maxbuf);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int CSlib::find_field(int id, int n)
+{
+  int ifield;
+  for (ifield = 0; ifield < n; ifield++)
+    if (id == fieldID[ifield]) return ifield;
+  return -1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::allocate_fields()
+{
+  int64_t bigbytes = (2 + 3*((int64_t) nfield)) * sizeof(int);
+  if (bigbytes > INT_MAX)
+    error_all("send(): Message header size exceeds 32-bit integer limit");
+
+  nheader = 2;
+  nheader += 3 * nfield;
+  
+  if (nfield > maxfield) {
+    deallocate_fields();
+    maxfield = nfield;
+    fieldID = new int[maxfield];
+    fieldtype = new int[maxfield];
+    fieldlen = new int[maxfield];
+    fieldoffset = new int[maxfield];
+  }
+  
+  if (nheader > maxheader) {
+    sfree(header);
+    maxheader = nheader;
+    header = (int *) smalloc(maxheader*sizeof(int));
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::deallocate_fields()
+{
+  delete [] fieldID;
+  delete [] fieldtype;
+  delete [] fieldlen;
+  delete [] fieldoffset;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void *CSlib::smalloc(int nbytes)
+{
+  if (nbytes == 0) return NULL;
+  void *ptr = malloc(nbytes);
+  if (ptr == NULL) {
+    char str[128];
+    sprintf(str,"malloc(): Failed to allocate %d bytes",nbytes);
+    error_one(str);
+  }
+  return ptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void *CSlib::srealloc(void *ptr, int nbytes)
+{
+  if (nbytes == 0) {
+    sfree(ptr);
+    return NULL;
+  }
+  
+  ptr = realloc(ptr,nbytes);
+  if (ptr == NULL) {
+    char str[128];
+    sprintf(str,"realloc(): Failed to reallocate %d bytes",nbytes);
+    error_one(str);
+  }
+  return ptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::sfree(void *ptr)
+{
+  if (ptr == NULL) return;
+  free(ptr);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::error_all(const char *str)
+{
+  if (me == 0) printf("CSlib ERROR: %s\n",str);
+  MPI_Comm world = (MPI_Comm) myworld;
+  MPI_Abort(world,1);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CSlib::error_one(const char *str)
+{
+  printf("CSlib ERROR: %s\n",str);
+  MPI_Comm world = (MPI_Comm) myworld;
+  MPI_Abort(world,1);
+}
+
+/* ----------------------------------------------------------------------
+   round N up to multiple of nalign and return it
+   NOTE: see mapreduce/src/keyvalue.cpp for doing this as uint64_t
+------------------------------------------------------------------------- */
+
+int64_t CSlib::roundup(int64_t n, int nalign)
+{
+  if (n % nalign == 0) return n;
+  n = (n/nalign + 1) * nalign;
+  return n;
+}
--- a/lib/message/cslib/src/cslib.h
+++ b/lib/message/cslib/src/cslib.h
@ -0,0 +1,87 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+#ifndef CSLIB_H
+#define CSLIB_H
+
+#include <stdint.h>
+
+namespace CSLIB_NS {
+
+class CSlib {
+ public:
+  int nsend,nrecv;
+
+  CSlib(int, const char *, const void *, const void *);
+  ~CSlib();
+
+  void send(int, int);
+
+  void pack_int(int, int);
+  void pack_int64(int, int64_t);
+  void pack_float(int, float);
+  void pack_double(int, double);
+  void pack_string(int, char *);
+  void pack(int, int, int, void *);
+  void pack_parallel(int, int, int, int *, int, void *);
+
+  int recv(int &, int *&, int *&, int *&);
+
+  int unpack_int(int);
+  int64_t unpack_int64(int);
+  float unpack_float(int);
+  double unpack_double(int);
+  char *unpack_string(int);
+  void *unpack(int);
+  void unpack(int, void *);
+  void unpack_parallel(int, int, int *, int, void *);
+
+  int extract(int);
+  
+ private:
+  uint64_t myworld;    // really MPI_Comm, but avoids use of mpi.h in this file
+                       // so apps can include this file w/ no MPI on system
+  int me,nprocs;
+  int client,server;
+  int nfield,maxfield;
+  int msgID,fieldcount;
+  int nheader,maxheader;
+  int nbuf,maxbuf;
+  int maxglobal,maxfieldbytes;
+  int *fieldID,*fieldtype,*fieldlen,*fieldoffset;
+  int *header;
+  int *recvcounts,*displs;    // nprocs size for Allgathers
+  int *allids;                // nglobal size for pack_parallel()
+  char *buf;                  // maxbuf size for msg with all fields
+  char *fielddata;            // maxfieldbytes size for one global field
+  const char *pad;
+
+  class Msg *msg;
+
+  void send_message();
+  void onefield(int, int, int &, int &);
+  int find_field(int, int);
+  void allocate_fields();
+  void deallocate_fields();
+  int64_t roundup(int64_t, int);
+  void *smalloc(int);
+  void *srealloc(void *, int);
+  void sfree(void *);
+  void error_all(const char *);
+  void error_one(const char *);
+};
+
+}
+
+#endif
--- a/lib/message/cslib/src/cslib.py
+++ b/lib/message/cslib/src/cslib.py
@ -0,0 +1,362 @@
+# ------------------------------------------------------------------------
+#   CSlib - Client/server library for code coupling
+#   http://cslib.sandia.gov, Sandia National Laboratories
+#   Steve Plimpton, sjplimp@sandia.gov
+#
+#   Copyright 2018 National Technology & Engineering Solutions of
+#   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+#   NTESS, the U.S. Government retains certain rights in this software.
+#   This software is distributed under the modified Berkeley Software
+#   Distribution (BSD) License.
+#
+#   See the README file in the top-level CSlib directory.
+# -------------------------------------------------------------------------
+
+# Python wrapper on CSlib library via ctypes
+
+# ctypes and Numpy data types:
+# 32-bit int = c_int = np.intc = np.int32
+# 64-bit int = c_longlong = np.int64
+# 32-bit floating point = c_float = np.float32
+# 64-bit floating point = c_double = np.float = np.float64
+
+import sys,traceback
+from ctypes import *
+
+# Numpy and mpi4py packages may not exist
+
+try:
+  import numpy as np
+  numpyflag = 1
+except:
+  numpyflag = 0
+
+try:
+  from mpi4py import MPI
+  mpi4pyflag = 1
+except:
+  mpi4pyflag = 0
+
+# wrapper class
+
+class CSlib:
+
+  # instantiate CSlib thru its C-interface
+  
+  def __init__(self,csflag,mode,ptr,comm):
+
+    # load libcslib.so
+    
+    try:
+      if comm: self.lib = CDLL("libcsmpi.so",RTLD_GLOBAL)
+      else: self.lib = CDLL("libcsnompi.so",RTLD_GLOBAL)
+    except:
+      etype,value,tb = sys.exc_info()
+      traceback.print_exception(etype,value,tb)
+      raise OSError,"Could not load CSlib dynamic library"
+
+    # define ctypes API for each library method
+
+    self.lib.cslib_open.argtypes = [c_int,c_char_p,c_void_p,c_void_p,
+                                    POINTER(c_void_p)]
+    self.lib.cslib_open.restype = None
+
+    self.lib.cslib_close.argtypes = [c_void_p]
+    self.lib.cslib_close.restype = None
+
+    self.lib.cslib_send.argtypes = [c_void_p,c_int,c_int]
+    self.lib.cslib_send.restype = None
+    
+    self.lib.cslib_pack_int.argtypes = [c_void_p,c_int,c_int]
+    self.lib.cslib_pack_int.restype = None
+    
+    self.lib.cslib_pack_int64.argtypes = [c_void_p,c_int,c_longlong]
+    self.lib.cslib_pack_int64.restype = None
+    
+    self.lib.cslib_pack_float.argtypes = [c_void_p,c_int,c_float]
+    self.lib.cslib_pack_float.restype = None
+    
+    self.lib.cslib_pack_double.argtypes = [c_void_p,c_int,c_double]
+    self.lib.cslib_pack_double.restype = None
+    
+    self.lib.cslib_pack_string.argtypes = [c_void_p,c_int,c_char_p]
+    self.lib.cslib_pack_string.restype = None
+    
+    self.lib.cslib_pack.argtypes = [c_void_p,c_int,c_int,c_int,c_void_p]
+    self.lib.cslib_pack.restype = None
+    
+    self.lib.cslib_pack_parallel.argtypes = [c_void_p,c_int,c_int,c_int,
+                                             POINTER(c_int),c_int,c_void_p]
+    self.lib.cslib_pack_parallel.restype = None
+    
+    self.lib.cslib_recv.argtypes = [c_void_p,POINTER(c_int),
+                                    POINTER(POINTER(c_int)),
+                                    POINTER(POINTER(c_int)),
+                                    POINTER(POINTER(c_int))]
+    self.lib.cslib_recv.restype = c_int
+    
+    self.lib.cslib_unpack_int.argtypes = [c_void_p,c_int]
+    self.lib.cslib_unpack_int.restype = c_int
+
+    self.lib.cslib_unpack_int64.argtypes = [c_void_p,c_int]
+    self.lib.cslib_unpack_int64.restype = c_longlong
+
+    self.lib.cslib_unpack_float.argtypes = [c_void_p,c_int]
+    self.lib.cslib_unpack_float.restype = c_float
+
+    self.lib.cslib_unpack_double.argtypes = [c_void_p,c_int]
+    self.lib.cslib_unpack_double.restype = c_double
+
+    self.lib.cslib_unpack_string.argtypes = [c_void_p,c_int]
+    self.lib.cslib_unpack_string.restype = c_char_p
+
+    # override return in unpack()
+    self.lib.cslib_unpack.argtypes = [c_void_p,c_int]
+    self.lib.cslib_unpack.restype = c_void_p
+
+    self.lib.cslib_unpack_data.argtypes = [c_void_p,c_int,c_void_p]
+    self.lib.cslib_unpack_data.restype = None
+
+    # override last arg in unpack_parallel()
+    self.lib.cslib_unpack_parallel.argtypes = [c_void_p,c_int,c_int,
+                                               POINTER(c_int),c_int,c_void_p]
+    self.lib.cslib_unpack_parallel.restype = None
+
+    self.lib.cslib_extract.argtypes = [c_void_p,c_int]
+    self.lib.cslib_extract.restype = c_int
+
+    # create an instance of CSlib with or w/out MPI communicator
+
+    self.cs = c_void_p()
+    
+    if not comm:
+      self.lib.cslib_open(csflag,mode,ptr,None,byref(self.cs))
+    elif not mpi4pyflag:
+      print "Cannot pass MPI communicator to CSlib w/out mpi4py package"
+      sys.exit()
+    else:
+      address = MPI._addressof(comm)
+      comm_ptr = c_void_p(address)
+      if mode == "mpi/one":
+        address = MPI._addressof(ptr)
+        ptrcopy = c_void_p(address)
+      else: ptrcopy = ptr
+      self.lib.cslib_open(csflag,mode,ptrcopy,comm_ptr,byref(self.cs))
+
+  # destroy instance of CSlib
+  
+  def __del__(self):
+    if self.cs: self.lib.cslib_close(self.cs)
+
+  def close(self):
+    self.lib.cslib_close(self.cs)
+    self.lib = None
+
+  # send a message
+  
+  def send(self,msgID,nfield):
+    self.nfield = nfield
+    self.lib.cslib_send(self.cs,msgID,nfield)
+
+  # pack one field of message
+  
+  def pack_int(self,id,value):
+    self.lib.cslib_pack_int(self.cs,id,value)
+
+  def pack_int64(self,id,value):
+    self.lib.cslib_pack_int64(self.cs,id,value)
+
+  def pack_float(self,id,value):
+    self.lib.cslib_pack_float(self.cs,id,value)
+
+  def pack_double(self,id,value):
+    self.lib.cslib_pack_double(self.cs,id,value)
+
+  def pack_string(self,id,value):
+    self.lib.cslib_pack_string(self.cs,id,value)
+
+  def pack(self,id,ftype,flen,data):
+    cdata = self.data_convert(ftype,flen,data)
+    self.lib.cslib_pack(self.cs,id,ftype,flen,cdata)
+
+  def pack_parallel(self,id,ftype,nlocal,ids,nper,data):
+    cids = self.data_convert(1,nlocal,ids)
+    cdata = self.data_convert(ftype,nper*nlocal,data)
+    self.lib.cslib_pack_parallel(self.cs,id,ftype,nlocal,cids,nper,cdata)
+
+  # convert input data to a ctypes vector to pass to CSlib
+  
+  def data_convert(self,ftype,flen,data):
+       
+    # tflag = type of data
+    # tflag = 1 if data is list or tuple
+    # tflag = 2 if data is Numpy array
+    # tflag = 3 if data is ctypes vector
+    # same usage of tflag as in unpack function
+    
+    txttype = str(type(data))
+    if "numpy" in txttype: tflag = 2
+    elif "c_" in txttype: tflag = 3
+    else: tflag = 1
+    
+    # create ctypes vector out of data to pass to lib
+    # cdata = ctypes vector to return
+    # NOTE: error check on ftype and tflag everywhere, also flen
+    
+    if ftype == 1:
+      if tflag == 1: cdata = (flen * c_int)(*data)
+      elif tflag == 2: cdata = data.ctypes.data_as(POINTER(c_int))
+      elif tflag == 3: cdata = data
+    elif ftype == 2:
+      if tflag == 1: cdata = (flen * c_longlong)(*data)
+      elif tflag == 2: cdata = data.ctypes.data_as(POINTER(c_longlong))
+      elif tflag == 3: cdata = data
+    elif ftype == 3:
+      if tflag == 1: cdata = (flen * c_float)(*data)
+      elif tflag == 2: cdata = data.ctypes.data_as(POINTER(c_float))
+      elif tflag == 3: cdata = data
+    elif ftype == 4:
+      if tflag == 1: cdata = (flen * c_double)(*data)
+      elif tflag == 2: cdata = data.ctypes.data_as(POINTER(c_double))
+      elif tflag == 3: cdata = data
+
+    return cdata
+
+  # receive a message
+  
+  def recv(self):
+    self.lib.cslib_recv.restype = c_int
+    nfield = c_int()
+    fieldID = POINTER(c_int)()
+    fieldtype = POINTER(c_int)()
+    fieldlen = POINTER(c_int)()
+    msgID = self.lib.cslib_recv(self.cs,byref(nfield),
+                                byref(fieldID),byref(fieldtype),byref(fieldlen))
+
+    # copy returned C args to native Python int and lists
+    # store them in class so unpack() methods can access the info
+    
+    self.nfield = nfield = nfield.value
+    self.fieldID = fieldID[:nfield]
+    self.fieldtype = fieldtype[:nfield]
+    self.fieldlen = fieldlen[:nfield]
+    
+    return msgID,self.nfield,self.fieldID,self.fieldtype,self.fieldlen
+
+  # unpack one field of message
+  # tflag = type of data to return
+  # 3 = ctypes vector is default, since no conversion required
+  
+  def unpack_int(self,id):
+    return self.lib.cslib_unpack_int(self.cs,id)
+
+  def unpack_int64(self,id):
+    return self.lib.cslib_unpack_int64(self.cs,id)
+
+  def unpack_float(self,id):
+    return self.lib.cslib_unpack_float(self.cs,id)
+
+  def unpack_double(self,id):
+    return self.lib.cslib_unpack_double(self.cs,id)
+
+  def unpack_string(self,id):
+    return self.lib.cslib_unpack_string(self.cs,id)
+
+  def unpack(self,id,tflag=3):
+    index = self.fieldID.index(id)
+
+    # reset data type of return so can morph by tflag
+    # cannot do this for the generic c_void_p returned by CSlib
+    
+    if self.fieldtype[index] == 1:
+      self.lib.cslib_unpack.restype = POINTER(c_int)
+    elif self.fieldtype[index] == 2:
+      self.lib.cslib_unpack.restype = POINTER(c_longlong)
+    elif self.fieldtype[index] == 3:
+      self.lib.cslib_unpack.restype = POINTER(c_float)
+    elif self.fieldtype[index] == 4:
+      self.lib.cslib_unpack.restype = POINTER(c_double)
+    #elif self.fieldtype[index] == 5:
+    #  self.lib.cslib_unpack.restype = POINTER(c_char)
+
+    cdata = self.lib.cslib_unpack(self.cs,id)
+
+    # tflag = user-requested type of data to return
+    # tflag = 1 to return data as list
+    # tflag = 2 to return data as Numpy array
+    # tflag = 3 to return data as ctypes vector
+    # same usage of tflag as in pack functions
+    # tflag = 2,3 should NOT perform a data copy
+    
+    if tflag == 1:
+      data = cdata[:self.fieldlen[index]]
+    elif tflag == 2:
+      if numpyflag == 0:
+        print "Cannot return Numpy array w/out numpy package"
+        sys.exit()
+      data = np.ctypeslib.as_array(cdata,shape=(self.fieldlen[index],))
+    elif tflag == 3:
+      data = cdata
+      
+    return data
+
+  # handle data array like pack() or unpack_parallel() ??
+  
+  def unpack_data(self,id,tflag=3):
+    index = self.fieldID.index(id)
+
+  # unpack one field of message in parallel
+  # tflag = type of data to return
+  # 3 = ctypes vector is default, since no conversion required
+  # NOTE: allow direct use of user array (e.g. Numpy), if user provides data arg?
+  #       as opposed to creating this cdata
+  #       does that make any performance difference ?
+  #       e.g. should we allow CSlib to populate an existing Numpy array's memory
+  
+  def unpack_parallel(self,id,nlocal,ids,nper,tflag=3):
+    cids = self.data_convert(1,nlocal,ids)
+
+    # allocate memory for the returned data
+    # pass cdata ptr to the memory to CSlib unpack_parallel()
+    # this resets data type of last unpack_parallel() arg
+    
+    index = self.fieldID.index(id)
+    if self.fieldtype[index] == 1: cdata = (nper*nlocal * c_int)()
+    elif self.fieldtype[index] == 2: cdata = (nlocal*nper * c_longlong)()
+    elif self.fieldtype[index] == 3: cdata = (nlocal*nper * c_float)()
+    elif self.fieldtype[index] == 4: cdata = (nlocal*nper * c_double)()
+    #elif self.fieldtype[index] == 5: cdata = (nlocal*nper * c_char)()
+
+    self.lib.cslib_unpack_parallel(self.cs,id,nlocal,cids,nper,cdata)
+
+    # tflag = user-requested type of data to return
+    # tflag = 1 to return data as list
+    # tflag = 2 to return data as Numpy array
+    # tflag = 3 to return data as ctypes vector
+    # same usage of tflag as in pack functions
+    
+    if tflag == 1:
+      data = cdata[:nper*nlocal]
+    elif tflag == 2:
+      if numpyflag == 0:
+        print "Cannot return Numpy array w/out numpy package"
+        sys.exit()
+      # NOTE: next line gives ctypes warning for fieldtype = 2 = 64-bit int
+      # not sure why, reported as bug between ctypes and Numpy here:
+      # https://stackoverflow.com/questions/4964101/pep-3118-
+      #         warning-when-using-ctypes-array-as-numpy-array
+      # but why not same warning when just using unpack() ??
+      # in Python these lines give same warning:
+      # >>> import ctypes,numpy
+      # >>> a = (10 * ctypes.c_longlong)()
+      # >>> b = numpy.ctypeslib.as_array(a)
+      data = np.ctypeslib.as_array(cdata,shape=(nlocal*nper,))
+    elif tflag == 3:
+      data = cdata
+      
+    return data
+
+  # extract a library value
+  
+  def extract(self,flag):
+   return self.lib.cslib_extract(self.cs,flag)
--- a/lib/message/cslib/src/cslib_wrap.cpp
+++ b/lib/message/cslib/src/cslib_wrap.cpp
@ -0,0 +1,239 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+// C style library interface to CSlib class
+
+#include <mpi.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "cslib_wrap.h"
+#include "cslib.h"
+
+using namespace CSLIB_NS;
+
+// ----------------------------------------------------------------------
+
+void cslib_open(int csflag, const char *mode, const void *ptr, 
+                const void *pcomm, void **csptr)
+{
+  CSlib *cs = new CSlib(csflag,mode,ptr,pcomm);
+  *csptr = (void *) cs;
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_open_fortran(int csflag, const char *mode, const char *str, 
+                        const void *pcomm, void **csptr)
+{
+  MPI_Comm ccomm;
+  void *pccomm = NULL;
+
+  if (pcomm) {
+    MPI_Fint *fcomm = (MPI_Fint *) pcomm;
+    ccomm = MPI_Comm_f2c(*fcomm); 
+    pccomm = &ccomm;
+  }
+
+  CSlib *cs = new CSlib(csflag,mode,str,pccomm);
+  *csptr = (void *) cs;
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_open_fortran_mpi_one(int csflag, const char *mode, 
+                                const void *pboth, const void *pcomm,
+                                void **csptr)
+{
+  MPI_Comm ccomm,cboth;
+  void *pccomm,*pcboth;
+
+  MPI_Fint *fcomm = (MPI_Fint *) pcomm;
+  ccomm = MPI_Comm_f2c(*fcomm); 
+  pccomm = &ccomm;
+
+  MPI_Fint *fboth = (MPI_Fint *) pboth;
+  cboth = MPI_Comm_f2c(*fboth); 
+  pcboth = &cboth;
+
+  CSlib *cs = new CSlib(csflag,mode,pcboth,pccomm);
+  *csptr = (void *) cs;
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_close(void *ptr)
+{
+  CSlib *cs = (CSlib *) ptr;
+  delete cs;
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_send(void *ptr, int msgID, int nfield)
+{
+  CSlib *cs = (CSlib *) ptr;
+  cs->send(msgID,nfield);
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_pack_int(void *ptr, int id, int value)
+{
+  CSlib *cs = (CSlib *) ptr;
+  cs->pack_int(id,value);
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_pack_int64(void *ptr, int id, int64_t value)
+{
+  CSlib *cs = (CSlib *) ptr;
+  cs->pack_int64(id,value);
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_pack_float(void *ptr, int id, float value)
+{
+  CSlib *cs = (CSlib *) ptr;
+  cs->pack_float(id,value);
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_pack_double(void *ptr, int id, double value)
+{
+  CSlib *cs = (CSlib *) ptr;
+  cs->pack_double(id,value);
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_pack_string(void *ptr, int id, char *value)
+{
+  CSlib *cs = (CSlib *) ptr;
+  cs->pack_string(id,value);
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_pack(void *ptr, int id, int ftype, int flen, void *data)
+{
+  CSlib *cs = (CSlib *) ptr;
+  cs->pack(id,ftype,flen,data);
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_pack_parallel(void *ptr, int id, int ftype,
+			 int nlocal, int *ids, int nper, void *data)
+{
+  CSlib *cs = (CSlib *) ptr;
+  cs->pack_parallel(id,ftype,nlocal,ids,nper,data);
+}
+
+// ----------------------------------------------------------------------
+
+int cslib_recv(void *ptr, int *nfield_caller, 
+	       int **fieldID_caller, int **fieldtype_caller, 
+	       int **fieldlen_caller)
+{
+  CSlib *cs = (CSlib *) ptr;
+
+  int nfield;
+  int *fieldID,*fieldtype,*fieldlen;
+  int msgID = cs->recv(nfield,fieldID,fieldtype,fieldlen);
+
+  *nfield_caller = nfield;
+  *fieldID_caller = fieldID;
+  *fieldtype_caller = fieldtype;
+  *fieldlen_caller = fieldlen;
+
+  return msgID;
+}
+
+// ----------------------------------------------------------------------
+
+int cslib_unpack_int(void *ptr, int id)
+{
+  CSlib *cs = (CSlib *) ptr;
+  return cs->unpack_int(id);
+}
+// ----------------------------------------------------------------------
+
+int64_t cslib_unpack_int64(void *ptr, int id)
+{
+  CSlib *cs = (CSlib *) ptr;
+  return cs->unpack_int64(id);
+}
+
+// ----------------------------------------------------------------------
+
+float cslib_unpack_float(void *ptr, int id)
+{
+  CSlib *cs = (CSlib *) ptr;
+  return cs->unpack_float(id);
+}
+
+// ----------------------------------------------------------------------
+
+double cslib_unpack_double(void *ptr, int id)
+{
+  CSlib *cs = (CSlib *) ptr;
+  return cs->unpack_double(id);
+}
+
+// ----------------------------------------------------------------------
+
+char *cslib_unpack_string(void *ptr, int id)
+{
+  CSlib *cs = (CSlib *) ptr;
+  return cs->unpack_string(id);
+}
+
+// ----------------------------------------------------------------------
+
+void *cslib_unpack(void *ptr, int id)
+{
+  CSlib *cs = (CSlib *) ptr;
+  return cs->unpack(id);
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_unpack_data(void *ptr, int id, void *data)
+{
+  CSlib *cs = (CSlib *) ptr;
+  cs->unpack(id,data);
+}
+
+// ----------------------------------------------------------------------
+
+void cslib_unpack_parallel(void *ptr, int id, int nlocal, int *ids, 
+			   int nper, void *data)
+{
+  CSlib *cs = (CSlib *) ptr;
+  cs->unpack_parallel(id,nlocal,ids,nper,data);
+}
+
+// ----------------------------------------------------------------------
+
+int cslib_extract(void *ptr, int flag)
+{
+  CSlib *cs = (CSlib *) ptr;
+  return cs->extract(flag);
+}
--- a/lib/message/cslib/src/cslib_wrap.f90
+++ b/lib/message/cslib/src/cslib_wrap.f90
@ -0,0 +1,147 @@
+! ISO_C_binding wrapper on CSlib C interface
+
+module cslib_wrap
+
+interface
+  subroutine cslib_open_fortran(csflag,mode,str,pcomm,ptr) bind(c)
+    use iso_c_binding
+    integer(c_int), value :: csflag
+    character(c_char) :: mode(*),str(*)
+    type(c_ptr), value :: pcomm
+    type(c_ptr) :: ptr
+  end subroutine cslib_open_fortran
+
+  subroutine cslib_open_fortran_mpi_one(csflag,mode,pboth,pcomm,ptr) bind(c)
+    use iso_c_binding
+    integer(c_int), value :: csflag
+    character(c_char) :: mode(*)
+    type(c_ptr), value :: pboth,pcomm
+    type(c_ptr) :: ptr
+  end subroutine cslib_open_fortran_mpi_one
+
+  subroutine cslib_close(ptr) bind(c)
+    use iso_c_binding
+    type(c_ptr), value :: ptr
+  end subroutine cslib_close
+
+  subroutine cslib_send(ptr,msgID,nfield) bind(c)
+    use iso_c_binding
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: msgID,nfield
+  end subroutine cslib_send
+
+  subroutine cslib_pack_int(ptr,id,value) bind(c)
+    use iso_c_binding
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id
+    integer(c_int), value :: value
+  end subroutine cslib_pack_int
+
+  subroutine cslib_pack_int64(ptr,id,value) bind(c)
+    use iso_c_binding
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id
+    integer(c_int64_t), value :: value
+  end subroutine cslib_pack_int64
+
+  subroutine cslib_pack_float(ptr,id,value) bind(c)
+    use iso_c_binding
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id
+    real(c_float), value :: value
+  end subroutine cslib_pack_float
+
+  subroutine cslib_pack_double(ptr,id,value) bind(c)
+    use iso_c_binding
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id
+    real(c_double), value :: value
+  end subroutine cslib_pack_double
+
+  subroutine cslib_pack_string(ptr,id,value) bind(c)
+    use iso_c_binding
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id
+    character(c_char) :: value(*)
+  end subroutine cslib_pack_string
+
+  subroutine cslib_pack(ptr,id,ftype,flen,data) bind(c)
+    use iso_c_binding
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id,ftype,flen
+    type(c_ptr), value :: data
+  end subroutine cslib_pack
+
+  subroutine cslib_pack_parallel(ptr,id,ftype,nlocal,ids,nper,data) bind(c)
+    use iso_c_binding
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id,ftype,nlocal,nper
+    type(c_ptr), value :: ids,data
+  end subroutine cslib_pack_parallel
+
+  function cslib_recv(ptr,nfield,fieldID,fieldtype,fieldlen) bind(c)
+    use iso_c_binding
+    integer(c_int) :: cslib_recv
+    type(c_ptr), value :: ptr
+    integer(c_int) :: nfield
+    type(c_ptr) :: fieldID,fieldtype,fieldlen
+  end function cslib_recv
+
+  function cslib_unpack_int(ptr,id) bind(c)
+    use iso_c_binding
+    integer(c_int) :: cslib_unpack_int
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id
+  end function cslib_unpack_int
+
+  function cslib_unpack_int64(ptr,id) bind(c)
+    use iso_c_binding
+    integer(c_int64_t) :: cslib_unpack_int64
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id
+  end function cslib_unpack_int64
+
+  function cslib_unpack_float(ptr,id) bind(c)
+    use iso_c_binding
+    real(c_float) :: cslib_unpack_float
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id
+  end function cslib_unpack_float
+
+  function cslib_unpack_double(ptr,id) bind(c)
+    use iso_c_binding
+    real(c_double) :: cslib_unpack_double
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id
+  end function cslib_unpack_double
+
+  function cslib_unpack_string(ptr,id) bind(c)
+    use iso_c_binding
+    type(c_ptr) :: cslib_unpack_string
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id
+  end function cslib_unpack_string
+
+  function cslib_unpack(ptr,id) bind(c)
+    use iso_c_binding
+    type(c_ptr) :: cslib_unpack
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id
+  end function cslib_unpack
+
+  subroutine cslib_unpack_parallel(ptr,id,nlocal,ids,nper,data) bind(c)
+    use iso_c_binding
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: id,nlocal,nper
+    type(c_ptr), value :: ids,data
+  end subroutine cslib_unpack_parallel
+
+  function cslib_extract(ptr,flag) bind(c)
+    use iso_c_binding
+    integer(c_int) :: cslib_extract
+    type(c_ptr), value :: ptr
+    integer(c_int), value :: flag
+  end function cslib_extract
+end interface
+
+end module cslib_wrap
--- a/lib/message/cslib/src/cslib_wrap.h
+++ b/lib/message/cslib/src/cslib_wrap.h
@ -0,0 +1,54 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+/* C style library interface to CSlib class
+   ifdefs allow this file to be included in a C program
+*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void cslib_open(int, const char *, const void *, const void *, void **);
+void cslib_open_fortran(int, const char *, const char *, const void *, void **);
+void cslib_open_fortran_mpi_one(int, const char *, const void *, 
+                                const void *, void **);
+void cslib_close(void *);
+
+void cslib_send(void *, int, int);
+
+void cslib_pack_int(void *, int, int);
+void cslib_pack_int64(void *, int, int64_t);
+void cslib_pack_float(void *, int, float);
+void cslib_pack_double(void *, int, double);
+void cslib_pack_string(void *, int, char *);
+void cslib_pack(void *, int, int, int, void *);
+void cslib_pack_parallel(void *, int, int, int, int *, int, void *);
+
+int cslib_recv(void *, int *, int **, int **, int **);
+
+int cslib_unpack_int(void *, int);
+int64_t cslib_unpack_int64(void *, int);
+float cslib_unpack_float(void *, int);
+double cslib_unpack_double(void *, int);
+char *cslib_unpack_string(void *, int);
+void *cslib_unpack(void *, int);
+void cslib_unpack_data(void *, int, void *);
+void cslib_unpack_parallel(void *, int, int, int *, int, void *);
+
+int cslib_extract(void *, int);
+  
+#ifdef __cplusplus
+}
+#endif
--- a/lib/message/cslib/src/msg.cpp
+++ b/lib/message/cslib/src/msg.cpp
@ -0,0 +1,110 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "msg.h"
+
+using namespace CSLIB_NS;
+
+/* ---------------------------------------------------------------------- */
+
+Msg::Msg(int csflag, const void *ptr, MPI_Comm cworld)
+{
+  world = cworld;
+  MPI_Comm_rank(world,&me);
+  MPI_Comm_size(world,&nprocs);
+
+  init(csflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+Msg::Msg(int csflag, const void *ptr)
+{
+  world = 0;
+  me = 0;
+  nprocs = 1;
+
+  init(csflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void Msg::init(int csflag)
+{
+  client = server = 0;
+  if (csflag == 0) client = 1;
+  else if (csflag == 1) server = 1;
+
+  nsend = nrecv = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void Msg::allocate(int nheader, int &maxheader, int *&header,
+		   int nbuf, int &maxbuf, char *&buf)
+{
+  if (nheader > maxheader) {
+    sfree(header);
+    maxheader = nheader;
+    header = (int *) smalloc(maxheader*sizeof(int));
+  }
+
+  if (nbuf > maxbuf) {
+    sfree(buf);
+    maxbuf = nbuf;
+    buf = (char *) smalloc(maxbuf*sizeof(char));
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void *Msg::smalloc(int nbytes)
+{
+  if (nbytes == 0) return NULL;
+  void *ptr = (void *) malloc(nbytes);
+  if (ptr == NULL) {
+    char str[128];
+    sprintf(str,"Failed to allocate %d bytes",nbytes);
+  }
+  return ptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void Msg::sfree(void *ptr)
+{
+  if (ptr == NULL) return;
+  free(ptr);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void Msg::error_all(const char *str)
+{
+  if (me == 0) printf("CSlib ERROR: %s\n",str);
+  MPI_Abort(world,1);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void Msg::error_one(const char *str)
+{
+  printf("CSlib ERROR: %s\n",str);
+  MPI_Abort(world,1);
+}
--- a/lib/message/cslib/src/msg.h
+++ b/lib/message/cslib/src/msg.h
@ -0,0 +1,52 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+#ifndef MSG_H
+#define MSG_H
+
+#include <mpi.h>
+
+namespace CSLIB_NS {
+
+class Msg {
+ public:
+  int nsend,nrecv;
+  MPI_Comm world;
+
+  Msg(int, const void *, MPI_Comm);
+  Msg(int, const void *);
+  virtual ~Msg() {}
+  virtual void send(int, int *, int, char *) = 0;
+  virtual void recv(int &, int *&, int &, char *&) = 0;
+
+ protected:
+  int me,nprocs;
+  int client,server;
+
+  int nfield;
+  int *fieldID,*fieldtype,*fieldlen;
+  int lengths[2];
+  
+  void init(int);
+  void allocate(int, int &, int *&, int, int &, char *&);
+  void *smalloc(int);
+  void sfree(void *);
+  void error_all(const char *);
+  void error_one(const char *);
+};
+
+
+}
+
+#endif
--- a/lib/message/cslib/src/msg_file.cpp
+++ b/lib/message/cslib/src/msg_file.cpp
@ -0,0 +1,143 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include "msg_file.h"
+
+using namespace CSLIB_NS;
+
+#define MAXLINE 256
+#define SLEEP 0.1       // delay in CPU secs to check for message file
+
+/* ---------------------------------------------------------------------- */
+
+MsgFile::MsgFile(int csflag, const void *ptr, MPI_Comm cworld) : 
+  Msg(csflag, ptr, cworld)
+{
+  char *filename = (char *) ptr;
+  init(filename);
+}
+
+/* ---------------------------------------------------------------------- */
+
+MsgFile::MsgFile(int csflag, const void *ptr) : Msg(csflag, ptr)
+{
+  char *filename = (char *) ptr;
+  init(filename);
+}
+
+/* ---------------------------------------------------------------------- */
+
+MsgFile::~MsgFile()
+{
+  delete [] fileroot;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void MsgFile::init(char *filename)
+{
+  int n = strlen(filename) + 1;
+  fileroot = new char[n];
+  strcpy(fileroot,filename);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void MsgFile::send(int nheader, int *header, int nbuf, char *buf)
+{
+  char filename[MAXLINE];
+  
+  lengths[0] = nheader;
+  lengths[1] = nbuf;
+  
+  if (me == 0) {
+    if (client) sprintf(filename,"%s.%s",fileroot,"client");
+    else if (server) sprintf(filename,"%s.%s",fileroot,"server");
+    
+    fp = fopen(filename,"wb");
+    if (!fp) error_one("send(): Could not open send message file");
+    fwrite(lengths,sizeof(int),2,fp);
+    fwrite(header,sizeof(int),nheader,fp);
+    fwrite(buf,1,nbuf,fp);
+    fclose(fp);
+  }
+  
+  // create empty signal file
+
+  if (me == 0) {
+    if (client) sprintf(filename,"%s.%s",fileroot,"client.signal");
+    else if (server) sprintf(filename,"%s.%s",fileroot,"server.signal");
+    fp = fopen(filename,"w");
+    fclose(fp);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void MsgFile::recv(int &maxheader, int *&header, int &maxbuf, char *&buf)
+{
+  char filename[MAXLINE];
+
+  // wait until signal file exists to open message file
+
+  if (me == 0) {
+    if (client) sprintf(filename,"%s.%s",fileroot,"server.signal");
+    else if (server) sprintf(filename,"%s.%s",fileroot,"client.signal");
+
+    int delay = (int) (1000000 * SLEEP);
+    while (1) {
+      fp = fopen(filename,"r");
+      if (fp) break;
+      usleep(delay);
+    }
+    fclose(fp);
+  
+    if (client) sprintf(filename,"%s.%s",fileroot,"server");
+    else if (server) sprintf(filename,"%s.%s",fileroot,"client");
+    fp = fopen(filename,"rb");
+    if (!fp) error_one("recv(): Could not open recv message file");
+  }
+
+  // read and broadcast data
+  
+  if (me == 0) fread(lengths,sizeof(int),2,fp);
+  if (nprocs > 1) MPI_Bcast(lengths,2,MPI_INT,0,world);
+
+  int nheader = lengths[0];
+  int nbuf = lengths[1];
+  allocate(nheader,maxheader,header,nbuf,maxbuf,buf);
+  
+  if (me == 0) fread(header,sizeof(int),nheader,fp);
+  if (nprocs > 1) MPI_Bcast(header,nheader,MPI_INT,0,world);
+
+  if (me == 0) fread(buf,1,nbuf,fp);
+  if (nprocs > 1) MPI_Bcast(buf,nbuf,MPI_CHAR,0,world);
+
+  // delete both message and signal file
+
+  if (me == 0) {
+    fclose(fp);
+    unlink(filename);
+    if (client) sprintf(filename,"%s.%s",fileroot,"server.signal");
+    else if (server) sprintf(filename,"%s.%s",fileroot,"client.signal");
+    unlink(filename);
+  }
+}
--- a/lib/message/cslib/src/msg_file.h
+++ b/lib/message/cslib/src/msg_file.h
@ -0,0 +1,40 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+#ifndef MSG_FILE_H
+#define MSG_FILE_H
+
+#include <stdio.h>
+#include "msg.h"
+
+namespace CSLIB_NS {
+
+class MsgFile : public Msg {
+ public:
+  MsgFile(int, const void *, MPI_Comm);
+  MsgFile(int, const void *);
+  ~MsgFile();
+  void send(int, int *, int, char *);
+  void recv(int &, int *&, int &, char *&);
+
+ private:
+  char *fileroot;
+  FILE *fp;
+
+  void init(char *);
+};
+
+}
+
+#endif
--- a/lib/message/cslib/src/msg_mpi_one.cpp
+++ b/lib/message/cslib/src/msg_mpi_one.cpp
@ -0,0 +1,82 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "msg_mpi_one.h"
+
+using namespace CSLIB_NS;
+
+/* ---------------------------------------------------------------------- */
+
+MsgMPIOne::MsgMPIOne(int csflag, const void *ptr, MPI_Comm cworld) : 
+  Msg(csflag, ptr, cworld)
+{
+  // NOTE: ideally would skip this call if mpi/two
+  init(ptr);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void MsgMPIOne::init(const void *ptr)
+{
+  MPI_Comm *pbothcomm = (MPI_Comm *) ptr;
+  bothcomm = *pbothcomm;
+
+  if (client) {
+    MPI_Comm_size(world,&nprocs);
+    otherroot = nprocs;
+  } else if (server) {
+    otherroot = 0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void MsgMPIOne::send(int nheader, int *header, int nbuf, char *buf)
+{
+  lengths[0] = nheader;
+  lengths[1] = nbuf;
+
+  if (me == 0) {
+    MPI_Send(lengths,2,MPI_INT,otherroot,0,bothcomm);
+    MPI_Send(header,nheader,MPI_INT,otherroot,0,bothcomm);
+    MPI_Send(buf,nbuf,MPI_CHAR,otherroot,0,bothcomm);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void MsgMPIOne::recv(int &maxheader, int *&header, int &maxbuf, char *&buf)
+{
+  MPI_Status status;
+
+  if (me == 0) MPI_Recv(lengths,2,MPI_INT,otherroot,0,bothcomm,&status);
+  if (nprocs > 1) MPI_Bcast(lengths,2,MPI_INT,0,world);
+
+  int nheader = lengths[0];
+  int nbuf = lengths[1];
+  allocate(nheader,maxheader,header,nbuf,maxbuf,buf);
+
+  if (me == 0) MPI_Recv(header,nheader,MPI_INT,otherroot,0,bothcomm,&status);
+  if (nprocs > 1) MPI_Bcast(header,nheader,MPI_INT,0,world);
+
+  if (me == 0) MPI_Recv(buf,nbuf,MPI_CHAR,otherroot,0,bothcomm,&status);
+  if (nprocs > 1) MPI_Bcast(buf,nbuf,MPI_CHAR,0,world);
+}
--- a/lib/message/cslib/src/msg_mpi_one.h
+++ b/lib/message/cslib/src/msg_mpi_one.h
@ -0,0 +1,38 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+#ifndef MSG_MPI_ONE_H
+#define MSG_MPI_ONE_H
+
+#include "msg.h"
+
+namespace CSLIB_NS {
+
+class MsgMPIOne : public Msg {
+ public:
+  MsgMPIOne(int, const void *, MPI_Comm);
+  virtual ~MsgMPIOne() {}
+  void send(int, int *, int, char *);
+  void recv(int &, int *&, int &, char *&);
+
+ protected:
+  MPI_Comm bothcomm;
+  int otherroot;
+
+  void init(const void *);
+};
+
+}
+
+#endif
--- a/lib/message/cslib/src/msg_mpi_two.cpp
+++ b/lib/message/cslib/src/msg_mpi_two.cpp
@ -0,0 +1,81 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "msg_mpi_two.h"
+
+using namespace CSLIB_NS;
+
+/* ---------------------------------------------------------------------- */
+
+MsgMPITwo::MsgMPITwo(int csflag, const void *ptr, MPI_Comm cworld) : 
+  MsgMPIOne(csflag, ptr, cworld)
+{
+  char *filename = (char *) ptr;
+  init(filename);
+}
+
+/* ---------------------------------------------------------------------- */
+
+MsgMPITwo::~MsgMPITwo()
+{
+  // free the inter comm that spans both client and server
+
+  MPI_Comm_free(&bothcomm);
+  MPI_Close_port(port);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void MsgMPITwo::init(char *filename)
+{
+  if (client) {
+    if (me == 0) {
+      FILE *fp = NULL;
+      while (!fp) {
+        fp = fopen(filename,"r");
+        if (!fp) sleep(1);
+      }
+      fgets(port,MPI_MAX_PORT_NAME,fp);
+      //printf("Client port: %s\n",port);
+      fclose(fp);
+    }
+  
+    MPI_Bcast(port,MPI_MAX_PORT_NAME,MPI_CHAR,0,world);
+    MPI_Comm_connect(port,MPI_INFO_NULL,0,world,&bothcomm); 
+    //if (me == 0) printf("CLIENT comm connect\n");
+    if (me == 0) unlink(filename);
+
+  } else if (server) {
+    MPI_Open_port(MPI_INFO_NULL,port); 
+
+    if (me == 0) {
+      //printf("Server name: %s\n",port);
+      FILE *fp = fopen(filename,"w");
+      fprintf(fp,"%s",port);
+      fclose(fp);
+    }
+    
+    MPI_Comm_accept(port,MPI_INFO_NULL,0,world,&bothcomm); 
+    //if (me == 0) printf("SERVER comm accept\n");
+  }
+
+  otherroot = 0;
+}
--- a/lib/message/cslib/src/msg_mpi_two.h
+++ b/lib/message/cslib/src/msg_mpi_two.h
@ -0,0 +1,35 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+#ifndef MSG_MPI_TWO_H
+#define MSG_MPI_TWO_H
+
+#include "msg_mpi_one.h"
+
+namespace CSLIB_NS {
+
+class MsgMPITwo : public MsgMPIOne {
+ public:
+  MsgMPITwo(int, const void *, MPI_Comm);
+  ~MsgMPITwo();
+
+ private:
+  char port[MPI_MAX_PORT_NAME];
+
+  void init(char *);
+};
+
+}
+
+#endif
--- a/lib/message/cslib/src/msg_zmq.cpp
+++ b/lib/message/cslib/src/msg_zmq.cpp
@ -0,0 +1,140 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <zmq.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "msg_zmq.h"
+
+using namespace CSLIB_NS;
+
+/* ---------------------------------------------------------------------- */
+
+MsgZMQ::MsgZMQ(int csflag, const void *ptr, MPI_Comm cworld) :
+  Msg(csflag, ptr, cworld)
+{
+  char *port = (char *) ptr;
+  init(port);
+}
+
+MsgZMQ::MsgZMQ(int csflag, const void *ptr) : Msg(csflag, ptr)
+{
+  char *port = (char *) ptr;
+  init(port);
+}
+
+/* ---------------------------------------------------------------------- */
+
+MsgZMQ::~MsgZMQ()
+{
+  if (me == 0) {
+    zmq_close(socket);
+    zmq_ctx_destroy(context);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void MsgZMQ::init(char *port)
+{
+#ifdef ZMQ_NO
+  error_all("constructor(): Library not built with ZMQ support");
+#endif
+
+  if (me == 0) {
+    int n = strlen(port) + 8;
+    char *socket_name = new char[n];
+    strcpy(socket_name,"tcp://");
+    strcat(socket_name,port);
+  
+    if (client) {
+      context = zmq_ctx_new();
+      socket = zmq_socket(context,ZMQ_REQ);
+      zmq_connect(socket,socket_name);
+    } else if (server) {
+      context = zmq_ctx_new();
+      socket = zmq_socket(context,ZMQ_REP);
+      int rc = zmq_bind(socket,socket_name);
+      if (rc) error_one("constructor(): Server could not make socket connection");
+    }
+
+    delete [] socket_name;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   client/server sockets (REQ/REP) must follow this protocol:
+     client sends request (REQ) which server receives
+     server sends response (REP) which client receives
+     every exchange is of this form, server cannot initiate a send
+   thus each ZMQ send below has a following ZMQ recv, except last one
+     if client calls send(), it will next call recv()
+     if server calls send(), it will next call recv() from its wait loop
+     in either case, recv() issues a ZMQ recv to match last ZMQ send here
+------------------------------------------------------------------------- */
+
+void MsgZMQ::send(int nheader, int *header, int nbuf, char *buf)
+{
+  lengths[0] = nheader;
+  lengths[1] = nbuf;
+
+  if (me == 0) {
+    zmq_send(socket,lengths,2*sizeof(int),0);
+    zmq_recv(socket,NULL,0,0);
+  }
+
+  if (me == 0) {
+    zmq_send(socket,header,nheader*sizeof(int),0);
+    zmq_recv(socket,NULL,0,0);
+  }
+
+  if (me == 0) zmq_send(socket,buf,nbuf,0);
+}
+
+/* ----------------------------------------------------------------------
+   client/server sockets (REQ/REP) must follow this protocol:
+     client sends request (REQ) which server receives
+     server sends response (REP) which client receives
+     every exchange is of this form, server cannot initiate a send
+   thus each ZMQ recv below has a following ZMQ send, except last one
+     if client calls recv(), it will next call send() to ping server again,
+     if server calls recv(), it will next call send() to respond to client
+     in either case, send() issues a ZMQ send to match last ZMQ recv here
+------------------------------------------------------------------------- */
+
+void MsgZMQ::recv(int &maxheader, int *&header, int &maxbuf, char *&buf)
+{
+  if (me == 0) {
+    zmq_recv(socket,lengths,2*sizeof(int),0);
+    zmq_send(socket,NULL,0,0);
+  }
+  if (nprocs > 1) MPI_Bcast(lengths,2,MPI_INT,0,world);
+
+  int nheader = lengths[0];
+  int nbuf = lengths[1];
+  allocate(nheader,maxheader,header,nbuf,maxbuf,buf);
+
+  if (me == 0) {
+    zmq_recv(socket,header,nheader*sizeof(int),0);
+    zmq_send(socket,NULL,0,0);
+  }
+  if (nprocs > 1) MPI_Bcast(header,nheader,MPI_INT,0,world);
+
+  if (me == 0) zmq_recv(socket,buf,nbuf,0);
+  if (nprocs > 1) MPI_Bcast(buf,nbuf,MPI_CHAR,0,world);
+}
--- a/lib/message/cslib/src/msg_zmq.h
+++ b/lib/message/cslib/src/msg_zmq.h
@ -0,0 +1,38 @@
+/* ----------------------------------------------------------------------
+   CSlib - Client/server library for code coupling
+   http://cslib.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright 2018 National Technology & Engineering Solutions of
+   Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
+   NTESS, the U.S. Government retains certain rights in this software.
+   This software is distributed under the modified Berkeley Software
+   Distribution (BSD) License.
+
+   See the README file in the top-level CSlib directory.
+------------------------------------------------------------------------- */
+
+#ifndef MSG_ZMQ_H
+#define MSG_ZMQ_H
+
+#include "msg.h"
+
+namespace CSLIB_NS {
+
+class MsgZMQ : public Msg {
+ public:
+  MsgZMQ(int, const void *, MPI_Comm);
+  MsgZMQ(int, const void *);
+  ~MsgZMQ();
+  void send(int, int *, int, char *);
+  void recv(int &, int *&, int &, char *&);
+
+ private:
+  void *context,*socket;
+
+  void init(char *);
+};
+
+}
+
+#endif
--- a/lib/scafacos/Install.py
+++ b/lib/scafacos/Install.py
@ -0,0 +1,165 @@
+#!/usr/bin/env python
+
+# Install.py tool to download, unpack, build, and link to the Scafacos library
+# used to automate the steps described in the README file in this dir
+
+from __future__ import print_function
+import sys,os,re,subprocess
+
+# help message
+
+help = """
+Syntax from src dir: make lib-scafacos args="-b"
+                 or: make lib-scafacos args="-p /usr/local/scafacos"
+Syntax from lib dir: python Install.py -b
+                 or: python Install.py -p /usr/local/scafacos
+
+specify zero or more options, order does not matter
+
+  -b = download and build the Scafacos library
+  -p = specify folder of existing Scafacos installation
+
+   always creates includelink, liblink to Scafacos dirs
+
+Example:
+
+make lib-scafacos args="-b"   # download/build in lib/scafacos/scafacos
+make lib-scafacos args="-p $HOME/scafacos" # use existing Scafacos installation in $HOME
+"""
+
+# settings
+
+version = "scafacos-1.0.1"
+url = "https://github.com/scafacos/scafacos/releases/download/v1.0.1/scafacos-1.0.1.tar.gz"
+#url = "https://gigamove.rz.rwth-aachen.de/d/id/CTzyApN76MXMJ6/dd/100" % version
+
+# print error message or help
+
+def error(str=None):
+  if not str: print(help)
+  else: print("ERROR",str)
+  sys.exit()
+
+# expand to full path name
+# process leading '~' or relative path
+
+def fullpath(path):
+  return os.path.abspath(os.path.expanduser(path))
+
+def which(program):
+  def is_exe(fpath):
+    return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
+
+  fpath, fname = os.path.split(program)
+  if fpath:
+    if is_exe(program):
+      return program
+  else:
+    for path in os.environ["PATH"].split(os.pathsep):
+      path = path.strip('"')
+      exe_file = os.path.join(path, program)
+      if is_exe(exe_file):
+        return exe_file
+
+  return None
+
+def geturl(url,fname):
+  success = False
+
+  if which('curl') != None:
+    cmd = 'curl -L -o "%s" %s' % (fname,url)
+    try:
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+      success = True
+    except subprocess.CalledProcessError as e:
+      print("Calling curl failed with: %s" % e.output.decode('UTF-8'))
+
+  if not success and which('wget') != None:
+    cmd = 'wget -O "%s" %s' % (fname,url)
+    print("Wget command: %s" % cmd)
+    try:
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+      success = True
+    except subprocess.CalledProcessError as e:
+      print("Calling wget failed with: %s" % e.output.decode('UTF-8'))
+
+  if not success:
+    error("Failed to download source code with 'curl' or 'wget'")
+  return
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+
+homepath = "."
+
+buildflag = True 
+pathflag = False
+linkflag = True
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-v":
+    if iarg+2 > nargs: error()
+    version = args[iarg+1]
+    iarg += 2
+  elif args[iarg] == "-p":
+    if iarg+2 > nargs: error()
+    scafacospath = fullpath(args[iarg+1])
+    pathflag = True
+    iarg += 2
+  elif args[iarg] == "-b":
+    buildflag = True
+    iarg += 1
+  else: error()
+
+homepath = fullpath(homepath)
+homedir = "%s/%s" % (homepath,version)
+
+if (pathflag):
+    if not os.path.isdir(scafacospath): error("Scafacos path does not exist")
+    homedir =scafacospath
+
+if (buildflag and pathflag):
+    error("Cannot use -b and -p flag at the same time")
+
+# download and unpack Scafacos tarball
+
+if buildflag:
+  print("Downloading Scafacos ...")
+  geturl(url,"%s/%s.tar.gz" % (homepath,version))
+
+  print("Unpacking Scafacos tarball ...")
+  if os.path.exists("%s/%s" % (homepath,version)):
+    cmd = 'rm -rf "%s/%s"' % (homepath,version)
+    subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+  cmd = 'cd "%s"; tar -xzvf %s.tar.gz' % (homepath,version)
+  subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+  os.remove("%s/%s.tar.gz" % (homepath,version))
+  if os.path.basename(homedir) != version:
+    if os.path.exists(homedir):
+      cmd = 'rm -rf "%s"' % homedir
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+    os.rename("%s/%s" % (homepath,version),homedir)
+
+# build Scafacos
+
+if buildflag:
+  print("Building Scafacos ...")
+  cmd = 'cd "%s"; ./configure --prefix="`pwd`/build" --disable-doc --enable-fcs-solvers=fmm,p2nfft,direct,ewald,p3m --with-internal-fftw --with-internal-pfft --with-internal-pnfft CC=mpicc FC=mpif90 CXX=mpicxx F77= > log.txt; make -j; make install' % homedir
+  txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+  print(txt.decode('UTF-8'))
+
+# create 2 links in lib/scafacos to Scafacos include/lib dirs
+
+if linkflag:
+  print("Creating links to Scafacos include and lib files")
+  if os.path.isfile("includelink") or os.path.islink("includelink"):
+    os.remove("includelink")
+  if os.path.isfile("liblink") or os.path.islink("liblink"):
+    os.remove("liblink")
+  cmd = 'ln -s "%s/build/include" includelink' % homedir
+  subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+  cmd = 'ln -s "%s/build/lib" liblink' % homedir
+  subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
--- a/lib/scafacos/README
+++ b/lib/scafacos/README
@ -0,0 +1,76 @@
+This directory contains links to the ScaFaCoS library which
+is required to use the KSPACE scafacos and its kspace_style
+scafacos command in a LAMMPS input script.
+
+The ScaFaCoS library is available at http://scafacos.de or
+on github at https://github.com/scafacos, the libary was
+developed by a consortium of different universities in
+Germany (Bonn, Chemnitz, Stuttgart, Wuppertal) and
+the Research Centre Juelich (Juelich Supercomputing Centre).
+
+-----------------
+
+Instructions:
+
+1.) Download ScaFaCoS at http://scafacos.de or directly from github
+    https://github.com/scafacos where you can either clone the
+    repository or download the latest stable release.
+    NOTE: For the P2NFFT solver, you require an installation of the
+          GNU Scientific Library (GSL). Also to ensure the correct
+          linker-flags are used, ScaFaCoS employs the pkg-config
+          tool, which is also required.
+    If you cloned the repository, please refer to 2.), else continue
+    with 3.)
+
+2.) If you cloned the git repository, you require autotools to setup
+    the library. For that the following packages are required:
+      m4
+      autotools
+      automake
+      libtools
+    In the build_aux folder of the scafacos folder, you can find the
+    get_autotools.sh script, that downloads and installs the tools
+    to ${HOME}/local. To change the target folder, please change the
+    value of 'myprefix' in that script.
+    To start the auto-configuration process, please run the './bootstrap'
+    command in the scafacos base-folder.
+
+3.) If you downloaded the library as a tarball, please extract the file
+    to somewhere in your file system, or if you finished running 
+    './bootstrap', please run './configure' in the base folder.
+    Important flags for './configure' are:
+      --prefix=<install_dir>:       sets the directory the compiled files will
+                                    be installed to [default: /usr/local]
+      --fcs-enable-solvers=<list>:  sets the list of solvers that are going to
+                                    be built. By default all solvers will be
+                                    built. Currently supported by the kspace in LAMMPS
+                                    are: direct, ewald, fmm, p2nfft
+                                    The other solvers might work, but support
+                                    is purely experimental at the moment. To
+                                    give a list of solvers, use a comma seperated
+                                    list.
+      --fcs-disable-doc:            disables the compilation of the documentation,
+                                    e.g. if no Latex is available on the system.
+
+4.) To build the library after configuration, run 'make' from the base folder.
+
+5.) To install the libray in the designated installation folder, run 'make install'.
+    Installation is required, as ScaFaCoS does not support an in-source build!
+
+6.) Create two soft links to this directory (lib/scafacos) to where the libary
+    is installed. E.g. if you built ScaFaCoS in the default install directory:
+      % ln -s /usr/local/include includelink
+      % ln -s /usr/local/lib liblink
+    for any custom directory <custom_dir>:
+      % ln -s <custom_dir>/include includelink
+      % ln -s <custom_dir>/lib liblink
+
+7.) ScaFaCoS uses the pkg-config tool to supply the correct, so you need to setup your
+    PKG_CONFIG_PATH environment variable to include the lib/pkgconfig folder in the
+    installation directory.
+    Depending on the shell you use, this can be done either by:
+      % export PKG_CONFIG_PATH=<custom_dir>/lib/pkgconfig:${PKG_CONFIG_PATH}
+    or
+      % setenv PKG_CONFIG_PATH=<custom_dir>/lib/pkgconfig:${PKG_CONFIG_PATH}
+
+-----------------