Merge branch 'develop' of https://github.com/lammps/lammps into kk_update_3.7

2022-10-10 13:44:02 -07:00
parent e51be5d6e0 1fb07387b9
commit c113253e2d
133 changed files with 6313 additions and 1175 deletions
--- a/lib/gpu/Makefile.oneapi
+++ b/lib/gpu/Makefile.oneapi
@ -1,5 +1,5 @@
-# /* ----------------------------------------------------------------------
-#  Generic Linux Makefile for OpenCL
+# /* ----------------------------------------------------------------------   
+#  Linux Makefile for Intel oneAPI - Mixed precision
 # ------------------------------------------------------------------------- */

 # which file will be copied to Makefile.lammps
@ -11,11 +11,14 @@ EXTRAMAKE = Makefile.lammps.opencl

 LMP_INC = -DLAMMPS_SMALLBIG

-OCL_INC =
-OCL_CPP = mpiicpc -std=c++11 -xHost -O2 -qopenmp -qopenmp-simd  -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
-OCL_LINK = -lOpenCL
+OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/
+CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -fp-model fast=2 -no-prec-div \
+          -qoverride-limits
+OCL_CPP = mpiicpc -std=c++11 -diag-disable=10441 -DMPICH_IGNORE_CXX_SEEK \
+          $(LMP_INC) $(OCL_INC) $(CPP_OPT)
+OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
 OCL_PREC = -D_SINGLE_DOUBLE
-OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -fp-model fast=2 -no-prec-div
+OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT

 BIN_DIR = ./
 OBJ_DIR = ./
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -264,6 +264,20 @@ GERYON_KERNEL_DUMP      Dump all compiled OpenCL programs with compiler
                        flags and build logs
 GPU_CAST                Casting performed on GPU, untested recently
 THREE_CONCURRENT        Concurrent 3-body calcs in separate queues, untested
+LAL_SERIALIZE_INIT      Force serialization of initialization and compilation
+                        for multiple MPI tasks sharing the same accelerator.
+                        Some accelerator API implementations have had issues
+                        with temporary file conflicts in the past.
+GERYON_FORCE_SHARED_MAIN_MEM_ON      Should only be used for builds where the
+                                     accelerator is guaranteed to share physical
+                                     main memory with the host (e.g. integrated
+                                     GPU or CPU device). Default behavior is to
+                                     auto-detect. Impacts OpenCL only.
+GERYON_FORCE_SHARED_MAIN_MEM_OFF     Should only be used for builds where the
+                                     accelerator is guaranteed to have discrete
+                                     physical main memory vs the host (discrete
+                                     GPU card). Default behavior is to
+                                     auto-detect. Impacts OpenCL only.


 ------------------------------------------------------------------------------
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -126,10 +126,13 @@ class UCL_Device {
  /// Return the number of devices that support OpenCL
  inline int num_devices() { return _num_devices; }

-  /// Specify whether profiling (device timers) will be used for the device (yes=true)
+  /// Specify whether profiling (device timers) will be used (yes=true)
  /** No-op for CUDA and HIP **/
-  inline void configure_profiling(const bool profiling_on)
-    { _cq_profiling = profiling_on; }
+  inline void configure_profiling(const bool profiling_on) {
+    #ifndef GERYON_NO_OCL_MARKERS
+    _cq_profiling = profiling_on;
+    #endif
+  }

  /// Set the OpenCL device to the specified device number
  /** A context and default command queue will be created for the device *
@ -176,8 +179,8 @@ class UCL_Device {

 #ifdef CL_VERSION_2_0
    if (_cq_profiling) {
-      cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE,
-                                     0};
+      cl_queue_properties props[] = {CL_QUEUE_PROPERTIES,
+                                     CL_QUEUE_PROFILING_ENABLE, 0};
      _cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props,
                                                    &errorv);
    } else {
@ -187,8 +190,8 @@ class UCL_Device {
    }
 #else
    if (_cq_profiling)
-      _cq.back()=clCreateCommandQueue(_context, _cl_device, CL_QUEUE_PROFILING_ENABLE,
-                                      &errorv);
+      _cq.back()=clCreateCommandQueue(_context, _cl_device,
+                                      CL_QUEUE_PROFILING_ENABLE, &errorv);
    else
      _cq.back()=clCreateCommandQueue(_context, _cl_device, 0, &errorv);
 #endif
@ -403,7 +406,11 @@ class UCL_Device {
 // Grabs the properties for all devices
 UCL_Device::UCL_Device() {
  _device=-1;
+  #ifndef GERYON_NO_OCL_MARKERS
  _cq_profiling=true;
+  #else
+  _cq_profiling=false;
+  #endif

  // --- Get Number of Platforms
  cl_uint nplatforms;
@ -482,6 +489,7 @@ int UCL_Device::set_platform(int pid) {
  _num_devices = 0;
  for (int i=0; i<num_unpart; i++) {
    cl_uint num_subdevices = 1;
+    cl_device_id *subdevice_list = device_list + i;

    #ifdef CL_VERSION_1_2
    cl_device_affinity_domain adomain;
@ -494,25 +502,29 @@ int UCL_Device::set_platform(int pid) {
    props[0]=CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
    props[1]=CL_DEVICE_AFFINITY_DOMAIN_NUMA;
    props[2]=0;
+
+    cl_int err = CL_SUCCESS;
    if (adomain & CL_DEVICE_AFFINITY_DOMAIN_NUMA)
-      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, 0, NULL,
-                                      &num_subdevices));
-    if (num_subdevices > 1) {
-      cl_device_id *subdevice_list = new cl_device_id[num_subdevices];
-      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, num_subdevices,
-                                      subdevice_list, &num_subdevices));
-      for (cl_uint j=0; j<num_subdevices; j++) {
-        _cl_devices.push_back(device_list[i]);
-        add_properties(device_list[i]);
-        _num_devices++;
+      err = clCreateSubDevices(device_list[i], props, 0, NULL,
+                               &num_subdevices);
+    if (err == CL_SUCCESS && num_subdevices > 1) {
+      subdevice_list = new cl_device_id[num_subdevices];
+      err = clCreateSubDevices(device_list[i], props, num_subdevices,
+                               subdevice_list, &num_subdevices);
+      if (err != CL_SUCCESS) {
+        delete[] subdevice_list;
+        num_subdevices = 1;
+        subdevice_list = device_list + i;
      }
-      delete[] subdevice_list;
-    } else {
-      _cl_devices.push_back(device_list[i]);
-      add_properties(device_list[i]);
-      _num_devices++;
    }
    #endif
+
+    for (cl_uint j=0; j<num_subdevices; j++) {
+      _num_devices++;
+      _cl_devices.push_back(subdevice_list[j]);
+      add_properties(subdevice_list[j]);
+    }
+    if (num_subdevices > 1) delete[] subdevice_list;
  } // for i
  #endif

@ -686,10 +698,10 @@ void UCL_Device::add_properties(cl_device_id device_list) {
    double arch = static_cast<double>(minor)/10+major;
    if (arch >= 3.0)
      op.has_shuffle_support=true;
-    op.shared_main_memory=_shared_mem_device(device_list);
  }
  delete[] buffer2;
  #endif
+  op.shared_main_memory=_shared_mem_device(device_list);

  _properties.push_back(op);
 }
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -27,11 +27,15 @@
 #include "ocl_macros.h"
 #include "ocl_device.h"

+#ifndef GERYON_NO_OCL_MARKERS
 #ifdef CL_VERSION_1_2
 #define UCL_OCL_MARKER(cq,event) clEnqueueMarkerWithWaitList(cq,0,nullptr,event)
 #else
 #define UCL_OCL_MARKER clEnqueueMarker
 #endif
+#else
+#define UCL_OCL_MARKER(cq,event)
+#endif

 namespace ucl_opencl {

@ -51,8 +55,10 @@ class UCL_Timer {
  inline void clear() {
    if (_initialized) {
      if (has_measured_time) {
+        #ifndef GERYON_NO_OCL_MARKERS
        clReleaseEvent(start_event);
        clReleaseEvent(stop_event);
+        #endif
        has_measured_time = false;
      }
      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
@ -76,8 +82,10 @@ class UCL_Timer {
  /// Start timing on default command queue
  inline void start() {
    if (has_measured_time) {
+      #ifndef GERYON_NO_OCL_MARKERS
      clReleaseEvent(start_event);
      clReleaseEvent(stop_event);
+      #endif
      has_measured_time = false;
    }
    UCL_OCL_MARKER(_cq,&start_event);
@ -91,17 +99,26 @@ class UCL_Timer {

  /// Block until the start event has been reached on device
  inline void sync_start() {
+    #ifndef GERYON_NO_OCL_MARKERS
+    CL_SAFE_CALL(clWaitForEvents(1,&start_event));
    if (has_measured_time) {
      clReleaseEvent(start_event);
      clReleaseEvent(stop_event);
      has_measured_time = false;
    }
-    CL_SAFE_CALL(clWaitForEvents(1,&start_event));
+    #else
+    CL_SAFE_CALL(clFinish(_cq));
+    has_measured_time = false;
+    #endif
  }

  /// Block until the stop event has been reached on device
  inline void sync_stop() {
+    #ifndef GERYON_NO_OCL_MARKERS
    CL_SAFE_CALL(clWaitForEvents(1,&stop_event));
+    #else
+    CL_SAFE_CALL(clFinish(_cq));
+    #endif
    has_measured_time = true;
  }

@ -126,6 +143,7 @@ class UCL_Timer {
  /// Return the time (ms) of last start to stop - Forces synchronization
  inline double time() {
    if(!has_measured_time) return 0.0;
+    #ifndef GERYON_NO_OCL_MARKERS
    cl_ulong tstart,tend;
    CL_SAFE_CALL(clWaitForEvents(1,&stop_event));
    CL_SAFE_CALL(clGetEventProfilingInfo(stop_event,
@ -138,6 +156,11 @@ class UCL_Timer {
    clReleaseEvent(stop_event);
    has_measured_time = false;
    return (tend-tstart)*1e-6;
+    #else
+    CL_SAFE_CALL(clFinish(_cq));
+    has_measured_time = false;
+    return 0.0;
+    #endif
  }

  /// Return the time (s) of last start to stop - Forces synchronization
--- a/lib/gpu/lal_beck_ext.cpp
+++ b/lib/gpu/lal_beck_ext.cpp
@ -76,7 +76,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
                        special_lj, inum, nall, max_nbors, maxspecial,
                        cell_size, gpu_split, screen);

-    BLMF.device->gpu_barrier();
+    BLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_born_coul_long_cs_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_cs_ext.cpp
@ -84,7 +84,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                            host_special_coul, qqrd2e, g_ewald);

-    BCLCSMF.device->gpu_barrier();
+    BCLCSMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_born_coul_long_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@ -84,7 +84,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                            host_special_coul, qqrd2e, g_ewald);

-    BORNCLMF.device->gpu_barrier();
+    BORNCLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
@ -86,7 +86,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                            host_cut_coulsq, host_special_coul, qqrd2e,
                            alf, e_shift, f_shift);

-    BornCWCST.device->gpu_barrier();
+    BornCWCST.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_born_coul_wolf_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@ -86,7 +86,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                            host_cut_coulsq, host_special_coul, qqrd2e,
                            alf, e_shift, f_shift);

-    BORNCWMF.device->gpu_barrier();
+    BORNCWMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@ -80,7 +80,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                          offset, special_lj, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen);

-    BORNMF.device->gpu_barrier();
+    BORNMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -114,7 +114,7 @@ void born_gpu_reinit(const int ntypes, double **host_rhoinv,
      BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
                    host_born3, host_a, host_c, host_d, offset);

-    BORNMF.device->gpu_barrier();
+    BORNMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_buck_coul_ext.cpp
+++ b/lib/gpu/lal_buck_coul_ext.cpp
@ -83,7 +83,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                       host_cut_ljsq, host_cut_coulsq,
                       host_special_coul, qqrd2e);

-    BUCKCMF.device->gpu_barrier();
+    BUCKCMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_buck_coul_long_ext.cpp
+++ b/lib/gpu/lal_buck_coul_long_ext.cpp
@ -82,7 +82,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

-    BUCKCLMF.device->gpu_barrier();
+    BUCKCLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@ -77,7 +77,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen);

-    BUCKMF.device->gpu_barrier();
+    BUCKMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -110,7 +110,7 @@ void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv,
      BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                    host_a, host_c, offset);

-    BUCKMF.device->gpu_barrier();
+    BUCKMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_charmm_ext.cpp
+++ b/lib/gpu/lal_charmm_ext.cpp
@ -88,7 +88,7 @@ int crm_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                          qqrd2e, cut_lj_innersq, cut_coul_innersq, denom_lj,
                          denom_coul, epsilon, sigma, mix_arithmetic);

-    CRMMF.device->gpu_barrier();
+    CRMMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@ -86,7 +86,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                          qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
                          sigma, mix_arithmetic);

-    CRMLMF.device->gpu_barrier();
+    CRMLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_colloid_ext.cpp
+++ b/lib/gpu/lal_colloid_ext.cpp
@ -83,7 +83,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);

-    COLLMF.device->gpu_barrier();
+    COLLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_coul_debye_ext.cpp
+++ b/lib/gpu/lal_coul_debye_ext.cpp
@ -74,7 +74,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
      init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);

-    CDEMF.device->gpu_barrier();
+    CDEMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -103,7 +103,7 @@ void cdebye_gpu_reinit(const int ntypes, double **host_scale) {
    if (gpu_rank==i && world_me!=0)
      CDEMF.reinit(ntypes, host_scale);

-    CDEMF.device->gpu_barrier();
+    CDEMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_coul_dsf_ext.cpp
+++ b/lib/gpu/lal_coul_dsf_ext.cpp
@ -77,7 +77,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
                        gpu_split, screen, host_cut_coulsq, host_special_coul,
                        qqrd2e, e_shift, f_shift, alpha);

-    CDMF.device->gpu_barrier();
+    CDMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_coul_ext.cpp
+++ b/lib/gpu/lal_coul_ext.cpp
@ -74,7 +74,7 @@ int coul_gpu_init(const int ntypes, double **host_scale,
      init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen, qqrd2e);

-    COULMF.device->gpu_barrier();
+    COULMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -103,7 +103,7 @@ void coul_gpu_reinit(const int ntypes, double **host_scale) {
    if (gpu_rank==i && world_me!=0)
      COULMF.reinit(ntypes, host_scale);

-    COULMF.device->gpu_barrier();
+    COULMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_coul_long_cs_ext.cpp
+++ b/lib/gpu/lal_coul_long_cs_ext.cpp
@ -76,7 +76,7 @@ int clcs_gpu_init(const int ntypes, double **host_scale,
                        cell_size, gpu_split, screen, host_cut_coulsq,
                        host_special_coul, qqrd2e, g_ewald);

-    CLCSMF.device->gpu_barrier();
+    CLCSMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -105,7 +105,7 @@ void clcs_gpu_reinit(const int ntypes, double **host_scale) {
    if (gpu_rank==i && world_me!=0)
      CLCSMF.reinit(ntypes, host_scale);

-    CLCSMF.device->gpu_barrier();
+    CLCSMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@ -76,7 +76,7 @@ int cl_gpu_init(const int ntypes, double **host_scale,
                        cell_size, gpu_split, screen, host_cut_coulsq,
                        host_special_coul, qqrd2e, g_ewald);

-    CLMF.device->gpu_barrier();
+    CLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -105,7 +105,7 @@ void cl_gpu_reinit(const int ntypes, double **host_scale) {
    if (gpu_rank==i && world_me!=0)
      CLMF.reinit(ntypes, host_scale);

-    CLMF.device->gpu_barrier();
+    CLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -328,7 +328,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
  for (int i=0; i<_procs_per_gpu; i++) {
    if (_gpu_rank==i)
      flag=compile_kernels();
-    gpu_barrier();
+    serialize_init();
  }

  // check if double precision support is available
@ -609,6 +609,10 @@ void DeviceT::init_message(FILE *screen, const char *name,
    int last=last_gpu+1;
    if (last>gpu->num_devices())
      last=gpu->num_devices();
+    if (gpu->num_platforms()>1) {
+      std::string pname=gpu->platform_name();
+      fprintf(screen,"Platform: %s\n",pname.c_str());
+    }
    for (int i=first_gpu; i<last; i++) {
      std::string sname;
      if (i==first_gpu)
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -217,6 +217,12 @@ class Device {
  inline int gpu_rank() const { return _gpu_rank; }
  /// MPI Barrier for gpu
  inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
+  /// Serialize GPU initialization and JIT for unsafe platforms
+  inline void serialize_init() {
+    #ifdef LAL_SERIALIZE_INIT
+    gpu_barrier();
+    #endif
+  }
  /// Return the 'mode' for acceleration: GPU_FORCE, GPU_NEIGH or GPU_HYB_NEIGH
  inline int gpu_mode() const { return _gpu_mode; }
  /// Index of first device used by a node
--- a/lib/gpu/lal_dipole_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_ext.cpp
@ -80,7 +80,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e);

-    DPLMF.device->gpu_barrier();
+    DPLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_dipole_lj_sf_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp
@ -80,7 +80,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e);

-    DPLSFMF.device->gpu_barrier();
+    DPLSFMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_dipole_long_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_long_lj_ext.cpp
@ -81,7 +81,7 @@ int dplj_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

-    DPLJMF.device->gpu_barrier();
+    DPLJMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_dpd_ext.cpp
+++ b/lib/gpu/lal_dpd_ext.cpp
@ -76,7 +76,7 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0,
                         host_cut, special_lj, false, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen);

-    DPDMF.device->gpu_barrier();
+    DPDMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_dpd_tstat_ext.cpp
+++ b/lib/gpu/lal_dpd_tstat_ext.cpp
@ -76,7 +76,7 @@ int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0,
                         host_cut, special_lj, true, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen);

-    DPDTMF.device->gpu_barrier();
+    DPDTMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_eam_alloy_ext.cpp
+++ b/lib/gpu/lal_eam_alloy_ext.cpp
@ -90,7 +90,7 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
                         nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);

-    EAMALMF.device->gpu_barrier();
+    EAMALMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@ -90,7 +90,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
                         nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);

-    EAMMF.device->gpu_barrier();
+    EAMMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_eam_fs_ext.cpp
+++ b/lib/gpu/lal_eam_fs_ext.cpp
@ -90,7 +90,7 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
                         nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);

-    EAMFSMF.device->gpu_barrier();
+    EAMFSMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_gauss_ext.cpp
+++ b/lib/gpu/lal_gauss_ext.cpp
@ -76,7 +76,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
                        offset, special_lj, inum, nall, max_nbors, maxspecial,
                        cell_size, gpu_split, screen);

-    GLMF.device->gpu_barrier();
+    GLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -106,7 +106,7 @@ void gauss_gpu_reinit(const int ntypes, double **cutsq, double **host_a,
    if (gpu_rank==i && world_me!=0)
      GLMF.reinit(ntypes, cutsq, host_a, host_b, offset);

-    GLMF.device->gpu_barrier();
+    GLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_gayberne_ext.cpp
+++ b/lib/gpu/lal_gayberne_ext.cpp
@ -83,7 +83,7 @@ int gb_gpu_init(const int ntypes, const double gamma,
                        host_lj3, host_lj4, offset, special_lj,  inum, nall,
                        max_nbors, maxspecial, cell_size, gpu_split,  screen);

-    GBMF.device->gpu_barrier();
+    GBMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj96_ext.cpp
+++ b/lib/gpu/lal_lj96_ext.cpp
@ -76,7 +76,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          offset, special_lj, inum,  nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);

-    LJ96MF.device->gpu_barrier();
+    LJ96MF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_class2_long_ext.cpp
+++ b/lib/gpu/lal_lj_class2_long_ext.cpp
@ -81,7 +81,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

-    C2CLMF.device->gpu_barrier();
+    C2CLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_coul_debye_ext.cpp
+++ b/lib/gpu/lal_lj_coul_debye_ext.cpp
@ -81,7 +81,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, kappa);

-    LJCDMF.device->gpu_barrier();
+    LJCDMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_coul_ext.cpp
+++ b/lib/gpu/lal_lj_coul_ext.cpp
@ -80,7 +80,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e);

-    LJCMF.device->gpu_barrier();
+    LJCMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_coul_long_ext.cpp
@ -81,7 +81,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

-    LJCLMF.device->gpu_barrier();
+    LJCLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -112,7 +112,7 @@ void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
    if (gpu_rank==i && world_me!=0)
      LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                    offset, host_cut_ljsq);
-    LJCLMF.device->gpu_barrier();
+    LJCLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_lj_coul_msm_ext.cpp
+++ b/lib/gpu/lal_lj_coul_msm_ext.cpp
@ -83,7 +83,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, order, qqrd2e);

-    LJCMLMF.device->gpu_barrier();
+    LJCMLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_cubic_ext.cpp
+++ b/lib/gpu/lal_lj_cubic_ext.cpp
@ -80,7 +80,7 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq,
                              special_lj, inum, nall, max_nbors, maxspecial,
                              cell_size, gpu_split, screen);

-    LJCubicLMF.device->gpu_barrier();
+    LJCubicLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_dsf_ext.cpp
+++ b/lib/gpu/lal_lj_dsf_ext.cpp
@ -84,7 +84,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                         host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
                         f_shift, alpha);

-    LJDMF.device->gpu_barrier();
+    LJDMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_expand_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_expand_coul_long_ext.cpp
@ -81,7 +81,7 @@ int ljecl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

-    LJECLMF.device->gpu_barrier();
+    LJECLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -112,7 +112,7 @@ void ljecl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
    if (gpu_rank==i && world_me!=0)
      LJECLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                    offset, shift, host_cut_ljsq);
-    LJECLMF.device->gpu_barrier();
+    LJECLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_lj_expand_ext.cpp
+++ b/lib/gpu/lal_lj_expand_ext.cpp
@ -108,7 +108,7 @@ void lje_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
    if (gpu_rank==i && world_me!=0)
      LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                   offset, shift);
-    LJEMF.device->gpu_barrier();
+    LJEMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_lj_ext.cpp
+++ b/lib/gpu/lal_lj_ext.cpp
@ -76,7 +76,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);

-    LJLMF.device->gpu_barrier();
+    LJLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -105,7 +105,7 @@ void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (gpu_rank==i && world_me!=0)
      LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
-    LJLMF.device->gpu_barrier();
+    LJLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_lj_gromacs_ext.cpp
+++ b/lib/gpu/lal_lj_gromacs_ext.cpp
@ -81,7 +81,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3,
                           host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq);

-    LJGRMMF.device->gpu_barrier();
+    LJGRMMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_smooth_ext.cpp
+++ b/lib/gpu/lal_lj_smooth_ext.cpp
@ -80,7 +80,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                         cell_size, gpu_split, screen, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3,
                         host_ljsw4, cut_inner, cut_inner_sq);

-    LJSMTMF.device->gpu_barrier();
+    LJSMTMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -110,7 +110,7 @@ void ljsmt_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (gpu_rank==i && world_me!=0)
      LJSMTMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq);
-    LJSMTMF.device->gpu_barrier();
+    LJSMTMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_lj_spica_ext.cpp
+++ b/lib/gpu/lal_lj_spica_ext.cpp
@ -77,7 +77,7 @@ int spica_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                         host_lj4, offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen);

-    CMMMF.device->gpu_barrier();
+    CMMMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_spica_long_ext.cpp
+++ b/lib/gpu/lal_lj_spica_long_ext.cpp
@ -81,7 +81,7 @@ int spical_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                          maxspecial, cell_size, gpu_split, screen,
                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
                          qqrd2e, g_ewald);
-    CMMLMF.device->gpu_barrier();
+    CMMLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_tip4p_long_ext.cpp
+++ b/lib/gpu/lal_lj_tip4p_long_ext.cpp
@ -89,7 +89,7 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
          host_special_coul, qqrd2e,
          g_ewald, map_size, max_same);

-    LJTIP4PLMF.device->gpu_barrier();
+    LJTIP4PLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_mie_ext.cpp
+++ b/lib/gpu/lal_mie_ext.cpp
@ -80,7 +80,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
                        offset, special_lj, inum, nall, max_nbors, maxspecial,
                        cell_size, gpu_split, screen);

-    MLMF.device->gpu_barrier();
+    MLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_morse_ext.cpp
+++ b/lib/gpu/lal_morse_ext.cpp
@ -77,7 +77,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);

-    MORMF.device->gpu_barrier();
+    MORMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_pppm_ext.cpp
+++ b/lib/gpu/lal_pppm_ext.cpp
@ -81,7 +81,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
                           vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
                           split,success);

-    pppm.device->gpu_barrier();
+    pppm.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_re_squared_ext.cpp
+++ b/lib/gpu/lal_re_squared_ext.cpp
@ -80,7 +80,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
                        host_lj4, offset, special_lj,  inum, nall,
                        max_nbors, maxspecial, cell_size, gpu_split, screen);

-    REMF.device->gpu_barrier();
+    REMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_soft_ext.cpp
+++ b/lib/gpu/lal_soft_ext.cpp
@ -76,7 +76,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
                        special_lj, inum, nall, max_nbors, maxspecial,
                        cell_size, gpu_split, screen);

-    SLMF.device->gpu_barrier();
+    SLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -106,7 +106,7 @@ void soft_gpu_reinit(const int ntypes, double **cutsq, double **host_prefactor,
    if (gpu_rank==i && world_me!=0)
      SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut);

-    SLMF.device->gpu_barrier();
+    SLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_sw_ext.cpp
+++ b/lib/gpu/lal_sw_ext.cpp
@ -84,7 +84,7 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall,
                        sigma_gamma, c1, c2, c3, c4, c5, c6, lambda_epsilon,
                        costheta, map, e2param);

-    SWMF.device->gpu_barrier();
+    SWMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_table_ext.cpp
+++ b/lib/gpu/lal_table_ext.cpp
@ -76,7 +76,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
                      special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                      gpu_split, screen, tabstyle, ntables, tablength);

-    TBMF.device->gpu_barrier();
+    TBMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_tersoff_ext.cpp
+++ b/lib/gpu/lal_tersoff_ext.cpp
@ -91,7 +91,7 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int
                        ts_c1, ts_c2, ts_c3, ts_c4, ts_c, ts_d, ts_h,
                        ts_gamma, ts_beta, ts_powern, ts_cutsq);

-    TSMF.device->gpu_barrier();
+    TSMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_tersoff_mod_ext.cpp
+++ b/lib/gpu/lal_tersoff_mod_ext.cpp
@ -91,7 +91,7 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall,
                        ts_c3, ts_c4, ts_c5, ts_h, ts_beta, ts_powern,
                        ts_powern_del, ts_ca1, ts_cutsq);

-    TSMMF.device->gpu_barrier();
+    TSMMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_tersoff_zbl_ext.cpp
+++ b/lib/gpu/lal_tersoff_zbl_ext.cpp
@ -102,7 +102,7 @@ int tersoff_zbl_gpu_init(const int ntypes, const int inum, const int nall,
                        ts_ZBLcut, ts_ZBLexpscale, global_e, global_a_0,
                        global_epsilon_0, ts_cutsq);

-    TSZMF.device->gpu_barrier();
+    TSZMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_ufm_ext.cpp
+++ b/lib/gpu/lal_ufm_ext.cpp
@ -78,7 +78,7 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);

-    UFMLMF.device->gpu_barrier();
+    UFMLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -106,7 +106,7 @@ void ufml_gpu_reinit(const int ntypes, double **cutsq, double **host_uf1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (gpu_rank==i && world_me!=0)
      UFMLMF.reinit(ntypes, cutsq, host_uf1, host_uf2, host_uf3, offset);
-    UFMLMF.device->gpu_barrier();
+    UFMLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_vashishta_ext.cpp
+++ b/lib/gpu/lal_vashishta_ext.cpp
@ -89,7 +89,7 @@ int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const i
                        lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw,
                        c0, costheta, bigb, big2b, bigc);

-    VashishtaMF.device->gpu_barrier();
+    VashishtaMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_yukawa_colloid_ext.cpp
+++ b/lib/gpu/lal_yukawa_colloid_ext.cpp
@ -76,7 +76,7 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
                            inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
                            screen, kappa);

-    YKCOLLMF.device->gpu_barrier();
+    YKCOLLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_yukawa_ext.cpp
+++ b/lib/gpu/lal_yukawa_ext.cpp
@ -76,7 +76,7 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
                      inum, nall, max_nbors, maxspecial, cell_size,
                      gpu_split, screen);

-    YKMF.device->gpu_barrier();
+    YKMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_zbl_ext.cpp
+++ b/lib/gpu/lal_zbl_ext.cpp
@ -79,7 +79,7 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
                         cut_globalsq, cut_innersq, cut_inner,
                         inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen);

-    ZBLMF.device->gpu_barrier();
+    ZBLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/pace/Install.py
+++ b/lib/pace/Install.py
@ -1,4 +1,4 @@
-# TODO#!/usr/bin/env python
+#!/usr/bin/env python

 """
 Install.py tool to download, compile, and setup the pace library
@ -6,7 +6,10 @@ used to automate the steps described in the README file in this dir
 """

 from __future__ import print_function
-import sys, subprocess
+
+import shutil
+import subprocess
+import sys
 from argparse import ArgumentParser

 sys.path.append('..')
@ -15,23 +18,16 @@ from install_helpers import fullpath, geturl, checkmd5sum
 # settings

 thisdir = fullpath('.')
-version = 'v.2021.10.25.fix2'
+version ='v.2022.09.27.fix10Oct'

 # known checksums for different PACE versions. used to validate the download.
 checksums = { \
-        'v.2021.2.3.upd2' : '8fd1162724d349b930e474927197f20d',
-        'v.2021.4.9'      : '4db54962fbd6adcf8c18d46e1798ceb5',
-        'v.2021.9.28'     : 'f98363bb98adc7295ea63974738c2a1b',
-        'v.2021.10.25'    : 'a2ac3315c41a1a4a5c912bcb1bc9c5cc',
-        'v.2021.10.25.fix': 'e0572de57039d4afedefb25707b6ceae',
-        'v.2021.10.25.fix2': '32394d799bc282bb57696c78c456e64f'
-        }
-
+    'v.2022.09.27.fix10Oct': '766cebcc0e5c4b8430c2f3cd202d9905'
+}

 parser = ArgumentParser(prog='Install.py',
                        description="LAMMPS library build wrapper script")

-
 # help message

 HELP = """
@ -55,55 +51,68 @@ parser.add_argument("-v", "--version", default=version, choices=checksums.keys()
                    help="set version of PACE library to download and build (default: %s)" % version)
 parser.add_argument("-vv", "--verbose", action="store_true",
                    help="be more verbose about is happening while this script runs")
+parser.add_argument("-l", "--local", default=None,
+                    help="use local version of PACE library build")

 args = parser.parse_args()

 # print help message and exit, if neither build nor path options are given
 if not args.build:
-  parser.print_help()
-  sys.exit(HELP)
+    parser.print_help()
+    sys.exit(HELP)

 buildflag = args.build

 verboseflag = args.verbose
 version = args.version
-
+local = args.local

 archive_extension = "tar.gz"
 url = "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/%s.%s" % (version, archive_extension)
-unarchived_folder_name = "lammps-user-pace-%s"%(version)
+unarchived_folder_name = "lammps-user-pace-%s" % (version)

 # download PACE tarball, unpack, build PACE
 if buildflag:
+    if not local:
+        # download entire tarball
+        print("Downloading pace tarball ...")
+        archive_filename = "%s.%s" % (version, archive_extension)
+        download_filename = "%s/%s" % (thisdir, archive_filename)
+        print("Downloading from ", url, " to ", download_filename, end=" ")
+        geturl(url, download_filename)
+        print(" done")

-  # download entire tarball
+        # verify downloaded archive integrity via md5 checksum, if known.
+        if version in checksums:
+            if not checkmd5sum(checksums[version], archive_filename):
+                sys.exit("Checksum for pace library does not match")

-  print("Downloading pace tarball ...")
-  archive_filename = "%s.%s" % (version, archive_extension)
-  download_filename = "%s/%s" % (thisdir, archive_filename)
-  print("Downloading from ",url," to ",download_filename, end=" ")
-  geturl(url, download_filename)
-  print(" done")
+        print("Unpacking pace tarball ...")
+        src_folder = thisdir + "/src"
+        cmd = 'cd "%s"; rm -rf "%s"; tar -xvf %s; mv %s %s' % (
+            thisdir, src_folder, archive_filename, unarchived_folder_name, src_folder)
+        subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
+    else:
+        # copy from local version of library PACE
+        print("Copy pace from ", local)
+        src_folder = thisdir + "/src"
+        shutil.copytree(local, src_folder,
+                        # ignore=lambda (s1,s2): ('.git' in s1 or '.git' in s2),
+                        dirs_exist_ok=True)

-  # verify downloaded archive integrity via md5 checksum, if known.
-  if version in checksums:
-    if not checkmd5sum(checksums[version], archive_filename):
-      sys.exit("Checksum for pace library does not match")

-  print("Unpacking pace tarball ...")
-  src_folder = thisdir+"/src"
-  cmd = 'cd "%s"; rm -rf "%s"; tar -xvf %s; mv %s %s' % (thisdir, src_folder, archive_filename, unarchived_folder_name, src_folder)
-  subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
+    # build
+    print("Building libpace ...")
+    cmd = 'make lib -j2'
+    txt = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
+    if verboseflag:
+        print(txt.decode("UTF-8"))

-  # build
-  print("Building libpace ...")
-  cmd = 'make lib -j2'
-  txt = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
-  if verboseflag:
-    print(txt.decode("UTF-8"))
+    #   remove source files

-#   remove source files
+    print("Removing pace build files and archive ...")
+    cmd = 'make clean-build'
+    if not local:
+        cmd = ('rm %s;' % (download_filename))+cmd
+    subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)

-  print("Removing pace build files and archive ...")
-  cmd = 'rm %s; make clean-build' % (download_filename)
-  subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
--- a/lib/pace/Makefile
+++ b/lib/pace/Makefile
@ -5,8 +5,14 @@ SHELL = /bin/sh
 YAML_CPP_PATH = src/yaml-cpp
 YAML_CPP_INC = $(YAML_CPP_PATH)/include

-SRC_FILES = $(wildcard src/ML-PACE/*.cpp)
-SRC = $(filter-out src/ML-PACE/pair_pace.cpp, $(SRC_FILES))
+WIGNER_CPP_INC = src/wigner-cpp/include/wigner
+
+CNPY_CPP_PATH = src/cnpy
+CNPY_CPP_INC = $(CNPY_CPP_PATH)
+CNPY_SRC_FILES =  $(CNPY_CPP_PATH)/cnpy.cpp
+
+SRC_FILES = $(wildcard src/ML-PACE/ace/*.cpp) $(wildcard src/ML-PACE/ace-evaluator/*.cpp)
+SRC = $(filter-out src/ML-PACE/pair_pace.cpp, $(SRC_FILES)) $(CNPY_SRC_FILES)

 # ------ DEFINITIONS ------

@ -15,7 +21,7 @@ OBJ =   $(SRC:.cpp=.o)


 # ------ SETTINGS ------
-CXXFLAGS = -O3 -fPIC -Isrc/ML-PACE -I$(YAML_CPP_INC)
+CXXFLAGS = -O3 -fPIC -Isrc/ML-PACE/ace -Isrc/ML-PACE/ace-evaluator -I$(YAML_CPP_INC) -I$(WIGNER_CPP_INC) -I$(CNPY_CPP_INC) -DEXTRA_C_PROJECTIONS

 ARCHIVE =	ar
 ARCHFLAG =	-rc
--- a/lib/pace/Makefile.lammps
+++ b/lib/pace/Makefile.lammps
@ -1,3 +1,3 @@
-pace_SYSINC =-I../../lib/pace/src/ML-PACE -I../../lib/pace/src/yaml-cpp/include
+pace_SYSINC =-I../../lib/pace/src/ML-PACE/ace -I../../lib/pace/src/ML-PACE/ace-evaluator -I../../lib/pace/src/yaml-cpp/include -I../../lib/pace/src/wigner-cpp/include/wigner -DEXTRA_C_PROJECTIONS
 pace_SYSLIB = -L../../lib/pace/ -lpace -L../../lib/pace/src/yaml-cpp/ -lyaml-cpp
 pace_SYSPATH =