git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12851 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2014-12-22 22:12:30 +00:00
parent 6429e05b78
commit baccbaeda8
9 changed files with 336 additions and 79 deletions
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@ -6,6 +6,8 @@
                        W. Michael Brown (Intel)
                       michael.w.brown at intel.com

+                          Anupama Kurpad (Intel)
+
 -----------------------------------------------------------------------------

 This package is based on the USER-OMP package and provides LAMMPS styles that:
@ -41,3 +43,10 @@ Intel compilers.
 For portability reasons, vectorization directives are currently only enabled 
 for Intel compilers. Using other compilers may result in significantly
 lower performance.
+
+-----------------------------------------------------------------------------
+
+By default, when running with offload to Intel(R) coprocessors, affinity
+for host MPI tasks and OpenMP threads is set automatically within the code.
+This currently requires the use of system calls. To disable at build time,
+compile with -DINTEL_OFFLOAD_NOAFFINITY.
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@ -13,6 +13,7 @@

 /* ----------------------------------------------------------------------
   Contributing author: W. Michael Brown (Intel)
+                        Anupama Kurpad (Intel) - Host Affinitization
 ------------------------------------------------------------------------- */

 #include "comm.h"
@ -59,7 +60,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
  _off_overflow_flag[LMP_OVERFLOW] = 0;

  _offload_affinity_balanced = 0;
-  _offload_threads = 1;
+  _offload_threads = 0;
  _offload_tpc = 4;

  #ifdef _LMP_INTEL_OFFLOAD
@ -71,25 +72,12 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
  _off_ev_array_s = 0;
  _off_ev_array_d = 0;
  _balance_fixed = 0.0;
-
  _cop = 0;
-
-  int max_offload_threads, offload_cores;
-  #pragma offload target(mic:_cop) mandatory \
-    out(max_offload_threads,offload_cores)
-  {
-    offload_cores = omp_get_num_procs();
-    omp_set_num_threads(offload_cores);
-    max_offload_threads = omp_get_max_threads();
-  }
-  _max_offload_threads = max_offload_threads;
-  _offload_cores = offload_cores;
-  _offload_threads = offload_cores;
  #endif

  // optional keywords

-  int nomp = 0;
+  int nomp = 0, no_affinity = 0;
  _allow_separate_buffers = 1;
  _offload_ghost = -1;

@ -127,6 +115,9 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
      _offload_threads = atoi(arg[iarg+1]);
      iarg += 2;
+    } else if (strcmp(arg[iarg],"no_affinity") == 0) {
+      no_affinity = 1;
+      iarg++;
    }

    // undocumented options
@ -143,10 +134,34 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)

  // error check

-  if (_offload_balance > 1.0 || _offload_threads <= 0 ||
-      _offload_tpc <= 0 || _offload_tpc > 4)
+  if (_offload_balance > 1.0 || _offload_threads < 0 ||
+      _offload_tpc <= 0 || _offload_tpc > 4 || nomp < 0)
    error->all(FLERR,"Illegal package intel command");

+  #ifdef _LMP_INTEL_OFFLOAD
+  _ncops = ncops;
+  if (_offload_balance != 0.0) {
+    _real_space_comm = MPI_COMM_WORLD;
+    if (no_affinity == 0)
+      if (set_host_affinity(nomp) != 0)
+	error->all(FLERR,"Could not set host affinity for offload tasks");
+  }
+
+  int max_offload_threads = 0, offload_cores = 0;
+  if (_offload_balance != 0.0) {
+    #pragma offload target(mic:_cop) mandatory \
+      out(max_offload_threads,offload_cores)
+    {
+      offload_cores = omp_get_num_procs();
+      omp_set_num_threads(offload_cores);
+      max_offload_threads = omp_get_max_threads();
+    }
+    _max_offload_threads = max_offload_threads;
+    _offload_cores = offload_cores;
+    if (_offload_threads == 0) _offload_threads = offload_cores;
+  }
+  #endif
+
  // set OpenMP threads
  // nomp is user setting, default = 0
  
@ -154,13 +169,17 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
  if (nomp != 0) {
    omp_set_num_threads(nomp);
    comm->nthreads = nomp;
+  } else {
+    int nthreads;
+    #pragma omp parallel default(none) shared(nthreads)
+    nthreads = omp_get_num_threads();
+    comm->nthreads = nthreads;
  }
 #endif

  // set offload params

  #ifdef _LMP_INTEL_OFFLOAD
-  _ncops = ncops;
  if (_offload_balance < 0.0) {
    _balance_neighbor = 0.9;
    _balance_pair = 0.9;
@ -173,6 +192,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
  zero_timers();
  _setup_time_cleared = false;
  _timers_allocated = false;
+
  #else
  _offload_balance = 0.0;
  #endif
@ -197,7 +217,8 @@ FixIntel::~FixIntel()
    double *time1 = off_watch_pair();
    double *time2 = off_watch_neighbor();
    int *overflow = get_off_overflow_flag();
-    if (time1 != NULL && time2 != NULL && overflow != NULL) {
+    if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL && 
+	overflow != NULL) {
      #pragma offload_transfer target(mic:_cop) \
        nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
    }
@ -239,7 +260,8 @@ void FixIntel::init()
    double *time1 = off_watch_pair();
    double *time2 = off_watch_neighbor();
    int *overflow = get_off_overflow_flag();
-    if (time1 != NULL && time2 != NULL && overflow != NULL) {
+    if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL && 
+	overflow != NULL) {
      #pragma offload_transfer target(mic:_cop)  \
        nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
        in(overflow:length(5) alloc_if(1) free_if(0))
@ -247,25 +269,6 @@ void FixIntel::init()
    _timers_allocated = true;
  }

-  char kmode[80];
-  if (_precision_mode == PREC_MODE_SINGLE)
-    strcpy(kmode, "single");
-  else if (_precision_mode == PREC_MODE_MIXED)
-    strcpy(kmode, "mixed");
-  else
-    strcpy(kmode, "double");
-
-  // print summary of settings
-  if (comm->me == 0) {
-    if (screen) {
-      #ifdef _LMP_INTEL_OFFLOAD
-      if (_offload_balance != 0.0) {
-        fprintf(screen,"using offload with %d threads per core, ",_offload_tpc);
-        fprintf(screen,"%d threads per task\n",_offload_threads);
-      }
-      #endif
-    }
-  }
  if (update->whichflag == 2 && _offload_balance != 0.0) {
    if (_offload_balance == 1.0 && _offload_noghost == 0)
      _sync_at_pair = 1;
@ -279,12 +282,6 @@ void FixIntel::init()
  }
  #endif

-  if (neighbor->style != BIN)
-    error->all(FLERR,
-	    "Currently, neighbor style BIN must be used with Intel package.");
-  if (neighbor->exclude_setting() != 0)
-    error->all(FLERR,
-	    "Currently, cannot use neigh_modify exclude with Intel package.");
  int nstyles = 0;
  if (force->pair_match("hybrid", 1) != NULL) {
    PairHybrid *hybrid = (PairHybrid *) force->pair;
@ -315,6 +312,71 @@ void FixIntel::init()
    _double_buffers->zero_ev();
 }

+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::setup(int vflag)
+{
+  if (neighbor->style != BIN)
+    error->all(FLERR,
+	    "Currently, neighbor style BIN must be used with Intel package.");
+  if (neighbor->exclude_setting() != 0)
+    error->all(FLERR,
+	    "Currently, cannot use neigh_modify exclude with Intel package.");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::pair_init_check()
+{
+  if (_offload_balance != 0.0 && comm->me == 0) {
+    #ifndef __INTEL_COMPILER_BUILD_DATE
+    error->warning(FLERR, "Unknown Intel Compiler Version\n");
+    #else
+    if (__INTEL_COMPILER_BUILD_DATE != 20131008 &&
+	__INTEL_COMPILER_BUILD_DATE < 20141023)
+      error->warning(FLERR, "Unsupported Intel Compiler.");
+    #endif
+    #if !defined(__INTEL_COMPILER)
+    error->warning(FLERR, "Unsupported Intel Compiler.");
+    #endif
+  }
+
+  // Clear buffers used for pair style
+  char kmode[80];
+  if (_precision_mode == PREC_MODE_SINGLE) {
+    strcpy(kmode, "single");
+    get_single_buffers()->free_all_nbor_buffers();
+  } else if (_precision_mode == PREC_MODE_MIXED) {
+    strcpy(kmode, "mixed");
+    get_mixed_buffers()->free_all_nbor_buffers();
+  } else {
+    strcpy(kmode, "double");
+    get_double_buffers()->free_all_nbor_buffers();
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  set_offload_affinity();
+  #endif
+
+  if (comm->me == 0) {
+    if (screen) {
+      fprintf(screen,
+	      "----------------------------------------------------------\n");
+      if (_offload_balance != 0.0) {
+        fprintf(screen,"Using Intel Coprocessor with %d threads per core, ",
+		_offload_tpc);
+        fprintf(screen,"%d threads per task\n",_offload_threads);
+      } else {
+	fprintf(screen,"Using Intel Package without Coprocessor.\n");
+      }
+      fprintf(screen,"Precision: %s\n",kmode);
+      fprintf(screen,
+	      "----------------------------------------------------------\n");
+    }
+  }
+}
+
 /* ---------------------------------------------------------------------- */

 void FixIntel::check_neighbor_intel()
@ -440,6 +502,14 @@ void FixIntel::output_timing_data() {
 	  fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
 	fprintf(_tscreen, "\n");
        #endif
+	double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] +
+	  timers[TIME_OFFLOAD_WAIT];
+	double ct = timers[TIME_OFFLOAD_NEIGHBOR] + 
+	  timers[TIME_OFFLOAD_PAIR];
+	double tt = MAX(ht,ct);
+	if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0)
+	  error->warning(FLERR,
+		 "Leaving a core free can improve performance for offload");
      }
      fprintf(_tscreen, "------------------------------------------------\n");
    }
@ -549,4 +619,157 @@ void FixIntel::set_offload_affinity()
    _double_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
 }

+/* ---------------------------------------------------------------------- */
+
+int FixIntel::set_host_affinity(const int nomp)
+{
+  #ifndef INTEL_OFFLOAD_NOAFFINITY
+  _separate_coi = 1;
+  int rank = comm->me;
+  int node_rank;
+  int ppn = get_ppn(node_rank);
+  int cop = node_rank / (ppn / _ncops);
+
+  // Get a sorted list of logical cores
+  int proc_list[INTEL_MAX_HOST_CORE_COUNT];
+  int ncores;
+  FILE *p;
+  char cmd[512];
+  char readbuf[INTEL_MAX_HOST_CORE_COUNT*5];
+  sprintf(cmd, "lscpu -p=cpu,core,socket | grep -v '#' |"
+	  "sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'");
+  p = popen(cmd, "r");
+  if (p == NULL) return -1;
+  ncores = 0;
+  while(fgets(readbuf, 512, p)) {
+    proc_list[ncores] = atoi(readbuf);
+    ncores++;
+  }
+  pclose(p);
+
+  // Sanity checks for core list
+  if (ncores < 2) return -1;
+  int nzero = 0;
+  for (int i = 0; i < ncores; i++) {
+    if (proc_list[i] == 0) nzero++;
+    if (proc_list[i] < 0 || proc_list[i] >= ncores) return -1;
+  }
+  if (nzero > 1) return -1;
+
+  // Determine the OpenMP/MPI configuration
+  char *estring;
+  int nthreads = nomp;
+  if (nthreads == 0) {
+    estring = getenv("OMP_NUM_THREADS");
+    if (estring != NULL) {
+      nthreads = atoi(estring);
+      if (nthreads < 2) nthreads = 1;
+    } else
+      nthreads = 1;
+  }
+
+  // Determine how many logical cores for COI and MPI tasks
+  int coi_cores = 0, mpi_cores;
+  int subscription = nthreads * ppn;
+  if (subscription > ncores) {
+    if (rank == 0)
+      error->warning(FLERR,
+		     "More MPI tasks/OpenMP threads than available cores");
+    return 0;
+  }
+  if (subscription == ncores)
+    _separate_coi = 0;
+
+  if (subscription > ncores / 2) {
+    coi_cores = ncores - subscription;
+    if (coi_cores > INTEL_MAX_COI_CORES) coi_cores = INTEL_MAX_COI_CORES;
+  }
+  mpi_cores = (ncores - coi_cores) / ppn;
+
+  // Get ids of all LWPs that COI spawned and affinitize
+  int lwp = 0, plwp = 0, nlwp = 0, mlwp = 0, fail = 0;
+  cpu_set_t cpuset;
+  pid_t pid = getpid();
+  if (coi_cores) {
+    sprintf(cmd, "ps -Lp %d -o lwp | awk ' (NR > 2) {print}'", pid);
+    p = popen(cmd, "r");
+    if (p == NULL) return -1;
+
+    while(fgets(readbuf, 512, p)) {
+      lwp = atoi(readbuf);
+      int first = coi_cores + node_rank * mpi_cores;
+      CPU_ZERO(&cpuset);
+      for (int i = first; i < first + mpi_cores; i++)
+	CPU_SET(proc_list[i], &cpuset);
+      if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
+	fail = 1;
+	break;
+      }
+      plwp++;
+    }
+    pclose(p);
+
+    // Do async offload to create COI threads
+    int sig1, sig2;
+    float *buf1;
+    int pragma_size = 1024;
+    buf1 = (float*) malloc(sizeof(float)*pragma_size);
+
+    #pragma offload target (mic:0) \
+      in(buf1:length(pragma_size) alloc_if(1) free_if(0))	\
+      signal(&sig1)
+    {}
+    #pragma offload_wait target(mic:0) wait(&sig1)
+
+    #pragma offload target (mic:0) \
+      out(buf1:length(pragma_size) alloc_if(0) free_if(1))	\
+      signal(&sig2)
+    {}
+    #pragma offload_wait target(mic:0) wait(&sig2)
+    free(buf1);
+
+    p = popen(cmd, "r");
+    if (p == NULL) return -1;
+
+    while(fgets(readbuf, 512, p)) {
+      lwp = atoi(readbuf);
+      nlwp++;
+      if (nlwp <= plwp) continue;
+
+      CPU_ZERO(&cpuset);
+      for(int i=0; i<coi_cores; i++)
+	CPU_SET(proc_list[i], &cpuset);
+
+      if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
+	fail = 1;
+	break;
+      }
+    }
+    pclose(p);
+    nlwp -= plwp;
+
+    // Get stats on the number of LWPs per process
+    MPI_Reduce(&nlwp, &mlwp, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
+  }
+
+  if (screen && rank == 0) {
+    if (coi_cores)
+      fprintf(screen,"Intel Package: Affinitizing %d Offload Threads to %d Cores\n",
+	      mlwp, coi_cores);
+    fprintf(screen,"Intel Package: Affinitizing MPI Tasks to %d Cores Each\n",mpi_cores);
+  }
+  if (fail) return -1;
+
+  // Affinitize MPI Ranks
+  CPU_ZERO(&cpuset);
+  int first = coi_cores + node_rank * mpi_cores;
+  for (int i = first; i < first+mpi_cores; i++)
+    CPU_SET(proc_list[i], &cpuset);
+  if (sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset))
+    return -1;
+
+  #endif
+  return 0;
+}
+
 #endif
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@ -38,6 +38,8 @@ class FixIntel : public Fix {
  virtual ~FixIntel();
  virtual int setmask();
  virtual void init();
+  virtual void setup(int);
+  void pair_init_check();

  // Get all forces, calculation results from coprocesser
  void sync_coprocessor();
@ -135,7 +137,7 @@ class FixIntel : public Fix {
  int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost;
  int _host_min_local, _host_min_ghost, _host_nall;
  int _host_used_local, _host_used_ghost;
-  int _separate_buffers, _offload_noghost, _sync_at_pair;
+  int _separate_buffers, _offload_noghost, _sync_at_pair, _separate_coi;
  bool _setup_time_cleared, _timers_allocated;
  void output_timing_data();
  FILE *_tscreen;
@ -149,6 +151,7 @@ class FixIntel : public Fix {
  int _full_host_list, _cop, _ncops;

  int get_ppn(int &);
+  int set_host_affinity(const int);
  #endif
  void check_neighbor_intel();

@ -540,6 +543,14 @@ E: The 'package intel' command is required for /intel styles

 Self-explanatory.

+W: Could not set host affinity for offload tasks
+
+When using offload to a coprocessor, the application will try to set affinity
+for host MPI tasks and OpenMP threads and will generate a warning if unable
+to do so successfully. In the unsuccessful case, you might wish to set
+affinity outside of the application and performance might suffer if
+hyperthreading is disable on the CPU.
+
 E: Neighbor list overflow, boost neigh_modify one

 Increase the value for neigh_modify one to allow for larger allocations for
@ -578,6 +589,17 @@ E: Currently, cannot use neigh_modify exclude with Intel package.

 This is a current restriction of the Intel package.

+W: Unknown Intel Compiler Version
+
+The compiler version used to build LAMMPS has not been tested with
+offload to a coprocessor.
+
+W: Unsupported Intel Compiler
+
+The compiler version used to build LAMMPS is not supported when using
+offload to a coprocessor. There could be performance or correctness
+issues. Please use 14.0.1.106 or 15.1.133 or later.
+
 E: Currently, cannot use more than one intel style with hybrid.

 Currently, hybrid pair styles can only use the intel suffix for one of the
@ -589,9 +611,21 @@ The hybrid pair style configuration is not yet supported by the Intel
 package. Support is limited to hybrid/overlay or a hybrid style that does 
 not require a skip list.

+W: Leaving a core/node free can improve performance for offload
+
+When each CPU is fully subscribed with MPI tasks and OpenMP threads,
+context switching with threads used for offload can sometimes decrease
+performance. If you see this warning, try using fewer MPI tasks/OpenMP threads
+per node to leave a physical CPU core free on each node.
+
 E: MPI tasks per node must be multiple of offload_cards

 For offload to multiple coprocessors on a single node, the Intel package
 requires that each coprocessor is used by the same number of MPI tasks.

+W: More MPI tasks/OpenMP threads than available cores
+
+Using more MPI tasks/OpenMP threads than available cores will typically
+decrease performance.
+
 */
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@ -385,7 +385,7 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
    if (_ntypes > 0) {
      #ifdef _LMP_INTEL_OFFLOAD
      flt_t * cutneighsqo = _cutneighsq[0];
-      if (cutneighsqo != 0) {
+      if (_off_threads > 0 && cutneighsqo != 0) {
        #pragma offload_transfer target(mic:_cop) \
          nocopy(cutneighsqo:alloc_if(0) free_if(1))
      }
@ -396,7 +396,7 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
      lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq");
      #ifdef _LMP_INTEL_OFFLOAD
      flt_t * cutneighsqo = _cutneighsq[0];
-      if (cutneighsqo != NULL) {
+      if (_off_threads > 0 && cutneighsqo != NULL) {
        #pragma offload_transfer target(mic:_cop) \
          nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0))
      }
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@ -61,6 +61,8 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
 #define INTEL_LB_MEAN_WEIGHT 0.1
 #define INTEL_BIGP 1e15
+#define INTEL_MAX_HOST_CORE_COUNT 512
+#define INTEL_MAX_COI_CORES 2

 #define IP_PRE_get_stride(stride, n, datasize, torque)	\
  {								\
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@ -893,20 +893,17 @@ void PairGayBerneIntel::init_style()
  fix = static_cast<FixIntel *>(modify->fix[ifix]);

  #ifdef _LMP_INTEL_OFFLOAD
-  fix->set_offload_affinity();
  if (force->newton_pair) fix->set_offload_noghost(1);
  _cop = fix->coprocessor_number();
  #endif
-  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-    fix->get_mixed_buffers()->free_all_nbor_buffers();
+
+  fix->pair_init_check();
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
    pack_force_const(force_const_single, fix->get_mixed_buffers());
-  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-    fix->get_double_buffers()->free_all_nbor_buffers();
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
    pack_force_const(force_const_double, fix->get_double_buffers());
-  } else {
-    fix->get_single_buffers()->free_all_nbor_buffers();
+  else
    pack_force_const(force_const_single, fix->get_single_buffers());
-  }
 }

 /* ---------------------------------------------------------------------- */
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@ -489,19 +489,16 @@ void PairLJCharmmCoulLongIntel::init_style()
  fix = static_cast<FixIntel *>(modify->fix[ifix]);
  
  #ifdef _LMP_INTEL_OFFLOAD
-  fix->set_offload_affinity();
  _cop = fix->coprocessor_number();
  #endif
-  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-    fix->get_mixed_buffers()->free_all_nbor_buffers();
+
+  fix->pair_init_check();
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
    pack_force_const(force_const_single, fix->get_mixed_buffers());
-  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-    fix->get_double_buffers()->free_all_nbor_buffers();
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
    pack_force_const(force_const_double, fix->get_double_buffers());
-  } else {
-    fix->get_single_buffers()->free_all_nbor_buffers();
+  else
    pack_force_const(force_const_single, fix->get_single_buffers());
-  }
 }

 template <class flt_t, class acc_t>
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@ -456,19 +456,16 @@ void PairLJCutCoulLongIntel::init_style()
  fix = static_cast<FixIntel *>(modify->fix[ifix]);
  
  #ifdef _LMP_INTEL_OFFLOAD
-  fix->set_offload_affinity();
  _cop = fix->coprocessor_number();
  #endif
-  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-    fix->get_mixed_buffers()->free_all_nbor_buffers();
+
+  fix->pair_init_check();
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
    pack_force_const(force_const_single, fix->get_mixed_buffers());
-  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-    fix->get_double_buffers()->free_all_nbor_buffers();
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
    pack_force_const(force_const_double, fix->get_double_buffers());
-  } else {
-    fix->get_single_buffers()->free_all_nbor_buffers();
+  else
    pack_force_const(force_const_single, fix->get_single_buffers());
-  }
 }

 template <class flt_t, class acc_t>
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@ -336,16 +336,14 @@ void PairLJCutIntel::init_style()
    error->all(FLERR,
          "Offload for lj/cut/intel is not yet available. Set balance to 0.");
  #endif
-  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-    fix->get_mixed_buffers()->free_all_nbor_buffers();
+
+  fix->pair_init_check();
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
    pack_force_const(force_const_single, fix->get_mixed_buffers());
-  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-    fix->get_double_buffers()->free_all_nbor_buffers();
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
    pack_force_const(force_const_double, fix->get_double_buffers());
-  } else {
-    fix->get_single_buffers()->free_all_nbor_buffers();
+  else
    pack_force_const(force_const_single, fix->get_single_buffers());
-  }
 }

 /* ---------------------------------------------------------------------- */