lammps/src/INTEL/fix_intel.cpp

// clang-format off
/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing author: W. Michael Brown (Intel)
                        Anupama Kurpad (Intel) - Host Affinitization
------------------------------------------------------------------------- */

#include "fix_intel.h"

#include "comm.h"
#include "domain.h"
#include "error.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_request.h"
#include "pair.h"
#include "pair_hybrid.h"
#include "update.h"

#include <cstring>

#ifdef _LMP_INTEL_OFFLOAD
#ifndef INTEL_OFFLOAD_NOAFFINITY
#include <unistd.h>
#endif
#endif

#include "suffix.h"

using namespace LAMMPS_NS;
using namespace FixConst;

#ifdef __INTEL_OFFLOAD
#ifndef _LMP_INTEL_OFFLOAD
#warning "Not building INTEL package with Xeon Phi offload support."
#endif
#endif

/* ---------------------------------------------------------------------- */

FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
{
  if (narg < 4) error->all(FLERR,"Illegal package intel command");

  int ncops = utils::inumeric(FLERR,arg[3],false,lmp);

  _nbor_pack_width = 1;
  _three_body_neighbor = 0;
  _pair_intel_count = 0;
  _hybrid_nonpair = 0;
  _print_pkg_info = 1;
  _nthreads = comm->nthreads;
  _torque_flag = 0;

  _precision_mode = PREC_MODE_MIXED;
  _offload_balance = -1.0;
  _overflow_flag[LMP_OVERFLOW] = 0;
  _off_overflow_flag[LMP_OVERFLOW] = 0;

  _offload_affinity_balanced = 0;
  _offload_threads = 0;
  _offload_tpc = 4;

  _force_array_s = nullptr;
  _force_array_m = nullptr;
  _force_array_d = nullptr;
  _ev_array_s = nullptr;
  _ev_array_d = nullptr;

  #ifdef _LMP_INTEL_OFFLOAD
  if (ncops < 0) error->all(FLERR,"Illegal package intel command");
  _offload_affinity_set = 0;
  _off_force_array_s = 0;
  _off_force_array_m = 0;
  _off_force_array_d = 0;
  _off_ev_array_s = 0;
  _off_ev_array_d = 0;
  _balance_fixed = 0.0;
  _cop = 0;
  #endif

  // optional keywords

  int nomp = 0, no_affinity = 0;
  _allow_separate_buffers = 1;
  _offload_ghost = -1;
  _lrt = 0;
  _p3m_table = 1;

  int iarg = 4;
  while (iarg < narg) {
    if (strcmp(arg[iarg],"omp") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
      nomp = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
      iarg += 2;
    } else if (strcmp(arg[iarg],"mode") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
      if (strcmp(arg[iarg+1],"single") == 0)
        _precision_mode = PREC_MODE_SINGLE;
      else if (strcmp(arg[iarg+1],"mixed") == 0)
        _precision_mode = PREC_MODE_MIXED;
      else if (strcmp(arg[iarg+1],"double") == 0)
        _precision_mode = PREC_MODE_DOUBLE;
      else error->all(FLERR,"Illegal package intel command");
      iarg += 2;
    } else if (strcmp(arg[iarg],"balance") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
      _offload_balance = utils::numeric(FLERR,arg[iarg+1],false,lmp);
      iarg += 2;
    } else if (strcmp(arg[iarg], "ghost") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
      _offload_ghost = utils::logical(FLERR,arg[iarg+1],false,lmp);
      iarg += 2;
    } else if (strcmp(arg[iarg], "tpc") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
      _offload_tpc = atoi(arg[iarg+1]);
      iarg += 2;
    } else if (strcmp(arg[iarg],"tptask") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
      _offload_threads = atoi(arg[iarg+1]);
      iarg += 2;
    } else if (strcmp(arg[iarg],"no_affinity") == 0) {
      no_affinity = 1;
      iarg++;
    } else if (strcmp(arg[iarg], "lrt") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
      _lrt = utils::logical(FLERR,arg[iarg+1],false,lmp);
      iarg += 2;
    } else if (strcmp(arg[iarg], "pppm_table") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
      _p3m_table = utils::logical(FLERR,arg[iarg+1],false,lmp);
      iarg += 2;

    // undocumented options

    } else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) {
      _offload_affinity_balanced = 1;
      iarg++;
    } else if (strcmp(arg[iarg],"buffers") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
      _allow_separate_buffers = atoi(arg[iarg+1]);
      iarg += 2;
    } else error->all(FLERR,"Illegal package intel command");
  }

  // if ncops is zero, just run on the cpu
  if (ncops < 1) {
    ncops = -1;
    _offload_balance = 0.0;
  }

  // if using LRT mode, create the integrate style
  if (_lrt) {
    char *cmd[1];
    cmd[0] = (char *) "verlet/lrt/intel";
    update->create_integrate(1,cmd,0);
  }

  // error check

  if (_offload_balance > 1.0 || _offload_threads < 0 ||
      _offload_tpc <= 0 || _offload_tpc > 4 || nomp < 0)
    error->all(FLERR,"Illegal package intel command");

  #ifdef _LMP_INTEL_OFFLOAD
  _ncops = ncops;
  if (_offload_balance != 0.0) {
    _real_space_comm = MPI_COMM_WORLD;
    if (no_affinity == 0)
      if (set_host_affinity(nomp) != 0)
        error->all(FLERR,"Could not set host affinity for offload tasks");
  }

  int max_offload_threads = 0, offload_cores = 0;
  if (_offload_balance != 0.0) {
    #pragma offload target(mic:_cop) mandatory \
      out(max_offload_threads,offload_cores)
    {
      offload_cores = omp_get_num_procs();
      omp_set_num_threads(offload_cores);
      max_offload_threads = omp_get_max_threads();
      #ifdef __AVX512F__
      if ( (offload_cores / 4) % 2 == 1) {
        offload_cores += 4;
        max_offload_threads += 4;
      }
      #endif
    }
    _max_offload_threads = max_offload_threads;
    _offload_cores = offload_cores;
    if (_offload_threads == 0) _offload_threads = offload_cores;
    if (_offload_cores > 244 && _offload_tpc > 2)
      _offload_tpc = 2;
  }
  #endif

  // set OpenMP threads
  // nomp is user setting, default = 0

  #if defined(_OPENMP)
  #if defined(__INTEL_COMPILER)
  kmp_set_blocktime(0);
  #endif
  if (nomp != 0) {
    omp_set_num_threads(nomp);
    _nthreads = comm->nthreads = nomp;
  }
  #endif

  // set offload params

  #ifdef _LMP_INTEL_OFFLOAD
  if (_offload_balance < 0.0) {
    _balance_neighbor = 0.9;
    _balance_pair = 0.9;
  } else {
    _balance_neighbor = _offload_balance;
    _balance_pair = _offload_balance;
  }

  _tscreen = screen;
  zero_timers();
  _setup_time_cleared = false;
  _timers_allocated = false;

  #else
  _offload_balance = 0.0;
  #endif

  // set precision

  if (_precision_mode == PREC_MODE_SINGLE)
    _single_buffers = new IntelBuffers<float,float>(lmp);
  else if (_precision_mode == PREC_MODE_MIXED)
    _mixed_buffers = new IntelBuffers<float,double>(lmp);
  else
    _double_buffers = new IntelBuffers<double,double>(lmp);
}

/* ---------------------------------------------------------------------- */

FixIntel::~FixIntel()
{
  #ifdef _LMP_INTEL_OFFLOAD
  output_timing_data();
  if (_timers_allocated) {
    double *time1 = off_watch_pair();
    double *time2 = off_watch_neighbor();
    int *overflow = get_off_overflow_flag();
    if (_offload_balance != 0.0) {
      #pragma offload_transfer target(mic:_cop) \
        nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
    }
  }
  #endif

  if (_precision_mode == PREC_MODE_SINGLE)
    delete _single_buffers;
  else if (_precision_mode == PREC_MODE_MIXED)
    delete _mixed_buffers;
  else
    delete _double_buffers;
}

/* ---------------------------------------------------------------------- */

int FixIntel::setmask()
{
  int mask = 0;
  mask |= PRE_REVERSE;
  mask |= MIN_PRE_REVERSE;
  mask |= POST_FORCE;
  mask |= MIN_POST_FORCE;
  mask |= POST_RUN;
  return mask;
}

/* ---------------------------------------------------------------------- */

void FixIntel::init()
{
  #ifdef _LMP_INTEL_OFFLOAD
  output_timing_data();
  _sync_mode = 0;
  if (offload_balance() != 0.0) {
    if (offload_noghost() || force->newton_pair == 0)
      _sync_mode = 2;
    else
      _sync_mode = 1;
    if (update->whichflag == 2) _sync_mode = 1;
  }
  #endif

  _torque_flag = 0;
  if (force->pair_match("gayberne/intel$", 0)) _torque_flag = 1;

  const int nstyles = _pair_intel_count;
  if (force->pair_match("^hybrid", 0)) {
    _pair_hybrid_flag = 1;

    // Check if need to handle torque
    auto hybrid = dynamic_cast<PairHybrid *>(force->pair);
    if (hybrid) {
      for (int i = 0; i < hybrid->nstyles; i++)
        if (utils::strmatch(hybrid->keywords[i],"/intel$") &&
            utils::strmatch(hybrid->keywords[i],"gayberne"))
          _torque_flag = 1;
    }
    if (force->newton_pair != 0 && force->pair->no_virial_fdotr_compute)
      error->all(FLERR,"INTEL package requires fdotr virial with newton on.");
  } else
    _pair_hybrid_flag = 0;

  if (_torque_flag && nstyles > 1)
    error->all(FLERR,"gayberne/intel style cannot yet be used with other "
               "intel pair styles.");

  if (nstyles > 1 && _pair_hybrid_flag) _pair_hybrid_flag = 2;
  else if (force->newton_pair == 0) _pair_hybrid_flag = 0;

  _pair_hybrid_zero = 0;
  _zero_master = 0;

  if (_pair_hybrid_flag && _hybrid_nonpair)
    _pair_hybrid_zero = 1;
  _hybrid_nonpair = 0;

  _pair_intel_count = 0;

  #ifdef _LMP_INTEL_OFFLOAD
  if (offload_balance() != 0.0) {
    _pair_hybrid_zero = 0;
    if (force->newton_pair == 0) _pair_hybrid_flag = 0;
    if (nstyles > 1)
      error->all(FLERR,"Currently, cannot offload more than one intel style with hybrid.");
  }
  #endif

  check_neighbor_intel();

  int off_mode = 0;
  if (_offload_balance != 0.0) off_mode = 1;
  if (_precision_mode == PREC_MODE_SINGLE) {
    _single_buffers->set_torque_flag(_torque_flag);
    _single_buffers->zero_ev();
    _single_buffers->grow_ncache(off_mode, comm->nthreads);
    _single_buffers->free_list_ptrs();
  } else if (_precision_mode == PREC_MODE_MIXED) {
    _mixed_buffers->set_torque_flag(_torque_flag);
    _mixed_buffers->zero_ev();
    _mixed_buffers->grow_ncache(off_mode, comm->nthreads);
    _mixed_buffers->free_list_ptrs();
  } else {
    _double_buffers->set_torque_flag(_torque_flag);
    _double_buffers->zero_ev();
    _double_buffers->grow_ncache(off_mode, comm->nthreads);
    _double_buffers->free_list_ptrs();
  }

  _need_reduce = 0;
}

/* ---------------------------------------------------------------------- */

void FixIntel::setup(int vflag)
{
  if (neighbor->style != Neighbor::BIN)
    error->all(FLERR,"Currently, neighbor style BIN must be used with INTEL package.");
  if (vflag > 3)
   error->all(FLERR,"Cannot currently get per-atom virials with INTEL package.");
  #ifdef _LMP_INTEL_OFFLOAD
  if (neighbor->exclude_setting() != 0)
    error->all(FLERR,"Currently, cannot use neigh_modify exclude with INTEL package offload.");
  post_force(vflag);
  #endif
}

/* ---------------------------------------------------------------------- */

void FixIntel::setup_pre_reverse(int eflag, int vflag)
{
  pre_reverse(eflag,vflag);
}

/* ---------------------------------------------------------------------- */

bool FixIntel::pair_hybrid_check()
{
  auto ph = dynamic_cast<PairHybrid *>(force->pair);
  bool has_intel = false;
  int nstyles = ph->nstyles;

  for (int i = 0; i < nstyles; ++i)
    if (ph->styles[i]->suffix_flag & Suffix::INTEL) has_intel = true;

  return has_intel;
}

/* ---------------------------------------------------------------------- */

void FixIntel::pair_init_check(const bool cdmessage)
{
  #ifdef INTEL_VMASK
  if (atom->sortfreq) atom->sortfreq = 1;
  #endif

  _nbor_pack_width = 1;

  #ifdef _LMP_INTEL_OFFLOAD
  if (_offload_balance != 0.0) atom->sortfreq = 1;

  _offload_noghost = 0;
  if (force->newton_pair && _offload_ghost == 0)
    _offload_noghost = 1;

  set_offload_affinity();

  if (!_timers_allocated) {
    double *time1 = off_watch_pair();
    double *time2 = off_watch_neighbor();
    int *overflow = get_off_overflow_flag();
    if (_offload_balance != 0.0) {
      #pragma offload_transfer target(mic:_cop)  \
        nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
        in(overflow:length(5) alloc_if(1) free_if(0))
    }
    _timers_allocated = true;
  }
  #endif
  _nthreads = comm->nthreads;

  if (_offload_balance != 0.0 && comm->me == 0) {
    #ifndef __INTEL_COMPILER_BUILD_DATE
    error->warning(FLERR,"Unknown Intel Compiler Version\n");
    #else
    if (__INTEL_COMPILER_BUILD_DATE != 20131008 &&
        __INTEL_COMPILER_BUILD_DATE < 20141023)
      error->warning(FLERR,"Unsupported Intel Compiler.");
    #endif
    #if !defined(__INTEL_COMPILER)
    error->warning(FLERR,"Unsupported Intel Compiler.");
    #endif
  }

  #ifndef LMP_INTEL_NBOR_COMPAT
  if (force->pair->manybody_flag && (atom->molecular != Atom::ATOMIC)) {
    int flag = 0;
    if (atom->nbonds > 0 && force->special_lj[1] == 0.0 &&
        force->special_coul[1] == 0.0) flag = 1;
    if (atom->nangles > 0 && force->special_lj[2] == 0.0 &&
        force->special_coul[2] == 0.0) flag = 1;
    if (atom->ndihedrals > 0 && force->special_lj[3] == 0.0 &&
        force->special_coul[3] == 0.0) flag = 1;
    if (flag)
      error->all(FLERR,"Add -DLMP_INTEL_NBOR_COMPAT to build for special_bond"
                 " exclusions with Intel");
  }
  #endif

  int need_tag = 0;
  if (atom->molecular != Atom::ATOMIC || three_body_neighbor()) need_tag = 1;
  if (domain->triclinic && force->newton_pair) need_tag = 1;

  // Clear buffers used for pair style
  char kmode[80];
  if (_precision_mode == PREC_MODE_SINGLE) {
    strcpy(kmode, "single");
    get_single_buffers()->need_tag(need_tag);
  } else if (_precision_mode == PREC_MODE_MIXED) {
    strcpy(kmode, "mixed");
    get_mixed_buffers()->need_tag(need_tag);
  } else {
    strcpy(kmode, "double");
    get_double_buffers()->need_tag(need_tag);
  }

  _pair_intel_count++;

  #ifdef _LMP_INTEL_OFFLOAD
  set_offload_affinity();
  #endif

  if (_print_pkg_info && comm->me == 0) {
    utils::logmesg(lmp, "----------------------------------------------------------\n");
    if (_offload_balance != 0.0) {
      utils::logmesg(lmp,"Using Intel Coprocessor with {} threads per core, "
                     "{} threads per task\n",_offload_tpc, _offload_threads);
    } else {
      utils::logmesg(lmp,"Using INTEL Package without Coprocessor.\n");
    }
    utils::logmesg(lmp,"Compiler: {}\n",platform::compiler_info());
    #ifdef LMP_SIMD_COMPILER
    utils::logmesg(lmp,"SIMD compiler directives: Enabled\n");
    #else
    utils::logmesg(lmp,"SIMD compiler directives: Disabled\n");
    #endif
    utils::logmesg(lmp,"Precision: {}\n",kmode);
    if (cdmessage) {
      #ifdef LMP_USE_AVXCD
      utils::logmesg(lmp,"AVX512 CD Optimizations: Enabled\n");
      #else
      utils::logmesg(lmp,"AVX512 CD Optimizations: Disabled\n");
      #endif
    }
    utils::logmesg(lmp, "----------------------------------------------------------\n");
  }
  _print_pkg_info = 0;
}

/* ---------------------------------------------------------------------- */

void FixIntel::bond_init_check()
{
  if ((_offload_balance != 0.0) && (atom->molecular != Atom::ATOMIC)
      && (force->newton_pair != force->newton_bond))
    error->all(FLERR,"INTEL package requires same setting for newton bond and non-bond.");

  int intel_pair = 0;
  if (force->pair_match("/intel$", 0) != nullptr)
    intel_pair = 1;
  else if (force->pair_match("^hybrid", 0) != nullptr) {
    _hybrid_nonpair = 1;
    if (pair_hybrid_check()) intel_pair = 1;
  }

  if (intel_pair == 0)
    error->all(FLERR,"Intel styles for bond/angle/dihedral/improper require intel pair style.");
}

/* ---------------------------------------------------------------------- */

void FixIntel::kspace_init_check()
{
  int intel_pair = 0;
  if (force->pair_match("/intel$", 0) != nullptr)
    intel_pair = 1;
  else if (force->pair_match("^hybrid", 0) != nullptr) {
    _hybrid_nonpair = 1;
    if (pair_hybrid_check()) intel_pair = 1;
  }

  if (intel_pair == 0)
    error->all(FLERR,"Intel styles for kspace require intel pair style.");
}

/* ---------------------------------------------------------------------- */

void FixIntel::check_neighbor_intel()
{
  #ifdef _LMP_INTEL_OFFLOAD
  _full_host_list = 0;

  const int nrequest = neighbor->nrequest;
  for (int i = 0; i < nrequest; ++i) {
    if (_offload_balance != 0.0 && neighbor->requests[i]->intel == 0) {
      _full_host_list = 1;
      _offload_noghost = 0;
    }
    if (neighbor->requests[i]->skip && _offload_balance != 0.0)
      error->all(FLERR,"Cannot yet use hybrid styles with Intel offload.");
  }
  #endif
}

/* ---------------------------------------------------------------------- */

void FixIntel::_sync_main_arrays(const int prereverse)
{
  if (!prereverse) _zero_master = 1;
  int done_this_step = prereverse;
  if (_pair_hybrid_zero == 0) done_this_step = 1;
  if (_force_array_m != nullptr) {
    if (_need_reduce) {
      reduce_results(&_force_array_m[0].x);
      _need_reduce = 0;
    }
    add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom,0);
    if (done_this_step) _force_array_m = nullptr;
    else _ev_array_d = nullptr;
  } else if (_force_array_d != nullptr) {
    if (_need_reduce) {
      reduce_results(&_force_array_d[0].x);
      _need_reduce = 0;
    }
    add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom,0);
    if (done_this_step) _force_array_d = nullptr;
    else _ev_array_d = nullptr;
  } else if (_force_array_s != nullptr) {
    if (_need_reduce) {
      reduce_results(&_force_array_s[0].x);
      _need_reduce = 0;
    }
    add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom,0);
    if (done_this_step) _force_array_s = nullptr;
    else _ev_array_s = nullptr;
  }

  #ifdef _LMP_INTEL_OFFLOAD
  if (_sync_mode == 1) sync_coprocessor();
  #endif
}

/* ---------------------------------------------------------------------- */

void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/)
{
  _sync_main_arrays(1);
}

/* ---------------------------------------------------------------------- */

void FixIntel::post_force(int vflag)
{
  // Redundant call to sync Intel data structs with native for methods that
  // call force compute but do not call prereverse
  _sync_main_arrays(1);

  #ifdef LMP_INTEL_OFFLOAD
  if (_sync_mode == 2) sync_coprocessor();
  #endif
}

/* ---------------------------------------------------------------------- */

template <class acc_t>
void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
{
  int o_range, f_stride;
  if (force->newton_pair)
    o_range = atom->nlocal + atom->nghost;
  else
    o_range = atom->nlocal;
  IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), _torque_flag);

  o_range *= 4;
  const int f_stride4 = f_stride * 4;

  if (_nthreads <= INTEL_HTHREADS) {
    acc_t *f_scalar2 = f_scalar + f_stride4;
    if (_nthreads == 4) {
      acc_t *f_scalar3 = f_scalar2 + f_stride4;
      acc_t *f_scalar4 = f_scalar3 + f_stride4;
      #if defined(USE_OMP_SIMD)
      #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4:64)
      #elif defined(LMP_SIMD_COMPILER)
      #pragma vector aligned
      #pragma simd
      #endif
      for (int n = 0; n < o_range; n++)
        f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
    } else if (_nthreads == 2) {
      #if defined(USE_OMP_SIMD)
      #pragma omp simd aligned(f_scalar,f_scalar2:64)
      #elif defined(LMP_SIMD_COMPILER)
      #pragma vector aligned
      #pragma simd
      #endif
      for (int n = 0; n < o_range; n++)
        f_scalar[n] += f_scalar2[n];
    } else {
      acc_t *f_scalar3 = f_scalar2 + f_stride4;
      #if defined(USE_OMP_SIMD)
      #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3:64)
      #elif defined(LMP_SIMD_COMPILER)
      #pragma vector aligned
      #pragma simd
      #endif
      for (int n = 0; n < o_range; n++)
        f_scalar[n] += f_scalar2[n] + f_scalar3[n];
    }
  } else {
    #if defined(_OPENMP)
    #pragma omp parallel
    #endif
    {
      int iifrom, iito, tid;
      IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
                                sizeof(acc_t));

      acc_t *f_scalar2 = f_scalar + f_stride4;
      for (int t = 1; t < _nthreads; t++) {
        #if defined(USE_OMP_SIMD)
        #pragma omp simd aligned(f_scalar,f_scalar2:64)
        #elif defined(LMP_SIMD_COMPILER)
        #pragma vector aligned
        #pragma simd
        #endif
        for (int n = iifrom; n < iito; n++)
          f_scalar[n] += f_scalar2[n];
        f_scalar2 += f_stride4;
      }
    }
  }
}

/* ---------------------------------------------------------------------- */

void FixIntel::sync_coprocessor()
{
  #ifdef _LMP_INTEL_OFFLOAD
  if (_offload_balance != 0.0) {
    if (_off_force_array_m != 0) {
      add_off_results(_off_force_array_m, _off_ev_array_d);
      _off_force_array_m = 0;
    } else if (_off_force_array_d != 0) {
      add_off_results(_off_force_array_d, _off_ev_array_d);
      _off_force_array_d = 0;
    } else if (_off_force_array_s != 0) {
      add_off_results(_off_force_array_s, _off_ev_array_s);
      _off_force_array_s = 0;
    }
  }
  #endif
}

/* ---------------------------------------------------------------------- */

template <class ft, class acc_t>
void FixIntel::add_results(const ft * _noalias const f_in,
                           const acc_t * _noalias const ev_global,
                           const int eatom, const int vatom,
                           const int offload) {
  start_watch(TIME_PACK);
  int f_length;
  #ifdef _LMP_INTEL_OFFLOAD
  if (_separate_buffers) {
    if (offload) {
      if (force->newton_pair) {
        add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
        const acc_t * _noalias const enull = 0;
        int offset = _offload_nlocal;
        if (_torque_flag) offset *= 2;
        add_oresults(f_in + offset, enull, eatom, vatom,
                     _offload_min_ghost, _offload_nghost);
      } else
        add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair());
    } else {
      if (force->newton_pair) {
        add_oresults(f_in, ev_global, eatom, vatom,
                     _host_min_local, _host_used_local);
        const acc_t * _noalias const enull = 0;
        int offset = _host_used_local;
        if (_torque_flag) offset *= 2;
        add_oresults(f_in + offset, enull, eatom,
                     vatom, _host_min_ghost, _host_used_ghost);
      } else {
        int start = host_start_pair();
        add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start);
      }
    }
    stop_watch(TIME_PACK);
    return;
  }
  int start;
  if (offload) {
    start = 0;
    if (force->newton_pair) {
      if (_offload_noghost == 0)
        f_length = atom->nlocal + atom->nghost;
      else
        f_length = atom->nlocal;
    } else
      f_length = offload_end_pair();
  } else {
    if (force->newton_pair) {
      start = 0;
      f_length = atom->nlocal + atom->nghost;
    } else {
      start = host_start_pair();
      f_length = atom->nlocal - start;
    }
  }
  add_oresults(f_in, ev_global, eatom, vatom, start, f_length);
  #else
  if (force->newton_pair)
    f_length = atom->nlocal + atom->nghost;
  else
    f_length = atom->nlocal;
  add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
  #endif
  stop_watch(TIME_PACK);
}

/* ---------------------------------------------------------------------- */

template <class ft, class acc_t>
void FixIntel::add_oresults(const ft * _noalias const f_in,
                            const acc_t * _noalias const ev_global,
                            const int eatom, const int /*vatom*/,
                            const int out_offset, const int nall) {
  lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
  if (_torque_flag) {
    if (f_in[1].w)
    {
      if (f_in[1].w == 1)
        error->all(FLERR,"Bad matrix inversion in mldivide3");
      else
        error->all(FLERR,"Sphere particles not yet supported for gayberne/intel");
    }
  }

  int packthreads;
  if (_nthreads > INTEL_HTHREADS) packthreads = _nthreads;
  else packthreads = 1;
  #if defined(_OPENMP)
  #pragma omp parallel if (packthreads > 1)
  #endif
  {
    #if defined(_OPENMP)
    const int tid = omp_get_thread_num();
    #else
    const int tid = 0;
    #endif
    int ifrom, ito;
    IP_PRE_omp_range_align(ifrom, ito, tid, nall, packthreads, sizeof(acc_t));
    if (_torque_flag) {
      int ii = ifrom * 2;
      lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
        out_offset;
      if (eatom) {
        double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
        #if defined(LMP_SIMD_COMPILER)
        #pragma vector aligned
        #pragma ivdep
        #endif
        for (int i = ifrom; i < ito; i++) {
          f[i].x += f_in[ii].x;
          f[i].y += f_in[ii].y;
          f[i].z += f_in[ii].z;
          lmp_eatom[i] += f_in[ii].w;
          tor[i].x += f_in[ii+1].x;
          tor[i].y += f_in[ii+1].y;
          tor[i].z += f_in[ii+1].z;
          ii += 2;
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
        #pragma vector aligned
        #pragma ivdep
        #endif
        for (int i = ifrom; i < ito; i++) {
          f[i].x += f_in[ii].x;
          f[i].y += f_in[ii].y;
          f[i].z += f_in[ii].z;
          tor[i].x += f_in[ii+1].x;
          tor[i].y += f_in[ii+1].y;
          tor[i].z += f_in[ii+1].z;
          ii += 2;
        }
      }
    } else {
      if (eatom) {
        double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
        #if defined(LMP_SIMD_COMPILER)
        #pragma vector aligned
        #pragma ivdep
        #endif
        for (int i = ifrom; i < ito; i++) {
          f[i].x += f_in[i].x;
          f[i].y += f_in[i].y;
          f[i].z += f_in[i].z;
          lmp_eatom[i] += f_in[i].w;
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
        #pragma vector aligned
        #pragma ivdep
        #endif
        for (int i = ifrom; i < ito; i++) {
          f[i].x += f_in[i].x;
          f[i].y += f_in[i].y;
          f[i].z += f_in[i].z;
        }
      }
    }
  }

  if (ev_global != nullptr) {
    force->pair->eng_vdwl += ev_global[0];
    force->pair->eng_coul += ev_global[1];
    force->pair->virial[0] += ev_global[2];
    force->pair->virial[1] += ev_global[3];
    force->pair->virial[2] += ev_global[4];
    force->pair->virial[3] += ev_global[5];
    force->pair->virial[4] += ev_global[6];
    force->pair->virial[5] += ev_global[7];
  }
}

/* ---------------------------------------------------------------------- */

double FixIntel::memory_usage()
{
  double bytes;
  if (_precision_mode == PREC_MODE_SINGLE)
    bytes = _single_buffers->memory_usage(_nthreads);
  else if (_precision_mode == PREC_MODE_MIXED)
    bytes = _mixed_buffers->memory_usage(_nthreads);
  else
    bytes = _double_buffers->memory_usage(_nthreads);

  return bytes;
}

/* ---------------------------------------------------------------------- */

#ifdef _LMP_INTEL_OFFLOAD

/* ---------------------------------------------------------------------- */

template <class ft, class acc_t>
void FixIntel::add_off_results(const ft * _noalias const f_in,
                               const acc_t * _noalias const ev_global) {
  if (_offload_balance < 0.0)
    _balance_other_time = platform::walltime() - _balance_other_time;

  start_watch(TIME_OFFLOAD_WAIT);
  #ifdef _LMP_INTEL_OFFLOAD
  if (neighbor->ago == 0) {
    #pragma offload_wait target(mic:_cop) wait(atom->tag,f_in)
  } else {
    #pragma offload_wait target(mic:_cop) wait(f_in)
  }
  #endif
  double wait_time = stop_watch(TIME_OFFLOAD_WAIT);

  int nlocal = atom->nlocal;
  if (neighbor->ago == 0) {
    if (_off_overflow_flag[LMP_OVERFLOW])
      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
    _offload_nlocal = _off_overflow_flag[LMP_LOCAL_MAX] + 1;
    _offload_min_ghost = _off_overflow_flag[LMP_GHOST_MIN];
    _offload_nghost = _off_overflow_flag[LMP_GHOST_MAX] + 1 -
      _offload_min_ghost;
    if (_offload_nghost < 0) _offload_nghost = 0;
    _offload_nall = _offload_nlocal + _offload_nghost;
      _offload_nlocal;
  }

  if (_torque_flag)
    if (f_in[1].w < 0.0)
      error->all(FLERR,"Bad matrix inversion in mldivide3");
  add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);

  // Load balance?
  if (_offload_balance < 0.0) {
    if (neighbor->ago == 0)
      _balance_pair = _balance_neighbor;
    double mic_time;
    mic_time = *_stopwatch_offload_pair;
    if (_balance_pair_time + _balance_other_time < mic_time) {
      double ft = _balance_pair_time + _balance_other_time + wait_time -
          mic_time;
      _balance_fixed = (1.0 - INTEL_LB_MEAN_WEIGHT) * _balance_fixed +
          INTEL_LB_MEAN_WEIGHT * ft;
    }

    double ctps = _balance_pair_time / (1.0-_balance_pair);
    double otps = mic_time / _balance_pair;
    double new_balance = (ctps + _balance_other_time - _balance_fixed) /
        (otps + ctps);
    _balance_neighbor = (1.0 - INTEL_LB_MEAN_WEIGHT) *_balance_neighbor +
        INTEL_LB_MEAN_WEIGHT * new_balance;
  }

  #ifdef TIME_BALANCE
  start_watch(TIME_IMBALANCE);
  MPI_Barrier(_real_space_comm);
  stop_watch(TIME_IMBALANCE);
  #endif
  acc_timers();
}

/* ---------------------------------------------------------------------- */

void FixIntel::output_timing_data() {
  if (_im_real_space_task == 0 || _offload_affinity_set == 0) return;

  double timer_total = 0.0;
  int size, rank;
  double timers[NUM_ITIMERS];
  MPI_Comm_size(_real_space_comm, &size);
  MPI_Comm_rank(_real_space_comm, &rank);
  MPI_Allreduce(&_timers, &timers, NUM_ITIMERS, MPI_DOUBLE, MPI_SUM,
                _real_space_comm);
  for (int i=0; i < NUM_ITIMERS; i++) {
    timers[i] /= size;
    timer_total += timers[i];
  }
  #ifdef TIME_BALANCE
  double timers_min[NUM_ITIMERS], timers_max[NUM_ITIMERS];
  MPI_Allreduce(&_timers, &timers_max, NUM_ITIMERS, MPI_DOUBLE, MPI_MAX,
                _real_space_comm);
  MPI_Allreduce(&_timers, &timers_min, NUM_ITIMERS, MPI_DOUBLE, MPI_MIN,
                _real_space_comm);
  #endif

  if (timer_total > 0.0) {
    double balance_out[2], balance_in[2];
    balance_out[0] = _balance_pair;
    balance_out[1] = _balance_neighbor;
    MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM,
               0, _real_space_comm);
    balance_in[0] /= size;
    balance_in[1] /= size;

    if (rank == 0 && _tscreen) {
      fprintf(_tscreen, "\n------------------------------------------------\n");
      fprintf(_tscreen, "               Offload Timing Data\n");
      fprintf(_tscreen, "------------------------------------------------\n");
      fprintf(_tscreen, "  Data Pack/Cast Seconds    %f\n",
              timers[TIME_PACK]);
      if (_offload_balance != 0.0) {
        fprintf(_tscreen, "  Host Neighbor Seconds     %f\n",
                timers[TIME_HOST_NEIGHBOR]);
        fprintf(_tscreen, "  Host Pair Seconds         %f\n",
                timers[TIME_HOST_PAIR]);
        fprintf(_tscreen, "  Offload Neighbor Seconds  %f\n",
                timers[TIME_OFFLOAD_NEIGHBOR]);
        fprintf(_tscreen, "  Offload Pair Seconds      %f\n",
                timers[TIME_OFFLOAD_PAIR]);
        fprintf(_tscreen, "  Offload Wait Seconds      %f\n",
                timers[TIME_OFFLOAD_WAIT]);
        fprintf(_tscreen, "  Offload Latency Seconds   %f\n",
                timers[TIME_OFFLOAD_LATENCY]);
        fprintf(_tscreen, "  Offload Neighbor Balance  %f\n",
                balance_in[1]);
        fprintf(_tscreen, "  Offload Pair Balance      %f\n",
                balance_in[0]);
        fprintf(_tscreen, "  Offload Ghost Atoms       ");
        if (_offload_noghost) fprintf(_tscreen,"No\n");
        else fprintf(_tscreen,"Yes\n");
        #ifdef TIME_BALANCE
        fprintf(_tscreen, "  Offload Imbalance Seconds %f\n",
                timers[TIME_IMBALANCE]);
        fprintf(_tscreen, "  Offload Min/Max Seconds   ");
        for (int i = 0; i < NUM_ITIMERS; i++)
          fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
        fprintf(_tscreen, "\n");
        #endif
        double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] +
          timers[TIME_OFFLOAD_WAIT];
        double ct = timers[TIME_OFFLOAD_NEIGHBOR] +
          timers[TIME_OFFLOAD_PAIR];
        double tt = MAX(ht,ct);
        if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0)
          error->warning(FLERR,"Leaving a core free can improve performance for offload");
      }
      fprintf(_tscreen, "------------------------------------------------\n");
    }
    zero_timers();
    _setup_time_cleared = false;
  }
}

/* ---------------------------------------------------------------------- */

int FixIntel::get_ppn(int &node_rank) {
  int nprocs;
  int rank;
  MPI_Comm_size(_real_space_comm, &nprocs);
  MPI_Comm_rank(_real_space_comm, &rank);

  int name_length;
  char node_name[MPI_MAX_PROCESSOR_NAME];
  MPI_Get_processor_name(node_name,&name_length);
  node_name[name_length] = '\0';
  char *node_names = new char[MPI_MAX_PROCESSOR_NAME*nprocs];
  MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names,
                MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm);
  int ppn = 0;
  node_rank = 0;
  for (int i = 0; i < nprocs; i++) {
    if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) {
      ppn++;
      if (i < rank)
        node_rank++;
    }
  }

  return ppn;
}

/* ---------------------------------------------------------------------- */

void FixIntel::set_offload_affinity()
{
  _separate_buffers = 0;
  if (_allow_separate_buffers)
    if (_offload_balance != 0.0 && _offload_balance < 1.0)
      _separate_buffers = 1;

  _im_real_space_task = 1;
  if (strncmp(update->integrate_style,"verlet/split",12) == 0) {
    _real_space_comm = world;
    if (universe->iworld != 0) {
      _im_real_space_task = 0;
      return;
    }
  } else
    _real_space_comm = universe->uworld;

  if (_offload_balance == 0.0) _cop = -1;
  if (_offload_balance == 0.0 || _offload_affinity_set == 1)
    return;

  _offload_affinity_set = 1;
  int node_rank;
  int ppn = get_ppn(node_rank);

  if (ppn % _ncops != 0)
    error->all(FLERR,"MPI tasks per node must be multiple of offload_cards");
  ppn = ppn / _ncops;
  _cop = node_rank / ppn;
  node_rank = node_rank % ppn;

  int max_threads_per_task = _offload_cores / 4 * _offload_tpc / ppn;
  if (_offload_threads > max_threads_per_task)
    _offload_threads = max_threads_per_task;
  if (_offload_threads > _max_offload_threads)
    _offload_threads = _max_offload_threads;

  int offload_threads = _offload_threads;
  int offload_tpc = _offload_tpc;
  int offload_affinity_balanced = _offload_affinity_balanced;
  int offload_cores = _offload_cores;
  #pragma offload target(mic:_cop) mandatory \
    in(node_rank,offload_threads,offload_tpc,offload_affinity_balanced, \
       offload_cores)
  {
    omp_set_num_threads(offload_threads);
    #pragma omp parallel
    {
      int tnum = omp_get_thread_num();
      kmp_affinity_mask_t mask;
      kmp_create_affinity_mask(&mask);
      int proc = offload_threads * node_rank + tnum;
      #ifdef __AVX512F__
      proc = (proc / offload_tpc) + (proc % offload_tpc) *
             ((offload_cores) / 4);
      proc += 68;
      #else
      if (offload_affinity_balanced)
        proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1;
      else
        proc += (proc / 4) * (4 - offload_tpc) + 1;
      #endif
      kmp_set_affinity_mask_proc(proc, &mask);
      if (kmp_set_affinity(&mask) != 0)
        printf("Could not set affinity on rank %d thread %d to %d\n",
               node_rank, tnum, proc);
    }
  }

  if (_precision_mode == PREC_MODE_SINGLE)
    _single_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
  else if (_precision_mode == PREC_MODE_MIXED)
    _mixed_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
  else
    _double_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
}

/* ---------------------------------------------------------------------- */

int FixIntel::set_host_affinity(const int nomp)
{
  #ifndef INTEL_OFFLOAD_NOAFFINITY
  _separate_coi = 1;
  int rank = comm->me;
  int node_rank;
  int ppn = get_ppn(node_rank);
  int cop = node_rank / (ppn / _ncops);

  // Get a sorted list of logical cores
  int proc_list[INTEL_MAX_HOST_CORE_COUNT];
  int ncores;
  FILE *p;
  char cmd[512];
  char readbuf[INTEL_MAX_HOST_CORE_COUNT*5];
  sprintf(cmd, "lscpu -p | grep -v '#' |"
          "sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'");
  p = popen(cmd, "r");
  if (p == nullptr) return -1;
  ncores = 0;
  while (fgets(readbuf, 512, p)) {
    proc_list[ncores] = atoi(readbuf);
    ncores++;
  }
  pclose(p);

  // Sanity checks for core list
  if (ncores < 2) return -1;
  int nzero = 0;
  for (int i = 0; i < ncores; i++) {
    if (proc_list[i] == 0) nzero++;
    if (proc_list[i] < 0 || proc_list[i] >= ncores) return -1;
  }
  if (nzero > 1) return -1;

  // Determine the OpenMP/MPI configuration
  char *estring;
  int nthreads = nomp;
  if (nthreads == 0) {
    estring = getenv("OMP_NUM_THREADS");
    if (estring != nullptr) {
      nthreads = atoi(estring);
      if (nthreads < 2) nthreads = 1;
    } else
      nthreads = 1;
  }

  // Determine how many logical cores for COI and MPI tasks
  int coi_cores = 0, mpi_cores;
  int subscription = nthreads * ppn;
  if (subscription > ncores) {
    if (rank == 0)
      error->warning(FLERR,"More MPI tasks/OpenMP threads than available cores");
    return 0;
  }
  if (subscription == ncores)
    _separate_coi = 0;

  if (subscription > ncores / 2) {
    coi_cores = ncores - subscription;
    if (coi_cores > INTEL_MAX_COI_CORES) coi_cores = INTEL_MAX_COI_CORES;
  }
  mpi_cores = (ncores - coi_cores) / ppn;

  // Get ids of all LWPs that COI spawned and affinitize
  int lwp = 0, plwp = 0, nlwp = 0, mlwp = 0, fail = 0;
  cpu_set_t cpuset;
  pid_t pid = getpid();
  if (coi_cores) {
    sprintf(cmd, "ps -Lp %d -o lwp | awk ' (NR > 2) {print}'", pid);
    p = popen(cmd, "r");
    if (p == nullptr) return -1;

    while (fgets(readbuf, 512, p)) {
      lwp = atoi(readbuf);
      int first = coi_cores + node_rank * mpi_cores;
      CPU_ZERO(&cpuset);
      for (int i = first; i < first + mpi_cores; i++)
        CPU_SET(proc_list[i], &cpuset);
      if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
        fail = 1;
        break;
      }
      plwp++;
    }
    pclose(p);

    // Do async offload to create COI threads
    int sig1, sig2;
    float *buf1;
    int pragma_size = 1024;
    buf1 = (float*) malloc(sizeof(float)*pragma_size);

    #pragma offload target (mic:0) mandatory \
      in(buf1:length(pragma_size) alloc_if(1) free_if(0))       \
      signal(&sig1)
    { buf1[0] = 0.0; }
    #pragma offload_wait target(mic:0) wait(&sig1)

    #pragma offload target (mic:0) mandatory \
      out(buf1:length(pragma_size) alloc_if(0) free_if(1))      \
      signal(&sig2)
    { buf1[0] = 1.0; }
    #pragma offload_wait target(mic:0) wait(&sig2)
    free(buf1);

    p = popen(cmd, "r");
    if (p == nullptr) return -1;

    while (fgets(readbuf, 512, p)) {
      lwp = atoi(readbuf);
      nlwp++;
      if (nlwp <= plwp) continue;

      CPU_ZERO(&cpuset);
      for (int i=0; i<coi_cores; i++)
        CPU_SET(proc_list[i], &cpuset);

      if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
        fail = 1;
        break;
      }
    }
    pclose(p);
    nlwp -= plwp;

    // Get stats on the number of LWPs per process
    MPI_Reduce(&nlwp, &mlwp, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
  }

  if (screen && rank == 0) {
    if (coi_cores)
      fprintf(screen,"Intel Package: Affinitizing %d Offload Threads to %d Cores\n",
              mlwp, coi_cores);
    fprintf(screen,"Intel Package: Affinitizing MPI Tasks to %d Cores Each\n",mpi_cores);
  }
  if (fail) return -1;

  // Affinitize MPI Ranks
  CPU_ZERO(&cpuset);
  int first = coi_cores + node_rank * mpi_cores;
  for (int i = first; i < first+mpi_cores; i++)
    CPU_SET(proc_list[i], &cpuset);
  if (sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset))
    return -1;

  #endif
  return 0;
}

#endif