lammps/src/INTEL/fix_intel.h

// clang-format off
/* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */

#ifdef FIX_CLASS
// clang-format off
FixStyle(INTEL,FixIntel);
// clang-format on
#else

#ifndef LMP_FIX_INTEL_H
#define LMP_FIX_INTEL_H

#include "error.h"
#include "fix.h"
#include "force.h"
#include "intel_buffers.h"
#include "pair.h"
#include "update.h"

namespace LAMMPS_NS {

class IntelData;
template <class flt_t, class acc_t> class IntelBuffers;

class FixIntel : public Fix {
 public:
  FixIntel(class LAMMPS *, int, char **);
  ~FixIntel() override;
  int setmask() override;
  void init() override;
  void setup(int) override;
  inline void min_setup(int in) override { setup(in); }
  void setup_pre_reverse(int eflag = 0, int vflag = 0) override;

  bool pair_hybrid_check();
  void pair_init_check(const bool cdmessage = false);
  void bond_init_check();
  void kspace_init_check();

  void pre_reverse(int eflag = 0, int vflag = 0) override;
  inline void min_pre_reverse(int eflag = 0, int vflag = 0) override { pre_reverse(eflag, vflag); }

  void post_run() override { _print_pkg_info = 1; }

  // Get all forces, calculation results from coprocesser
  void sync_coprocessor();

  double memory_usage() override;

  typedef struct {
    double x, y, z;
  } lmp_ft;

  enum { PREC_MODE_SINGLE, PREC_MODE_MIXED, PREC_MODE_DOUBLE };

  inline int precision() { return _precision_mode; }
  inline IntelBuffers<float, float> *get_single_buffers() { return _single_buffers; }
  inline IntelBuffers<float, double> *get_mixed_buffers() { return _mixed_buffers; }
  inline IntelBuffers<double, double> *get_double_buffers() { return _double_buffers; }

  inline int nbor_pack_width() const { return _nbor_pack_width; }
  inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
  inline int three_body_neighbor() { return _three_body_neighbor; }
  inline void three_body_neighbor(const int i) { _three_body_neighbor = i; }

  inline int need_zero(const int tid)
  {
    if (_need_reduce == 0 && tid > 0)
      return 1;
    else if (_zero_master && tid == 0) {
      _zero_master = 0;
      return 1;
    } else
      return 0;
  }
  inline void set_reduce_flag()
  {
    if (_nthreads > 1) _need_reduce = 1;
  }
  inline int lrt()
  {
    if (force->kspace_match("^pppm/.*intel$", 0) && update->whichflag == 1)
      return _lrt;
    else
      return 0;
  }
  inline int pppm_table()
  {
    if (force->kspace_match("^pppm/.*intel$", 0))
      return INTEL_P3M_TABLE;
    else
      return 0;
  }

 protected:
  IntelBuffers<float, float> *_single_buffers;
  IntelBuffers<float, double> *_mixed_buffers;
  IntelBuffers<double, double> *_double_buffers;

  int _precision_mode, _nthreads, _nbor_pack_width, _three_body_neighbor;
  int _pair_intel_count, _pair_hybrid_flag, _print_pkg_info;
  // These should be removed in subsequent update w/ simpler hybrid arch
  int _pair_hybrid_zero, _hybrid_nonpair, _zero_master;

 public:
  inline int *get_overflow_flag() { return _overflow_flag; }
  inline int *get_off_overflow_flag() { return _off_overflow_flag; }
  inline void add_result_array(IntelBuffers<double, double>::vec3_acc_t *f_in, double *ev_in,
                               const int offload, const int eatom = 0, const int vatom = 0,
                               const int rflag = 0);
  inline void add_result_array(IntelBuffers<float, double>::vec3_acc_t *f_in, double *ev_in,
                               const int offload, const int eatom = 0, const int vatom = 0,
                               const int rflag = 0);
  inline void add_result_array(IntelBuffers<float, float>::vec3_acc_t *f_in, float *ev_in,
                               const int offload, const int eatom = 0, const int vatom = 0,
                               const int rflag = 0);
  inline void get_buffern(const int offload, int &nlocal, int &nall, int &minlocal);

#ifdef _LMP_INTEL_OFFLOAD
  void post_force(int vflag);
  inline int coprocessor_number() { return _cop; }
  inline int full_host_list() { return _full_host_list; }
  void set_offload_affinity();
  inline double offload_balance() { return _offload_balance; }
  inline int offload_end_neighbor();
  inline int offload_end_pair();
  inline int host_start_neighbor()
  {
    if (_offload_noghost)
      return 0;
    else
      return offload_end_neighbor();
  }
  inline int host_start_pair()
  {
    if (_offload_noghost)
      return 0;
    else
      return offload_end_pair();
  }
  inline int offload_nlocal() { return _offload_nlocal; }
  inline int offload_nall() { return _offload_nall; }
  inline int offload_min_ghost() { return _offload_min_ghost; }
  inline int host_min_local() { return _host_min_local; }
  inline int host_min_ghost() { return _host_min_ghost; }
  inline int host_used_local() { return _host_used_local; }
  inline int host_used_ghost() { return _host_used_ghost; }
  inline int host_nall() { return _host_nall; }
  inline int separate_buffers() { return _separate_buffers; }
  inline int offload_noghost() { return _offload_noghost; }
  inline void set_offload_noghost(const int v)
  {
    if (_offload_ghost < 0) _offload_noghost = v;
  }
  inline void set_neighbor_host_sizes();

  inline void zero_timers() { memset(_timers, 0, sizeof(double) * NUM_ITIMERS); }
  inline void start_watch(const int which) { _stopwatch[which] = MPI_Wtime(); }
  inline double stop_watch(const int which);
  inline double *off_watch_pair() { return _stopwatch_offload_pair; }
  inline double *off_watch_neighbor() { return _stopwatch_offload_neighbor; }
  inline void balance_stamp();
  inline void acc_timers();
#else
  inline int offload_end_neighbor() { return 0; }
  inline int offload_end_pair() { return 0; }
  inline int host_start_neighbor() { return 0; }
  inline int host_start_pair() { return 0; }
  inline void zero_timers() {}
  inline void start_watch(const int /*which*/) {}
  inline double stop_watch(const int /*which*/) { return 0.0; }
  double *off_watch_pair() { return nullptr; }
  double *off_watch_neighbor() { return nullptr; }
  inline void balance_stamp() {}
  inline void acc_timers() {}
  inline int separate_buffers() { return 0; }
#endif

 protected:
  int _overflow_flag[5];
  _alignvar(int _off_overflow_flag[5], 64);
  int _allow_separate_buffers, _offload_ghost, _lrt;

  IntelBuffers<float, float>::vec3_acc_t *_force_array_s;
  IntelBuffers<float, double>::vec3_acc_t *_force_array_m;
  IntelBuffers<double, double>::vec3_acc_t *_force_array_d;
  float *_ev_array_s;
  double *_ev_array_d;
  int _results_eatom, _results_vatom;
  int _need_reduce;

#ifdef _LMP_INTEL_OFFLOAD
  double _balance_pair_time, _balance_other_time;
  int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost;
  int _host_min_local, _host_min_ghost, _host_nall;
  int _host_used_local, _host_used_ghost, _sync_mode;
  int _separate_buffers, _offload_noghost, _separate_coi;
  bool _setup_time_cleared, _timers_allocated;
  void output_timing_data();
  FILE *_tscreen;

  IntelBuffers<float, float>::vec3_acc_t *_off_force_array_s;
  IntelBuffers<float, double>::vec3_acc_t *_off_force_array_m;
  IntelBuffers<double, double>::vec3_acc_t *_off_force_array_d;
  float *_off_ev_array_s;
  double *_off_ev_array_d;
  int _off_results_eatom, _off_results_vatom;
  int _full_host_list, _cop, _ncops;

  int get_ppn(int &);
  int set_host_affinity(const int);
#endif
  void check_neighbor_intel();

  double _offload_balance, _balance_neighbor, _balance_pair, _balance_fixed;
  double _timers[NUM_ITIMERS];
  double _stopwatch[NUM_ITIMERS];
  _alignvar(double _stopwatch_offload_neighbor[1], 64);
  _alignvar(double _stopwatch_offload_pair[1], 64);

  void _sync_main_arrays(const int prereverse);

  template <class ft> void reduce_results(ft *_noalias const f_in);

  template <class ft, class acc_t>
  inline void add_results(const ft *_noalias const f_in, const acc_t *_noalias const ev_global,
                          const int eatom, const int vatom, const int offload);

  template <class ft, class acc_t>
  inline void add_oresults(const ft *_noalias const f_in, const acc_t *_noalias const ev_global,
                           const int eatom, const int vatom, const int out_offset, const int nall);

  int _offload_affinity_balanced, _offload_threads, _offload_tpc;
#ifdef _LMP_INTEL_OFFLOAD
  int _max_offload_threads, _offload_cores, _offload_affinity_set;
  int _im_real_space_task;
  MPI_Comm _real_space_comm;
  template <class ft, class acc_t>
  inline void add_off_results(const ft *_noalias const f_in, const acc_t *_noalias const ev_global);
#endif
};

/* ---------------------------------------------------------------------- */

void FixIntel::get_buffern(const int offload, int &nlocal, int &nall, int &minlocal)
{
#ifdef _LMP_INTEL_OFFLOAD
  if (_separate_buffers) {
    if (offload) {
      if (neighbor->ago != 0) {
        nlocal = _offload_nlocal;
        nall = _offload_nall;
      } else {
        nlocal = atom->nlocal;
        nall = nlocal + atom->nghost;
      }
      minlocal = 0;
    } else {
      nlocal = atom->nlocal;
      nall = _host_nall;
      if (force->newton)
        minlocal = _host_min_local;
      else
        minlocal = host_start_pair();
    }
    return;
  }
  if (_offload_noghost && offload)
    nall = atom->nlocal;
  else
#endif
    nall = atom->nlocal + atom->nghost;
  nlocal = atom->nlocal;
  minlocal = 0;
}

/* ---------------------------------------------------------------------- */

void FixIntel::add_result_array(IntelBuffers<double, double>::vec3_acc_t *f_in, double *ev_in,
                                const int offload, const int eatom, const int vatom,
                                const int rflag)
{
#ifdef _LMP_INTEL_OFFLOAD
  if (offload) {
    _off_results_eatom = eatom;
    _off_results_vatom = vatom;
    _off_force_array_d = f_in;
    _off_ev_array_d = ev_in;
    if (_pair_hybrid_flag && force->pair->fdotr_is_set()) _sync_main_arrays(1);
    return;
  }
#endif

  _force_array_d = f_in;
  _ev_array_d = ev_in;
  _results_eatom = eatom;
  _results_vatom = vatom;
#ifndef _LMP_INTEL_OFFLOAD
  if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
#endif

  if (_overflow_flag[LMP_OVERFLOW])
    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");

  if (_pair_hybrid_flag > 1 || (_pair_hybrid_flag && force->pair->fdotr_is_set()))
    _sync_main_arrays(0);
}

/* ---------------------------------------------------------------------- */

void FixIntel::add_result_array(IntelBuffers<float, double>::vec3_acc_t *f_in, double *ev_in,
                                const int offload, const int eatom, const int vatom,
                                const int rflag)
{
#ifdef _LMP_INTEL_OFFLOAD
  if (offload) {
    _off_results_eatom = eatom;
    _off_results_vatom = vatom;
    _off_force_array_m = f_in;
    _off_ev_array_d = ev_in;
    if (_pair_hybrid_flag && force->pair->fdotr_is_set()) _sync_main_arrays(1);
    return;
  }
#endif

  _force_array_m = f_in;
  _ev_array_d = ev_in;
  _results_eatom = eatom;
  _results_vatom = vatom;
#ifndef _LMP_INTEL_OFFLOAD
  if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
#endif

  if (_overflow_flag[LMP_OVERFLOW])
    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");

  if (_pair_hybrid_flag > 1 || (_pair_hybrid_flag && force->pair->fdotr_is_set()))
    _sync_main_arrays(0);
}

/* ---------------------------------------------------------------------- */

void FixIntel::add_result_array(IntelBuffers<float, float>::vec3_acc_t *f_in, float *ev_in,
                                const int offload, const int eatom, const int vatom,
                                const int rflag)
{
#ifdef _LMP_INTEL_OFFLOAD
  if (offload) {
    _off_results_eatom = eatom;
    _off_results_vatom = vatom;
    _off_force_array_s = f_in;
    _off_ev_array_s = ev_in;
    if (_pair_hybrid_flag && force->pair->fdotr_is_set()) _sync_main_arrays(1);
    return;
  }
#endif

  _force_array_s = f_in;
  _ev_array_s = ev_in;
  _results_eatom = eatom;
  _results_vatom = vatom;
#ifndef _LMP_INTEL_OFFLOAD
  if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
#endif

  if (_overflow_flag[LMP_OVERFLOW])
    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");

  if (_pair_hybrid_flag > 1 || (_pair_hybrid_flag && force->pair->fdotr_is_set()))
    _sync_main_arrays(0);
}

/* ---------------------------------------------------------------------- */

#ifdef _LMP_INTEL_OFFLOAD

/* ---------------------------------------------------------------------- */

int FixIntel::offload_end_neighbor()
{
  if (_offload_balance < 0.0) {
    if (atom->nlocal < 2) error->one(FLERR, "Too few atoms for load balancing offload");
    double granularity = 1.0 / atom->nlocal;
    if (_balance_neighbor < granularity)
      _balance_neighbor = granularity + 1e-10;
    else if (_balance_neighbor > 1.0 - granularity)
      _balance_neighbor = 1.0 - granularity + 1e-10;
  }
  return _balance_neighbor * atom->nlocal;
}

int FixIntel::offload_end_pair()
{
  if (neighbor->ago == 0)
    return _balance_neighbor * atom->nlocal;
  else
    return _balance_pair * atom->nlocal;
}

/* ---------------------------------------------------------------------- */

double FixIntel::stop_watch(const int which)
{
  double elapsed = MPI_Wtime() - _stopwatch[which];
  _timers[which] += elapsed;
  return elapsed;
}

/* ---------------------------------------------------------------------- */

void FixIntel::balance_stamp()
{
  if (_offload_balance < 0.0) {
    double ct = MPI_Wtime();
    _balance_other_time = ct;
    _balance_pair_time = ct - _stopwatch[TIME_HOST_PAIR];
  }
}

/* ---------------------------------------------------------------------- */

void FixIntel::acc_timers()
{
  _timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair;
  if (neighbor->ago == 0) {
    _timers[TIME_OFFLOAD_NEIGHBOR] += *_stopwatch_offload_neighbor;
    if (_setup_time_cleared == false) {
      zero_timers();
      _setup_time_cleared = true;
    }
  }
}

/* ---------------------------------------------------------------------- */

void FixIntel::set_neighbor_host_sizes()
{
  _host_min_local = _overflow_flag[LMP_LOCAL_MIN];
  _host_min_ghost = _overflow_flag[LMP_GHOST_MIN];
  _host_used_local = atom->nlocal - _host_min_local;
  _host_used_ghost = _overflow_flag[LMP_GHOST_MAX] + 1 - _host_min_ghost;
  if (_host_used_ghost < 0) _host_used_ghost = 0;
  _host_nall = atom->nlocal + _host_used_ghost;
}

/* ---------------------------------------------------------------------- */

#endif

}    // namespace LAMMPS_NS

#endif
#endif