526 lines
17 KiB
C++
526 lines
17 KiB
C++
/* -*- c++ -*- ----------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the GNU General Public License.
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
------------------------------------------------------------------------- */
|
|
|
|
/* ----------------------------------------------------------------------
|
|
Contributing author: W. Michael Brown (Intel)
|
|
------------------------------------------------------------------------- */
|
|
|
|
#ifdef FIX_CLASS
|
|
|
|
FixStyle(INTEL,FixIntel)
|
|
|
|
#else
|
|
|
|
#ifndef LMP_FIX_INTEL_H
|
|
#define LMP_FIX_INTEL_H
|
|
|
|
#include "fix.h"
|
|
#include "intel_buffers.h"
|
|
#include "force.h"
|
|
#include "pair.h"
|
|
#include "error.h"
|
|
#include "update.h"
|
|
|
|
namespace LAMMPS_NS {
|
|
|
|
class IntelData;
|
|
template <class flt_t, class acc_t> class IntelBuffers;
|
|
|
|
class FixIntel : public Fix {
|
|
public:
|
|
FixIntel(class LAMMPS *, int, char **);
|
|
virtual ~FixIntel();
|
|
virtual int setmask();
|
|
virtual void init();
|
|
virtual void setup(int);
|
|
void setup_pre_reverse(int eflag = 0, int vflag = 0);
|
|
|
|
void pair_init_check(const bool cdmessage=false);
|
|
void bond_init_check();
|
|
void kspace_init_check();
|
|
|
|
void pre_reverse(int eflag = 0, int vflag = 0);
|
|
|
|
// Get all forces, calculation results from coprocesser
|
|
void sync_coprocessor();
|
|
|
|
double memory_usage();
|
|
|
|
typedef struct { double x,y,z; } lmp_ft;
|
|
|
|
enum {PREC_MODE_SINGLE, PREC_MODE_MIXED, PREC_MODE_DOUBLE};
|
|
|
|
inline int precision() { return _precision_mode; }
|
|
inline IntelBuffers<float,float> * get_single_buffers()
|
|
{ return _single_buffers; }
|
|
inline IntelBuffers<float,double> * get_mixed_buffers()
|
|
{ return _mixed_buffers; }
|
|
inline IntelBuffers<double,double> * get_double_buffers()
|
|
{ return _double_buffers; }
|
|
|
|
inline int nbor_pack_width() const { return _nbor_pack_width; }
|
|
inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
|
|
|
|
inline int need_zero(const int tid) {
|
|
if (_need_reduce == 0 && tid > 0) return 1;
|
|
return 0;
|
|
}
|
|
inline void set_reduce_flag() { _need_reduce = 1; }
|
|
inline int lrt() {
|
|
if (force->kspace_match("pppm/intel", 0)) return _lrt;
|
|
else return 0;
|
|
}
|
|
|
|
protected:
|
|
IntelBuffers<float,float> *_single_buffers;
|
|
IntelBuffers<float,double> *_mixed_buffers;
|
|
IntelBuffers<double,double> *_double_buffers;
|
|
|
|
int _precision_mode, _nthreads, _nbor_pack_width;
|
|
|
|
public:
|
|
inline int* get_overflow_flag() { return _overflow_flag; }
|
|
inline int* get_off_overflow_flag() { return _off_overflow_flag; }
|
|
inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
|
double *ev_in, const int offload,
|
|
const int eatom = 0, const int vatom = 0,
|
|
const int rflag = 0);
|
|
inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
|
|
double *ev_in, const int offload,
|
|
const int eatom = 0, const int vatom = 0,
|
|
const int rflag = 0);
|
|
inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
|
|
float *ev_in, const int offload,
|
|
const int eatom = 0, const int vatom = 0,
|
|
const int rflag = 0);
|
|
inline void get_buffern(const int offload, int &nlocal, int &nall,
|
|
int &minlocal);
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
void post_force(int vflag);
|
|
inline int coprocessor_number() { return _cop; }
|
|
inline int full_host_list() { return _full_host_list; }
|
|
void set_offload_affinity();
|
|
inline double offload_balance() { return _offload_balance; }
|
|
inline int offload_end_neighbor();
|
|
inline int offload_end_pair();
|
|
inline int host_start_neighbor()
|
|
{ if (_offload_noghost) return 0; else return offload_end_neighbor(); }
|
|
inline int host_start_pair()
|
|
{ if (_offload_noghost) return 0; else return offload_end_pair(); }
|
|
inline int offload_nlocal() { return _offload_nlocal; }
|
|
inline int offload_nall() { return _offload_nall; }
|
|
inline int offload_min_ghost() { return _offload_min_ghost; }
|
|
inline int host_min_local() { return _host_min_local; }
|
|
inline int host_min_ghost() { return _host_min_ghost; }
|
|
inline int host_used_local() { return _host_used_local; }
|
|
inline int host_used_ghost() { return _host_used_ghost; }
|
|
inline int host_nall() { return _host_nall; }
|
|
inline int separate_buffers() { return _separate_buffers; }
|
|
inline int offload_noghost() { return _offload_noghost; }
|
|
inline void set_offload_noghost(const int v)
|
|
{ if (_offload_ghost < 0) _offload_noghost = v; }
|
|
inline void set_neighbor_host_sizes();
|
|
|
|
inline void zero_timers()
|
|
{ memset(_timers, 0, sizeof(double) * NUM_ITIMERS); }
|
|
inline void start_watch(const int which) { _stopwatch[which] = MPI_Wtime(); }
|
|
inline double stop_watch(const int which);
|
|
inline double * off_watch_pair() { return _stopwatch_offload_pair; }
|
|
inline double * off_watch_neighbor() { return _stopwatch_offload_neighbor; }
|
|
inline void balance_stamp();
|
|
inline void acc_timers();
|
|
#else
|
|
inline int offload_end_neighbor() { return 0; }
|
|
inline int offload_end_pair() { return 0; }
|
|
inline int host_start_neighbor() { return 0; }
|
|
inline int host_start_pair() { return 0; }
|
|
inline void zero_timers() {}
|
|
inline void start_watch(const int which) {}
|
|
inline double stop_watch(const int which) { return 0.0; }
|
|
double * off_watch_pair() { return NULL; }
|
|
double * off_watch_neighbor() { return NULL; }
|
|
inline void balance_stamp() {}
|
|
inline void acc_timers() {}
|
|
inline int separate_buffers() { return 0; }
|
|
#endif
|
|
|
|
protected:
|
|
int _overflow_flag[5];
|
|
_alignvar(int _off_overflow_flag[5],64);
|
|
int _allow_separate_buffers, _offload_ghost, _lrt;
|
|
|
|
IntelBuffers<float,float>::vec3_acc_t *_force_array_s;
|
|
IntelBuffers<float,double>::vec3_acc_t *_force_array_m;
|
|
IntelBuffers<double,double>::vec3_acc_t *_force_array_d;
|
|
float *_ev_array_s;
|
|
double *_ev_array_d;
|
|
int _results_eatom, _results_vatom;
|
|
int _need_reduce;
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
double _balance_pair_time, _balance_other_time;
|
|
int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost;
|
|
int _host_min_local, _host_min_ghost, _host_nall;
|
|
int _host_used_local, _host_used_ghost, _sync_mode;
|
|
int _separate_buffers, _offload_noghost, _separate_coi;
|
|
bool _setup_time_cleared, _timers_allocated;
|
|
void output_timing_data();
|
|
FILE *_tscreen;
|
|
|
|
IntelBuffers<float,float>::vec3_acc_t *_off_force_array_s;
|
|
IntelBuffers<float,double>::vec3_acc_t *_off_force_array_m;
|
|
IntelBuffers<double,double>::vec3_acc_t *_off_force_array_d;
|
|
float *_off_ev_array_s;
|
|
double *_off_ev_array_d;
|
|
int _off_results_eatom, _off_results_vatom;
|
|
int _full_host_list, _cop, _ncops;
|
|
|
|
int get_ppn(int &);
|
|
int set_host_affinity(const int);
|
|
#endif
|
|
void check_neighbor_intel();
|
|
|
|
double _offload_balance, _balance_neighbor, _balance_pair, _balance_fixed;
|
|
double _timers[NUM_ITIMERS];
|
|
double _stopwatch[NUM_ITIMERS];
|
|
_alignvar(double _stopwatch_offload_neighbor[1],64);
|
|
_alignvar(double _stopwatch_offload_pair[1],64);
|
|
|
|
template <class ft>
|
|
void reduce_results(ft * _noalias const f_in);
|
|
|
|
template <class ft, class acc_t>
|
|
inline void add_results(const ft * _noalias const f_in,
|
|
const acc_t * _noalias const ev_global,
|
|
const int eatom, const int vatom,
|
|
const int offload);
|
|
|
|
template <class ft, class acc_t>
|
|
inline void add_oresults(const ft * _noalias const f_in,
|
|
const acc_t * _noalias const ev_global,
|
|
const int eatom, const int vatom,
|
|
const int out_offset, const int nall);
|
|
|
|
int _offload_affinity_balanced, _offload_threads, _offload_tpc;
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
int _max_offload_threads, _offload_cores, _offload_affinity_set;
|
|
int _im_real_space_task;
|
|
MPI_Comm _real_space_comm;
|
|
template <class ft, class acc_t>
|
|
inline void add_off_results(const ft * _noalias const f_in,
|
|
const acc_t * _noalias const ev_global);
|
|
#endif
|
|
};
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
|
|
int &minlocal) {
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
if (_separate_buffers) {
|
|
if (offload) {
|
|
if (neighbor->ago != 0) {
|
|
nlocal = _offload_nlocal;
|
|
nall = _offload_nall;
|
|
} else {
|
|
nlocal = atom->nlocal;
|
|
nall = nlocal + atom->nghost;
|
|
}
|
|
minlocal = 0;
|
|
} else {
|
|
nlocal = atom->nlocal;
|
|
nall = _host_nall;
|
|
minlocal = _host_min_local;
|
|
}
|
|
return;
|
|
}
|
|
if (_offload_noghost && offload)
|
|
nall = atom->nlocal;
|
|
else
|
|
#endif
|
|
nall = atom->nlocal + atom->nghost;
|
|
nlocal = atom->nlocal;
|
|
minlocal = 0;
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
|
double *ev_in, const int offload,
|
|
const int eatom, const int vatom,
|
|
const int rflag) {
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
if (offload) {
|
|
_off_results_eatom = eatom;
|
|
_off_results_vatom = vatom;
|
|
_off_force_array_d = f_in;
|
|
_off_ev_array_d = ev_in;
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
_force_array_d = f_in;
|
|
_ev_array_d = ev_in;
|
|
_results_eatom = eatom;
|
|
_results_vatom = vatom;
|
|
#ifndef _LMP_INTEL_OFFLOAD
|
|
if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
|
|
#endif
|
|
|
|
if (_overflow_flag[LMP_OVERFLOW])
|
|
error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
|
|
double *ev_in, const int offload,
|
|
const int eatom, const int vatom,
|
|
const int rflag) {
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
if (offload) {
|
|
_off_results_eatom = eatom;
|
|
_off_results_vatom = vatom;
|
|
_off_force_array_m = f_in;
|
|
_off_ev_array_d = ev_in;
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
_force_array_m = f_in;
|
|
_ev_array_d = ev_in;
|
|
_results_eatom = eatom;
|
|
_results_vatom = vatom;
|
|
#ifndef _LMP_INTEL_OFFLOAD
|
|
if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
|
|
#endif
|
|
|
|
if (_overflow_flag[LMP_OVERFLOW])
|
|
error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
|
|
float *ev_in, const int offload,
|
|
const int eatom, const int vatom,
|
|
const int rflag) {
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
if (offload) {
|
|
_off_results_eatom = eatom;
|
|
_off_results_vatom = vatom;
|
|
_off_force_array_s = f_in;
|
|
_off_ev_array_s = ev_in;
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
_force_array_s = f_in;
|
|
_ev_array_s = ev_in;
|
|
_results_eatom = eatom;
|
|
_results_vatom = vatom;
|
|
#ifndef _LMP_INTEL_OFFLOAD
|
|
if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
|
|
#endif
|
|
|
|
if (_overflow_flag[LMP_OVERFLOW])
|
|
error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
int FixIntel::offload_end_neighbor() {
|
|
if (_offload_balance < 0.0) {
|
|
if (atom->nlocal < 2)
|
|
error->one(FLERR,"Too few atoms for load balancing offload");
|
|
double granularity = 1.0 / atom->nlocal;
|
|
if (_balance_neighbor < granularity)
|
|
_balance_neighbor = granularity + 1e-10;
|
|
else if (_balance_neighbor > 1.0 - granularity)
|
|
_balance_neighbor = 1.0 - granularity + 1e-10;
|
|
}
|
|
return _balance_neighbor * atom->nlocal;
|
|
}
|
|
|
|
int FixIntel::offload_end_pair() {
|
|
if (neighbor->ago == 0) return _balance_neighbor * atom->nlocal;
|
|
else return _balance_pair * atom->nlocal;
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
double FixIntel::stop_watch(const int which) {
|
|
double elapsed = MPI_Wtime() - _stopwatch[which];
|
|
_timers[which] += elapsed;
|
|
return elapsed;
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
void FixIntel::balance_stamp() {
|
|
if (_offload_balance < 0.0) {
|
|
double ct = MPI_Wtime();
|
|
_balance_other_time = ct;
|
|
_balance_pair_time = ct - _stopwatch[TIME_HOST_PAIR];
|
|
}
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
void FixIntel::acc_timers() {
|
|
_timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair;
|
|
if (neighbor->ago == 0) {
|
|
_timers[TIME_OFFLOAD_NEIGHBOR] += *_stopwatch_offload_neighbor;
|
|
if (_setup_time_cleared == false) {
|
|
zero_timers();
|
|
_setup_time_cleared = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
void FixIntel::set_neighbor_host_sizes() {
|
|
_host_min_local = _overflow_flag[LMP_LOCAL_MIN];
|
|
_host_min_ghost = _overflow_flag[LMP_GHOST_MIN];
|
|
_host_used_local = atom->nlocal - _host_min_local;
|
|
_host_used_ghost = _overflow_flag[LMP_GHOST_MAX] + 1 - _host_min_ghost;
|
|
if (_host_used_ghost < 0) _host_used_ghost = 0;
|
|
_host_nall = atom->nlocal + _host_used_ghost;
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
#endif
|
|
#endif
|
|
|
|
/* ERROR/WARNING messages:
|
|
|
|
E: The 'package intel' command is required for /intel styles
|
|
|
|
Self-explanatory.
|
|
|
|
W: Could not set host affinity for offload tasks
|
|
|
|
When using offload to a coprocessor, the application will try to set affinity
|
|
for host MPI tasks and OpenMP threads and will generate a warning if unable
|
|
to do so successfully. In the unsuccessful case, you might wish to set
|
|
affinity outside of the application and performance might suffer if
|
|
hyperthreading is disable on the CPU.
|
|
|
|
E: Neighbor list overflow, boost neigh_modify one
|
|
|
|
Increase the value for neigh_modify one to allow for larger allocations for
|
|
neighbor list builds. The value required can be different for the Intel
|
|
package in order to support offload to a coprocessor.
|
|
|
|
E: Bad matrix inversion in mldivide3
|
|
|
|
This error should not occur unless the matrix is badly formed.
|
|
|
|
E: Illegal package intel command
|
|
|
|
The format for the package intel command is incorrect. Please see the
|
|
documentation.
|
|
|
|
E: fix intel has to operate on group 'all'
|
|
|
|
Self explanatory.
|
|
|
|
E: Illegal package intel mode requested
|
|
|
|
The format for the package intel command is incorrect. Please see the
|
|
documentation.
|
|
|
|
E: Currently, neighbor style BIN must be used with Intel package.
|
|
|
|
This is the only neighbor style that has been implemented for the Intel
|
|
package.
|
|
|
|
E: Currently, cannot use neigh_modify exclude with Intel package.
|
|
|
|
This is a current restriction of the Intel package.
|
|
|
|
W: Unknown Intel Compiler Version
|
|
|
|
The compiler version used to build LAMMPS has not been tested with
|
|
offload to a coprocessor.
|
|
|
|
W: Unsupported Intel Compiler
|
|
|
|
The compiler version used to build LAMMPS is not supported when using
|
|
offload to a coprocessor. There could be performance or correctness
|
|
issues. Please use 14.0.1.106 or 15.1.133 or later.
|
|
|
|
E: Currently, cannot use more than one intel style with hybrid.
|
|
|
|
Currently, hybrid pair styles can only use the intel suffix for one of the
|
|
pair styles.
|
|
|
|
E: Cannot yet use hybrid styles with Intel package.
|
|
|
|
The hybrid pair style configuration is not yet supported by the Intel
|
|
package. Support is limited to hybrid/overlay or a hybrid style that does
|
|
not require a skip list.
|
|
|
|
W: Leaving a core/node free can improve performance for offload
|
|
|
|
When each CPU is fully subscribed with MPI tasks and OpenMP threads,
|
|
context switching with threads used for offload can sometimes decrease
|
|
performance. If you see this warning, try using fewer MPI tasks/OpenMP threads
|
|
per node to leave a physical CPU core free on each node.
|
|
|
|
E: MPI tasks per node must be multiple of offload_cards
|
|
|
|
For offload to multiple coprocessors on a single node, the Intel package
|
|
requires that each coprocessor is used by the same number of MPI tasks.
|
|
|
|
W: More MPI tasks/OpenMP threads than available cores
|
|
|
|
Using more MPI tasks/OpenMP threads than available cores will typically
|
|
decrease performance.
|
|
|
|
E: USER-INTEL package requires same setting for newton bond and non-bond.
|
|
|
|
The newton setting must be the same for both pairwise and bonded forces.
|
|
|
|
E: Intel styles for bond/angle/dihedral/improper require intel pair style."
|
|
|
|
You cannot use the USER-INTEL package for bond calculations without a
|
|
USER-INTEL supported pair style.
|
|
|
|
E: Intel styles for kspace require intel pair style.
|
|
|
|
You cannot use the USER-INTEL package for kspace calculations without a
|
|
USER-INTEL supported pair style.
|
|
|
|
E: Cannot currently get per-atom virials with intel package.
|
|
|
|
The Intel package does not yet support per-atom virial calculation.
|
|
|
|
E: Too few atoms for load balancing offload.
|
|
|
|
When using offload to a coprocessor, each MPI task must have at least 2
|
|
atoms throughout the simulation.
|
|
|
|
*/
|