diff --git a/src/USER-INTEL/angle_charmm_intel.cpp b/src/USER-INTEL/angle_charmm_intel.cpp new file mode 100644 index 0000000000..fd9f6911e1 --- /dev/null +++ b/src/USER-INTEL/angle_charmm_intel.cpp @@ -0,0 +1,331 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include +#include +#include "angle_charmm_intel.h" +#include "atom.h" +#include "neighbor.h" +#include "domain.h" +#include "comm.h" +#include "force.h" +#include "math_const.h" +#include "memory.h" +#include "suffix.h" +#include "error.h" + +using namespace LAMMPS_NS; +using namespace MathConst; + +#define SMALL (flt_t)0.001 +typedef struct { int a,b,c,t; } int4_t; + +/* ---------------------------------------------------------------------- */ + +AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +AngleCharmmIntel::~AngleCharmmIntel() +{ +} + +/* ---------------------------------------------------------------------- */ + +void AngleCharmmIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + AngleCharmm::compute(eflag, vflag); + return; + } + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); +} + +/* ---------------------------------------------------------------------- */ + +template +void AngleCharmmIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = 0; + + if (evflag) { + if (eflag) { + if (force->newton_bond) + eval<1,1,1>(vflag, buffers, fc); + else + eval<1,1,0>(vflag, buffers, fc); + } else { + if (force->newton_bond) + eval<1,0,1>(vflag, buffers, fc); + else + eval<1,0,0>(vflag, buffers, fc); + } + } else { + if (force->newton_bond) + eval<0,0,1>(vflag, buffers, fc); + else + eval<0,0,0>(vflag, buffers, fc); + } +} + +/* ---------------------------------------------------------------------- */ + +template +void AngleCharmmIntel::eval(const int vflag, + IntelBuffers *buffers, + const ForceConst &fc) + +{ + const int inum = neighbor->nanglelist; + if (inum == 0) return; + + ATOM_T * _noalias const x = buffers->get_x(0); + const int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + int f_stride; + if (NEWTON_BOND) f_stride = buffers->get_stride(nall); + else f_stride = buffers->get_stride(nlocal); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + + acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5; + if (EVFLAG) { + if (EFLAG) + oeangle = (acc_t)0.0; + if (vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + } + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,fc) \ + reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int nfrom, nto, tid; + IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + + FORCE_T * _noalias const f = f_start + (tid * f_stride); + if (fix->need_zero(tid)) + memset(f, 0, f_stride * sizeof(FORCE_T)); + + const int4_t * _noalias const anglelist = + (int4_t *) neighbor->anglelist[0]; + + for (int n = nfrom; n < nto; n++) { + const int i1 = anglelist[n].a; + const int i2 = anglelist[n].b; + const int i3 = anglelist[n].c; + const int type = anglelist[n].t; + + // 1st bond + + const flt_t delx1 = x[i1].x - x[i2].x; + const flt_t dely1 = x[i1].y - x[i2].y; + const flt_t delz1 = x[i1].z - x[i2].z; + + const flt_t rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1; + const flt_t r1 = sqrt(rsq1); + + // 2nd bond + + const flt_t delx2 = x[i3].x - x[i2].x; + const flt_t dely2 = x[i3].y - x[i2].y; + const flt_t delz2 = x[i3].z - x[i2].z; + + const flt_t rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2; + const flt_t r2 = sqrt(rsq2); + + // Urey-Bradley bond + + const flt_t delxUB = x[i3].x - x[i1].x; + const flt_t delyUB = x[i3].y - x[i1].y; + const flt_t delzUB = x[i3].z - x[i1].z; + + const flt_t rsqUB = delxUB*delxUB + delyUB*delyUB + delzUB*delzUB; + const flt_t rUB = sqrt(rsqUB); + + // Urey-Bradley force & energy + + const flt_t dr = rUB - fc.fc[type].r_ub; + const flt_t rk = fc.fc[type].k_ub * dr; + + flt_t forceUB; + if (rUB > (flt_t)0.0) forceUB = (flt_t)-2.0*rk/rUB; + else forceUB = 0.0; + + flt_t eangle; + if (EFLAG) eangle = rk*dr; + + // angle (cos and sin) + + flt_t c = delx1*delx2 + dely1*dely2 + delz1*delz2; + c /= r1*r2; + + if (c > (flt_t)1.0) c = (flt_t)1.0; + if (c < (flt_t)-1.0) c = (flt_t)-1.0; + + flt_t s = sqrt((flt_t)1.0 - c*c); + if (s < SMALL) s = SMALL; + s = (flt_t)1.0/s; + + // harmonic force & energy + + const flt_t dtheta = acos(c) - fc.fc[type].theta0; + const flt_t tk = fc.fc[type].k * dtheta; + + if (EFLAG) eangle += tk*dtheta; + + const flt_t a = (flt_t)-2.0 * tk * s; + const flt_t a11 = a*c / rsq1; + const flt_t a12 = -a / (r1*r2); + const flt_t a22 = a*c / rsq2; + + const flt_t f1x = a11*delx1 + a12*delx2 - delxUB*forceUB; + const flt_t f1y = a11*dely1 + a12*dely2 - delyUB*forceUB; + const flt_t f1z = a11*delz1 + a12*delz2 - delzUB*forceUB; + + const flt_t f3x = a22*delx2 + a12*delx1 + delxUB*forceUB; + const flt_t f3y = a22*dely2 + a12*dely1 + delyUB*forceUB; + const flt_t f3z = a22*delz2 + a12*delz1 + delzUB*forceUB; + + // apply force to each of 3 atoms + + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x -= f1x + f3x; + f[i2].y -= f1y + f3y; + f[i2].z -= f1z + f3z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } + + if (EVFLAG) { + IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x, + f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1, + delx2, dely2, delz2, oeangle, f, NEWTON_BOND, + nlocal, ov0, ov1, ov2, ov3, ov4, ov5); + } + } // for n + } // omp parallel + + if (EVFLAG) { + if (EFLAG) + energy += oeangle; + if (vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + } + } + + fix->set_reduce_flag(); +} + +/* ---------------------------------------------------------------------- */ + +void AngleCharmmIntel::init_style() +{ + AngleCharmm::init_style(); + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->bond_init_check(); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void AngleCharmmIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + const int bp1 = atom->ndihedraltypes + 1; + fc.set_ntypes(bp1,memory); + + for (int i = 0; i < bp1; i++) { + fc.fc[i].k = k[i]; + fc.fc[i].theta0 = theta0[i]; + fc.fc[i].k_ub = k_ub[i]; + fc.fc[i].r_ub = r_ub[i]; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void AngleCharmmIntel::ForceConst::set_ntypes(const int nangletypes, + Memory *memory) { + if (nangletypes != _nangletypes) { + if (_nangletypes > 0) + _memory->destroy(fc); + + if (nangletypes > 0) + _memory->create(fc,nangletypes,"anglecharmmintel.fc"); + } + _nangletypes = nangletypes; + _memory = memory; +} diff --git a/src/USER-INTEL/angle_charmm_intel.h b/src/USER-INTEL/angle_charmm_intel.h new file mode 100644 index 0000000000..a98007b3ef --- /dev/null +++ b/src/USER-INTEL/angle_charmm_intel.h @@ -0,0 +1,88 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef ANGLE_CLASS + +AngleStyle(charmm/intel,AngleCharmmIntel) + +#else + +#ifndef LMP_ANGLE_CHARMM_INTEL_H +#define LMP_ANGLE_CHARMM_INTEL_H + +#include +#include "angle_charmm.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class AngleCharmmIntel : public AngleCharmm { + public: + AngleCharmmIntel(class LAMMPS *); + virtual ~AngleCharmmIntel(); + virtual void compute(int, int); + virtual void init_style(); + + protected: + FixIntel *fix; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int vflag, IntelBuffers * buffers, + const ForceConst &fc); + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template + class ForceConst { + public: + typedef struct { flt_t k, theta0, k_ub, r_ub; } fc_packed1; + + fc_packed1 *fc; + ForceConst() : _nangletypes(0) {} + ~ForceConst() { set_ntypes(0, NULL); } + + void set_ntypes(const int nangletypes, Memory *memory); + + private: + int _nangletypes; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Incorrect args for angle coefficients + +Self-explanatory. Check the input script or data file. + +*/ diff --git a/src/USER-INTEL/bond_harmonic_intel.cpp b/src/USER-INTEL/bond_harmonic_intel.cpp new file mode 100644 index 0000000000..51a33b1cc3 --- /dev/null +++ b/src/USER-INTEL/bond_harmonic_intel.cpp @@ -0,0 +1,261 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include +#include +#include "bond_harmonic_intel.h" +#include "atom.h" +#include "neighbor.h" +#include "domain.h" +#include "comm.h" +#include "force.h" +#include "memory.h" +#include "suffix.h" +#include "error.h" + +using namespace LAMMPS_NS; + +typedef struct { int a,b,t; } int3_t; + +/* ---------------------------------------------------------------------- */ + +BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +BondHarmonicIntel::~BondHarmonicIntel() +{ +} + +/* ---------------------------------------------------------------------- */ + +void BondHarmonicIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + BondHarmonic::compute(eflag, vflag); + return; + } + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); +} + +/* ---------------------------------------------------------------------- */ + +template +void BondHarmonicIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = 0; + + if (evflag) { + if (eflag) { + if (force->newton_bond) + eval<1,1,1>(vflag, buffers, fc); + else + eval<1,1,0>(vflag, buffers, fc); + } else { + if (force->newton_bond) + eval<1,0,1>(vflag, buffers, fc); + else + eval<1,0,0>(vflag, buffers, fc); + } + } else { + if (force->newton_bond) + eval<0,0,1>(vflag, buffers, fc); + else + eval<0,0,0>(vflag, buffers, fc); + } +} + +template +void BondHarmonicIntel::eval(const int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + const int inum = neighbor->nbondlist; + if (inum == 0) return; + + ATOM_T * _noalias const x = buffers->get_x(0); + const int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + int f_stride; + if (NEWTON_BOND) f_stride = buffers->get_stride(nall); + else f_stride = buffers->get_stride(nlocal); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + + acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5; + if (EVFLAG) { + if (EFLAG) + oebond = (acc_t)0.0; + if (vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + } + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,fc) \ + reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int nfrom, nto, tid; + IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + + FORCE_T * _noalias const f = f_start + (tid * f_stride); + if (fix->need_zero(tid)) + memset(f, 0, f_stride * sizeof(FORCE_T)); + + const int3_t * _noalias const bondlist = + (int3_t *) neighbor->bondlist[0]; + + for (int n = nfrom; n < nto; n++) { + const int i1 = bondlist[n].a; + const int i2 = bondlist[n].b; + const int type = bondlist[n].t; + + const flt_t delx = x[i1].x - x[i2].x; + const flt_t dely = x[i1].y - x[i2].y; + const flt_t delz = x[i1].z - x[i2].z; + + const flt_t rsq = delx*delx + dely*dely + delz*delz; + const flt_t r = sqrt(rsq); + const flt_t dr = r - fc.fc[type].r0; + const flt_t rk = fc.fc[type].k * dr; + + // force & energy + + flt_t fbond; + if (r > (flt_t)0.0) fbond = (flt_t)-2.0*rk/r; + else fbond = (flt_t)0.0; + + flt_t ebond; + if (EFLAG) ebond = rk*dr; + + // apply force to each of 2 atoms + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += delx*fbond; + f[i1].y += dely*fbond; + f[i1].z += delz*fbond; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x -= delx*fbond; + f[i2].y -= dely*fbond; + f[i2].z -= delz*fbond; + } + + if (EVFLAG) { + IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond, + delx, dely, delz, oebond, f, NEWTON_BOND, + nlocal, ov0, ov1, ov2, ov3, ov4, ov5); + } + } // for n + } // omp parallel + + if (EVFLAG) { + if (EFLAG) + energy += oebond; + if (vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + } + } + + fix->set_reduce_flag(); +} + +/* ---------------------------------------------------------------------- */ + +void BondHarmonicIntel::init_style() +{ + BondHarmonic::init_style(); + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->bond_init_check(); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void BondHarmonicIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + const int bp1 = atom->nbondtypes + 1; + fc.set_ntypes(bp1,memory); + + for (int i = 0; i < bp1; i++) { + fc.fc[i].k = k[i]; + fc.fc[i].r0 = r0[i]; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void BondHarmonicIntel::ForceConst::set_ntypes(const int nbondtypes, + Memory *memory) { + if (nbondtypes != _nbondtypes) { + if (_nbondtypes > 0) + _memory->destroy(fc); + + if (nbondtypes > 0) + _memory->create(fc,nbondtypes,"bondharmonicintel.fc"); + } + _nbondtypes = nbondtypes; + _memory = memory; +} diff --git a/src/USER-INTEL/bond_harmonic_intel.h b/src/USER-INTEL/bond_harmonic_intel.h new file mode 100644 index 0000000000..0de844cddf --- /dev/null +++ b/src/USER-INTEL/bond_harmonic_intel.h @@ -0,0 +1,88 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef BOND_CLASS + +BondStyle(harmonic/intel,BondHarmonicIntel) + +#else + +#ifndef LMP_BOND_HARMONIC_INTEL_H +#define LMP_BOND_HARMONIC_INTEL_H + +#include +#include "bond_harmonic.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class BondHarmonicIntel : public BondHarmonic { + public: + BondHarmonicIntel(class LAMMPS *); + virtual ~BondHarmonicIntel(); + virtual void compute(int, int); + virtual void init_style(); + + protected: + FixIntel *fix; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int vflag, IntelBuffers * buffers, + const ForceConst &fc); + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template + class ForceConst { + public: + typedef struct { flt_t k, r0; } fc_packed1; + fc_packed1 *fc; + + ForceConst() : _nbondtypes(0) {} + ~ForceConst() { set_ntypes(0, NULL); } + + void set_ntypes(const int nbondtypes, Memory *memory); + + private: + int _nbondtypes; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Incorrect args for bond coefficients + +Self-explanatory. Check the input script or data file. + +*/ diff --git a/src/USER-INTEL/dihedral_charmm_intel.cpp b/src/USER-INTEL/dihedral_charmm_intel.cpp new file mode 100644 index 0000000000..d2b69f9c2f --- /dev/null +++ b/src/USER-INTEL/dihedral_charmm_intel.cpp @@ -0,0 +1,538 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include "math.h" +#include "dihedral_charmm_intel.h" +#include "atom.h" +#include "comm.h" +#include "memory.h" +#include "neighbor.h" +#include "domain.h" +#include "force.h" +#include "pair.h" +#include "update.h" +#include "error.h" + +#include "suffix.h" +using namespace LAMMPS_NS; + +#define PTOLERANCE (flt_t)1.05 +#define MTOLERANCE (flt_t)-1.05 +#define SMALL (flt_t)0.001 +typedef struct { int a,b,c,d,t; } int5_t; + +/* ---------------------------------------------------------------------- */ + +DihedralCharmmIntel::DihedralCharmmIntel(class LAMMPS *lmp) + : DihedralCharmm(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +void DihedralCharmmIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + DihedralCharmm::compute(eflag, vflag); + return; + } + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralCharmmIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag,vflag); + } else evflag = 0; + + // insure pair->ev_tally() will use 1-4 virial contribution + + if (weightflag && vflag_global == 2) + force->pair->vflag_either = force->pair->vflag_global = 1; + + if (evflag) { + if (eflag) { + if (force->newton_bond) + eval<1,1,1>(vflag, buffers, fc); + else + eval<1,1,0>(vflag, buffers, fc); + } else { + if (force->newton_bond) + eval<1,0,1>(vflag, buffers, fc); + else + eval<1,0,0>(vflag, buffers, fc); + } + } else { + if (force->newton_bond) + eval<0,0,1>(vflag, buffers, fc); + else + eval<0,0,0>(vflag, buffers, fc); + } +} + +template +void DihedralCharmmIntel::eval(const int vflag, + IntelBuffers *buffers, + const ForceConst &fc) + +{ + const int inum = neighbor->ndihedrallist; + if (inum == 0) return; + + ATOM_T * _noalias const x = buffers->get_x(0); + flt_t * _noalias const q = buffers->get_q(0); + const int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + int f_stride; + if (NEWTON_BOND) f_stride = buffers->get_stride(nall); + else f_stride = buffers->get_stride(nlocal); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + + acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5; + acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5; + if (EVFLAG) { + if (EFLAG) + oevdwl = oecoul = oedihedral = (acc_t)0.0; + if (vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0; + } + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,fc) \ + reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \ + opv0,opv1,opv2,opv3,opv4,opv5) + #endif + { + int nfrom, nto, tid; + IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + + FORCE_T * _noalias const f = f_start + (tid * f_stride); + if (fix->need_zero(tid)) + memset(f, 0, f_stride * sizeof(FORCE_T)); + + const int5_t * _noalias const dihedrallist = + (int5_t *) neighbor->dihedrallist[0]; + const flt_t qqrd2e = force->qqrd2e; + + acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5; + acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5; + if (EVFLAG) { + if (EFLAG) + sevdwl = secoul = sedihedral = (acc_t)0.0; + if (vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0; + } + } + + for (int n = nfrom; n < nto; n++) { + const int i1 = dihedrallist[n].a; + const int i2 = dihedrallist[n].b; + const int i3 = dihedrallist[n].c; + const int i4 = dihedrallist[n].d; + const int type = dihedrallist[n].t; + + // 1st bond + + const flt_t vb1x = x[i1].x - x[i2].x; + const flt_t vb1y = x[i1].y - x[i2].y; + const flt_t vb1z = x[i1].z - x[i2].z; + const int itype = x[i1].w; + + // 2nd bond + + const flt_t vb2xm = x[i2].x - x[i3].x; + const flt_t vb2ym = x[i2].y - x[i3].y; + const flt_t vb2zm = x[i2].z - x[i3].z; + + // 3rd bond + + const flt_t vb3x = x[i4].x - x[i3].x; + const flt_t vb3y = x[i4].y - x[i3].y; + const flt_t vb3z = x[i4].z - x[i3].z; + const int jtype = x[i4].w; + + // 1-4 + + const flt_t delx = x[i1].x - x[i4].x; + const flt_t dely = x[i1].y - x[i4].y; + const flt_t delz = x[i1].z - x[i4].z; + + + // c,s calculation + + const flt_t ax = vb1y*vb2zm - vb1z*vb2ym; + const flt_t ay = vb1z*vb2xm - vb1x*vb2zm; + const flt_t az = vb1x*vb2ym - vb1y*vb2xm; + const flt_t bx = vb3y*vb2zm - vb3z*vb2ym; + const flt_t by = vb3z*vb2xm - vb3x*vb2zm; + const flt_t bz = vb3x*vb2ym - vb3y*vb2xm; + + const flt_t rasq = ax*ax + ay*ay + az*az; + const flt_t rbsq = bx*bx + by*by + bz*bz; + const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm; + const flt_t rg = sqrt(rgsq); + + flt_t rginv, ra2inv, rb2inv; + rginv = ra2inv = rb2inv = (flt_t)0.0; + if (rg > 0) rginv = (flt_t)1.0/rg; + if (rasq > 0) ra2inv = (flt_t)1.0/rasq; + if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq; + const flt_t rabinv = sqrt(ra2inv*rb2inv); + + flt_t c = (ax*bx + ay*by + az*bz)*rabinv; + const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z); + + // error check + if (c > PTOLERANCE || c < MTOLERANCE) { + int me = comm->me; + + if (screen) { + char str[128]; + sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + me,tid,update->ntimestep, + atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", + me,x[i1].x,x[i1].y,x[i1].z); + fprintf(screen," 2nd atom: %d %g %g %g\n", + me,x[i2].x,x[i2].y,x[i2].z); + fprintf(screen," 3rd atom: %d %g %g %g\n", + me,x[i3].x,x[i3].y,x[i3].z); + fprintf(screen," 4th atom: %d %g %g %g\n", + me,x[i4].x,x[i4].y,x[i4].z); + } + } + + if (c > (flt_t)1.0) c = (flt_t)1.0; + if (c < (flt_t)-1.0) c = (flt_t)-1.0; + + const flt_t tcos_shift = fc.bp[type].cos_shift; + const flt_t tsin_shift = fc.bp[type].sin_shift; + const flt_t tk = fc.bp[type].k; + const int m = fc.bp[type].multiplicity; + + flt_t p = (flt_t)1.0; + flt_t ddf1, df1; + ddf1 = df1 = (flt_t)0.0; + + for (int i = 0; i < m; i++) { + ddf1 = p*c - df1*s; + df1 = p*s + df1*c; + p = ddf1; + } + + p = p*tcos_shift + df1*tsin_shift; + df1 = df1*tcos_shift - ddf1*tsin_shift; + df1 *= -m; + p += (flt_t)1.0; + + if (m == 0) { + p = (flt_t)1.0 + tcos_shift; + df1 = (flt_t)0.0; + } + + const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm; + const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm; + const flt_t fga = fg*ra2inv*rginv; + const flt_t hgb = hg*rb2inv*rginv; + const flt_t gaa = -ra2inv*rg; + const flt_t gbb = rb2inv*rg; + + const flt_t dtfx = gaa*ax; + const flt_t dtfy = gaa*ay; + const flt_t dtfz = gaa*az; + const flt_t dtgx = fga*ax - hgb*bx; + const flt_t dtgy = fga*ay - hgb*by; + const flt_t dtgz = fga*az - hgb*bz; + const flt_t dthx = gbb*bx; + const flt_t dthy = gbb*by; + const flt_t dthz = gbb*bz; + + const flt_t df = -tk * df1; + + const flt_t sx2 = df*dtgx; + const flt_t sy2 = df*dtgy; + const flt_t sz2 = df*dtgz; + + flt_t f1x = df*dtfx; + flt_t f1y = df*dtfy; + flt_t f1z = df*dtfz; + + const flt_t f2x = sx2 - f1x; + const flt_t f2y = sy2 - f1y; + const flt_t f2z = sz2 - f1z; + + flt_t f4x = df*dthx; + flt_t f4y = df*dthy; + flt_t f4z = df*dthz; + + const flt_t f3x = -sx2 - f4x; + const flt_t f3y = -sy2 - f4y; + const flt_t f3z = -sz2 - f4z; + + if (EVFLAG) { + flt_t deng; + if (EFLAG) deng = tk * p; + IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x, + f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, + vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, + vb3z, sedihedral, f, NEWTON_BOND, nlocal, + sv0, sv1, sv2, sv3, sv4, sv5); + } + + + { + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } + } + + // 1-4 LJ and Coulomb interactions + // tally energy/virial in pair, using newton_bond as newton flag + + const flt_t tweight = fc.weight[type]; + const flt_t rsq = delx*delx + dely*dely + delz*delz; + const flt_t r2inv = (flt_t)1.0/rsq; + const flt_t r6inv = r2inv*r2inv*r2inv; + + flt_t forcecoul; + if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv; + else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv); + const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv - + fc.ljp[itype][jtype].lj2); + const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv; + + if (NEWTON_BOND || i1 < nlocal) { + f1x += delx*fpair; + f1y += dely*fpair; + f1z += delz*fpair; + } + if (NEWTON_BOND || i4 < nlocal) { + f4x -= delx*fpair; + f4y -= dely*fpair; + f4z -= delz*fpair; + } + + if (EVFLAG) { + flt_t ev_pre = (flt_t)0; + if (NEWTON_BOND || i1 < nlocal) + ev_pre += (flt_t)0.5; + if (NEWTON_BOND || i4 < nlocal) + ev_pre += (flt_t)0.5; + + if (EFLAG) { + flt_t ecoul, evdwl; + ecoul = tweight * forcecoul; + evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv - + fc.ljp[itype][jtype].lj4); + secoul += ev_pre * ecoul; + sevdwl += ev_pre * evdwl; + if (eatom) { + evdwl *= (flt_t)0.5; + evdwl += (flt_t)0.5 * ecoul; + if (NEWTON_BOND || i1 < nlocal) + f[i1].w += evdwl; + if (NEWTON_BOND || i4 < nlocal) + f[i4].w += evdwl; + } + } + if (vflag) { + spv0 += ev_pre * delx * delx * fpair; + spv1 += ev_pre * dely * dely * fpair; + spv2 += ev_pre * delz * delz * fpair; + spv3 += ev_pre * delx * dely * fpair; + spv4 += ev_pre * delx * delz * fpair; + spv5 += ev_pre * dely * delz * fpair; + } + } + + // apply force to each of 4 atoms + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i4 < nlocal) { + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; + } + } + } // for n + if (EVFLAG) { + if (EFLAG) { + oedihedral += sedihedral; + oecoul += secoul; + oevdwl += sevdwl; + } + if (vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5; + opv0 += spv0; opv1 += spv1; opv2 += spv2; + opv3 += spv3; opv4 += spv4; opv5 += spv5; + } + } + } // omp parallel + + if (EVFLAG) { + if (EFLAG) { + energy += oedihedral; + force->pair->eng_vdwl += oevdwl; + force->pair->eng_coul += oecoul; + } + if (vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + force->pair->virial[0] += opv0; + force->pair->virial[1] += opv1; + force->pair->virial[2] += opv2; + force->pair->virial[3] += opv3; + force->pair->virial[4] += opv4; + force->pair->virial[5] += opv5; + } + } + + fix->set_reduce_flag(); +} + +/* ---------------------------------------------------------------------- */ + +void DihedralCharmmIntel::init_style() +{ + DihedralCharmm::init_style(); + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->bond_init_check(); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralCharmmIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + + const int tp1 = atom->ntypes + 1; + const int bp1 = atom->ndihedraltypes + 1; + fc.set_ntypes(tp1,bp1,memory); + buffers->set_ntypes(tp1); + + for (int i = 0; i < tp1; i++) { + for (int j = 0; j < tp1; j++) { + fc.ljp[i][j].lj1 = lj14_1[i][j]; + fc.ljp[i][j].lj2 = lj14_2[i][j]; + fc.ljp[i][j].lj3 = lj14_3[i][j]; + fc.ljp[i][j].lj4 = lj14_4[i][j]; + } + } + + for (int i = 0; i < bp1; i++) { + fc.bp[i].multiplicity = multiplicity[i]; + fc.bp[i].cos_shift = cos_shift[i]; + fc.bp[i].sin_shift = sin_shift[i]; + fc.bp[i].k = k[i]; + fc.weight[i] = weight[i]; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralCharmmIntel::ForceConst::set_ntypes(const int npairtypes, + const int nbondtypes, + Memory *memory) { + if (npairtypes != _npairtypes) { + if (_npairtypes > 0) + _memory->destroy(ljp); + if (npairtypes > 0) + memory->create(ljp,npairtypes,npairtypes,"fc.ljp"); + } + + if (nbondtypes != _nbondtypes) { + if (_nbondtypes > 0) { + _memory->destroy(bp); + _memory->destroy(weight); + } + + if (nbondtypes > 0) { + _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp"); + _memory->create(weight,nbondtypes,"dihedralcharmmintel.weight"); + } + } + _npairtypes = npairtypes; + _nbondtypes = nbondtypes; + _memory = memory; +} diff --git a/src/USER-INTEL/dihedral_charmm_intel.h b/src/USER-INTEL/dihedral_charmm_intel.h new file mode 100644 index 0000000000..292faea9f9 --- /dev/null +++ b/src/USER-INTEL/dihedral_charmm_intel.h @@ -0,0 +1,84 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef DIHEDRAL_CLASS + +DihedralStyle(charmm/intel,DihedralCharmmIntel) + +#else + +#ifndef LMP_DIHEDRAL_CHARMM_INTEL_H +#define LMP_DIHEDRAL_CHARMM_INTEL_H + +#include "dihedral_charmm.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class DihedralCharmmIntel : public DihedralCharmm { + + public: + DihedralCharmmIntel(class LAMMPS *lmp); + virtual void compute(int, int); + void init_style(); + + private: + FixIntel *fix; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int vflag, IntelBuffers * buffers, + const ForceConst &fc); + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template + class ForceConst { + public: + typedef struct { flt_t lj1, lj2, lj3, lj4; } fc_packed1; + typedef struct { flt_t cos_shift, sin_shift, k; + int multiplicity; } fc_packed3; + + fc_packed1 **ljp; + fc_packed3 *bp; + flt_t *weight; + + ForceConst() : _npairtypes(0), _nbondtypes(0) {} + ~ForceConst() { set_ntypes(0, 0, NULL); } + + void set_ntypes(const int npairtypes, const int nbondtypes, Memory *memory); + + private: + int _npairtypes, _nbondtypes; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp index fac76939dd..295e670783 100644 --- a/src/USER-INTEL/fix_intel.cpp +++ b/src/USER-INTEL/fix_intel.cpp @@ -71,6 +71,12 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) _offload_threads = 0; _offload_tpc = 4; + _force_array_s = 0; + _force_array_m = 0; + _force_array_d = 0; + _ev_array_s = 0; + _ev_array_d = 0; + #ifdef _LMP_INTEL_OFFLOAD if (ncops < 0) error->all(FLERR,"Illegal package intel command"); _offload_affinity_set = 0; @@ -293,6 +299,8 @@ void FixIntel::init() _mixed_buffers->zero_ev(); else _double_buffers->zero_ev(); + + _need_reduce = 0; } /* ---------------------------------------------------------------------- */ @@ -317,6 +325,10 @@ void FixIntel::pair_init_check(const bool cdmessage) _nbor_pack_width = 1; + if (strstr(update->integrate_style,"intel") == 0) + error->all(FLERR, + "Specified run_style does not support the Intel package."); + #ifdef _LMP_INTEL_OFFLOAD if (_offload_balance != 0.0) atom->sortfreq = 1; @@ -348,9 +360,6 @@ void FixIntel::pair_init_check(const bool cdmessage) _sync_at_pair = 2; } else { _sync_at_pair = 0; - if (strstr(update->integrate_style,"intel") == 0) - error->all(FLERR, - "Specified run_style does not support the Intel package."); } #endif _nthreads = comm->nthreads; @@ -415,6 +424,35 @@ void FixIntel::pair_init_check(const bool cdmessage) /* ---------------------------------------------------------------------- */ +void FixIntel::bond_init_check() +{ + if (_offload_balance != 0.0 && atom->molecular && + force->newton_pair != force->newton_bond) + error->all(FLERR, + "USER-INTEL package requires same setting for newton bond and non-bond."); + + int intel_pair = 0; + if (force->pair_match("/intel", 0) != NULL) + intel_pair = 1; + else if (force->pair_match("hybrid", 1) != NULL) { + PairHybrid *hybrid = (PairHybrid *) force->pair; + for (int i = 0; i < hybrid->nstyles; i++) + if (strstr(hybrid->keywords[i], "/intel") != NULL) + intel_pair = 1; + } else if (force->pair_match("hybrid/overlay", 1) != NULL) { + PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair; + for (int i = 0; i < hybrid->nstyles; i++) + if (strstr(hybrid->keywords[i], "/intel") != NULL) + intel_pair = 1; + } + + if (intel_pair == 0) + error->all(FLERR, "Intel styles for bond/angle/dihedral/improper " + "require intel pair style."); +} + +/* ---------------------------------------------------------------------- */ + void FixIntel::check_neighbor_intel() { #ifdef _LMP_INTEL_OFFLOAD @@ -436,6 +474,82 @@ void FixIntel::check_neighbor_intel() /* ---------------------------------------------------------------------- */ +void FixIntel::sync() +{ + if (_force_array_m != 0) { + if (_need_reduce) { + reduce_results(_force_array_m); + _need_reduce = 0; + } + add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom, 0); + _force_array_m = 0; + } else if (_force_array_d != 0) { + if (_need_reduce) { + reduce_results(_force_array_d); + _need_reduce = 0; + } + add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom, 0); + _force_array_d = 0; + } else if (_force_array_s != 0) { + if (_need_reduce) { + reduce_results(_force_array_s); + _need_reduce = 0; + } + add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom, 0); + _force_array_s = 0; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void FixIntel::reduce_results(ft * _noalias const f_start) +{ + int o_range, f_stride; + if (force->newton_pair) + o_range = atom->nlocal + atom->nghost; + else + o_range = atom->nlocal; + IP_PRE_get_stride(f_stride, o_range, sizeof(ft), lmp->atom->torque); + + #if defined(_OPENMP) + #pragma omp parallel default(none) shared(o_range, f_stride) + #endif + { + int iifrom, iito, tid; + IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads, + sizeof(ft)); + + int t_off = f_stride; + if (_results_eatom) { + for (int t = 1; t < _nthreads; t++) { + _use_simd_pragma("vector nontemporal") + _use_simd_pragma("novector") + for (int n = iifrom; n < iito; n++) { + f_start[n].x += f_start[n + t_off].x; + f_start[n].y += f_start[n + t_off].y; + f_start[n].z += f_start[n + t_off].z; + f_start[n].w += f_start[n + t_off].w; + } + t_off += f_stride; + } + } else { + for (int t = 1; t < _nthreads; t++) { + _use_simd_pragma("vector nontemporal") + _use_simd_pragma("novector") + for (int n = iifrom; n < iito; n++) { + f_start[n].x += f_start[n + t_off].x; + f_start[n].y += f_start[n + t_off].y; + f_start[n].z += f_start[n + t_off].z; + } + t_off += f_stride; + } + } + } +} + +/* ---------------------------------------------------------------------- */ + void FixIntel::sync_coprocessor() { #ifdef _LMP_INTEL_OFFLOAD @@ -456,6 +570,153 @@ void FixIntel::sync_coprocessor() /* ---------------------------------------------------------------------- */ +template +void FixIntel::add_results(const ft * _noalias const f_in, + const acc_t * _noalias const ev_global, + const int eatom, const int vatom, + const int offload) { + start_watch(TIME_PACK); + int f_length; + #ifdef _LMP_INTEL_OFFLOAD + if (_separate_buffers) { + if (offload) { + add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal); + if (force->newton_pair) { + const acc_t * _noalias const enull = 0; + int offset = _offload_nlocal; + if (atom->torque) offset *= 2; + add_oresults(f_in + offset, enull, eatom, vatom, + _offload_min_ghost, _offload_nghost); + } + } else { + add_oresults(f_in, ev_global, eatom, vatom, + _host_min_local, _host_used_local); + if (force->newton_pair) { + const acc_t * _noalias const enull = 0; + int offset = _host_used_local; + if (atom->torque) offset *= 2; + add_oresults(f_in + offset, enull, eatom, + vatom, _host_min_ghost, _host_used_ghost); + } + } + stop_watch(TIME_PACK); + return; + } + if (force->newton_pair && (_offload_noghost == 0 || offload == 0)) + f_length = atom->nlocal + atom->nghost; + else + f_length = atom->nlocal; + #else + if (force->newton_pair) + f_length = atom->nlocal + atom->nghost; + else + f_length = atom->nlocal; + #endif + + add_oresults(f_in, ev_global, eatom, vatom, 0, f_length); + stop_watch(TIME_PACK); +} + +/* ---------------------------------------------------------------------- */ + +template +void FixIntel::add_oresults(const ft * _noalias const f_in, + const acc_t * _noalias const ev_global, + const int eatom, const int vatom, + const int out_offset, const int nall) { + lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset; + if (atom->torque) { + if (f_in[1].w) + if (f_in[1].w == 1) + error->all(FLERR,"Bad matrix inversion in mldivide3"); + else + error->all(FLERR, + "Sphere particles not yet supported for gayberne/intel"); + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) + #endif + { + #if defined(_OPENMP) + const int tid = omp_get_thread_num(); + #else + const int tid = 0; + #endif + int ifrom, ito; + IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t)); + if (atom->torque) { + int ii = ifrom * 2; + lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] + + out_offset; + if (eatom) { + double * _noalias const lmp_eatom = force->pair->eatom + out_offset; + #if defined(LMP_SIMD_COMPILER) + #pragma novector + #endif + for (int i = ifrom; i < ito; i++) { + f[i].x += f_in[ii].x; + f[i].y += f_in[ii].y; + f[i].z += f_in[ii].z; + lmp_eatom[i] += f_in[ii].w; + tor[i].x += f_in[ii+1].x; + tor[i].y += f_in[ii+1].y; + tor[i].z += f_in[ii+1].z; + ii += 2; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma novector + #endif + for (int i = ifrom; i < ito; i++) { + f[i].x += f_in[ii].x; + f[i].y += f_in[ii].y; + f[i].z += f_in[ii].z; + tor[i].x += f_in[ii+1].x; + tor[i].y += f_in[ii+1].y; + tor[i].z += f_in[ii+1].z; + ii += 2; + } + } + } else { + if (eatom) { + double * _noalias const lmp_eatom = force->pair->eatom + out_offset; + #if defined(LMP_SIMD_COMPILER) + #pragma novector + #endif + for (int i = ifrom; i < ito; i++) { + f[i].x += f_in[i].x; + f[i].y += f_in[i].y; + f[i].z += f_in[i].z; + lmp_eatom[i] += f_in[i].w; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma novector + #endif + for (int i = ifrom; i < ito; i++) { + f[i].x += f_in[i].x; + f[i].y += f_in[i].y; + f[i].z += f_in[i].z; + } + } + } + } + + if (ev_global != NULL) { + force->pair->eng_vdwl += ev_global[0]; + force->pair->eng_coul += ev_global[1]; + force->pair->virial[0] += ev_global[2]; + force->pair->virial[1] += ev_global[3]; + force->pair->virial[2] += ev_global[4]; + force->pair->virial[3] += ev_global[5]; + force->pair->virial[4] += ev_global[6]; + force->pair->virial[5] += ev_global[7]; + } +} + +/* ---------------------------------------------------------------------- */ + double FixIntel::memory_usage() { double bytes; @@ -473,6 +734,70 @@ double FixIntel::memory_usage() #ifdef _LMP_INTEL_OFFLOAD +/* ---------------------------------------------------------------------- */ + +template +void FixIntel::add_off_results(const ft * _noalias const f_in, + const acc_t * _noalias const ev_global) { + if (_offload_balance < 0.0) + _balance_other_time = MPI_Wtime() - _balance_other_time; + + start_watch(TIME_OFFLOAD_WAIT); + #ifdef _LMP_INTEL_OFFLOAD + #pragma offload_wait target(mic:_cop) wait(f_in) + #endif + double wait_time = stop_watch(TIME_OFFLOAD_WAIT); + + if (neighbor->ago == 0) { + if (_off_overflow_flag[LMP_OVERFLOW]) + error->one(FLERR, "Neighbor list overflow, boost neigh_modify one"); + _offload_nlocal = _off_overflow_flag[LMP_LOCAL_MAX] + 1; + _offload_min_ghost = _off_overflow_flag[LMP_GHOST_MIN]; + _offload_nghost = _off_overflow_flag[LMP_GHOST_MAX] + 1 - + _offload_min_ghost; + if (_offload_nghost < 0) _offload_nghost = 0; + _offload_nall = _offload_nlocal + _offload_nghost; + _offload_nlocal; + } + + int nlocal = atom->nlocal; + // Load balance? + if (_offload_balance < 0.0) { + if (neighbor->ago == 0) + _balance_pair = _balance_neighbor; + double mic_time; + mic_time = *_stopwatch_offload_pair; + if (_balance_pair_time + _balance_other_time < mic_time) { + double ft = _balance_pair_time + _balance_other_time + wait_time - + mic_time; + _balance_fixed = (1.0 - INTEL_LB_MEAN_WEIGHT) * _balance_fixed + + INTEL_LB_MEAN_WEIGHT * ft; + } + + double ctps = _balance_pair_time / (1.0-_balance_pair); + double otps = mic_time / _balance_pair; + double new_balance = (ctps + _balance_other_time - _balance_fixed) / + (otps + ctps); + if (new_balance < 0.01) new_balance = 0.01; + else if (new_balance > 0.99) new_balance = 0.99; + _balance_neighbor = (1.0 - INTEL_LB_MEAN_WEIGHT) *_balance_neighbor + + INTEL_LB_MEAN_WEIGHT * new_balance; + } + + #ifdef TIME_BALANCE + start_watch(TIME_IMBALANCE); + MPI_Barrier(_real_space_comm); + stop_watch(TIME_IMBALANCE); + #endif + acc_timers(); + if (atom->torque) + if (f_in[1].w < 0.0) + error->all(FLERR, "Bad matrix inversion in mldivide3"); + add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1); +} + +/* ---------------------------------------------------------------------- */ + void FixIntel::output_timing_data() { if (_im_real_space_task == 0 || _offload_affinity_set == 0) return; diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h index c1e5dd1061..4035434b9d 100644 --- a/src/USER-INTEL/fix_intel.h +++ b/src/USER-INTEL/fix_intel.h @@ -44,7 +44,9 @@ class FixIntel : public Fix { virtual void init(); virtual void setup(int); void pair_init_check(const bool cdmessage=false); + void bond_init_check(); + void sync(); // Get all forces, calculation results from coprocesser void sync_coprocessor(); @@ -65,6 +67,12 @@ class FixIntel : public Fix { inline int nbor_pack_width() const { return _nbor_pack_width; } inline void nbor_pack_width(const int w) { _nbor_pack_width = w; } + inline int need_zero(const int tid) { + if (_need_reduce == 0 && tid > 0) return 1; + return 0; + } + inline void set_reduce_flag() { _need_reduce = 1; } + protected: IntelBuffers *_single_buffers; IntelBuffers *_mixed_buffers; @@ -77,13 +85,16 @@ class FixIntel : public Fix { inline int* get_off_overflow_flag() { return _off_overflow_flag; } inline void add_result_array(IntelBuffers::vec3_acc_t *f_in, double *ev_in, const int offload, - const int eatom = 0, const int vatom = 0); + const int eatom = 0, const int vatom = 0, + const int rflag = 0); inline void add_result_array(IntelBuffers::vec3_acc_t *f_in, double *ev_in, const int offload, - const int eatom = 0, const int vatom = 0); + const int eatom = 0, const int vatom = 0, + const int rflag = 0); inline void add_result_array(IntelBuffers::vec3_acc_t *f_in, float *ev_in, const int offload, - const int eatom = 0, const int vatom = 0); + const int eatom = 0, const int vatom = 0, + const int rflag = 0); inline void get_buffern(const int offload, int &nlocal, int &nall, int &minlocal); @@ -139,6 +150,15 @@ class FixIntel : public Fix { int _overflow_flag[5]; _alignvar(int _off_overflow_flag[5],64); int _allow_separate_buffers, _offload_ghost; + + IntelBuffers::vec3_acc_t *_force_array_s; + IntelBuffers::vec3_acc_t *_force_array_m; + IntelBuffers::vec3_acc_t *_force_array_d; + float *_ev_array_s; + double *_ev_array_d; + int _results_eatom, _results_vatom; + int _need_reduce; + #ifdef _LMP_INTEL_OFFLOAD double _balance_pair_time, _balance_other_time; int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost; @@ -168,6 +188,9 @@ class FixIntel : public Fix { _alignvar(double _stopwatch_offload_neighbor[1],64); _alignvar(double _stopwatch_offload_pair[1],64); + template + void reduce_results(ft * _noalias const f_in); + template inline void add_results(const ft * _noalias const f_in, const acc_t * _noalias const ev_global, @@ -226,7 +249,8 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall, void FixIntel::add_result_array(IntelBuffers::vec3_acc_t *f_in, double *ev_in, const int offload, - const int eatom, const int vatom) { + const int eatom, const int vatom, + const int rflag) { #ifdef _LMP_INTEL_OFFLOAD if (offload) { _off_results_eatom = eatom; @@ -237,11 +261,23 @@ void FixIntel::add_result_array(IntelBuffers::vec3_acc_t *f_in, return; } #endif - add_results(f_in, ev_in, eatom, vatom, 0); + + _force_array_d = f_in; + _ev_array_d = ev_in; + _results_eatom = eatom; + _results_vatom = vatom; + #ifndef _LMP_INTEL_OFFLOAD + if (rflag != 2 && _nthreads > 1) _need_reduce = 1; + #endif + if (_overflow_flag[LMP_OVERFLOW]) error->one(FLERR, "Neighbor list overflow, boost neigh_modify one"); + #ifdef _LMP_INTEL_OFFLOAD - if (_sync_at_pair) sync_coprocessor(); + if (_sync_at_pair) { + sync(); + sync_coprocessor(); + } #endif } @@ -249,7 +285,8 @@ void FixIntel::add_result_array(IntelBuffers::vec3_acc_t *f_in, void FixIntel::add_result_array(IntelBuffers::vec3_acc_t *f_in, double *ev_in, const int offload, - const int eatom, const int vatom) { + const int eatom, const int vatom, + const int rflag) { #ifdef _LMP_INTEL_OFFLOAD if (offload) { _off_results_eatom = eatom; @@ -260,11 +297,23 @@ void FixIntel::add_result_array(IntelBuffers::vec3_acc_t *f_in, return; } #endif - add_results(f_in, ev_in, eatom, vatom, 0); + + _force_array_m = f_in; + _ev_array_d = ev_in; + _results_eatom = eatom; + _results_vatom = vatom; + #ifndef _LMP_INTEL_OFFLOAD + if (rflag != 2 && _nthreads > 1) _need_reduce = 1; + #endif + if (_overflow_flag[LMP_OVERFLOW]) error->one(FLERR, "Neighbor list overflow, boost neigh_modify one"); + #ifdef _LMP_INTEL_OFFLOAD - if (_sync_at_pair) sync_coprocessor(); + if (_sync_at_pair) { + sync(); + sync_coprocessor(); + } #endif } @@ -272,7 +321,8 @@ void FixIntel::add_result_array(IntelBuffers::vec3_acc_t *f_in, void FixIntel::add_result_array(IntelBuffers::vec3_acc_t *f_in, float *ev_in, const int offload, - const int eatom, const int vatom) { + const int eatom, const int vatom, + const int rflag) { #ifdef _LMP_INTEL_OFFLOAD if (offload) { _off_results_eatom = eatom; @@ -283,161 +333,28 @@ void FixIntel::add_result_array(IntelBuffers::vec3_acc_t *f_in, return; } #endif - add_results(f_in, ev_in, eatom, vatom, 0); + + _force_array_s = f_in; + _ev_array_s = ev_in; + _results_eatom = eatom; + _results_vatom = vatom; + #ifndef _LMP_INTEL_OFFLOAD + if (rflag != 2 && _nthreads > 1) _need_reduce = 1; + #endif + if (_overflow_flag[LMP_OVERFLOW]) error->one(FLERR, "Neighbor list overflow, boost neigh_modify one"); + #ifdef _LMP_INTEL_OFFLOAD - if (_sync_at_pair) sync_coprocessor(); + if (_sync_at_pair) { + sync(); + sync_coprocessor(); + } #endif } /* ---------------------------------------------------------------------- */ -template -void FixIntel::add_results(const ft * _noalias const f_in, - const acc_t * _noalias const ev_global, - const int eatom, const int vatom, - const int offload) { - start_watch(TIME_PACK); - int f_length; - #ifdef _LMP_INTEL_OFFLOAD - if (_separate_buffers) { - if (offload) { - add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal); - if (force->newton_pair) { - const acc_t * _noalias const enull = 0; - int offset = _offload_nlocal; - if (atom->torque) offset *= 2; - add_oresults(f_in + offset, enull, eatom, vatom, - _offload_min_ghost, _offload_nghost); - } - } else { - add_oresults(f_in, ev_global, eatom, vatom, - _host_min_local, _host_used_local); - if (force->newton_pair) { - const acc_t * _noalias const enull = 0; - int offset = _host_used_local; - if (atom->torque) offset *= 2; - add_oresults(f_in + offset, enull, eatom, - vatom, _host_min_ghost, _host_used_ghost); - } - } - stop_watch(TIME_PACK); - return; - } - if (force->newton_pair && (_offload_noghost == 0 || offload == 0)) - f_length = atom->nlocal + atom->nghost; - else - f_length = atom->nlocal; - #else - if (force->newton_pair) - f_length = atom->nlocal + atom->nghost; - else - f_length = atom->nlocal; - #endif - - add_oresults(f_in, ev_global, eatom, vatom, 0, f_length); - stop_watch(TIME_PACK); -} - -/* ---------------------------------------------------------------------- */ - -template -void FixIntel::add_oresults(const ft * _noalias const f_in, - const acc_t * _noalias const ev_global, - const int eatom, const int vatom, - const int out_offset, const int nall) { - lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset; - if (atom->torque) { - if (f_in[1].w) - if (f_in[1].w == 1) - error->all(FLERR,"Bad matrix inversion in mldivide3"); - else - error->all(FLERR, - "Sphere particles not yet supported for gayberne/intel"); - } - - #if defined(_OPENMP) - #pragma omp parallel default(none) - #endif - { - #if defined(_OPENMP) - const int tid = omp_get_thread_num(); - #else - const int tid = 0; - #endif - int ifrom, ito; - IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t)); - if (atom->torque) { - int ii = ifrom * 2; - lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] + - out_offset; - if (eatom) { - double * _noalias const lmp_eatom = force->pair->eatom + out_offset; - #if defined(LMP_SIMD_COMPILER) - #pragma novector - #endif - for (int i = ifrom; i < ito; i++) { - f[i].x += f_in[ii].x; - f[i].y += f_in[ii].y; - f[i].z += f_in[ii].z; - lmp_eatom[i] += f_in[ii].w; - tor[i].x += f_in[ii+1].x; - tor[i].y += f_in[ii+1].y; - tor[i].z += f_in[ii+1].z; - ii += 2; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma novector - #endif - for (int i = ifrom; i < ito; i++) { - f[i].x += f_in[ii].x; - f[i].y += f_in[ii].y; - f[i].z += f_in[ii].z; - tor[i].x += f_in[ii+1].x; - tor[i].y += f_in[ii+1].y; - tor[i].z += f_in[ii+1].z; - ii += 2; - } - } - } else { - if (eatom) { - double * _noalias const lmp_eatom = force->pair->eatom + out_offset; - #if defined(LMP_SIMD_COMPILER) - #pragma novector - #endif - for (int i = ifrom; i < ito; i++) { - f[i].x += f_in[i].x; - f[i].y += f_in[i].y; - f[i].z += f_in[i].z; - lmp_eatom[i] += f_in[i].w; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma novector - #endif - for (int i = ifrom; i < ito; i++) { - f[i].x += f_in[i].x; - f[i].y += f_in[i].y; - f[i].z += f_in[i].z; - } - } - } - } - - if (ev_global != NULL) { - force->pair->eng_vdwl += ev_global[0]; - force->pair->eng_coul += ev_global[1]; - force->pair->virial[0] += ev_global[2]; - force->pair->virial[1] += ev_global[3]; - force->pair->virial[2] += ev_global[4]; - force->pair->virial[3] += ev_global[5]; - force->pair->virial[4] += ev_global[6]; - force->pair->virial[5] += ev_global[7]; - } -} - #ifdef _LMP_INTEL_OFFLOAD /* ---------------------------------------------------------------------- */ @@ -491,70 +408,6 @@ void FixIntel::set_neighbor_host_sizes() { /* ---------------------------------------------------------------------- */ -template -void FixIntel::add_off_results(const ft * _noalias const f_in, - const acc_t * _noalias const ev_global) { - if (_offload_balance < 0.0) - _balance_other_time = MPI_Wtime() - _balance_other_time; - - start_watch(TIME_OFFLOAD_WAIT); - #ifdef _LMP_INTEL_OFFLOAD - if (neighbor->ago == 0) { - #pragma offload_wait target(mic:_cop) wait(atom->tag, f_in) - } else { - #pragma offload_wait target(mic:_cop) wait(f_in) - } - #endif - double wait_time = stop_watch(TIME_OFFLOAD_WAIT); - - if (neighbor->ago == 0) { - if (_off_overflow_flag[LMP_OVERFLOW]) - error->one(FLERR, "Neighbor list overflow, boost neigh_modify one"); - _offload_nlocal = _off_overflow_flag[LMP_LOCAL_MAX] + 1; - _offload_min_ghost = _off_overflow_flag[LMP_GHOST_MIN]; - _offload_nghost = _off_overflow_flag[LMP_GHOST_MAX] + 1 - - _offload_min_ghost; - if (_offload_nghost < 0) _offload_nghost = 0; - _offload_nall = _offload_nlocal + _offload_nghost; - _offload_nlocal; - } - - int nlocal = atom->nlocal; - // Load balance? - if (_offload_balance < 0.0) { - if (neighbor->ago == 0) - _balance_pair = _balance_neighbor; - double mic_time; - mic_time = *_stopwatch_offload_pair; - if (_balance_pair_time + _balance_other_time < mic_time) { - double ft = _balance_pair_time + _balance_other_time + wait_time - - mic_time; - _balance_fixed = (1.0 - INTEL_LB_MEAN_WEIGHT) * _balance_fixed + - INTEL_LB_MEAN_WEIGHT * ft; - } - - double ctps = _balance_pair_time / (1.0-_balance_pair); - double otps = mic_time / _balance_pair; - double new_balance = (ctps + _balance_other_time - _balance_fixed) / - (otps + ctps); - if (new_balance < 0.01) new_balance = 0.01; - else if (new_balance > 0.99) new_balance = 0.99; - _balance_neighbor = (1.0 - INTEL_LB_MEAN_WEIGHT) *_balance_neighbor + - INTEL_LB_MEAN_WEIGHT * new_balance; - } - - #ifdef TIME_BALANCE - start_watch(TIME_IMBALANCE); - MPI_Barrier(_real_space_comm); - stop_watch(TIME_IMBALANCE); - #endif - acc_timers(); - if (atom->torque) - if (f_in[1].w < 0.0) - error->all(FLERR, "Bad matrix inversion in mldivide3"); - add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1); -} - #endif } @@ -653,4 +506,13 @@ W: More MPI tasks/OpenMP threads than available cores Using more MPI tasks/OpenMP threads than available cores will typically decrease performance. +E: USER-INTEL package requires same setting for newton bond and non-bond. + +The newton setting must be the same for both pairwise and bonded forces. + +E: Intel styles for bond/angle/dihedral/improper require intel pair style." + +You cannot use the USER-INTEL package for bond calculations without a +USER-INTEL supported pair style. + */ diff --git a/src/USER-INTEL/improper_harmonic_intel.cpp b/src/USER-INTEL/improper_harmonic_intel.cpp new file mode 100644 index 0000000000..40ded62f5c --- /dev/null +++ b/src/USER-INTEL/improper_harmonic_intel.cpp @@ -0,0 +1,378 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "improper_harmonic_intel.h" +#include "atom.h" +#include "comm.h" +#include "neighbor.h" +#include "domain.h" +#include "force.h" +#include "update.h" +#include "math_const.h" +#include "memory.h" +#include "modify.h" +#include "suffix.h" +#include "error.h" + +using namespace LAMMPS_NS; +using namespace MathConst; + +#define PTOLERANCE (flt_t)1.05 +#define MTOLERANCE (flt_t)-1.05 +#define SMALL (flt_t)0.001 +typedef struct { int a,b,c,d,t; } int5_t; + +/* ---------------------------------------------------------------------- */ + +ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) : + ImproperHarmonic(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +ImproperHarmonicIntel::~ImproperHarmonicIntel() +{ +} + +/* ---------------------------------------------------------------------- */ + +void ImproperHarmonicIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + ImproperHarmonic::compute(eflag, vflag); + return; + } + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); +} + +/* ---------------------------------------------------------------------- */ + +template +void ImproperHarmonicIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = 0; + + if (evflag) { + if (eflag) { + if (force->newton_bond) + eval<1,1,1>(vflag, buffers, fc); + else + eval<1,1,0>(vflag, buffers, fc); + } else { + if (force->newton_bond) + eval<1,0,1>(vflag, buffers, fc); + else + eval<1,0,0>(vflag, buffers, fc); + } + } else { + if (force->newton_bond) + eval<0,0,1>(vflag, buffers, fc); + else + eval<0,0,0>(vflag, buffers, fc); + } +} + +/* ---------------------------------------------------------------------- */ + +template +void ImproperHarmonicIntel::eval(const int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + const int inum = neighbor->nimproperlist; + if (inum == 0) return; + + ATOM_T * _noalias const x = buffers->get_x(0); + const int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + int f_stride; + if (NEWTON_BOND) f_stride = buffers->get_stride(nall); + else f_stride = buffers->get_stride(nlocal); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + + acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5; + if (EVFLAG) { + if (EFLAG) + oeimproper = (acc_t)0.0; + if (vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + } + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,fc) \ + reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int nfrom, nto, tid; + IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + + FORCE_T * _noalias const f = f_start + (tid * f_stride); + if (fix->need_zero(tid)) + memset(f, 0, f_stride * sizeof(FORCE_T)); + + const int5_t * _noalias const improperlist = + (int5_t *) neighbor->improperlist[0]; + + for (int n = nfrom; n < nto; n++) { + const int i1 = improperlist[n].a; + const int i2 = improperlist[n].b; + const int i3 = improperlist[n].c; + const int i4 = improperlist[n].d; + const int type = improperlist[n].t; + + // geometry of 4-body + + const flt_t vb1x = x[i1].x - x[i2].x; + const flt_t vb1y = x[i1].y - x[i2].y; + const flt_t vb1z = x[i1].z - x[i2].z; + + const flt_t vb2x = x[i3].x - x[i2].x; + const flt_t vb2y = x[i3].y - x[i2].y; + const flt_t vb2z = x[i3].z - x[i2].z; + + const flt_t vb3x = x[i4].x - x[i3].x; + const flt_t vb3y = x[i4].y - x[i3].y; + const flt_t vb3z = x[i4].z - x[i3].z; + + const flt_t ss1 = (flt_t)1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z); + const flt_t ss2 = (flt_t)1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z); + const flt_t ss3 = (flt_t)1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z); + + const flt_t r1 = sqrt(ss1); + const flt_t r2 = sqrt(ss2); + const flt_t r3 = sqrt(ss3); + + // sin and cos of angle + + const flt_t c0 = (vb1x * vb3x + vb1y * vb3y + vb1z * vb3z) * r1 * r3; + const flt_t c1 = (vb1x * vb2x + vb1y * vb2y + vb1z * vb2z) * r1 * r2; + const flt_t c2 = -(vb3x * vb2x + vb3y * vb2y + vb3z * vb2z) * r3 * r2; + + flt_t s1 = 1.0 - c1*c1; + if (s1 < SMALL) s1 = SMALL; + s1 = (flt_t)1.0 / s1; + + flt_t s2 = (flt_t)1.0 - c2*c2; + if (s2 < SMALL) s2 = SMALL; + s2 = (flt_t)1.0 / s2; + + flt_t s12 = sqrt(s1*s2); + flt_t c = (c1*c2 + c0) * s12; + + // error check + + if (c > PTOLERANCE || c < MTOLERANCE) { + int me; + MPI_Comm_rank(world,&me); + if (screen) { + char str[128]; + sprintf(str,"Improper problem: %d " BIGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + me,update->ntimestep, + atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", + me,x[i1].x,x[i1].y,x[i1].z); + fprintf(screen," 2nd atom: %d %g %g %g\n", + me,x[i2].x,x[i2].y,x[i2].z); + fprintf(screen," 3rd atom: %d %g %g %g\n", + me,x[i3].x,x[i3].y,x[i3].z); + fprintf(screen," 4th atom: %d %g %g %g\n", + me,x[i4].x,x[i4].y,x[i4].z); + } + } + + if (c > (flt_t)1.0) c = (flt_t)1.0; + if (c < (flt_t)-1.0) c = (flt_t)-1.0; + + flt_t s = sqrt((flt_t)1.0 - c*c); + if (s < SMALL) s = SMALL; + + // force & energy + + const flt_t domega = acos(c) - fc.fc[type].chi; + flt_t a; + a = fc.fc[type].k * domega; + + flt_t eimproper; + if (EFLAG) eimproper = a*domega; + + a = -a * (flt_t)2.0/s; + c = c * a; + s12 = s12 * a; + const flt_t a11 = c*ss1*s1; + const flt_t a22 = -ss2 * ((flt_t)2.0*c0*s12 - c*(s1+s2)); + const flt_t a33 = c*ss3*s2; + const flt_t a12 = -r1*r2*(c1*c*s1 + c2*s12); + const flt_t a13 = -r1*r3*s12; + const flt_t a23 = r2*r3*(c2*c*s2 + c1*s12); + + const flt_t sx2 = a22*vb2x + a23*vb3x + a12*vb1x; + const flt_t sy2 = a22*vb2y + a23*vb3y + a12*vb1y; + const flt_t sz2 = a22*vb2z + a23*vb3z + a12*vb1z; + + const flt_t f1x = a12*vb2x + a13*vb3x + a11*vb1x; + const flt_t f1y = a12*vb2y + a13*vb3y + a11*vb1y; + const flt_t f1z = a12*vb2z + a13*vb3z + a11*vb1z; + + const flt_t f2x = -sx2 - f1x; + const flt_t f2y = -sy2 - f1y; + const flt_t f2z = -sz2 - f1z; + + const flt_t f4x = a23*vb2x + a33*vb3x + a13*vb1x; + const flt_t f4y = a23*vb2y + a33*vb3y + a13*vb1y; + const flt_t f4z = a23*vb2z + a33*vb3z + a13*vb1z; + + const flt_t f3x = sx2 - f4x; + const flt_t f3y = sy2 - f4y; + const flt_t f3z = sz2 - f4z; + + // apply force to each of 4 atoms + + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } + + if (NEWTON_BOND || i4 < nlocal) { + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; + } + + if (EVFLAG) { + IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, vb3x, vb3y, + vb3z, oeimproper, f, NEWTON_BOND, nlocal, + ov0, ov1, ov2, ov3, ov4, ov5); + } + } // for n + } // omp parallel + if (EVFLAG) { + if (EFLAG) + energy += oeimproper; + if (vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + } + } + + fix->set_reduce_flag(); +} + +/* ---------------------------------------------------------------------- */ + +void ImproperHarmonicIntel::init() +{ + ImproperHarmonic::init(); + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->bond_init_check(); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void ImproperHarmonicIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + const int bp1 = atom->nimpropertypes + 1; + fc.set_ntypes(bp1,memory); + + for (int i = 0; i < bp1; i++) { + fc.fc[i].k = k[i]; + fc.fc[i].chi = chi[i]; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void ImproperHarmonicIntel::ForceConst::set_ntypes(const int nimproper, + Memory *memory) { + if (nimproper != _nimpropertypes) { + if (_nimpropertypes > 0) + _memory->destroy(fc); + + if (nimproper > 0) + _memory->create(fc,nimproper,"improperharmonicintel.fc"); + } + _nimpropertypes = nimproper; + _memory = memory; +} diff --git a/src/USER-INTEL/improper_harmonic_intel.h b/src/USER-INTEL/improper_harmonic_intel.h new file mode 100644 index 0000000000..d5b0e4ee0f --- /dev/null +++ b/src/USER-INTEL/improper_harmonic_intel.h @@ -0,0 +1,94 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef IMPROPER_CLASS + +ImproperStyle(harmonic/intel,ImproperHarmonicIntel) + +#else + +#ifndef LMP_IMPROPER_HARMONIC_INTEL_H +#define LMP_IMPROPER_HARMONIC_INTEL_H + +#include +#include "improper_harmonic.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class ImproperHarmonicIntel : public ImproperHarmonic { + public: + ImproperHarmonicIntel(class LAMMPS *); + virtual ~ImproperHarmonicIntel(); + virtual void compute(int, int); + virtual void init(); + + protected: + FixIntel *fix; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int vflag, IntelBuffers * buffers, + const ForceConst &fc); + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template + class ForceConst { + public: + typedef struct { flt_t k, chi; } fc_packed1; + + fc_packed1 *fc; + + ForceConst() : _nimpropertypes(0) {} + ~ForceConst() { set_ntypes(0, NULL); } + + void set_ntypes(const int nimpropertypes, Memory *memory); + + private: + int _nimpropertypes; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +W: Improper problem: %d %ld %d %d %d %d + +Conformation of the 4 listed improper atoms is extreme; you may want +to check your simulation geometry. + +E: Incorrect args for improper coefficients + +Self-explanatory. Check the input script or data file. + +*/ diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp index 1cae48cfb3..f915baaa5d 100644 --- a/src/USER-INTEL/intel_buffers.cpp +++ b/src/USER-INTEL/intel_buffers.cpp @@ -319,13 +319,10 @@ template void IntelBuffers::free_nbor_list() { if (_list_alloc_atoms > 0) { - lmp->memory->destroy(_list_alloc); - _list_alloc_atoms = 0; - #ifdef _LMP_INTEL_OFFLOAD if (_off_list_alloc) { int * list_alloc = _list_alloc; - int * special_flag = lmp->neighbor->special_flag_alloc(); + int * special_flag = _off_map_special_flag; int * stencil = _off_map_stencil; if (list_alloc != 0 && special_flag != 0 && stencil != 0) { #pragma offload_transfer target(mic:_cop) \ @@ -335,6 +332,8 @@ void IntelBuffers::free_nbor_list() _off_list_alloc = false; } #endif + lmp->memory->destroy(_list_alloc); + _list_alloc_atoms = 0; } } @@ -350,7 +349,7 @@ void IntelBuffers::_grow_nbor_list(NeighList *list, free_nbor_list(); _list_alloc_atoms = 1.10 * nlocal; int nt = MAX(nthreads, _off_threads); - int list_alloc_size = (_list_alloc_atoms + nt + pack_width - 1) * + int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) * get_max_nbors(); lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc"); #ifdef _LMP_INTEL_OFFLOAD @@ -365,6 +364,7 @@ void IntelBuffers::_grow_nbor_list(NeighList *list, in(stencil:length(list->maxstencil) alloc_if(1) free_if(0)) \ nocopy(list_alloc:length(list_alloc_size) alloc_if(1) free_if(0)) _off_map_stencil = stencil; + _off_map_special_flag = special_flag; _off_list_alloc = true; } } @@ -406,6 +406,11 @@ void IntelBuffers::free_ccache() #pragma offload_transfer target(mic:_cop) \ nocopy(ccachex,ccachey,ccachez,ccachew:alloc_if(0) free_if(1)) \ nocopy(ccachei,ccachej:alloc_if(0) free_if(1)) + + #ifdef LMP_USE_AVXCD + #pragma offload_transfer target(mic:_cop) \ + nocopy(ccachef:alloc_if(0) free_if(1)) + #endif } _off_ccache = 0; #endif @@ -451,7 +456,7 @@ void IntelBuffers::grow_ccache(const int off_flag, lmp->memory->create(_ccachei, vsize, "_ccachei"); lmp->memory->create(_ccachej, vsize, "_ccachej"); #ifdef LMP_USE_AVXCD - IP_PRE_get_stride(_ccache_stride3, nsize * 3, esize, 0); + IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0); lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef"); #endif memset(_ccachej, 0, vsize * sizeof(int)); @@ -473,6 +478,12 @@ void IntelBuffers::grow_ccache(const int off_flag, nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \ in(ccachej:length(vsize) alloc_if(1) free_if(0)) } + #ifdef LMP_USE_AVXCD + if (ccachef != NULL) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(ccachef:length(_ccache_stride3 * nt) alloc_if(1) free_if(0)) + } + #endif _off_ccache = 1; } #endif diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h index cbae3f56d5..81056d248f 100644 --- a/src/USER-INTEL/intel_buffers.h +++ b/src/USER-INTEL/intel_buffers.h @@ -298,7 +298,7 @@ class IntelBuffers { quat_t *_host_quat; vec3_acc_t *_off_f; int _off_map_nmax, _off_map_maxhead, _cop, _off_ccache; - int *_off_map_ilist; + int *_off_map_ilist, *_off_map_special_flag; int *_off_map_stencil, *_off_map_special, *_off_map_nspecial, *_off_map_tag; int *_off_map_binhead, *_off_map_bins, *_off_map_numneigh; bool _off_list_alloc; diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h index aabe6e32ad..96e740a667 100644 --- a/src/USER-INTEL/intel_preprocess.h +++ b/src/USER-INTEL/intel_preprocess.h @@ -383,11 +383,78 @@ inline double MIC_Wtime() { } \ } +#define IP_PRE_ev_tally_bond(eflag, eatom, vflag, ebond, i1, i2, fbond, \ + delx, dely, delz, obond, force, newton, \ + nlocal, ov0, ov1, ov2, ov3, ov4, ov5) \ +{ \ + flt_t ev_pre; \ + if (newton) ev_pre = (flt_t)1.0; \ + else { \ + ev_pre = (flt_t)0.0; \ + if (i1 < nlocal) ev_pre += (flt_t)0.5; \ + if (i2 < nlocal) ev_pre += (flt_t)0.5; \ + } \ + \ + if (eflag) { \ + oebond += ev_pre * ebond; \ + if (eatom) { \ + flt_t halfeng = ebond * (flt_t)0.5; \ + if (newton || i1 < nlocal) f[i1].w += halfeng; \ + if (newton || i2 < nlocal) f[i2].w += halfeng; \ + } \ + } \ + \ + if (vflag) { \ + ov0 += ev_pre * (delx * delx * fbond); \ + ov1 += ev_pre * (dely * dely * fbond); \ + ov2 += ev_pre * (delz * delz * fbond); \ + ov3 += ev_pre * (delx * dely * fbond); \ + ov4 += ev_pre * (delx * delz * fbond); \ + ov5 += ev_pre * (dely * delz * fbond); \ + } \ +} + +#define IP_PRE_ev_tally_angle(eflag, eatom, vflag, eangle, i1, i2, i3, \ + f1x, f1y, f1z, f3x, f3y, f3z, delx1, \ + dely1, delz1, delx2, dely2, delz2, \ + oeangle, force, newton, nlocal, ov0, ov1, \ + ov2, ov3, ov4, ov5) \ +{ \ + flt_t ev_pre; \ + if (newton) ev_pre = (flt_t)1.0; \ + else { \ + ev_pre = (flt_t)0.0; \ + if (i1 < nlocal) ev_pre += (flt_t)0.3333333333333333; \ + if (i2 < nlocal) ev_pre += (flt_t)0.3333333333333333; \ + if (i3 < nlocal) ev_pre += (flt_t)0.3333333333333333; \ + } \ + \ + if (eflag) { \ + oeangle += ev_pre * eangle; \ + if (eatom) { \ + flt_t thirdeng = eangle * (flt_t)0.3333333333333333; \ + if (newton || i1 < nlocal) f[i1].w += thirdeng; \ + if (newton || i2 < nlocal) f[i2].w += thirdeng; \ + if (newton || i3 < nlocal) f[i3].w += thirdeng; \ + } \ + } \ + \ + if (vflag) { \ + ov0 += ev_pre * (delx1 * f1x + delx2 * f3x); \ + ov1 += ev_pre * (dely1 * f1y + dely2 * f3y); \ + ov2 += ev_pre * (delz1 * f1z + delz2 * f3z); \ + ov3 += ev_pre * (delx1 * f1y + delx2 * f3y); \ + ov4 += ev_pre * (delx1 * f1z + delx2 * f3z); \ + ov5 += ev_pre * (dely1 * f1z + dely2 * f3z); \ + } \ +} + #define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\ f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, \ f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, \ vb3x, vb3y, vb3z,oedihedral, force, \ - newton, nlocal) \ + newton, nlocal, ov0, ov1, ov2, ov3, ov4, \ + ov5) \ { \ flt_t ev_pre; \ if (newton) ev_pre = (flt_t)1.0; \ @@ -411,12 +478,12 @@ inline double MIC_Wtime() { } \ \ if (vflag) { \ - sv0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \ - sv1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \ - sv2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \ - sv3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y); \ - sv4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z); \ - sv5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z); \ + ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \ + ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \ + ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \ + ov3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y); \ + ov4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z); \ + ov5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z); \ } \ } @@ -459,7 +526,7 @@ inline double MIC_Wtime() { #define IP_PRE_fdotr_acc_force(newton, evflag, eflag, vflag, eatom, \ nall, nlocal, minlocal, nthreads, \ - f_start, f_stride, x) \ + f_start, f_stride, x, offload) \ { \ int o_range; \ if (newton) \ diff --git a/src/USER-INTEL/intel_simd.h b/src/USER-INTEL/intel_simd.h index 989b00794d..e6f57bfa93 100644 --- a/src/USER-INTEL/intel_simd.h +++ b/src/USER-INTEL/intel_simd.h @@ -110,6 +110,16 @@ namespace ip_simd { return _mm512_maskz_mov_pd(m, one); } + inline SIMD_float SIMD_set(const SIMD_float &src, const SIMD_mask &m, + const SIMD_float &one) { + return _mm512_mask_mov_ps(src,m,one); + } + + inline SIMD_double SIMD_set(const SIMD_double &src, const SIMD_mask &m, + const SIMD_double &one) { + return _mm512_mask_mov_pd(src,m,one); + } + // -------- Load Operations inline SIMD_int SIMD_load(const int *p) { @@ -352,6 +362,14 @@ namespace ip_simd { return _mm512_rcp28_pd(one); } + inline SIMD_float SIMD_rcpz(const SIMD_mask &m, const SIMD_float &one) { + return _mm512_maskz_rcp28_ps(m, one); + } + + inline SIMD_double SIMD_rcpz(const SIMD_mask &m, const SIMD_double &one) { + return _mm512_maskz_rcp28_pd(m, one); + } + inline SIMD_float SIMD_sqrt(const SIMD_float &one) { return _mm512_sqrt_ps(one); } @@ -456,6 +474,40 @@ namespace ip_simd { return _mm512_cmple_pd_mask(SIMD_set(one), two); } + inline SIMD_mask operator>(const SIMD_int &one, const SIMD_int &two) { + return _mm512_cmpgt_epi32_mask(one,two); + } + + inline SIMD_mask operator>(const SIMD_float &one, const SIMD_float &two) { + return _mm512_cmplt_ps_mask(two,one); + } + + inline SIMD_mask operator>(const SIMD_double &one, const SIMD_double &two) { + return _mm512_cmplt_pd_mask(two,one); + } + + inline SIMD_mask operator==(const SIMD_int &one, const SIMD_int &two) { + return _mm512_cmpeq_epi32_mask(one,two); + } + + inline SIMD_mask operator==(const SIMD_float &one, const SIMD_float &two) { + return _mm512_cmpeq_ps_mask(one,two); + } + + inline SIMD_mask operator==(const SIMD_double &one, const SIMD_double &two) { + return _mm512_cmpeq_pd_mask(one,two); + } + + // ------- Typecast operations + + inline void SIMD_cast(const SIMD_int &one, SIMD_float &two) { + two = _mm512_cvtepi32_ps(one); + } + + inline void SIMD_cast(const SIMD_int &one, SIMD_double &two) { + two = _mm512_cvtepi32lo_pd(one); + } + // ------- Reduction operations inline int SIMD_max(const SIMD_int &i) { @@ -482,6 +534,55 @@ namespace ip_simd { return _mm512_reduce_add_pd(i); } + // i indices should be positive + inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i, + SIMD_float &v1) { + SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i); + SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc); + SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1)); + if (todo_mask) { + SIMD_int lz = _mm512_lzcnt_epi32(cd); + SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31), + _mm512_lzcnt_epi32(cd)); + + while(todo_mask) { + SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); + SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, + todo_bcast); + SIMD_float am_perm; + am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), + now_mask, lid, v1); + v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm); + todo_mask = _mm512_kxor(todo_mask, now_mask); + } + } + } + + // i indices should be positive + inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i, + SIMD_double &v1) { + SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i); + SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc); + SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1)); + if (todo_mask) { + SIMD_int lz = _mm512_lzcnt_epi32(cd); + SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31), + _mm512_lzcnt_epi32(cd)); + lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid)); + + while(todo_mask) { + SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); + SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, + todo_bcast); + SIMD_double am_perm; + am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), + now_mask, lid, v1); + v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm); + todo_mask = _mm512_kxor(todo_mask, now_mask); + } + } + } + // i indices should be positive inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i, SIMD_float &v1, SIMD_float &v2, @@ -655,6 +756,166 @@ namespace ip_simd { _MM_SCALE_2); } + inline SIMD_float SIMD_ev_add(const SIMD_float &one, + const SIMD_float &two) { + return _mm512_add_ps(one,two); + } + + inline SIMD_double SIMD_ev_add(const SIMD_double &one, + const SIMD_double &two) { + return _mm512_add_pd(one,two); + } + + inline SIMD_double SIMD_ev_add(const SIMD_double &one, + const SIMD_float &two) { + SIMD_double twod = _mm512_cvtps_pd(_mm512_castps512_ps256(two)); + SIMD_double ans = _mm512_add_pd(one,twod); + twod = _mm512_cvtps_pd(_mm512_castps512_ps256( + _mm512_shuffle_f32x4(two,two,238))); + return _mm512_add_pd(ans,twod); + } + + inline void SIMD_jeng_update(const SIMD_mask &rmask, float *force, + const SIMD_int &joffset, SIMD_float &eng) { + SIMD_float jeng; + SIMD_conflict_pi_reduce1(rmask, joffset, eng); + jeng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), rmask, joffset, + force, _MM_SCALE_1); + jeng = jeng + eng; + _mm512_mask_i32scatter_ps(force, rmask, joffset, jeng, _MM_SCALE_1); + } + + inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, + const SIMD_int &joffset, SIMD_double &eng) { + SIMD_double jeng; + SIMD_conflict_pi_reduce1(rmask, joffset, eng); + jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, + force, _MM_SCALE_2); + jeng = jeng + eng; + _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2); + } + + inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, + const SIMD_int &joffset, SIMD_float &eng) { + SIMD_double engd, jeng; + engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng)); + SIMD_conflict_pi_reduce1(rmask, joffset, engd); + jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, + force, _MM_SCALE_2); + jeng = jeng + engd; + _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2); + + SIMD_mask rmask2 = rmask >> 8; + engd = _mm512_cvtps_pd(_mm512_castps512_ps256( + _mm512_shuffle_f32x4(eng,eng,238))); + SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238); + SIMD_conflict_pi_reduce1(rmask2, joffset2, engd); + jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, + force, _MM_SCALE_2); + jeng = jeng + engd; + _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2); + } + + inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force, + const SIMD_int &joffset1, SIMD_float &eng) { + } + + inline void SIMD_jeng_update_hi(const SIMD_mask &mask, double *force, + const SIMD_int &joffset1, SIMD_double &eng) { + SIMD_mask rmask = mask >> 8; + SIMD_int joffset = _mm512_shuffle_i32x4(joffset1, joffset1, 238); + + SIMD_double jeng; + SIMD_conflict_pi_reduce1(rmask, joffset, eng); + jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, + force, _MM_SCALE_2); + jeng = jeng + eng; + _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2); + } + + inline void SIMD_safe_jforce(const SIMD_mask &m, float *force, + const SIMD_int &i, SIMD_float &fx, + SIMD_float &fy, SIMD_float &fz) { + SIMD_conflict_pi_reduce3(m, i, fx, fy, fz); + SIMD_float jfrc; + jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, + _MM_SCALE_1); + jfrc = jfrc + fx; + _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1); + jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, + _MM_SCALE_1); + jfrc = jfrc + fy; + _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1); + jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2, + _MM_SCALE_1); + jfrc = jfrc + fz; + _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1); + } + + inline void SIMD_safe_jforce(const SIMD_mask &m, double *force, + const SIMD_int &i, SIMD_double &fx, + SIMD_double &fy, SIMD_double &fz) { + SIMD_conflict_pi_reduce3(m, i, fx, fy, fz); + SIMD_double jfrc; + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, + _MM_SCALE_2); + jfrc = jfrc + fx; + _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, + _MM_SCALE_2); + jfrc = jfrc + fy; + _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2, + _MM_SCALE_2); + jfrc = jfrc + fz; + _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2); + } + + inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force, + const SIMD_int &joffset, SIMD_float &amx, + SIMD_float &amy, SIMD_float &amz) { + SIMD_double amxd, amyd, amzd; + amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx)); + amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy)); + amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz)); + SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd); + SIMD_double jfrc; + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, + force, _MM_SCALE_2); + jfrc = jfrc + amxd; + _mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, + force + 1, _MM_SCALE_2); + jfrc = jfrc + amyd; + _mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, + force + 2, _MM_SCALE_2); + jfrc = jfrc + amzd; + _mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2); + + SIMD_mask rmask2 = rmask >> 8; + amxd = _mm512_cvtps_pd(_mm512_castps512_ps256( + _mm512_shuffle_f32x4(amx,amx,238))); + amyd = _mm512_cvtps_pd(_mm512_castps512_ps256( + _mm512_shuffle_f32x4(amy,amy,238))); + amzd = _mm512_cvtps_pd(_mm512_castps512_ps256( + _mm512_shuffle_f32x4(amz,amz,238))); + SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238); + SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, + force, _MM_SCALE_2); + jfrc = jfrc + amxd; + _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, + force + 1, _MM_SCALE_2); + jfrc = jfrc + amyd; + _mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, + force + 2, _MM_SCALE_2); + jfrc = jfrc + amzd; + _mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2); + } + inline void SIMD_jforce_update(const SIMD_mask &m, float *force, const SIMD_int &i, const SIMD_float &fx, const SIMD_float &fy, const SIMD_float &fz) { @@ -1086,37 +1347,48 @@ namespace ip_simd { } } - inline void SIMD_acc_three(const SIMD_mask &hmask, - const SIMD_float &facrad, const int eatom, - SIMD_float &sevdwl, SIMD_float &fwtmp, - SIMD_float &fjtmp, SIMD_float &fwtmp2, - SIMD_float &fjtmp2) { + inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, + const int eatom, SIMD_float &sevdwl, + SIMD_float &fwtmp, SIMD_float &fjtmp, + SIMD_float &fwtmp2, SIMD_float &fjtmp2, + const SIMD_int &k, float *force) { sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad); if (eatom) { SIMD_float hevdwl = facrad * SIMD_set((float)0.33333333); fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); + SIMD_conflict_pi_reduce1(hmask, k, hevdwl); + SIMD_float keng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), hmask, + k, force + 3, _MM_SCALE_1); + keng = keng + hevdwl; + _mm512_mask_i32scatter_ps(force + 3, hmask, k, keng, _MM_SCALE_1); } } - inline void SIMD_acc_three(const SIMD_mask &hmask, - const SIMD_double &facrad, const int eatom, - SIMD_double &sevdwl, SIMD_double &fwtmp, - SIMD_double &fjtmp, SIMD_double &fwtmp2, - SIMD_double &fjtmp2) { + inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_double &facrad, + const int eatom, SIMD_double &sevdwl, + SIMD_double &fwtmp, SIMD_double &fjtmp, + SIMD_double &fwtmp2, SIMD_double &fjtmp2, + const SIMD_int &k, double *force) { sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad); if (eatom) { SIMD_double hevdwl = facrad * SIMD_set((double)0.33333333); fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); + SIMD_conflict_pi_reduce1(hmask, k, hevdwl); + SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), + hmask, k, force + 3, + _MM_SCALE_2); + keng = keng + hevdwl; + _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2); } } - inline void SIMD_acc_three(const SIMD_mask &hmask, - const SIMD_float &facrad, const int eatom, - SIMD_double &sevdwl, SIMD_double &fwtmp, - SIMD_double &fjtmp, SIMD_double &fwtmp2, - SIMD_double &fjtmp2) { + inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, + const int eatom, SIMD_double &sevdwl, + SIMD_double &fwtmp, SIMD_double &fjtmp, + SIMD_double &fwtmp2, SIMD_double &fjtmp2, + const SIMD_int &k, double *force) { SIMD_double facradd; facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(facrad)); sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facradd); @@ -1124,6 +1396,12 @@ namespace ip_simd { SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333); fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); + SIMD_conflict_pi_reduce1(hmask, k, hevdwl); + SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), + hmask, k, force + 3, + _MM_SCALE_2); + keng = keng + hevdwl; + _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2); } SIMD_mask hmask2 = hmask >> 8; facradd = _mm512_cvtps_pd(_mm512_castps512_ps256( @@ -1133,6 +1411,13 @@ namespace ip_simd { SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333); fwtmp2 = SIMD_add(fwtmp2, hmask2, fwtmp2, hevdwl); fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl); + SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238); + SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl); + SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), + hmask2, k2, force + 3, + _MM_SCALE_2); + keng = keng + hevdwl; + _mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2); } } @@ -1372,7 +1657,9 @@ namespace ip_simd { inline void SIMD_iforce_update(const SIMD_mask &m, float *force, const SIMD_int &i, const SIMD_float &fx, - const SIMD_float &fy, const SIMD_float &fz) { + const SIMD_float &fy, const SIMD_float &fz, + const int EVFLAG, const int eatom, + const SIMD_float &fwtmp) { SIMD_float jfrc; jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, _MM_SCALE_1); @@ -1386,11 +1673,21 @@ namespace ip_simd { _MM_SCALE_1); jfrc = jfrc + fz; _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1); + if (EVFLAG) { + if (eatom) { + jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3, + _MM_SCALE_1); + jfrc = jfrc + fwtmp; + _mm512_mask_i32scatter_ps(force+3, m, i, jfrc, _MM_SCALE_1); + } + } } inline void SIMD_iforce_update(const SIMD_mask &m, double *force, const SIMD_int &i, const SIMD_double &fx, - const SIMD_double &fy, const SIMD_double &fz) { + const SIMD_double &fy, const SIMD_double &fz, + const int EVFLAG, const int eatom, + const SIMD_double &fwtmp) { SIMD_double jfrc; jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, _MM_SCALE_2); @@ -1404,6 +1701,14 @@ namespace ip_simd { _MM_SCALE_2); jfrc = jfrc + fz; _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2); + if (EVFLAG) { + if (eatom) { + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, + force + 3, _MM_SCALE_2); + jfrc = jfrc + fwtmp; + _mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2); + } + } } #ifdef SW_GATHER_TEST diff --git a/src/USER-INTEL/neigh_half_bin_intel.cpp b/src/USER-INTEL/neigh_half_bin_intel.cpp index 6700712cb4..6fa1d48e48 100644 --- a/src/USER-INTEL/neigh_half_bin_intel.cpp +++ b/src/USER-INTEL/neigh_half_bin_intel.cpp @@ -369,6 +369,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, const int list_size = (ito + tid + 1) * maxnbors; int ct = (ifrom + tid) * maxnbors; int *neighptr = firstneigh + ct; + for (int i = ifrom; i < ito; i++) { int j, k, n, n2, itype, jtype, ibin; double xtmp, ytmp, ztmp, delx, dely, delz, rsq; @@ -861,22 +862,23 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, int e_ito = ito; if (ito == num) { int imod = ito % swidth; - if (imod) e_ito += swidth - e_ito; + if (imod) e_ito += swidth - imod; } - const int list_size = (e_ito + tid + 1) * maxnbors; + const int list_size = (e_ito + tid * 2 + 2) * maxnbors; #else const int swidth = 1; IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads); ifrom += astart; ito += astart; - const int list_size = (ito + tid + 1) * maxnbors; + const int list_size = (ito + tid * 2 + 2) * maxnbors; #endif int which; int pack_offset = maxnbors * swidth; - int ct = (ifrom + tid) * maxnbors; + int ct = (ifrom + tid * 2) * maxnbors; int *neighptr = firstneigh + ct; + const int obound = pack_offset + maxnbors * 2; int max_chunk = 0; int lane = 0; @@ -934,6 +936,8 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, } } + if (raw_count > obound) *overflow = 1; + #if defined(LMP_SIMD_COMPILER) #ifdef _LMP_INTEL_OFFLOAD int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax; @@ -994,7 +998,6 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, } } int ns = (n - lane) / swidth; - if (ns > maxnbors || n2 > list_size) *overflow = 1; for (int u = pack_offset; u < n2; u++) { neighptr[n] = neighptr[u]; n += swidth; @@ -1011,7 +1014,6 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, #ifdef OUTER_CHUNK if (ns > max_chunk) max_chunk = ns; lane++; - pack_offset -= maxnbors; if (lane == swidth) { ct += max_chunk * swidth; const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); @@ -1021,10 +1023,10 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, max_chunk = 0; pack_offset = maxnbors * swidth; lane = 0; - if (ct + pack_offset + maxnbors > list_size) { + if (ct + obound > list_size) { if (i < ito - 1) { *overflow = 1; - ct = (ifrom + tid) * maxnbors; + ct = (ifrom + tid * 2) * maxnbors; } } } @@ -1034,10 +1036,10 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, const int edge = (ct % alignb); if (edge) ct += alignb - edge; neighptr = firstneigh + ct; - if (ct + pack_offset + maxnbors > list_size) { + if (ct + obound > list_size) { if (i < ito - 1) { *overflow = 1; - ct = (ifrom + tid) * maxnbors; + ct = (ifrom + tid * 2) * maxnbors; } } #endif @@ -1965,15 +1967,14 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in, int e_ito = ito; if (ito == num) { int imod = ito % pack_width; - if (imod) e_ito += pack_width - e_ito; + if (imod) e_ito += pack_width - imod; } - const int list_size = (e_ito + tid + 1) * maxnbors; - + const int list_size = (e_ito + tid * 2 + 2) * maxnbors; int which; - int pack_offset = maxnbors * pack_width; - int ct = (ifrom + tid) * maxnbors; + int ct = (ifrom + tid * 2) * maxnbors; int *neighptr = firstneigh + ct; + const int obound = pack_offset + maxnbors * 2; int max_chunk = 0; int lane = 0; @@ -2011,6 +2012,8 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in, } } + if (raw_count > obound) *overflow = 1; + #if defined(LMP_SIMD_COMPILER) #ifdef _LMP_INTEL_OFFLOAD int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax; @@ -2084,7 +2087,6 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in, } } int ns = (n - lane) / pack_width; - if (ns > maxnbors || n2 > list_size) *overflow = 1; atombin[i] = ns; for (int u = pack_offset; u < n2; u++) { neighptr[n] = neighptr[u]; @@ -2098,7 +2100,6 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in, if (ns > max_chunk) max_chunk = ns; lane++; - pack_offset -= maxnbors; if (lane == pack_width) { ct += max_chunk * pack_width; const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); @@ -2108,10 +2109,10 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in, max_chunk = 0; pack_offset = maxnbors * pack_width; lane = 0; - if (ct + pack_offset + maxnbors > list_size) { + if (ct + obound > list_size) { if (i < ito - 1) { *overflow = 1; - ct = (ifrom + tid) * maxnbors; + ct = (ifrom + tid * 2) * maxnbors; } } } diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp index ee51659a3b..b9d60d8ead 100644 --- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp +++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp @@ -358,12 +358,18 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp); } // for ii - #if defined(_OPENMP) - #pragma omp barrier + + #ifndef _LMP_INTEL_OFFLOAD + if (vflag == 2) #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x); + { + #if defined(_OPENMP) + #pragma omp barrier + #endif + IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, + nlocal, minlocal, nthreads, f_start, f_stride, + x, offload); + } } // end of omp parallel region if (EVFLAG) { if (EFLAG) { @@ -390,7 +396,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, fix->stop_watch(TIME_HOST_PAIR); if (EVFLAG) - fix->add_result_array(f_start, ev_global, offload, eatom); + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); } diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp index 18f96b7397..7f2cc654e5 100644 --- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp @@ -410,12 +410,18 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, f[i].z += fztmp; IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp); } // for ii - #if defined(_OPENMP) - #pragma omp barrier + + #ifndef _LMP_INTEL_OFFLOAD + if (vflag == 2) #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x); + { + #if defined(_OPENMP) + #pragma omp barrier + #endif + IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, + nlocal, minlocal, nthreads, f_start, f_stride, + x, offload); + } } // end of omp parallel region if (EVFLAG) { if (EFLAG) { @@ -442,7 +448,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, fix->stop_watch(TIME_HOST_PAIR); if (EVFLAG) - fix->add_result_array(f_start, ev_global, offload, eatom); + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); } diff --git a/src/USER-INTEL/pair_buck_intel.cpp b/src/USER-INTEL/pair_buck_intel.cpp index 0c419b18ed..5411661c54 100644 --- a/src/USER-INTEL/pair_buck_intel.cpp +++ b/src/USER-INTEL/pair_buck_intel.cpp @@ -321,12 +321,18 @@ void PairBuckIntel::eval(const int offload, const int vflag, f[i].z += fztmp; IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp); } // for ii - #if defined(_OPENMP) - #pragma omp barrier + + #ifndef _LMP_INTEL_OFFLOAD + if (vflag == 2) #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x); + { + #if defined(_OPENMP) + #pragma omp barrier + #endif + IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, + nlocal, minlocal, nthreads, f_start, f_stride, + x, offload); + } } // end of omp parallel region if (EVFLAG) { if (EFLAG) { @@ -353,7 +359,7 @@ void PairBuckIntel::eval(const int offload, const int vflag, fix->stop_watch(TIME_HOST_PAIR); if (EVFLAG) - fix->add_result_array(f_start, ev_global, offload, eatom); + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); } diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp index 9de0b9291a..c9af1d76ad 100644 --- a/src/USER-INTEL/pair_gayberne_intel.cpp +++ b/src/USER-INTEL/pair_gayberne_intel.cpp @@ -793,10 +793,6 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, sizeof(FORCE_T)); const int two_iito = iito * 2; - #if defined(_OPENMP) - #pragma omp barrier - #endif - acc_t *facc = &(f_start[0].x); const int sto = two_iito * 4; const int fst4 = f_stride * 4; @@ -882,9 +878,9 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, fix->stop_watch(TIME_HOST_PAIR); if (EVFLAG) - fix->add_result_array(f_start, ev_global, offload,eatom); + fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2); else - fix->add_result_array(f_start, 0, offload); + fix->add_result_array(f_start, 0, offload, 0, 0, 2); } /* ---------------------------------------------------------------------- */ diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp index 41ab6885ec..1f985a8d36 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp @@ -438,12 +438,17 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp); } // for ii - #if defined(_OPENMP) - #pragma omp barrier + #ifndef _LMP_INTEL_OFFLOAD + if (vflag == 2) #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x); + { + #if defined(_OPENMP) + #pragma omp barrier + #endif + IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, + nlocal, minlocal, nthreads, f_start, f_stride, + x, offload); + } } // end of omp parallel region if (EVFLAG) { if (EFLAG) { @@ -470,7 +475,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, fix->stop_watch(TIME_HOST_PAIR); if (EVFLAG) - fix->add_result_array(f_start, ev_global, offload, eatom); + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); } diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp index 40451d6f36..a158ead327 100644 --- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp @@ -405,12 +405,18 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, f[i].z += fztmp; IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp); } // for ii - #if defined(_OPENMP) - #pragma omp barrier + + #ifndef _LMP_INTEL_OFFLOAD + if (vflag == 2) #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x); + { + #if defined(_OPENMP) + #pragma omp barrier + #endif + IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, + nlocal, minlocal, nthreads, f_start, f_stride, + x, offload); + } } // end of omp parallel region if (EVFLAG) { if (EFLAG) { @@ -437,7 +443,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, fix->stop_watch(TIME_HOST_PAIR); if (EVFLAG) - fix->add_result_array(f_start, ev_global, offload, eatom); + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); } diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp index e1300b1a52..95e36bdb2f 100644 --- a/src/USER-INTEL/pair_lj_cut_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_intel.cpp @@ -349,12 +349,17 @@ void PairLJCutIntel::eval(const int offload, const int vflag, IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp); } // for ii - #if defined(_OPENMP) - #pragma omp barrier + #ifndef _LMP_INTEL_OFFLOAD + if (vflag == 2) #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x); + { + #if defined(_OPENMP) + #pragma omp barrier + #endif + IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, + nlocal, minlocal, nthreads, f_start, f_stride, + x, offload); + } } // end omp if (EVFLAG) { if (EFLAG) { @@ -381,7 +386,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag, fix->stop_watch(TIME_HOST_PAIR); if (EVFLAG) - fix->add_result_array(f_start, ev_global, offload, eatom); + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); } diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp index a1491e2859..c777700731 100755 --- a/src/USER-INTEL/pair_sw_intel.cpp +++ b/src/USER-INTEL/pair_sw_intel.cpp @@ -559,12 +559,18 @@ void PairSWIntel::eval(const int offload, const int vflag, f[i].z += fztmp; IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp); } // for ii - #if defined(_OPENMP) - #pragma omp barrier + + #ifndef _LMP_INTEL_OFFLOAD + if (vflag == 2) #endif - IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x); + { + #if defined(_OPENMP) + #pragma omp barrier + #endif + IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall, + nlocal, minlocal, nthreads, f_start, f_stride, + x, offload); + } } // end omp if (EVFLAG) { if (EFLAG) { @@ -590,7 +596,7 @@ void PairSWIntel::eval(const int offload, const int vflag, fix->stop_watch(TIME_HOST_PAIR); if (EVFLAG) - fix->add_result_array(f_start, ev_global, offload, eatom); + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); } @@ -677,7 +683,7 @@ void PairSWIntel::eval(const int offload, const int vflag, in(numneighhalf:length(0) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ - in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ + in(ccachei,ccachej,ccachef:length(0) alloc_if(0) free_if(0)) \ in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \ in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \ in(ccache_stride3) \ @@ -852,7 +858,7 @@ void PairSWIntel::eval(const int offload, const int vflag, const int ejnum_max = SIMD_max(ejnum); const int ejnumhalf_max = SIMD_max(ejnumhalf); - memset(tf, 0, ejnum_max * sizeof(double) * swidth * 3); + memset(tf, 0, ejnum_max * sizeof(acc_t) * swidth * 3); for (int jj = 0; jj < ejnum_max; jj++) { SIMD_int ijtype; const int coffset = jj * swidth; @@ -1006,9 +1012,15 @@ void PairSWIntel::eval(const int offload, const int vflag, tf + kcoffset * 3, swidth); if (EVFLAG) { - if (EFLAG) + if (EFLAG) { + SIMD_int k; + if (eatom) { + k = SIMD_load(tj + kcoffset); + k = k << 4; + } SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp, - fwtmp2, fjtmp2); + fwtmp2, fjtmp2, k, &(f[0].x)); + } SIMD_ev_tally_nbor3v(kmask, vflag, fjx, fjy, fjz, fkx, fky, fkz, delx, dely, delz, delr2x, delr2y, delr2z, sv0, sv1, sv2, sv3, sv4, sv5); @@ -1021,8 +1033,15 @@ void PairSWIntel::eval(const int offload, const int vflag, SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp, fjxtmp2, fjytmp2, fjztmp2); - //if (EFLAG) - // if (eatom) f[j].w += fjtmp; + if (EFLAG) { + if (eatom) { + SIMD_int j = SIMD_load(tj + coffset); + j = j << 4; + SIMD_jeng_update(jmask, &(f[0].x) + 3, j, fjtmp); + if (is_same::value == 0) + SIMD_jeng_update_hi(jmask, &(f[0].x) + 3, j, fjtmp2); + } + } } // for jj first loop for (int jj = 0; jj < ejnum_max; jj++) { @@ -1066,15 +1085,15 @@ void PairSWIntel::eval(const int offload, const int vflag, } } // for jj second loop - SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp); + SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp, + EVFLAG, eatom, fwtmp); if (is_same::value == 0) { imask = imask >> 8; SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2, - fztmp2); + fztmp2, EVFLAG, eatom, fwtmp2); } if (EVFLAG) { if (EFLAG) oevdwl += SIMD_sum(sevdwl); - // f[i].w if (vflag == 1) { ov0 += SIMD_sum(sv0); ov1 += SIMD_sum(sv1); @@ -1086,11 +1105,18 @@ void PairSWIntel::eval(const int offload, const int vflag, } ilist = ilist + swidth; } // for ii - #if defined(_OPENMP) - #pragma omp barrier + + #ifndef _LMP_INTEL_OFFLOAD + if (vflag == 2) #endif - IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall, nlocal, - minlocal, nthreads, f_start, f_stride, x); + { + #if defined(_OPENMP) + #pragma omp barrier + #endif + IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall, nlocal, + minlocal, nthreads, f_start, f_stride, x, + offload); + } } // end omp if (EVFLAG) { @@ -1117,7 +1143,7 @@ void PairSWIntel::eval(const int offload, const int vflag, fix->stop_watch(TIME_HOST_PAIR); if (EVFLAG) - fix->add_result_array(f_start, ev_global, offload, eatom); + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); } diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp index b33056de44..666b234cb9 100644 --- a/src/USER-INTEL/pair_tersoff_intel.cpp +++ b/src/USER-INTEL/pair_tersoff_intel.cpp @@ -384,12 +384,18 @@ void PairTersoffIntel::eval(const int offload, const int vflag, } } } - #if defined(_OPENMP) - #pragma omp barrier + + #ifndef _LMP_INTEL_OFFLOAD + if (vflag == 2) #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x); + { + #if defined(_OPENMP) + #pragma omp barrier + #endif + IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, + nlocal, minlocal, nthreads, f_start, f_stride, + x, offload); + } } // end of omp parallel region if (EVFLAG) { if (EFLAG) { @@ -419,7 +425,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag, fix->stop_watch(TIME_HOST_PAIR); if (EVFLAG) - fix->add_result_array(f_start, ev_global, offload, eatom); + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); diff --git a/src/USER-INTEL/verlet_intel.cpp b/src/USER-INTEL/verlet_intel.cpp index 587b2fbd44..ad685f4aa7 100644 --- a/src/USER-INTEL/verlet_intel.cpp +++ b/src/USER-INTEL/verlet_intel.cpp @@ -73,11 +73,10 @@ void VerletIntel::init() if (nvlist_atom) error->all(FLERR, "Cannot currently get per-atom virials with Intel package."); - #ifdef _LMP_INTEL_OFFLOAD + ifix = modify->find_fix("package_intel"); if (ifix >= 0) fix_intel = static_cast(modify->fix[ifix]); else fix_intel = 0; - #endif // set flags for what arrays to clear in force_clear() // need to clear additionals arrays if they exist @@ -149,6 +148,8 @@ void VerletIntel::setup() else force->kspace->compute_dummy(eflag,vflag); } + if (fix_intel) fix_intel->sync(); + #ifdef _LMP_INTEL_OFFLOAD sync_mode = 0; if (fix_intel) { @@ -227,6 +228,8 @@ void VerletIntel::setup_minimal(int flag) else force->kspace->compute_dummy(eflag,vflag); } + if (fix_intel) fix_intel->sync(); + #ifdef _LMP_INTEL_OFFLOAD sync_mode = 0; if (fix_intel) { @@ -349,6 +352,8 @@ void VerletIntel::run(int n) timer->stamp(Timer::KSPACE); } + if (fix_intel) fix_intel->sync(); + #ifdef _LMP_INTEL_OFFLOAD if (sync_mode == 1) { fix_intel->sync_coprocessor(); diff --git a/src/USER-INTEL/verlet_intel.h b/src/USER-INTEL/verlet_intel.h index 765da5410d..8dcca76df4 100644 --- a/src/USER-INTEL/verlet_intel.h +++ b/src/USER-INTEL/verlet_intel.h @@ -21,9 +21,7 @@ IntegrateStyle(verlet/intel,VerletIntel) #define LMP_VERLET_INTEL_H #include "integrate.h" -#ifdef LMP_INTEL_OFFLOAD #include "fix_intel.h" -#endif namespace LAMMPS_NS { @@ -42,8 +40,9 @@ class VerletIntel : public Integrate { int torqueflag,extraflag; virtual void force_clear(); - #ifdef _LMP_INTEL_OFFLOAD + FixIntel *fix_intel; + #ifdef _LMP_INTEL_OFFLOAD int sync_mode; #endif }; diff --git a/src/USER-INTEL/verlet_split_intel.cpp b/src/USER-INTEL/verlet_split_intel.cpp index 644ebce9fc..76813b7885 100644 --- a/src/USER-INTEL/verlet_split_intel.cpp +++ b/src/USER-INTEL/verlet_split_intel.cpp @@ -378,6 +378,8 @@ void VerletSplitIntel::run(int n) timer->stamp(Timer::BOND); } + if (fix_intel) fix_intel->sync(); + #ifdef _LMP_INTEL_OFFLOAD if (sync_mode == 1) { fix_intel->sync_coprocessor(); diff --git a/src/USER-INTEL/verlet_split_intel.h b/src/USER-INTEL/verlet_split_intel.h index ebacdf29f7..4691fc38de 100644 --- a/src/USER-INTEL/verlet_split_intel.h +++ b/src/USER-INTEL/verlet_split_intel.h @@ -21,9 +21,6 @@ IntegrateStyle(verlet/split/intel,VerletSplitIntel) #define LMP_VERLET_SPLIT_INTEL_H #include "verlet_intel.h" -#ifdef LMP_INTEL_OFFLOAD -#include "fix_intel.h" -#endif namespace LAMMPS_NS {