git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@14499 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
331
src/USER-INTEL/angle_charmm_intel.cpp
Normal file
331
src/USER-INTEL/angle_charmm_intel.cpp
Normal file
@ -0,0 +1,331 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include "angle_charmm_intel.h"
|
||||
#include "atom.h"
|
||||
#include "neighbor.h"
|
||||
#include "domain.h"
|
||||
#include "comm.h"
|
||||
#include "force.h"
|
||||
#include "math_const.h"
|
||||
#include "memory.h"
|
||||
#include "suffix.h"
|
||||
#include "error.h"
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
using namespace MathConst;
|
||||
|
||||
#define SMALL (flt_t)0.001
|
||||
typedef struct { int a,b,c,t; } int4_t;
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp)
|
||||
{
|
||||
suffix_flag |= Suffix::INTEL;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
AngleCharmmIntel::~AngleCharmmIntel()
|
||||
{
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AngleCharmmIntel::compute(int eflag, int vflag)
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_use_base) {
|
||||
AngleCharmm::compute(eflag, vflag);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
||||
force_const_single);
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
||||
force_const_double);
|
||||
else
|
||||
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
|
||||
force_const_single);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void AngleCharmmIntel::compute(int eflag, int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
{
|
||||
if (eflag || vflag) ev_setup(eflag,vflag);
|
||||
else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<0,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void AngleCharmmIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
|
||||
{
|
||||
const int inum = neighbor->nanglelist;
|
||||
if (inum == 0) return;
|
||||
|
||||
ATOM_T * _noalias const x = buffers->get_x(0);
|
||||
const int nlocal = atom->nlocal;
|
||||
const int nall = nlocal + atom->nghost;
|
||||
|
||||
int f_stride;
|
||||
if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
|
||||
else f_stride = buffers->get_stride(nlocal);
|
||||
|
||||
int tc;
|
||||
FORCE_T * _noalias f_start;
|
||||
acc_t * _noalias ev_global;
|
||||
IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oeangle = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,fc) \
|
||||
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
const int4_t * _noalias const anglelist =
|
||||
(int4_t *) neighbor->anglelist[0];
|
||||
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
const int i1 = anglelist[n].a;
|
||||
const int i2 = anglelist[n].b;
|
||||
const int i3 = anglelist[n].c;
|
||||
const int type = anglelist[n].t;
|
||||
|
||||
// 1st bond
|
||||
|
||||
const flt_t delx1 = x[i1].x - x[i2].x;
|
||||
const flt_t dely1 = x[i1].y - x[i2].y;
|
||||
const flt_t delz1 = x[i1].z - x[i2].z;
|
||||
|
||||
const flt_t rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
|
||||
const flt_t r1 = sqrt(rsq1);
|
||||
|
||||
// 2nd bond
|
||||
|
||||
const flt_t delx2 = x[i3].x - x[i2].x;
|
||||
const flt_t dely2 = x[i3].y - x[i2].y;
|
||||
const flt_t delz2 = x[i3].z - x[i2].z;
|
||||
|
||||
const flt_t rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
|
||||
const flt_t r2 = sqrt(rsq2);
|
||||
|
||||
// Urey-Bradley bond
|
||||
|
||||
const flt_t delxUB = x[i3].x - x[i1].x;
|
||||
const flt_t delyUB = x[i3].y - x[i1].y;
|
||||
const flt_t delzUB = x[i3].z - x[i1].z;
|
||||
|
||||
const flt_t rsqUB = delxUB*delxUB + delyUB*delyUB + delzUB*delzUB;
|
||||
const flt_t rUB = sqrt(rsqUB);
|
||||
|
||||
// Urey-Bradley force & energy
|
||||
|
||||
const flt_t dr = rUB - fc.fc[type].r_ub;
|
||||
const flt_t rk = fc.fc[type].k_ub * dr;
|
||||
|
||||
flt_t forceUB;
|
||||
if (rUB > (flt_t)0.0) forceUB = (flt_t)-2.0*rk/rUB;
|
||||
else forceUB = 0.0;
|
||||
|
||||
flt_t eangle;
|
||||
if (EFLAG) eangle = rk*dr;
|
||||
|
||||
// angle (cos and sin)
|
||||
|
||||
flt_t c = delx1*delx2 + dely1*dely2 + delz1*delz2;
|
||||
c /= r1*r2;
|
||||
|
||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||
|
||||
flt_t s = sqrt((flt_t)1.0 - c*c);
|
||||
if (s < SMALL) s = SMALL;
|
||||
s = (flt_t)1.0/s;
|
||||
|
||||
// harmonic force & energy
|
||||
|
||||
const flt_t dtheta = acos(c) - fc.fc[type].theta0;
|
||||
const flt_t tk = fc.fc[type].k * dtheta;
|
||||
|
||||
if (EFLAG) eangle += tk*dtheta;
|
||||
|
||||
const flt_t a = (flt_t)-2.0 * tk * s;
|
||||
const flt_t a11 = a*c / rsq1;
|
||||
const flt_t a12 = -a / (r1*r2);
|
||||
const flt_t a22 = a*c / rsq2;
|
||||
|
||||
const flt_t f1x = a11*delx1 + a12*delx2 - delxUB*forceUB;
|
||||
const flt_t f1y = a11*dely1 + a12*dely2 - delyUB*forceUB;
|
||||
const flt_t f1z = a11*delz1 + a12*delz2 - delzUB*forceUB;
|
||||
|
||||
const flt_t f3x = a22*delx2 + a12*delx1 + delxUB*forceUB;
|
||||
const flt_t f3y = a22*dely2 + a12*dely1 + delyUB*forceUB;
|
||||
const flt_t f3z = a22*delz2 + a12*delz1 + delzUB*forceUB;
|
||||
|
||||
// apply force to each of 3 atoms
|
||||
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x -= f1x + f3x;
|
||||
f[i2].y -= f1y + f3y;
|
||||
f[i2].z -= f1z + f3z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x,
|
||||
f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1,
|
||||
delx2, dely2, delz2, oeangle, f, NEWTON_BOND,
|
||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
}
|
||||
} // for n
|
||||
} // omp parallel
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
energy += oeangle;
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void AngleCharmmIntel::init_style()
|
||||
{
|
||||
AngleCharmm::init_style();
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
if (ifix < 0)
|
||||
error->all(FLERR,
|
||||
"The 'package intel' command is required for /intel styles");
|
||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
_use_base = 0;
|
||||
if (fix->offload_balance() != 0.0) {
|
||||
_use_base = 1;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
fix->bond_init_check();
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
pack_force_const(force_const_double, fix->get_double_buffers());
|
||||
else
|
||||
pack_force_const(force_const_single, fix->get_single_buffers());
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void AngleCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
const int bp1 = atom->ndihedraltypes + 1;
|
||||
fc.set_ntypes(bp1,memory);
|
||||
|
||||
for (int i = 0; i < bp1; i++) {
|
||||
fc.fc[i].k = k[i];
|
||||
fc.fc[i].theta0 = theta0[i];
|
||||
fc.fc[i].k_ub = k_ub[i];
|
||||
fc.fc[i].r_ub = r_ub[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t>
|
||||
void AngleCharmmIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
|
||||
Memory *memory) {
|
||||
if (nangletypes != _nangletypes) {
|
||||
if (_nangletypes > 0)
|
||||
_memory->destroy(fc);
|
||||
|
||||
if (nangletypes > 0)
|
||||
_memory->create(fc,nangletypes,"anglecharmmintel.fc");
|
||||
}
|
||||
_nangletypes = nangletypes;
|
||||
_memory = memory;
|
||||
}
|
||||
88
src/USER-INTEL/angle_charmm_intel.h
Normal file
88
src/USER-INTEL/angle_charmm_intel.h
Normal file
@ -0,0 +1,88 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef ANGLE_CLASS
|
||||
|
||||
AngleStyle(charmm/intel,AngleCharmmIntel)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef LMP_ANGLE_CHARMM_INTEL_H
|
||||
#define LMP_ANGLE_CHARMM_INTEL_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include "angle_charmm.h"
|
||||
#include "fix_intel.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class AngleCharmmIntel : public AngleCharmm {
|
||||
public:
|
||||
AngleCharmmIntel(class LAMMPS *);
|
||||
virtual ~AngleCharmmIntel();
|
||||
virtual void compute(int, int);
|
||||
virtual void init_style();
|
||||
|
||||
protected:
|
||||
FixIntel *fix;
|
||||
|
||||
template <class flt_t> class ForceConst;
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <class flt_t, class acc_t>
|
||||
void pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t, acc_t> *buffers);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int _use_base;
|
||||
#endif
|
||||
|
||||
template <class flt_t>
|
||||
class ForceConst {
|
||||
public:
|
||||
typedef struct { flt_t k, theta0, k_ub, r_ub; } fc_packed1;
|
||||
|
||||
fc_packed1 *fc;
|
||||
ForceConst() : _nangletypes(0) {}
|
||||
~ForceConst() { set_ntypes(0, NULL); }
|
||||
|
||||
void set_ntypes(const int nangletypes, Memory *memory);
|
||||
|
||||
private:
|
||||
int _nangletypes;
|
||||
Memory *_memory;
|
||||
};
|
||||
ForceConst<float> force_const_single;
|
||||
ForceConst<double> force_const_double;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* ERROR/WARNING messages:
|
||||
|
||||
E: Incorrect args for angle coefficients
|
||||
|
||||
Self-explanatory. Check the input script or data file.
|
||||
|
||||
*/
|
||||
261
src/USER-INTEL/bond_harmonic_intel.cpp
Normal file
261
src/USER-INTEL/bond_harmonic_intel.cpp
Normal file
@ -0,0 +1,261 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include "bond_harmonic_intel.h"
|
||||
#include "atom.h"
|
||||
#include "neighbor.h"
|
||||
#include "domain.h"
|
||||
#include "comm.h"
|
||||
#include "force.h"
|
||||
#include "memory.h"
|
||||
#include "suffix.h"
|
||||
#include "error.h"
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
typedef struct { int a,b,t; } int3_t;
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp)
|
||||
{
|
||||
suffix_flag |= Suffix::INTEL;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
BondHarmonicIntel::~BondHarmonicIntel()
|
||||
{
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void BondHarmonicIntel::compute(int eflag, int vflag)
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_use_base) {
|
||||
BondHarmonic::compute(eflag, vflag);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
||||
force_const_single);
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
||||
force_const_double);
|
||||
else
|
||||
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
|
||||
force_const_single);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void BondHarmonicIntel::compute(int eflag, int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
{
|
||||
if (eflag || vflag) ev_setup(eflag,vflag);
|
||||
else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<0,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
}
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void BondHarmonicIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
{
|
||||
const int inum = neighbor->nbondlist;
|
||||
if (inum == 0) return;
|
||||
|
||||
ATOM_T * _noalias const x = buffers->get_x(0);
|
||||
const int nlocal = atom->nlocal;
|
||||
const int nall = nlocal + atom->nghost;
|
||||
|
||||
int f_stride;
|
||||
if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
|
||||
else f_stride = buffers->get_stride(nlocal);
|
||||
|
||||
int tc;
|
||||
FORCE_T * _noalias f_start;
|
||||
acc_t * _noalias ev_global;
|
||||
IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oebond = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,fc) \
|
||||
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
const int3_t * _noalias const bondlist =
|
||||
(int3_t *) neighbor->bondlist[0];
|
||||
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
const int i1 = bondlist[n].a;
|
||||
const int i2 = bondlist[n].b;
|
||||
const int type = bondlist[n].t;
|
||||
|
||||
const flt_t delx = x[i1].x - x[i2].x;
|
||||
const flt_t dely = x[i1].y - x[i2].y;
|
||||
const flt_t delz = x[i1].z - x[i2].z;
|
||||
|
||||
const flt_t rsq = delx*delx + dely*dely + delz*delz;
|
||||
const flt_t r = sqrt(rsq);
|
||||
const flt_t dr = r - fc.fc[type].r0;
|
||||
const flt_t rk = fc.fc[type].k * dr;
|
||||
|
||||
// force & energy
|
||||
|
||||
flt_t fbond;
|
||||
if (r > (flt_t)0.0) fbond = (flt_t)-2.0*rk/r;
|
||||
else fbond = (flt_t)0.0;
|
||||
|
||||
flt_t ebond;
|
||||
if (EFLAG) ebond = rk*dr;
|
||||
|
||||
// apply force to each of 2 atoms
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += delx*fbond;
|
||||
f[i1].y += dely*fbond;
|
||||
f[i1].z += delz*fbond;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x -= delx*fbond;
|
||||
f[i2].y -= dely*fbond;
|
||||
f[i2].z -= delz*fbond;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
||||
delx, dely, delz, oebond, f, NEWTON_BOND,
|
||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
}
|
||||
} // for n
|
||||
} // omp parallel
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
energy += oebond;
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void BondHarmonicIntel::init_style()
|
||||
{
|
||||
BondHarmonic::init_style();
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
if (ifix < 0)
|
||||
error->all(FLERR,
|
||||
"The 'package intel' command is required for /intel styles");
|
||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
_use_base = 0;
|
||||
if (fix->offload_balance() != 0.0) {
|
||||
_use_base = 1;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
fix->bond_init_check();
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
pack_force_const(force_const_double, fix->get_double_buffers());
|
||||
else
|
||||
pack_force_const(force_const_single, fix->get_single_buffers());
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void BondHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
const int bp1 = atom->nbondtypes + 1;
|
||||
fc.set_ntypes(bp1,memory);
|
||||
|
||||
for (int i = 0; i < bp1; i++) {
|
||||
fc.fc[i].k = k[i];
|
||||
fc.fc[i].r0 = r0[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t>
|
||||
void BondHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
|
||||
Memory *memory) {
|
||||
if (nbondtypes != _nbondtypes) {
|
||||
if (_nbondtypes > 0)
|
||||
_memory->destroy(fc);
|
||||
|
||||
if (nbondtypes > 0)
|
||||
_memory->create(fc,nbondtypes,"bondharmonicintel.fc");
|
||||
}
|
||||
_nbondtypes = nbondtypes;
|
||||
_memory = memory;
|
||||
}
|
||||
88
src/USER-INTEL/bond_harmonic_intel.h
Normal file
88
src/USER-INTEL/bond_harmonic_intel.h
Normal file
@ -0,0 +1,88 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef BOND_CLASS
|
||||
|
||||
BondStyle(harmonic/intel,BondHarmonicIntel)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef LMP_BOND_HARMONIC_INTEL_H
|
||||
#define LMP_BOND_HARMONIC_INTEL_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include "bond_harmonic.h"
|
||||
#include "fix_intel.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class BondHarmonicIntel : public BondHarmonic {
|
||||
public:
|
||||
BondHarmonicIntel(class LAMMPS *);
|
||||
virtual ~BondHarmonicIntel();
|
||||
virtual void compute(int, int);
|
||||
virtual void init_style();
|
||||
|
||||
protected:
|
||||
FixIntel *fix;
|
||||
|
||||
template <class flt_t> class ForceConst;
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <class flt_t, class acc_t>
|
||||
void pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t, acc_t> *buffers);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int _use_base;
|
||||
#endif
|
||||
|
||||
template <class flt_t>
|
||||
class ForceConst {
|
||||
public:
|
||||
typedef struct { flt_t k, r0; } fc_packed1;
|
||||
fc_packed1 *fc;
|
||||
|
||||
ForceConst() : _nbondtypes(0) {}
|
||||
~ForceConst() { set_ntypes(0, NULL); }
|
||||
|
||||
void set_ntypes(const int nbondtypes, Memory *memory);
|
||||
|
||||
private:
|
||||
int _nbondtypes;
|
||||
Memory *_memory;
|
||||
};
|
||||
ForceConst<float> force_const_single;
|
||||
ForceConst<double> force_const_double;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* ERROR/WARNING messages:
|
||||
|
||||
E: Incorrect args for bond coefficients
|
||||
|
||||
Self-explanatory. Check the input script or data file.
|
||||
|
||||
*/
|
||||
538
src/USER-INTEL/dihedral_charmm_intel.cpp
Normal file
538
src/USER-INTEL/dihedral_charmm_intel.cpp
Normal file
@ -0,0 +1,538 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "mpi.h"
|
||||
#include "math.h"
|
||||
#include "dihedral_charmm_intel.h"
|
||||
#include "atom.h"
|
||||
#include "comm.h"
|
||||
#include "memory.h"
|
||||
#include "neighbor.h"
|
||||
#include "domain.h"
|
||||
#include "force.h"
|
||||
#include "pair.h"
|
||||
#include "update.h"
|
||||
#include "error.h"
|
||||
|
||||
#include "suffix.h"
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
#define PTOLERANCE (flt_t)1.05
|
||||
#define MTOLERANCE (flt_t)-1.05
|
||||
#define SMALL (flt_t)0.001
|
||||
typedef struct { int a,b,c,d,t; } int5_t;
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
DihedralCharmmIntel::DihedralCharmmIntel(class LAMMPS *lmp)
|
||||
: DihedralCharmm(lmp)
|
||||
{
|
||||
suffix_flag |= Suffix::INTEL;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void DihedralCharmmIntel::compute(int eflag, int vflag)
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_use_base) {
|
||||
DihedralCharmm::compute(eflag, vflag);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
||||
force_const_single);
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
||||
force_const_double);
|
||||
else
|
||||
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
|
||||
force_const_single);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void DihedralCharmmIntel::compute(int eflag, int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
{
|
||||
if (eflag || vflag) {
|
||||
ev_setup(eflag,vflag);
|
||||
} else evflag = 0;
|
||||
|
||||
// insure pair->ev_tally() will use 1-4 virial contribution
|
||||
|
||||
if (weightflag && vflag_global == 2)
|
||||
force->pair->vflag_either = force->pair->vflag_global = 1;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<0,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
}
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void DihedralCharmmIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
|
||||
{
|
||||
const int inum = neighbor->ndihedrallist;
|
||||
if (inum == 0) return;
|
||||
|
||||
ATOM_T * _noalias const x = buffers->get_x(0);
|
||||
flt_t * _noalias const q = buffers->get_q(0);
|
||||
const int nlocal = atom->nlocal;
|
||||
const int nall = nlocal + atom->nghost;
|
||||
|
||||
int f_stride;
|
||||
if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
|
||||
else f_stride = buffers->get_stride(nlocal);
|
||||
|
||||
int tc;
|
||||
FORCE_T * _noalias f_start;
|
||||
acc_t * _noalias ev_global;
|
||||
IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oevdwl = oecoul = oedihedral = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,fc) \
|
||||
reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
|
||||
opv0,opv1,opv2,opv3,opv4,opv5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
const int5_t * _noalias const dihedrallist =
|
||||
(int5_t *) neighbor->dihedrallist[0];
|
||||
const flt_t qqrd2e = force->qqrd2e;
|
||||
|
||||
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
sevdwl = secoul = sedihedral = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0;
|
||||
}
|
||||
}
|
||||
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
const int i1 = dihedrallist[n].a;
|
||||
const int i2 = dihedrallist[n].b;
|
||||
const int i3 = dihedrallist[n].c;
|
||||
const int i4 = dihedrallist[n].d;
|
||||
const int type = dihedrallist[n].t;
|
||||
|
||||
// 1st bond
|
||||
|
||||
const flt_t vb1x = x[i1].x - x[i2].x;
|
||||
const flt_t vb1y = x[i1].y - x[i2].y;
|
||||
const flt_t vb1z = x[i1].z - x[i2].z;
|
||||
const int itype = x[i1].w;
|
||||
|
||||
// 2nd bond
|
||||
|
||||
const flt_t vb2xm = x[i2].x - x[i3].x;
|
||||
const flt_t vb2ym = x[i2].y - x[i3].y;
|
||||
const flt_t vb2zm = x[i2].z - x[i3].z;
|
||||
|
||||
// 3rd bond
|
||||
|
||||
const flt_t vb3x = x[i4].x - x[i3].x;
|
||||
const flt_t vb3y = x[i4].y - x[i3].y;
|
||||
const flt_t vb3z = x[i4].z - x[i3].z;
|
||||
const int jtype = x[i4].w;
|
||||
|
||||
// 1-4
|
||||
|
||||
const flt_t delx = x[i1].x - x[i4].x;
|
||||
const flt_t dely = x[i1].y - x[i4].y;
|
||||
const flt_t delz = x[i1].z - x[i4].z;
|
||||
|
||||
|
||||
// c,s calculation
|
||||
|
||||
const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
|
||||
const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
|
||||
const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
|
||||
const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
|
||||
const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
|
||||
const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
|
||||
|
||||
const flt_t rasq = ax*ax + ay*ay + az*az;
|
||||
const flt_t rbsq = bx*bx + by*by + bz*bz;
|
||||
const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
|
||||
const flt_t rg = sqrt(rgsq);
|
||||
|
||||
flt_t rginv, ra2inv, rb2inv;
|
||||
rginv = ra2inv = rb2inv = (flt_t)0.0;
|
||||
if (rg > 0) rginv = (flt_t)1.0/rg;
|
||||
if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
|
||||
if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
|
||||
const flt_t rabinv = sqrt(ra2inv*rb2inv);
|
||||
|
||||
flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
|
||||
const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
|
||||
|
||||
// error check
|
||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||
int me = comm->me;
|
||||
|
||||
if (screen) {
|
||||
char str[128];
|
||||
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
|
||||
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
||||
TAGINT_FORMAT " " TAGINT_FORMAT,
|
||||
me,tid,update->ntimestep,
|
||||
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
||||
error->warning(FLERR,str,0);
|
||||
fprintf(screen," 1st atom: %d %g %g %g\n",
|
||||
me,x[i1].x,x[i1].y,x[i1].z);
|
||||
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
||||
me,x[i2].x,x[i2].y,x[i2].z);
|
||||
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
||||
me,x[i3].x,x[i3].y,x[i3].z);
|
||||
fprintf(screen," 4th atom: %d %g %g %g\n",
|
||||
me,x[i4].x,x[i4].y,x[i4].z);
|
||||
}
|
||||
}
|
||||
|
||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||
|
||||
const flt_t tcos_shift = fc.bp[type].cos_shift;
|
||||
const flt_t tsin_shift = fc.bp[type].sin_shift;
|
||||
const flt_t tk = fc.bp[type].k;
|
||||
const int m = fc.bp[type].multiplicity;
|
||||
|
||||
flt_t p = (flt_t)1.0;
|
||||
flt_t ddf1, df1;
|
||||
ddf1 = df1 = (flt_t)0.0;
|
||||
|
||||
for (int i = 0; i < m; i++) {
|
||||
ddf1 = p*c - df1*s;
|
||||
df1 = p*s + df1*c;
|
||||
p = ddf1;
|
||||
}
|
||||
|
||||
p = p*tcos_shift + df1*tsin_shift;
|
||||
df1 = df1*tcos_shift - ddf1*tsin_shift;
|
||||
df1 *= -m;
|
||||
p += (flt_t)1.0;
|
||||
|
||||
if (m == 0) {
|
||||
p = (flt_t)1.0 + tcos_shift;
|
||||
df1 = (flt_t)0.0;
|
||||
}
|
||||
|
||||
const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
|
||||
const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
|
||||
const flt_t fga = fg*ra2inv*rginv;
|
||||
const flt_t hgb = hg*rb2inv*rginv;
|
||||
const flt_t gaa = -ra2inv*rg;
|
||||
const flt_t gbb = rb2inv*rg;
|
||||
|
||||
const flt_t dtfx = gaa*ax;
|
||||
const flt_t dtfy = gaa*ay;
|
||||
const flt_t dtfz = gaa*az;
|
||||
const flt_t dtgx = fga*ax - hgb*bx;
|
||||
const flt_t dtgy = fga*ay - hgb*by;
|
||||
const flt_t dtgz = fga*az - hgb*bz;
|
||||
const flt_t dthx = gbb*bx;
|
||||
const flt_t dthy = gbb*by;
|
||||
const flt_t dthz = gbb*bz;
|
||||
|
||||
const flt_t df = -tk * df1;
|
||||
|
||||
const flt_t sx2 = df*dtgx;
|
||||
const flt_t sy2 = df*dtgy;
|
||||
const flt_t sz2 = df*dtgz;
|
||||
|
||||
flt_t f1x = df*dtfx;
|
||||
flt_t f1y = df*dtfy;
|
||||
flt_t f1z = df*dtfz;
|
||||
|
||||
const flt_t f2x = sx2 - f1x;
|
||||
const flt_t f2y = sy2 - f1y;
|
||||
const flt_t f2z = sz2 - f1z;
|
||||
|
||||
flt_t f4x = df*dthx;
|
||||
flt_t f4y = df*dthy;
|
||||
flt_t f4z = df*dthz;
|
||||
|
||||
const flt_t f3x = -sx2 - f4x;
|
||||
const flt_t f3y = -sy2 - f4y;
|
||||
const flt_t f3z = -sz2 - f4z;
|
||||
|
||||
if (EVFLAG) {
|
||||
flt_t deng;
|
||||
if (EFLAG) deng = tk * p;
|
||||
IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x,
|
||||
f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x,
|
||||
vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y,
|
||||
vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x += f2x;
|
||||
f[i2].y += f2y;
|
||||
f[i2].z += f2z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
}
|
||||
|
||||
// 1-4 LJ and Coulomb interactions
|
||||
// tally energy/virial in pair, using newton_bond as newton flag
|
||||
|
||||
const flt_t tweight = fc.weight[type];
|
||||
const flt_t rsq = delx*delx + dely*dely + delz*delz;
|
||||
const flt_t r2inv = (flt_t)1.0/rsq;
|
||||
const flt_t r6inv = r2inv*r2inv*r2inv;
|
||||
|
||||
flt_t forcecoul;
|
||||
if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv;
|
||||
else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv);
|
||||
const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv -
|
||||
fc.ljp[itype][jtype].lj2);
|
||||
const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv;
|
||||
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f1x += delx*fpair;
|
||||
f1y += dely*fpair;
|
||||
f1z += delz*fpair;
|
||||
}
|
||||
if (NEWTON_BOND || i4 < nlocal) {
|
||||
f4x -= delx*fpair;
|
||||
f4y -= dely*fpair;
|
||||
f4z -= delz*fpair;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
flt_t ev_pre = (flt_t)0;
|
||||
if (NEWTON_BOND || i1 < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
if (NEWTON_BOND || i4 < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
|
||||
if (EFLAG) {
|
||||
flt_t ecoul, evdwl;
|
||||
ecoul = tweight * forcecoul;
|
||||
evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv -
|
||||
fc.ljp[itype][jtype].lj4);
|
||||
secoul += ev_pre * ecoul;
|
||||
sevdwl += ev_pre * evdwl;
|
||||
if (eatom) {
|
||||
evdwl *= (flt_t)0.5;
|
||||
evdwl += (flt_t)0.5 * ecoul;
|
||||
if (NEWTON_BOND || i1 < nlocal)
|
||||
f[i1].w += evdwl;
|
||||
if (NEWTON_BOND || i4 < nlocal)
|
||||
f[i4].w += evdwl;
|
||||
}
|
||||
}
|
||||
if (vflag) {
|
||||
spv0 += ev_pre * delx * delx * fpair;
|
||||
spv1 += ev_pre * dely * dely * fpair;
|
||||
spv2 += ev_pre * delz * delz * fpair;
|
||||
spv3 += ev_pre * delx * dely * fpair;
|
||||
spv4 += ev_pre * delx * delz * fpair;
|
||||
spv5 += ev_pre * dely * delz * fpair;
|
||||
}
|
||||
}
|
||||
|
||||
// apply force to each of 4 atoms
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i4 < nlocal) {
|
||||
f[i4].x += f4x;
|
||||
f[i4].y += f4y;
|
||||
f[i4].z += f4z;
|
||||
}
|
||||
}
|
||||
} // for n
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
oedihedral += sedihedral;
|
||||
oecoul += secoul;
|
||||
oevdwl += sevdwl;
|
||||
}
|
||||
if (vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
opv0 += spv0; opv1 += spv1; opv2 += spv2;
|
||||
opv3 += spv3; opv4 += spv4; opv5 += spv5;
|
||||
}
|
||||
}
|
||||
} // omp parallel
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
energy += oedihedral;
|
||||
force->pair->eng_vdwl += oevdwl;
|
||||
force->pair->eng_coul += oecoul;
|
||||
}
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
force->pair->virial[0] += opv0;
|
||||
force->pair->virial[1] += opv1;
|
||||
force->pair->virial[2] += opv2;
|
||||
force->pair->virial[3] += opv3;
|
||||
force->pair->virial[4] += opv4;
|
||||
force->pair->virial[5] += opv5;
|
||||
}
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void DihedralCharmmIntel::init_style()
|
||||
{
|
||||
DihedralCharmm::init_style();
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
if (ifix < 0)
|
||||
error->all(FLERR,
|
||||
"The 'package intel' command is required for /intel styles");
|
||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
_use_base = 0;
|
||||
if (fix->offload_balance() != 0.0) {
|
||||
_use_base = 1;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
fix->bond_init_check();
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
pack_force_const(force_const_double, fix->get_double_buffers());
|
||||
else
|
||||
pack_force_const(force_const_single, fix->get_single_buffers());
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
|
||||
const int tp1 = atom->ntypes + 1;
|
||||
const int bp1 = atom->ndihedraltypes + 1;
|
||||
fc.set_ntypes(tp1,bp1,memory);
|
||||
buffers->set_ntypes(tp1);
|
||||
|
||||
for (int i = 0; i < tp1; i++) {
|
||||
for (int j = 0; j < tp1; j++) {
|
||||
fc.ljp[i][j].lj1 = lj14_1[i][j];
|
||||
fc.ljp[i][j].lj2 = lj14_2[i][j];
|
||||
fc.ljp[i][j].lj3 = lj14_3[i][j];
|
||||
fc.ljp[i][j].lj4 = lj14_4[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < bp1; i++) {
|
||||
fc.bp[i].multiplicity = multiplicity[i];
|
||||
fc.bp[i].cos_shift = cos_shift[i];
|
||||
fc.bp[i].sin_shift = sin_shift[i];
|
||||
fc.bp[i].k = k[i];
|
||||
fc.weight[i] = weight[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t>
|
||||
void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
|
||||
const int nbondtypes,
|
||||
Memory *memory) {
|
||||
if (npairtypes != _npairtypes) {
|
||||
if (_npairtypes > 0)
|
||||
_memory->destroy(ljp);
|
||||
if (npairtypes > 0)
|
||||
memory->create(ljp,npairtypes,npairtypes,"fc.ljp");
|
||||
}
|
||||
|
||||
if (nbondtypes != _nbondtypes) {
|
||||
if (_nbondtypes > 0) {
|
||||
_memory->destroy(bp);
|
||||
_memory->destroy(weight);
|
||||
}
|
||||
|
||||
if (nbondtypes > 0) {
|
||||
_memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
|
||||
_memory->create(weight,nbondtypes,"dihedralcharmmintel.weight");
|
||||
}
|
||||
}
|
||||
_npairtypes = npairtypes;
|
||||
_nbondtypes = nbondtypes;
|
||||
_memory = memory;
|
||||
}
|
||||
84
src/USER-INTEL/dihedral_charmm_intel.h
Normal file
84
src/USER-INTEL/dihedral_charmm_intel.h
Normal file
@ -0,0 +1,84 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef DIHEDRAL_CLASS
|
||||
|
||||
DihedralStyle(charmm/intel,DihedralCharmmIntel)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef LMP_DIHEDRAL_CHARMM_INTEL_H
|
||||
#define LMP_DIHEDRAL_CHARMM_INTEL_H
|
||||
|
||||
#include "dihedral_charmm.h"
|
||||
#include "fix_intel.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class DihedralCharmmIntel : public DihedralCharmm {
|
||||
|
||||
public:
|
||||
DihedralCharmmIntel(class LAMMPS *lmp);
|
||||
virtual void compute(int, int);
|
||||
void init_style();
|
||||
|
||||
private:
|
||||
FixIntel *fix;
|
||||
|
||||
template <class flt_t> class ForceConst;
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <class flt_t, class acc_t>
|
||||
void pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t, acc_t> *buffers);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int _use_base;
|
||||
#endif
|
||||
|
||||
template <class flt_t>
|
||||
class ForceConst {
|
||||
public:
|
||||
typedef struct { flt_t lj1, lj2, lj3, lj4; } fc_packed1;
|
||||
typedef struct { flt_t cos_shift, sin_shift, k;
|
||||
int multiplicity; } fc_packed3;
|
||||
|
||||
fc_packed1 **ljp;
|
||||
fc_packed3 *bp;
|
||||
flt_t *weight;
|
||||
|
||||
ForceConst() : _npairtypes(0), _nbondtypes(0) {}
|
||||
~ForceConst() { set_ntypes(0, 0, NULL); }
|
||||
|
||||
void set_ntypes(const int npairtypes, const int nbondtypes, Memory *memory);
|
||||
|
||||
private:
|
||||
int _npairtypes, _nbondtypes;
|
||||
Memory *_memory;
|
||||
};
|
||||
ForceConst<float> force_const_single;
|
||||
ForceConst<double> force_const_double;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@ -71,6 +71,12 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
||||
_offload_threads = 0;
|
||||
_offload_tpc = 4;
|
||||
|
||||
_force_array_s = 0;
|
||||
_force_array_m = 0;
|
||||
_force_array_d = 0;
|
||||
_ev_array_s = 0;
|
||||
_ev_array_d = 0;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (ncops < 0) error->all(FLERR,"Illegal package intel command");
|
||||
_offload_affinity_set = 0;
|
||||
@ -293,6 +299,8 @@ void FixIntel::init()
|
||||
_mixed_buffers->zero_ev();
|
||||
else
|
||||
_double_buffers->zero_ev();
|
||||
|
||||
_need_reduce = 0;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
@ -317,6 +325,10 @@ void FixIntel::pair_init_check(const bool cdmessage)
|
||||
|
||||
_nbor_pack_width = 1;
|
||||
|
||||
if (strstr(update->integrate_style,"intel") == 0)
|
||||
error->all(FLERR,
|
||||
"Specified run_style does not support the Intel package.");
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_offload_balance != 0.0) atom->sortfreq = 1;
|
||||
|
||||
@ -348,9 +360,6 @@ void FixIntel::pair_init_check(const bool cdmessage)
|
||||
_sync_at_pair = 2;
|
||||
} else {
|
||||
_sync_at_pair = 0;
|
||||
if (strstr(update->integrate_style,"intel") == 0)
|
||||
error->all(FLERR,
|
||||
"Specified run_style does not support the Intel package.");
|
||||
}
|
||||
#endif
|
||||
_nthreads = comm->nthreads;
|
||||
@ -415,6 +424,35 @@ void FixIntel::pair_init_check(const bool cdmessage)
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void FixIntel::bond_init_check()
|
||||
{
|
||||
if (_offload_balance != 0.0 && atom->molecular &&
|
||||
force->newton_pair != force->newton_bond)
|
||||
error->all(FLERR,
|
||||
"USER-INTEL package requires same setting for newton bond and non-bond.");
|
||||
|
||||
int intel_pair = 0;
|
||||
if (force->pair_match("/intel", 0) != NULL)
|
||||
intel_pair = 1;
|
||||
else if (force->pair_match("hybrid", 1) != NULL) {
|
||||
PairHybrid *hybrid = (PairHybrid *) force->pair;
|
||||
for (int i = 0; i < hybrid->nstyles; i++)
|
||||
if (strstr(hybrid->keywords[i], "/intel") != NULL)
|
||||
intel_pair = 1;
|
||||
} else if (force->pair_match("hybrid/overlay", 1) != NULL) {
|
||||
PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
|
||||
for (int i = 0; i < hybrid->nstyles; i++)
|
||||
if (strstr(hybrid->keywords[i], "/intel") != NULL)
|
||||
intel_pair = 1;
|
||||
}
|
||||
|
||||
if (intel_pair == 0)
|
||||
error->all(FLERR, "Intel styles for bond/angle/dihedral/improper "
|
||||
"require intel pair style.");
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void FixIntel::check_neighbor_intel()
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
@ -436,6 +474,82 @@ void FixIntel::check_neighbor_intel()
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void FixIntel::sync()
|
||||
{
|
||||
if (_force_array_m != 0) {
|
||||
if (_need_reduce) {
|
||||
reduce_results(_force_array_m);
|
||||
_need_reduce = 0;
|
||||
}
|
||||
add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom, 0);
|
||||
_force_array_m = 0;
|
||||
} else if (_force_array_d != 0) {
|
||||
if (_need_reduce) {
|
||||
reduce_results(_force_array_d);
|
||||
_need_reduce = 0;
|
||||
}
|
||||
add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom, 0);
|
||||
_force_array_d = 0;
|
||||
} else if (_force_array_s != 0) {
|
||||
if (_need_reduce) {
|
||||
reduce_results(_force_array_s);
|
||||
_need_reduce = 0;
|
||||
}
|
||||
add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom, 0);
|
||||
_force_array_s = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class ft>
|
||||
void FixIntel::reduce_results(ft * _noalias const f_start)
|
||||
{
|
||||
int o_range, f_stride;
|
||||
if (force->newton_pair)
|
||||
o_range = atom->nlocal + atom->nghost;
|
||||
else
|
||||
o_range = atom->nlocal;
|
||||
IP_PRE_get_stride(f_stride, o_range, sizeof(ft), lmp->atom->torque);
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(o_range, f_stride)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
|
||||
sizeof(ft));
|
||||
|
||||
int t_off = f_stride;
|
||||
if (_results_eatom) {
|
||||
for (int t = 1; t < _nthreads; t++) {
|
||||
_use_simd_pragma("vector nontemporal")
|
||||
_use_simd_pragma("novector")
|
||||
for (int n = iifrom; n < iito; n++) {
|
||||
f_start[n].x += f_start[n + t_off].x;
|
||||
f_start[n].y += f_start[n + t_off].y;
|
||||
f_start[n].z += f_start[n + t_off].z;
|
||||
f_start[n].w += f_start[n + t_off].w;
|
||||
}
|
||||
t_off += f_stride;
|
||||
}
|
||||
} else {
|
||||
for (int t = 1; t < _nthreads; t++) {
|
||||
_use_simd_pragma("vector nontemporal")
|
||||
_use_simd_pragma("novector")
|
||||
for (int n = iifrom; n < iito; n++) {
|
||||
f_start[n].x += f_start[n + t_off].x;
|
||||
f_start[n].y += f_start[n + t_off].y;
|
||||
f_start[n].z += f_start[n + t_off].z;
|
||||
}
|
||||
t_off += f_stride;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void FixIntel::sync_coprocessor()
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
@ -456,6 +570,153 @@ void FixIntel::sync_coprocessor()
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class ft, class acc_t>
|
||||
void FixIntel::add_results(const ft * _noalias const f_in,
|
||||
const acc_t * _noalias const ev_global,
|
||||
const int eatom, const int vatom,
|
||||
const int offload) {
|
||||
start_watch(TIME_PACK);
|
||||
int f_length;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_separate_buffers) {
|
||||
if (offload) {
|
||||
add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
|
||||
if (force->newton_pair) {
|
||||
const acc_t * _noalias const enull = 0;
|
||||
int offset = _offload_nlocal;
|
||||
if (atom->torque) offset *= 2;
|
||||
add_oresults(f_in + offset, enull, eatom, vatom,
|
||||
_offload_min_ghost, _offload_nghost);
|
||||
}
|
||||
} else {
|
||||
add_oresults(f_in, ev_global, eatom, vatom,
|
||||
_host_min_local, _host_used_local);
|
||||
if (force->newton_pair) {
|
||||
const acc_t * _noalias const enull = 0;
|
||||
int offset = _host_used_local;
|
||||
if (atom->torque) offset *= 2;
|
||||
add_oresults(f_in + offset, enull, eatom,
|
||||
vatom, _host_min_ghost, _host_used_ghost);
|
||||
}
|
||||
}
|
||||
stop_watch(TIME_PACK);
|
||||
return;
|
||||
}
|
||||
if (force->newton_pair && (_offload_noghost == 0 || offload == 0))
|
||||
f_length = atom->nlocal + atom->nghost;
|
||||
else
|
||||
f_length = atom->nlocal;
|
||||
#else
|
||||
if (force->newton_pair)
|
||||
f_length = atom->nlocal + atom->nghost;
|
||||
else
|
||||
f_length = atom->nlocal;
|
||||
#endif
|
||||
|
||||
add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
|
||||
stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class ft, class acc_t>
|
||||
void FixIntel::add_oresults(const ft * _noalias const f_in,
|
||||
const acc_t * _noalias const ev_global,
|
||||
const int eatom, const int vatom,
|
||||
const int out_offset, const int nall) {
|
||||
lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
|
||||
if (atom->torque) {
|
||||
if (f_in[1].w)
|
||||
if (f_in[1].w == 1)
|
||||
error->all(FLERR,"Bad matrix inversion in mldivide3");
|
||||
else
|
||||
error->all(FLERR,
|
||||
"Sphere particles not yet supported for gayberne/intel");
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none)
|
||||
#endif
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
const int tid = omp_get_thread_num();
|
||||
#else
|
||||
const int tid = 0;
|
||||
#endif
|
||||
int ifrom, ito;
|
||||
IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
|
||||
if (atom->torque) {
|
||||
int ii = ifrom * 2;
|
||||
lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
|
||||
out_offset;
|
||||
if (eatom) {
|
||||
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma novector
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
f[i].x += f_in[ii].x;
|
||||
f[i].y += f_in[ii].y;
|
||||
f[i].z += f_in[ii].z;
|
||||
lmp_eatom[i] += f_in[ii].w;
|
||||
tor[i].x += f_in[ii+1].x;
|
||||
tor[i].y += f_in[ii+1].y;
|
||||
tor[i].z += f_in[ii+1].z;
|
||||
ii += 2;
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma novector
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
f[i].x += f_in[ii].x;
|
||||
f[i].y += f_in[ii].y;
|
||||
f[i].z += f_in[ii].z;
|
||||
tor[i].x += f_in[ii+1].x;
|
||||
tor[i].y += f_in[ii+1].y;
|
||||
tor[i].z += f_in[ii+1].z;
|
||||
ii += 2;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (eatom) {
|
||||
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma novector
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
f[i].x += f_in[i].x;
|
||||
f[i].y += f_in[i].y;
|
||||
f[i].z += f_in[i].z;
|
||||
lmp_eatom[i] += f_in[i].w;
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma novector
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
f[i].x += f_in[i].x;
|
||||
f[i].y += f_in[i].y;
|
||||
f[i].z += f_in[i].z;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ev_global != NULL) {
|
||||
force->pair->eng_vdwl += ev_global[0];
|
||||
force->pair->eng_coul += ev_global[1];
|
||||
force->pair->virial[0] += ev_global[2];
|
||||
force->pair->virial[1] += ev_global[3];
|
||||
force->pair->virial[2] += ev_global[4];
|
||||
force->pair->virial[3] += ev_global[5];
|
||||
force->pair->virial[4] += ev_global[6];
|
||||
force->pair->virial[5] += ev_global[7];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
double FixIntel::memory_usage()
|
||||
{
|
||||
double bytes;
|
||||
@ -473,6 +734,70 @@ double FixIntel::memory_usage()
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class ft, class acc_t>
|
||||
void FixIntel::add_off_results(const ft * _noalias const f_in,
|
||||
const acc_t * _noalias const ev_global) {
|
||||
if (_offload_balance < 0.0)
|
||||
_balance_other_time = MPI_Wtime() - _balance_other_time;
|
||||
|
||||
start_watch(TIME_OFFLOAD_WAIT);
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
#pragma offload_wait target(mic:_cop) wait(f_in)
|
||||
#endif
|
||||
double wait_time = stop_watch(TIME_OFFLOAD_WAIT);
|
||||
|
||||
if (neighbor->ago == 0) {
|
||||
if (_off_overflow_flag[LMP_OVERFLOW])
|
||||
error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
|
||||
_offload_nlocal = _off_overflow_flag[LMP_LOCAL_MAX] + 1;
|
||||
_offload_min_ghost = _off_overflow_flag[LMP_GHOST_MIN];
|
||||
_offload_nghost = _off_overflow_flag[LMP_GHOST_MAX] + 1 -
|
||||
_offload_min_ghost;
|
||||
if (_offload_nghost < 0) _offload_nghost = 0;
|
||||
_offload_nall = _offload_nlocal + _offload_nghost;
|
||||
_offload_nlocal;
|
||||
}
|
||||
|
||||
int nlocal = atom->nlocal;
|
||||
// Load balance?
|
||||
if (_offload_balance < 0.0) {
|
||||
if (neighbor->ago == 0)
|
||||
_balance_pair = _balance_neighbor;
|
||||
double mic_time;
|
||||
mic_time = *_stopwatch_offload_pair;
|
||||
if (_balance_pair_time + _balance_other_time < mic_time) {
|
||||
double ft = _balance_pair_time + _balance_other_time + wait_time -
|
||||
mic_time;
|
||||
_balance_fixed = (1.0 - INTEL_LB_MEAN_WEIGHT) * _balance_fixed +
|
||||
INTEL_LB_MEAN_WEIGHT * ft;
|
||||
}
|
||||
|
||||
double ctps = _balance_pair_time / (1.0-_balance_pair);
|
||||
double otps = mic_time / _balance_pair;
|
||||
double new_balance = (ctps + _balance_other_time - _balance_fixed) /
|
||||
(otps + ctps);
|
||||
if (new_balance < 0.01) new_balance = 0.01;
|
||||
else if (new_balance > 0.99) new_balance = 0.99;
|
||||
_balance_neighbor = (1.0 - INTEL_LB_MEAN_WEIGHT) *_balance_neighbor +
|
||||
INTEL_LB_MEAN_WEIGHT * new_balance;
|
||||
}
|
||||
|
||||
#ifdef TIME_BALANCE
|
||||
start_watch(TIME_IMBALANCE);
|
||||
MPI_Barrier(_real_space_comm);
|
||||
stop_watch(TIME_IMBALANCE);
|
||||
#endif
|
||||
acc_timers();
|
||||
if (atom->torque)
|
||||
if (f_in[1].w < 0.0)
|
||||
error->all(FLERR, "Bad matrix inversion in mldivide3");
|
||||
add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void FixIntel::output_timing_data() {
|
||||
if (_im_real_space_task == 0 || _offload_affinity_set == 0) return;
|
||||
|
||||
|
||||
@ -44,7 +44,9 @@ class FixIntel : public Fix {
|
||||
virtual void init();
|
||||
virtual void setup(int);
|
||||
void pair_init_check(const bool cdmessage=false);
|
||||
void bond_init_check();
|
||||
|
||||
void sync();
|
||||
// Get all forces, calculation results from coprocesser
|
||||
void sync_coprocessor();
|
||||
|
||||
@ -65,6 +67,12 @@ class FixIntel : public Fix {
|
||||
inline int nbor_pack_width() const { return _nbor_pack_width; }
|
||||
inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
|
||||
|
||||
inline int need_zero(const int tid) {
|
||||
if (_need_reduce == 0 && tid > 0) return 1;
|
||||
return 0;
|
||||
}
|
||||
inline void set_reduce_flag() { _need_reduce = 1; }
|
||||
|
||||
protected:
|
||||
IntelBuffers<float,float> *_single_buffers;
|
||||
IntelBuffers<float,double> *_mixed_buffers;
|
||||
@ -77,13 +85,16 @@ class FixIntel : public Fix {
|
||||
inline int* get_off_overflow_flag() { return _off_overflow_flag; }
|
||||
inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
||||
double *ev_in, const int offload,
|
||||
const int eatom = 0, const int vatom = 0);
|
||||
const int eatom = 0, const int vatom = 0,
|
||||
const int rflag = 0);
|
||||
inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
|
||||
double *ev_in, const int offload,
|
||||
const int eatom = 0, const int vatom = 0);
|
||||
const int eatom = 0, const int vatom = 0,
|
||||
const int rflag = 0);
|
||||
inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
|
||||
float *ev_in, const int offload,
|
||||
const int eatom = 0, const int vatom = 0);
|
||||
const int eatom = 0, const int vatom = 0,
|
||||
const int rflag = 0);
|
||||
inline void get_buffern(const int offload, int &nlocal, int &nall,
|
||||
int &minlocal);
|
||||
|
||||
@ -139,6 +150,15 @@ class FixIntel : public Fix {
|
||||
int _overflow_flag[5];
|
||||
_alignvar(int _off_overflow_flag[5],64);
|
||||
int _allow_separate_buffers, _offload_ghost;
|
||||
|
||||
IntelBuffers<float,float>::vec3_acc_t *_force_array_s;
|
||||
IntelBuffers<float,double>::vec3_acc_t *_force_array_m;
|
||||
IntelBuffers<double,double>::vec3_acc_t *_force_array_d;
|
||||
float *_ev_array_s;
|
||||
double *_ev_array_d;
|
||||
int _results_eatom, _results_vatom;
|
||||
int _need_reduce;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
double _balance_pair_time, _balance_other_time;
|
||||
int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost;
|
||||
@ -168,6 +188,9 @@ class FixIntel : public Fix {
|
||||
_alignvar(double _stopwatch_offload_neighbor[1],64);
|
||||
_alignvar(double _stopwatch_offload_pair[1],64);
|
||||
|
||||
template <class ft>
|
||||
void reduce_results(ft * _noalias const f_in);
|
||||
|
||||
template <class ft, class acc_t>
|
||||
inline void add_results(const ft * _noalias const f_in,
|
||||
const acc_t * _noalias const ev_global,
|
||||
@ -226,7 +249,8 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
|
||||
|
||||
void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
||||
double *ev_in, const int offload,
|
||||
const int eatom, const int vatom) {
|
||||
const int eatom, const int vatom,
|
||||
const int rflag) {
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
_off_results_eatom = eatom;
|
||||
@ -237,11 +261,23 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
add_results(f_in, ev_in, eatom, vatom, 0);
|
||||
|
||||
_force_array_d = f_in;
|
||||
_ev_array_d = ev_in;
|
||||
_results_eatom = eatom;
|
||||
_results_vatom = vatom;
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
|
||||
#endif
|
||||
|
||||
if (_overflow_flag[LMP_OVERFLOW])
|
||||
error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_sync_at_pair) sync_coprocessor();
|
||||
if (_sync_at_pair) {
|
||||
sync();
|
||||
sync_coprocessor();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -249,7 +285,8 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
||||
|
||||
void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
|
||||
double *ev_in, const int offload,
|
||||
const int eatom, const int vatom) {
|
||||
const int eatom, const int vatom,
|
||||
const int rflag) {
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
_off_results_eatom = eatom;
|
||||
@ -260,11 +297,23 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
add_results(f_in, ev_in, eatom, vatom, 0);
|
||||
|
||||
_force_array_m = f_in;
|
||||
_ev_array_d = ev_in;
|
||||
_results_eatom = eatom;
|
||||
_results_vatom = vatom;
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
|
||||
#endif
|
||||
|
||||
if (_overflow_flag[LMP_OVERFLOW])
|
||||
error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_sync_at_pair) sync_coprocessor();
|
||||
if (_sync_at_pair) {
|
||||
sync();
|
||||
sync_coprocessor();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -272,7 +321,8 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
|
||||
|
||||
void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
|
||||
float *ev_in, const int offload,
|
||||
const int eatom, const int vatom) {
|
||||
const int eatom, const int vatom,
|
||||
const int rflag) {
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
_off_results_eatom = eatom;
|
||||
@ -283,161 +333,28 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
add_results(f_in, ev_in, eatom, vatom, 0);
|
||||
|
||||
_force_array_s = f_in;
|
||||
_ev_array_s = ev_in;
|
||||
_results_eatom = eatom;
|
||||
_results_vatom = vatom;
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
|
||||
#endif
|
||||
|
||||
if (_overflow_flag[LMP_OVERFLOW])
|
||||
error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_sync_at_pair) sync_coprocessor();
|
||||
if (_sync_at_pair) {
|
||||
sync();
|
||||
sync_coprocessor();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class ft, class acc_t>
|
||||
void FixIntel::add_results(const ft * _noalias const f_in,
|
||||
const acc_t * _noalias const ev_global,
|
||||
const int eatom, const int vatom,
|
||||
const int offload) {
|
||||
start_watch(TIME_PACK);
|
||||
int f_length;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_separate_buffers) {
|
||||
if (offload) {
|
||||
add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
|
||||
if (force->newton_pair) {
|
||||
const acc_t * _noalias const enull = 0;
|
||||
int offset = _offload_nlocal;
|
||||
if (atom->torque) offset *= 2;
|
||||
add_oresults(f_in + offset, enull, eatom, vatom,
|
||||
_offload_min_ghost, _offload_nghost);
|
||||
}
|
||||
} else {
|
||||
add_oresults(f_in, ev_global, eatom, vatom,
|
||||
_host_min_local, _host_used_local);
|
||||
if (force->newton_pair) {
|
||||
const acc_t * _noalias const enull = 0;
|
||||
int offset = _host_used_local;
|
||||
if (atom->torque) offset *= 2;
|
||||
add_oresults(f_in + offset, enull, eatom,
|
||||
vatom, _host_min_ghost, _host_used_ghost);
|
||||
}
|
||||
}
|
||||
stop_watch(TIME_PACK);
|
||||
return;
|
||||
}
|
||||
if (force->newton_pair && (_offload_noghost == 0 || offload == 0))
|
||||
f_length = atom->nlocal + atom->nghost;
|
||||
else
|
||||
f_length = atom->nlocal;
|
||||
#else
|
||||
if (force->newton_pair)
|
||||
f_length = atom->nlocal + atom->nghost;
|
||||
else
|
||||
f_length = atom->nlocal;
|
||||
#endif
|
||||
|
||||
add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
|
||||
stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class ft, class acc_t>
|
||||
void FixIntel::add_oresults(const ft * _noalias const f_in,
|
||||
const acc_t * _noalias const ev_global,
|
||||
const int eatom, const int vatom,
|
||||
const int out_offset, const int nall) {
|
||||
lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
|
||||
if (atom->torque) {
|
||||
if (f_in[1].w)
|
||||
if (f_in[1].w == 1)
|
||||
error->all(FLERR,"Bad matrix inversion in mldivide3");
|
||||
else
|
||||
error->all(FLERR,
|
||||
"Sphere particles not yet supported for gayberne/intel");
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none)
|
||||
#endif
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
const int tid = omp_get_thread_num();
|
||||
#else
|
||||
const int tid = 0;
|
||||
#endif
|
||||
int ifrom, ito;
|
||||
IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
|
||||
if (atom->torque) {
|
||||
int ii = ifrom * 2;
|
||||
lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
|
||||
out_offset;
|
||||
if (eatom) {
|
||||
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma novector
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
f[i].x += f_in[ii].x;
|
||||
f[i].y += f_in[ii].y;
|
||||
f[i].z += f_in[ii].z;
|
||||
lmp_eatom[i] += f_in[ii].w;
|
||||
tor[i].x += f_in[ii+1].x;
|
||||
tor[i].y += f_in[ii+1].y;
|
||||
tor[i].z += f_in[ii+1].z;
|
||||
ii += 2;
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma novector
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
f[i].x += f_in[ii].x;
|
||||
f[i].y += f_in[ii].y;
|
||||
f[i].z += f_in[ii].z;
|
||||
tor[i].x += f_in[ii+1].x;
|
||||
tor[i].y += f_in[ii+1].y;
|
||||
tor[i].z += f_in[ii+1].z;
|
||||
ii += 2;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (eatom) {
|
||||
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma novector
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
f[i].x += f_in[i].x;
|
||||
f[i].y += f_in[i].y;
|
||||
f[i].z += f_in[i].z;
|
||||
lmp_eatom[i] += f_in[i].w;
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma novector
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
f[i].x += f_in[i].x;
|
||||
f[i].y += f_in[i].y;
|
||||
f[i].z += f_in[i].z;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ev_global != NULL) {
|
||||
force->pair->eng_vdwl += ev_global[0];
|
||||
force->pair->eng_coul += ev_global[1];
|
||||
force->pair->virial[0] += ev_global[2];
|
||||
force->pair->virial[1] += ev_global[3];
|
||||
force->pair->virial[2] += ev_global[4];
|
||||
force->pair->virial[3] += ev_global[5];
|
||||
force->pair->virial[4] += ev_global[6];
|
||||
force->pair->virial[5] += ev_global[7];
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
@ -491,70 +408,6 @@ void FixIntel::set_neighbor_host_sizes() {
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class ft, class acc_t>
|
||||
void FixIntel::add_off_results(const ft * _noalias const f_in,
|
||||
const acc_t * _noalias const ev_global) {
|
||||
if (_offload_balance < 0.0)
|
||||
_balance_other_time = MPI_Wtime() - _balance_other_time;
|
||||
|
||||
start_watch(TIME_OFFLOAD_WAIT);
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (neighbor->ago == 0) {
|
||||
#pragma offload_wait target(mic:_cop) wait(atom->tag, f_in)
|
||||
} else {
|
||||
#pragma offload_wait target(mic:_cop) wait(f_in)
|
||||
}
|
||||
#endif
|
||||
double wait_time = stop_watch(TIME_OFFLOAD_WAIT);
|
||||
|
||||
if (neighbor->ago == 0) {
|
||||
if (_off_overflow_flag[LMP_OVERFLOW])
|
||||
error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
|
||||
_offload_nlocal = _off_overflow_flag[LMP_LOCAL_MAX] + 1;
|
||||
_offload_min_ghost = _off_overflow_flag[LMP_GHOST_MIN];
|
||||
_offload_nghost = _off_overflow_flag[LMP_GHOST_MAX] + 1 -
|
||||
_offload_min_ghost;
|
||||
if (_offload_nghost < 0) _offload_nghost = 0;
|
||||
_offload_nall = _offload_nlocal + _offload_nghost;
|
||||
_offload_nlocal;
|
||||
}
|
||||
|
||||
int nlocal = atom->nlocal;
|
||||
// Load balance?
|
||||
if (_offload_balance < 0.0) {
|
||||
if (neighbor->ago == 0)
|
||||
_balance_pair = _balance_neighbor;
|
||||
double mic_time;
|
||||
mic_time = *_stopwatch_offload_pair;
|
||||
if (_balance_pair_time + _balance_other_time < mic_time) {
|
||||
double ft = _balance_pair_time + _balance_other_time + wait_time -
|
||||
mic_time;
|
||||
_balance_fixed = (1.0 - INTEL_LB_MEAN_WEIGHT) * _balance_fixed +
|
||||
INTEL_LB_MEAN_WEIGHT * ft;
|
||||
}
|
||||
|
||||
double ctps = _balance_pair_time / (1.0-_balance_pair);
|
||||
double otps = mic_time / _balance_pair;
|
||||
double new_balance = (ctps + _balance_other_time - _balance_fixed) /
|
||||
(otps + ctps);
|
||||
if (new_balance < 0.01) new_balance = 0.01;
|
||||
else if (new_balance > 0.99) new_balance = 0.99;
|
||||
_balance_neighbor = (1.0 - INTEL_LB_MEAN_WEIGHT) *_balance_neighbor +
|
||||
INTEL_LB_MEAN_WEIGHT * new_balance;
|
||||
}
|
||||
|
||||
#ifdef TIME_BALANCE
|
||||
start_watch(TIME_IMBALANCE);
|
||||
MPI_Barrier(_real_space_comm);
|
||||
stop_watch(TIME_IMBALANCE);
|
||||
#endif
|
||||
acc_timers();
|
||||
if (atom->torque)
|
||||
if (f_in[1].w < 0.0)
|
||||
error->all(FLERR, "Bad matrix inversion in mldivide3");
|
||||
add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
@ -653,4 +506,13 @@ W: More MPI tasks/OpenMP threads than available cores
|
||||
Using more MPI tasks/OpenMP threads than available cores will typically
|
||||
decrease performance.
|
||||
|
||||
E: USER-INTEL package requires same setting for newton bond and non-bond.
|
||||
|
||||
The newton setting must be the same for both pairwise and bonded forces.
|
||||
|
||||
E: Intel styles for bond/angle/dihedral/improper require intel pair style."
|
||||
|
||||
You cannot use the USER-INTEL package for bond calculations without a
|
||||
USER-INTEL supported pair style.
|
||||
|
||||
*/
|
||||
|
||||
378
src/USER-INTEL/improper_harmonic_intel.cpp
Normal file
378
src/USER-INTEL/improper_harmonic_intel.cpp
Normal file
@ -0,0 +1,378 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <mpi.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include "improper_harmonic_intel.h"
|
||||
#include "atom.h"
|
||||
#include "comm.h"
|
||||
#include "neighbor.h"
|
||||
#include "domain.h"
|
||||
#include "force.h"
|
||||
#include "update.h"
|
||||
#include "math_const.h"
|
||||
#include "memory.h"
|
||||
#include "modify.h"
|
||||
#include "suffix.h"
|
||||
#include "error.h"
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
using namespace MathConst;
|
||||
|
||||
#define PTOLERANCE (flt_t)1.05
|
||||
#define MTOLERANCE (flt_t)-1.05
|
||||
#define SMALL (flt_t)0.001
|
||||
typedef struct { int a,b,c,d,t; } int5_t;
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) :
|
||||
ImproperHarmonic(lmp)
|
||||
{
|
||||
suffix_flag |= Suffix::INTEL;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
ImproperHarmonicIntel::~ImproperHarmonicIntel()
|
||||
{
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void ImproperHarmonicIntel::compute(int eflag, int vflag)
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_use_base) {
|
||||
ImproperHarmonic::compute(eflag, vflag);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
||||
force_const_single);
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
||||
force_const_double);
|
||||
else
|
||||
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
|
||||
force_const_single);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void ImproperHarmonicIntel::compute(int eflag, int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
{
|
||||
if (eflag || vflag) ev_setup(eflag,vflag);
|
||||
else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<0,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void ImproperHarmonicIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
{
|
||||
const int inum = neighbor->nimproperlist;
|
||||
if (inum == 0) return;
|
||||
|
||||
ATOM_T * _noalias const x = buffers->get_x(0);
|
||||
const int nlocal = atom->nlocal;
|
||||
const int nall = nlocal + atom->nghost;
|
||||
|
||||
int f_stride;
|
||||
if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
|
||||
else f_stride = buffers->get_stride(nlocal);
|
||||
|
||||
int tc;
|
||||
FORCE_T * _noalias f_start;
|
||||
acc_t * _noalias ev_global;
|
||||
IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oeimproper = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,fc) \
|
||||
reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
const int5_t * _noalias const improperlist =
|
||||
(int5_t *) neighbor->improperlist[0];
|
||||
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
const int i1 = improperlist[n].a;
|
||||
const int i2 = improperlist[n].b;
|
||||
const int i3 = improperlist[n].c;
|
||||
const int i4 = improperlist[n].d;
|
||||
const int type = improperlist[n].t;
|
||||
|
||||
// geometry of 4-body
|
||||
|
||||
const flt_t vb1x = x[i1].x - x[i2].x;
|
||||
const flt_t vb1y = x[i1].y - x[i2].y;
|
||||
const flt_t vb1z = x[i1].z - x[i2].z;
|
||||
|
||||
const flt_t vb2x = x[i3].x - x[i2].x;
|
||||
const flt_t vb2y = x[i3].y - x[i2].y;
|
||||
const flt_t vb2z = x[i3].z - x[i2].z;
|
||||
|
||||
const flt_t vb3x = x[i4].x - x[i3].x;
|
||||
const flt_t vb3y = x[i4].y - x[i3].y;
|
||||
const flt_t vb3z = x[i4].z - x[i3].z;
|
||||
|
||||
const flt_t ss1 = (flt_t)1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
|
||||
const flt_t ss2 = (flt_t)1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z);
|
||||
const flt_t ss3 = (flt_t)1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
|
||||
|
||||
const flt_t r1 = sqrt(ss1);
|
||||
const flt_t r2 = sqrt(ss2);
|
||||
const flt_t r3 = sqrt(ss3);
|
||||
|
||||
// sin and cos of angle
|
||||
|
||||
const flt_t c0 = (vb1x * vb3x + vb1y * vb3y + vb1z * vb3z) * r1 * r3;
|
||||
const flt_t c1 = (vb1x * vb2x + vb1y * vb2y + vb1z * vb2z) * r1 * r2;
|
||||
const flt_t c2 = -(vb3x * vb2x + vb3y * vb2y + vb3z * vb2z) * r3 * r2;
|
||||
|
||||
flt_t s1 = 1.0 - c1*c1;
|
||||
if (s1 < SMALL) s1 = SMALL;
|
||||
s1 = (flt_t)1.0 / s1;
|
||||
|
||||
flt_t s2 = (flt_t)1.0 - c2*c2;
|
||||
if (s2 < SMALL) s2 = SMALL;
|
||||
s2 = (flt_t)1.0 / s2;
|
||||
|
||||
flt_t s12 = sqrt(s1*s2);
|
||||
flt_t c = (c1*c2 + c0) * s12;
|
||||
|
||||
// error check
|
||||
|
||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||
int me;
|
||||
MPI_Comm_rank(world,&me);
|
||||
if (screen) {
|
||||
char str[128];
|
||||
sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
|
||||
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
||||
TAGINT_FORMAT " " TAGINT_FORMAT,
|
||||
me,update->ntimestep,
|
||||
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
||||
error->warning(FLERR,str,0);
|
||||
fprintf(screen," 1st atom: %d %g %g %g\n",
|
||||
me,x[i1].x,x[i1].y,x[i1].z);
|
||||
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
||||
me,x[i2].x,x[i2].y,x[i2].z);
|
||||
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
||||
me,x[i3].x,x[i3].y,x[i3].z);
|
||||
fprintf(screen," 4th atom: %d %g %g %g\n",
|
||||
me,x[i4].x,x[i4].y,x[i4].z);
|
||||
}
|
||||
}
|
||||
|
||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||
|
||||
flt_t s = sqrt((flt_t)1.0 - c*c);
|
||||
if (s < SMALL) s = SMALL;
|
||||
|
||||
// force & energy
|
||||
|
||||
const flt_t domega = acos(c) - fc.fc[type].chi;
|
||||
flt_t a;
|
||||
a = fc.fc[type].k * domega;
|
||||
|
||||
flt_t eimproper;
|
||||
if (EFLAG) eimproper = a*domega;
|
||||
|
||||
a = -a * (flt_t)2.0/s;
|
||||
c = c * a;
|
||||
s12 = s12 * a;
|
||||
const flt_t a11 = c*ss1*s1;
|
||||
const flt_t a22 = -ss2 * ((flt_t)2.0*c0*s12 - c*(s1+s2));
|
||||
const flt_t a33 = c*ss3*s2;
|
||||
const flt_t a12 = -r1*r2*(c1*c*s1 + c2*s12);
|
||||
const flt_t a13 = -r1*r3*s12;
|
||||
const flt_t a23 = r2*r3*(c2*c*s2 + c1*s12);
|
||||
|
||||
const flt_t sx2 = a22*vb2x + a23*vb3x + a12*vb1x;
|
||||
const flt_t sy2 = a22*vb2y + a23*vb3y + a12*vb1y;
|
||||
const flt_t sz2 = a22*vb2z + a23*vb3z + a12*vb1z;
|
||||
|
||||
const flt_t f1x = a12*vb2x + a13*vb3x + a11*vb1x;
|
||||
const flt_t f1y = a12*vb2y + a13*vb3y + a11*vb1y;
|
||||
const flt_t f1z = a12*vb2z + a13*vb3z + a11*vb1z;
|
||||
|
||||
const flt_t f2x = -sx2 - f1x;
|
||||
const flt_t f2y = -sy2 - f1y;
|
||||
const flt_t f2z = -sz2 - f1z;
|
||||
|
||||
const flt_t f4x = a23*vb2x + a33*vb3x + a13*vb1x;
|
||||
const flt_t f4y = a23*vb2y + a33*vb3y + a13*vb1y;
|
||||
const flt_t f4z = a23*vb2z + a33*vb3z + a13*vb1z;
|
||||
|
||||
const flt_t f3x = sx2 - f4x;
|
||||
const flt_t f3y = sy2 - f4y;
|
||||
const flt_t f3z = sz2 - f4z;
|
||||
|
||||
// apply force to each of 4 atoms
|
||||
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x += f2x;
|
||||
f[i2].y += f2y;
|
||||
f[i2].z += f2z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i4 < nlocal) {
|
||||
f[i4].x += f4x;
|
||||
f[i4].y += f4y;
|
||||
f[i4].z += f4z;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4,
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||
vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, vb3x, vb3y,
|
||||
vb3z, oeimproper, f, NEWTON_BOND, nlocal,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
}
|
||||
} // for n
|
||||
} // omp parallel
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
energy += oeimproper;
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void ImproperHarmonicIntel::init()
|
||||
{
|
||||
ImproperHarmonic::init();
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
if (ifix < 0)
|
||||
error->all(FLERR,
|
||||
"The 'package intel' command is required for /intel styles");
|
||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
_use_base = 0;
|
||||
if (fix->offload_balance() != 0.0) {
|
||||
_use_base = 1;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
fix->bond_init_check();
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
pack_force_const(force_const_double, fix->get_double_buffers());
|
||||
else
|
||||
pack_force_const(force_const_single, fix->get_single_buffers());
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
const int bp1 = atom->nimpropertypes + 1;
|
||||
fc.set_ntypes(bp1,memory);
|
||||
|
||||
for (int i = 0; i < bp1; i++) {
|
||||
fc.fc[i].k = k[i];
|
||||
fc.fc[i].chi = chi[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t>
|
||||
void ImproperHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
|
||||
Memory *memory) {
|
||||
if (nimproper != _nimpropertypes) {
|
||||
if (_nimpropertypes > 0)
|
||||
_memory->destroy(fc);
|
||||
|
||||
if (nimproper > 0)
|
||||
_memory->create(fc,nimproper,"improperharmonicintel.fc");
|
||||
}
|
||||
_nimpropertypes = nimproper;
|
||||
_memory = memory;
|
||||
}
|
||||
94
src/USER-INTEL/improper_harmonic_intel.h
Normal file
94
src/USER-INTEL/improper_harmonic_intel.h
Normal file
@ -0,0 +1,94 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef IMPROPER_CLASS
|
||||
|
||||
ImproperStyle(harmonic/intel,ImproperHarmonicIntel)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef LMP_IMPROPER_HARMONIC_INTEL_H
|
||||
#define LMP_IMPROPER_HARMONIC_INTEL_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include "improper_harmonic.h"
|
||||
#include "fix_intel.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class ImproperHarmonicIntel : public ImproperHarmonic {
|
||||
public:
|
||||
ImproperHarmonicIntel(class LAMMPS *);
|
||||
virtual ~ImproperHarmonicIntel();
|
||||
virtual void compute(int, int);
|
||||
virtual void init();
|
||||
|
||||
protected:
|
||||
FixIntel *fix;
|
||||
|
||||
template <class flt_t> class ForceConst;
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <class flt_t, class acc_t>
|
||||
void pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t, acc_t> *buffers);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int _use_base;
|
||||
#endif
|
||||
|
||||
template <class flt_t>
|
||||
class ForceConst {
|
||||
public:
|
||||
typedef struct { flt_t k, chi; } fc_packed1;
|
||||
|
||||
fc_packed1 *fc;
|
||||
|
||||
ForceConst() : _nimpropertypes(0) {}
|
||||
~ForceConst() { set_ntypes(0, NULL); }
|
||||
|
||||
void set_ntypes(const int nimpropertypes, Memory *memory);
|
||||
|
||||
private:
|
||||
int _nimpropertypes;
|
||||
Memory *_memory;
|
||||
};
|
||||
ForceConst<float> force_const_single;
|
||||
ForceConst<double> force_const_double;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* ERROR/WARNING messages:
|
||||
|
||||
W: Improper problem: %d %ld %d %d %d %d
|
||||
|
||||
Conformation of the 4 listed improper atoms is extreme; you may want
|
||||
to check your simulation geometry.
|
||||
|
||||
E: Incorrect args for improper coefficients
|
||||
|
||||
Self-explanatory. Check the input script or data file.
|
||||
|
||||
*/
|
||||
@ -319,13 +319,10 @@ template <class flt_t, class acc_t>
|
||||
void IntelBuffers<flt_t, acc_t>::free_nbor_list()
|
||||
{
|
||||
if (_list_alloc_atoms > 0) {
|
||||
lmp->memory->destroy(_list_alloc);
|
||||
_list_alloc_atoms = 0;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_off_list_alloc) {
|
||||
int * list_alloc = _list_alloc;
|
||||
int * special_flag = lmp->neighbor->special_flag_alloc();
|
||||
int * special_flag = _off_map_special_flag;
|
||||
int * stencil = _off_map_stencil;
|
||||
if (list_alloc != 0 && special_flag != 0 && stencil != 0) {
|
||||
#pragma offload_transfer target(mic:_cop) \
|
||||
@ -335,6 +332,8 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list()
|
||||
_off_list_alloc = false;
|
||||
}
|
||||
#endif
|
||||
lmp->memory->destroy(_list_alloc);
|
||||
_list_alloc_atoms = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -350,7 +349,7 @@ void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
|
||||
free_nbor_list();
|
||||
_list_alloc_atoms = 1.10 * nlocal;
|
||||
int nt = MAX(nthreads, _off_threads);
|
||||
int list_alloc_size = (_list_alloc_atoms + nt + pack_width - 1) *
|
||||
int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) *
|
||||
get_max_nbors();
|
||||
lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
@ -365,6 +364,7 @@ void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
|
||||
in(stencil:length(list->maxstencil) alloc_if(1) free_if(0)) \
|
||||
nocopy(list_alloc:length(list_alloc_size) alloc_if(1) free_if(0))
|
||||
_off_map_stencil = stencil;
|
||||
_off_map_special_flag = special_flag;
|
||||
_off_list_alloc = true;
|
||||
}
|
||||
}
|
||||
@ -406,6 +406,11 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
|
||||
#pragma offload_transfer target(mic:_cop) \
|
||||
nocopy(ccachex,ccachey,ccachez,ccachew:alloc_if(0) free_if(1)) \
|
||||
nocopy(ccachei,ccachej:alloc_if(0) free_if(1))
|
||||
|
||||
#ifdef LMP_USE_AVXCD
|
||||
#pragma offload_transfer target(mic:_cop) \
|
||||
nocopy(ccachef:alloc_if(0) free_if(1))
|
||||
#endif
|
||||
}
|
||||
_off_ccache = 0;
|
||||
#endif
|
||||
@ -451,7 +456,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
||||
lmp->memory->create(_ccachei, vsize, "_ccachei");
|
||||
lmp->memory->create(_ccachej, vsize, "_ccachej");
|
||||
#ifdef LMP_USE_AVXCD
|
||||
IP_PRE_get_stride(_ccache_stride3, nsize * 3, esize, 0);
|
||||
IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0);
|
||||
lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
|
||||
#endif
|
||||
memset(_ccachej, 0, vsize * sizeof(int));
|
||||
@ -473,6 +478,12 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
||||
nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
|
||||
in(ccachej:length(vsize) alloc_if(1) free_if(0))
|
||||
}
|
||||
#ifdef LMP_USE_AVXCD
|
||||
if (ccachef != NULL) {
|
||||
#pragma offload_transfer target(mic:_cop) \
|
||||
nocopy(ccachef:length(_ccache_stride3 * nt) alloc_if(1) free_if(0))
|
||||
}
|
||||
#endif
|
||||
_off_ccache = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -298,7 +298,7 @@ class IntelBuffers {
|
||||
quat_t *_host_quat;
|
||||
vec3_acc_t *_off_f;
|
||||
int _off_map_nmax, _off_map_maxhead, _cop, _off_ccache;
|
||||
int *_off_map_ilist;
|
||||
int *_off_map_ilist, *_off_map_special_flag;
|
||||
int *_off_map_stencil, *_off_map_special, *_off_map_nspecial, *_off_map_tag;
|
||||
int *_off_map_binhead, *_off_map_bins, *_off_map_numneigh;
|
||||
bool _off_list_alloc;
|
||||
|
||||
@ -383,11 +383,78 @@ inline double MIC_Wtime() {
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_ev_tally_bond(eflag, eatom, vflag, ebond, i1, i2, fbond, \
|
||||
delx, dely, delz, obond, force, newton, \
|
||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5) \
|
||||
{ \
|
||||
flt_t ev_pre; \
|
||||
if (newton) ev_pre = (flt_t)1.0; \
|
||||
else { \
|
||||
ev_pre = (flt_t)0.0; \
|
||||
if (i1 < nlocal) ev_pre += (flt_t)0.5; \
|
||||
if (i2 < nlocal) ev_pre += (flt_t)0.5; \
|
||||
} \
|
||||
\
|
||||
if (eflag) { \
|
||||
oebond += ev_pre * ebond; \
|
||||
if (eatom) { \
|
||||
flt_t halfeng = ebond * (flt_t)0.5; \
|
||||
if (newton || i1 < nlocal) f[i1].w += halfeng; \
|
||||
if (newton || i2 < nlocal) f[i2].w += halfeng; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (vflag) { \
|
||||
ov0 += ev_pre * (delx * delx * fbond); \
|
||||
ov1 += ev_pre * (dely * dely * fbond); \
|
||||
ov2 += ev_pre * (delz * delz * fbond); \
|
||||
ov3 += ev_pre * (delx * dely * fbond); \
|
||||
ov4 += ev_pre * (delx * delz * fbond); \
|
||||
ov5 += ev_pre * (dely * delz * fbond); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_ev_tally_angle(eflag, eatom, vflag, eangle, i1, i2, i3, \
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, delx1, \
|
||||
dely1, delz1, delx2, dely2, delz2, \
|
||||
oeangle, force, newton, nlocal, ov0, ov1, \
|
||||
ov2, ov3, ov4, ov5) \
|
||||
{ \
|
||||
flt_t ev_pre; \
|
||||
if (newton) ev_pre = (flt_t)1.0; \
|
||||
else { \
|
||||
ev_pre = (flt_t)0.0; \
|
||||
if (i1 < nlocal) ev_pre += (flt_t)0.3333333333333333; \
|
||||
if (i2 < nlocal) ev_pre += (flt_t)0.3333333333333333; \
|
||||
if (i3 < nlocal) ev_pre += (flt_t)0.3333333333333333; \
|
||||
} \
|
||||
\
|
||||
if (eflag) { \
|
||||
oeangle += ev_pre * eangle; \
|
||||
if (eatom) { \
|
||||
flt_t thirdeng = eangle * (flt_t)0.3333333333333333; \
|
||||
if (newton || i1 < nlocal) f[i1].w += thirdeng; \
|
||||
if (newton || i2 < nlocal) f[i2].w += thirdeng; \
|
||||
if (newton || i3 < nlocal) f[i3].w += thirdeng; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (vflag) { \
|
||||
ov0 += ev_pre * (delx1 * f1x + delx2 * f3x); \
|
||||
ov1 += ev_pre * (dely1 * f1y + dely2 * f3y); \
|
||||
ov2 += ev_pre * (delz1 * f1z + delz2 * f3z); \
|
||||
ov3 += ev_pre * (delx1 * f1y + delx2 * f3y); \
|
||||
ov4 += ev_pre * (delx1 * f1z + delx2 * f3z); \
|
||||
ov5 += ev_pre * (dely1 * f1z + dely2 * f3z); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, \
|
||||
f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, \
|
||||
vb3x, vb3y, vb3z,oedihedral, force, \
|
||||
newton, nlocal) \
|
||||
newton, nlocal, ov0, ov1, ov2, ov3, ov4, \
|
||||
ov5) \
|
||||
{ \
|
||||
flt_t ev_pre; \
|
||||
if (newton) ev_pre = (flt_t)1.0; \
|
||||
@ -411,12 +478,12 @@ inline double MIC_Wtime() {
|
||||
} \
|
||||
\
|
||||
if (vflag) { \
|
||||
sv0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \
|
||||
sv1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \
|
||||
sv2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \
|
||||
sv3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y); \
|
||||
sv4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z); \
|
||||
sv5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z); \
|
||||
ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \
|
||||
ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \
|
||||
ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \
|
||||
ov3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y); \
|
||||
ov4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z); \
|
||||
ov5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z); \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -459,7 +526,7 @@ inline double MIC_Wtime() {
|
||||
|
||||
#define IP_PRE_fdotr_acc_force(newton, evflag, eflag, vflag, eatom, \
|
||||
nall, nlocal, minlocal, nthreads, \
|
||||
f_start, f_stride, x) \
|
||||
f_start, f_stride, x, offload) \
|
||||
{ \
|
||||
int o_range; \
|
||||
if (newton) \
|
||||
|
||||
@ -110,6 +110,16 @@ namespace ip_simd {
|
||||
return _mm512_maskz_mov_pd(m, one);
|
||||
}
|
||||
|
||||
inline SIMD_float SIMD_set(const SIMD_float &src, const SIMD_mask &m,
|
||||
const SIMD_float &one) {
|
||||
return _mm512_mask_mov_ps(src,m,one);
|
||||
}
|
||||
|
||||
inline SIMD_double SIMD_set(const SIMD_double &src, const SIMD_mask &m,
|
||||
const SIMD_double &one) {
|
||||
return _mm512_mask_mov_pd(src,m,one);
|
||||
}
|
||||
|
||||
// -------- Load Operations
|
||||
|
||||
inline SIMD_int SIMD_load(const int *p) {
|
||||
@ -352,6 +362,14 @@ namespace ip_simd {
|
||||
return _mm512_rcp28_pd(one);
|
||||
}
|
||||
|
||||
inline SIMD_float SIMD_rcpz(const SIMD_mask &m, const SIMD_float &one) {
|
||||
return _mm512_maskz_rcp28_ps(m, one);
|
||||
}
|
||||
|
||||
inline SIMD_double SIMD_rcpz(const SIMD_mask &m, const SIMD_double &one) {
|
||||
return _mm512_maskz_rcp28_pd(m, one);
|
||||
}
|
||||
|
||||
inline SIMD_float SIMD_sqrt(const SIMD_float &one) {
|
||||
return _mm512_sqrt_ps(one);
|
||||
}
|
||||
@ -456,6 +474,40 @@ namespace ip_simd {
|
||||
return _mm512_cmple_pd_mask(SIMD_set(one), two);
|
||||
}
|
||||
|
||||
inline SIMD_mask operator>(const SIMD_int &one, const SIMD_int &two) {
|
||||
return _mm512_cmpgt_epi32_mask(one,two);
|
||||
}
|
||||
|
||||
inline SIMD_mask operator>(const SIMD_float &one, const SIMD_float &two) {
|
||||
return _mm512_cmplt_ps_mask(two,one);
|
||||
}
|
||||
|
||||
inline SIMD_mask operator>(const SIMD_double &one, const SIMD_double &two) {
|
||||
return _mm512_cmplt_pd_mask(two,one);
|
||||
}
|
||||
|
||||
inline SIMD_mask operator==(const SIMD_int &one, const SIMD_int &two) {
|
||||
return _mm512_cmpeq_epi32_mask(one,two);
|
||||
}
|
||||
|
||||
inline SIMD_mask operator==(const SIMD_float &one, const SIMD_float &two) {
|
||||
return _mm512_cmpeq_ps_mask(one,two);
|
||||
}
|
||||
|
||||
inline SIMD_mask operator==(const SIMD_double &one, const SIMD_double &two) {
|
||||
return _mm512_cmpeq_pd_mask(one,two);
|
||||
}
|
||||
|
||||
// ------- Typecast operations
|
||||
|
||||
inline void SIMD_cast(const SIMD_int &one, SIMD_float &two) {
|
||||
two = _mm512_cvtepi32_ps(one);
|
||||
}
|
||||
|
||||
inline void SIMD_cast(const SIMD_int &one, SIMD_double &two) {
|
||||
two = _mm512_cvtepi32lo_pd(one);
|
||||
}
|
||||
|
||||
// ------- Reduction operations
|
||||
|
||||
inline int SIMD_max(const SIMD_int &i) {
|
||||
@ -482,6 +534,55 @@ namespace ip_simd {
|
||||
return _mm512_reduce_add_pd(i);
|
||||
}
|
||||
|
||||
// i indices should be positive
|
||||
inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i,
|
||||
SIMD_float &v1) {
|
||||
SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
|
||||
SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
|
||||
SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
|
||||
if (todo_mask) {
|
||||
SIMD_int lz = _mm512_lzcnt_epi32(cd);
|
||||
SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
|
||||
_mm512_lzcnt_epi32(cd));
|
||||
|
||||
while(todo_mask) {
|
||||
SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
|
||||
SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
|
||||
todo_bcast);
|
||||
SIMD_float am_perm;
|
||||
am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
|
||||
now_mask, lid, v1);
|
||||
v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm);
|
||||
todo_mask = _mm512_kxor(todo_mask, now_mask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// i indices should be positive
|
||||
inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i,
|
||||
SIMD_double &v1) {
|
||||
SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
|
||||
SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
|
||||
SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
|
||||
if (todo_mask) {
|
||||
SIMD_int lz = _mm512_lzcnt_epi32(cd);
|
||||
SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
|
||||
_mm512_lzcnt_epi32(cd));
|
||||
lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid));
|
||||
|
||||
while(todo_mask) {
|
||||
SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
|
||||
SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
|
||||
todo_bcast);
|
||||
SIMD_double am_perm;
|
||||
am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
|
||||
now_mask, lid, v1);
|
||||
v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm);
|
||||
todo_mask = _mm512_kxor(todo_mask, now_mask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// i indices should be positive
|
||||
inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i,
|
||||
SIMD_float &v1, SIMD_float &v2,
|
||||
@ -655,6 +756,166 @@ namespace ip_simd {
|
||||
_MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline SIMD_float SIMD_ev_add(const SIMD_float &one,
|
||||
const SIMD_float &two) {
|
||||
return _mm512_add_ps(one,two);
|
||||
}
|
||||
|
||||
inline SIMD_double SIMD_ev_add(const SIMD_double &one,
|
||||
const SIMD_double &two) {
|
||||
return _mm512_add_pd(one,two);
|
||||
}
|
||||
|
||||
inline SIMD_double SIMD_ev_add(const SIMD_double &one,
|
||||
const SIMD_float &two) {
|
||||
SIMD_double twod = _mm512_cvtps_pd(_mm512_castps512_ps256(two));
|
||||
SIMD_double ans = _mm512_add_pd(one,twod);
|
||||
twod = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
||||
_mm512_shuffle_f32x4(two,two,238)));
|
||||
return _mm512_add_pd(ans,twod);
|
||||
}
|
||||
|
||||
inline void SIMD_jeng_update(const SIMD_mask &rmask, float *force,
|
||||
const SIMD_int &joffset, SIMD_float &eng) {
|
||||
SIMD_float jeng;
|
||||
SIMD_conflict_pi_reduce1(rmask, joffset, eng);
|
||||
jeng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), rmask, joffset,
|
||||
force, _MM_SCALE_1);
|
||||
jeng = jeng + eng;
|
||||
_mm512_mask_i32scatter_ps(force, rmask, joffset, jeng, _MM_SCALE_1);
|
||||
}
|
||||
|
||||
inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
|
||||
const SIMD_int &joffset, SIMD_double &eng) {
|
||||
SIMD_double jeng;
|
||||
SIMD_conflict_pi_reduce1(rmask, joffset, eng);
|
||||
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
||||
force, _MM_SCALE_2);
|
||||
jeng = jeng + eng;
|
||||
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
|
||||
const SIMD_int &joffset, SIMD_float &eng) {
|
||||
SIMD_double engd, jeng;
|
||||
engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
|
||||
SIMD_conflict_pi_reduce1(rmask, joffset, engd);
|
||||
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
||||
force, _MM_SCALE_2);
|
||||
jeng = jeng + engd;
|
||||
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
|
||||
|
||||
SIMD_mask rmask2 = rmask >> 8;
|
||||
engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
||||
_mm512_shuffle_f32x4(eng,eng,238)));
|
||||
SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
|
||||
SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
|
||||
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
|
||||
force, _MM_SCALE_2);
|
||||
jeng = jeng + engd;
|
||||
_mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force,
|
||||
const SIMD_int &joffset1, SIMD_float &eng) {
|
||||
}
|
||||
|
||||
inline void SIMD_jeng_update_hi(const SIMD_mask &mask, double *force,
|
||||
const SIMD_int &joffset1, SIMD_double &eng) {
|
||||
SIMD_mask rmask = mask >> 8;
|
||||
SIMD_int joffset = _mm512_shuffle_i32x4(joffset1, joffset1, 238);
|
||||
|
||||
SIMD_double jeng;
|
||||
SIMD_conflict_pi_reduce1(rmask, joffset, eng);
|
||||
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
||||
force, _MM_SCALE_2);
|
||||
jeng = jeng + eng;
|
||||
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
|
||||
const SIMD_int &i, SIMD_float &fx,
|
||||
SIMD_float &fy, SIMD_float &fz) {
|
||||
SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
|
||||
SIMD_float jfrc;
|
||||
jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
|
||||
_MM_SCALE_1);
|
||||
jfrc = jfrc + fx;
|
||||
_mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1);
|
||||
jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1,
|
||||
_MM_SCALE_1);
|
||||
jfrc = jfrc + fy;
|
||||
_mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1);
|
||||
jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2,
|
||||
_MM_SCALE_1);
|
||||
jfrc = jfrc + fz;
|
||||
_mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
|
||||
}
|
||||
|
||||
inline void SIMD_safe_jforce(const SIMD_mask &m, double *force,
|
||||
const SIMD_int &i, SIMD_double &fx,
|
||||
SIMD_double &fy, SIMD_double &fz) {
|
||||
SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
|
||||
SIMD_double jfrc;
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc + fx;
|
||||
_mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc + fy;
|
||||
_mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc + fz;
|
||||
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force,
|
||||
const SIMD_int &joffset, SIMD_float &amx,
|
||||
SIMD_float &amy, SIMD_float &amz) {
|
||||
SIMD_double amxd, amyd, amzd;
|
||||
amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx));
|
||||
amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy));
|
||||
amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
|
||||
SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
|
||||
SIMD_double jfrc;
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
||||
force, _MM_SCALE_2);
|
||||
jfrc = jfrc + amxd;
|
||||
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
||||
force + 1, _MM_SCALE_2);
|
||||
jfrc = jfrc + amyd;
|
||||
_mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
||||
force + 2, _MM_SCALE_2);
|
||||
jfrc = jfrc + amzd;
|
||||
_mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2);
|
||||
|
||||
SIMD_mask rmask2 = rmask >> 8;
|
||||
amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
||||
_mm512_shuffle_f32x4(amx,amx,238)));
|
||||
amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
||||
_mm512_shuffle_f32x4(amy,amy,238)));
|
||||
amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
||||
_mm512_shuffle_f32x4(amz,amz,238)));
|
||||
SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
|
||||
SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
|
||||
force, _MM_SCALE_2);
|
||||
jfrc = jfrc + amxd;
|
||||
_mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
|
||||
force + 1, _MM_SCALE_2);
|
||||
jfrc = jfrc + amyd;
|
||||
_mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
|
||||
force + 2, _MM_SCALE_2);
|
||||
jfrc = jfrc + amzd;
|
||||
_mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
|
||||
const SIMD_int &i, const SIMD_float &fx,
|
||||
const SIMD_float &fy, const SIMD_float &fz) {
|
||||
@ -1086,37 +1347,48 @@ namespace ip_simd {
|
||||
}
|
||||
}
|
||||
|
||||
inline void SIMD_acc_three(const SIMD_mask &hmask,
|
||||
const SIMD_float &facrad, const int eatom,
|
||||
SIMD_float &sevdwl, SIMD_float &fwtmp,
|
||||
SIMD_float &fjtmp, SIMD_float &fwtmp2,
|
||||
SIMD_float &fjtmp2) {
|
||||
inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad,
|
||||
const int eatom, SIMD_float &sevdwl,
|
||||
SIMD_float &fwtmp, SIMD_float &fjtmp,
|
||||
SIMD_float &fwtmp2, SIMD_float &fjtmp2,
|
||||
const SIMD_int &k, float *force) {
|
||||
sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad);
|
||||
if (eatom) {
|
||||
SIMD_float hevdwl = facrad * SIMD_set((float)0.33333333);
|
||||
fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
|
||||
fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
|
||||
SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
|
||||
SIMD_float keng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), hmask,
|
||||
k, force + 3, _MM_SCALE_1);
|
||||
keng = keng + hevdwl;
|
||||
_mm512_mask_i32scatter_ps(force + 3, hmask, k, keng, _MM_SCALE_1);
|
||||
}
|
||||
}
|
||||
|
||||
inline void SIMD_acc_three(const SIMD_mask &hmask,
|
||||
const SIMD_double &facrad, const int eatom,
|
||||
SIMD_double &sevdwl, SIMD_double &fwtmp,
|
||||
SIMD_double &fjtmp, SIMD_double &fwtmp2,
|
||||
SIMD_double &fjtmp2) {
|
||||
inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_double &facrad,
|
||||
const int eatom, SIMD_double &sevdwl,
|
||||
SIMD_double &fwtmp, SIMD_double &fjtmp,
|
||||
SIMD_double &fwtmp2, SIMD_double &fjtmp2,
|
||||
const SIMD_int &k, double *force) {
|
||||
sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad);
|
||||
if (eatom) {
|
||||
SIMD_double hevdwl = facrad * SIMD_set((double)0.33333333);
|
||||
fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
|
||||
fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
|
||||
SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
|
||||
SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
|
||||
hmask, k, force + 3,
|
||||
_MM_SCALE_2);
|
||||
keng = keng + hevdwl;
|
||||
_mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
|
||||
}
|
||||
}
|
||||
|
||||
inline void SIMD_acc_three(const SIMD_mask &hmask,
|
||||
const SIMD_float &facrad, const int eatom,
|
||||
SIMD_double &sevdwl, SIMD_double &fwtmp,
|
||||
SIMD_double &fjtmp, SIMD_double &fwtmp2,
|
||||
SIMD_double &fjtmp2) {
|
||||
inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad,
|
||||
const int eatom, SIMD_double &sevdwl,
|
||||
SIMD_double &fwtmp, SIMD_double &fjtmp,
|
||||
SIMD_double &fwtmp2, SIMD_double &fjtmp2,
|
||||
const SIMD_int &k, double *force) {
|
||||
SIMD_double facradd;
|
||||
facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(facrad));
|
||||
sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facradd);
|
||||
@ -1124,6 +1396,12 @@ namespace ip_simd {
|
||||
SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333);
|
||||
fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
|
||||
fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
|
||||
SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
|
||||
SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
|
||||
hmask, k, force + 3,
|
||||
_MM_SCALE_2);
|
||||
keng = keng + hevdwl;
|
||||
_mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
|
||||
}
|
||||
SIMD_mask hmask2 = hmask >> 8;
|
||||
facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
||||
@ -1133,6 +1411,13 @@ namespace ip_simd {
|
||||
SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333);
|
||||
fwtmp2 = SIMD_add(fwtmp2, hmask2, fwtmp2, hevdwl);
|
||||
fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
|
||||
SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
|
||||
SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
|
||||
SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
|
||||
hmask2, k2, force + 3,
|
||||
_MM_SCALE_2);
|
||||
keng = keng + hevdwl;
|
||||
_mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1372,7 +1657,9 @@ namespace ip_simd {
|
||||
|
||||
inline void SIMD_iforce_update(const SIMD_mask &m, float *force,
|
||||
const SIMD_int &i, const SIMD_float &fx,
|
||||
const SIMD_float &fy, const SIMD_float &fz) {
|
||||
const SIMD_float &fy, const SIMD_float &fz,
|
||||
const int EVFLAG, const int eatom,
|
||||
const SIMD_float &fwtmp) {
|
||||
SIMD_float jfrc;
|
||||
jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
|
||||
_MM_SCALE_1);
|
||||
@ -1386,11 +1673,21 @@ namespace ip_simd {
|
||||
_MM_SCALE_1);
|
||||
jfrc = jfrc + fz;
|
||||
_mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
|
||||
if (EVFLAG) {
|
||||
if (eatom) {
|
||||
jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3,
|
||||
_MM_SCALE_1);
|
||||
jfrc = jfrc + fwtmp;
|
||||
_mm512_mask_i32scatter_ps(force+3, m, i, jfrc, _MM_SCALE_1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void SIMD_iforce_update(const SIMD_mask &m, double *force,
|
||||
const SIMD_int &i, const SIMD_double &fx,
|
||||
const SIMD_double &fy, const SIMD_double &fz) {
|
||||
const SIMD_double &fy, const SIMD_double &fz,
|
||||
const int EVFLAG, const int eatom,
|
||||
const SIMD_double &fwtmp) {
|
||||
SIMD_double jfrc;
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
|
||||
_MM_SCALE_2);
|
||||
@ -1404,6 +1701,14 @@ namespace ip_simd {
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc + fz;
|
||||
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
|
||||
if (EVFLAG) {
|
||||
if (eatom) {
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
|
||||
force + 3, _MM_SCALE_2);
|
||||
jfrc = jfrc + fwtmp;
|
||||
_mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SW_GATHER_TEST
|
||||
|
||||
@ -369,6 +369,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
|
||||
const int list_size = (ito + tid + 1) * maxnbors;
|
||||
int ct = (ifrom + tid) * maxnbors;
|
||||
int *neighptr = firstneigh + ct;
|
||||
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
int j, k, n, n2, itype, jtype, ibin;
|
||||
double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
|
||||
@ -861,22 +862,23 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
int e_ito = ito;
|
||||
if (ito == num) {
|
||||
int imod = ito % swidth;
|
||||
if (imod) e_ito += swidth - e_ito;
|
||||
if (imod) e_ito += swidth - imod;
|
||||
}
|
||||
const int list_size = (e_ito + tid + 1) * maxnbors;
|
||||
const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
|
||||
#else
|
||||
const int swidth = 1;
|
||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
||||
ifrom += astart;
|
||||
ito += astart;
|
||||
const int list_size = (ito + tid + 1) * maxnbors;
|
||||
const int list_size = (ito + tid * 2 + 2) * maxnbors;
|
||||
#endif
|
||||
|
||||
int which;
|
||||
|
||||
int pack_offset = maxnbors * swidth;
|
||||
int ct = (ifrom + tid) * maxnbors;
|
||||
int ct = (ifrom + tid * 2) * maxnbors;
|
||||
int *neighptr = firstneigh + ct;
|
||||
const int obound = pack_offset + maxnbors * 2;
|
||||
|
||||
int max_chunk = 0;
|
||||
int lane = 0;
|
||||
@ -934,6 +936,8 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
}
|
||||
}
|
||||
|
||||
if (raw_count > obound) *overflow = 1;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
||||
@ -994,7 +998,6 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
}
|
||||
}
|
||||
int ns = (n - lane) / swidth;
|
||||
if (ns > maxnbors || n2 > list_size) *overflow = 1;
|
||||
for (int u = pack_offset; u < n2; u++) {
|
||||
neighptr[n] = neighptr[u];
|
||||
n += swidth;
|
||||
@ -1011,7 +1014,6 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
#ifdef OUTER_CHUNK
|
||||
if (ns > max_chunk) max_chunk = ns;
|
||||
lane++;
|
||||
pack_offset -= maxnbors;
|
||||
if (lane == swidth) {
|
||||
ct += max_chunk * swidth;
|
||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||
@ -1021,10 +1023,10 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
max_chunk = 0;
|
||||
pack_offset = maxnbors * swidth;
|
||||
lane = 0;
|
||||
if (ct + pack_offset + maxnbors > list_size) {
|
||||
if (ct + obound > list_size) {
|
||||
if (i < ito - 1) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid) * maxnbors;
|
||||
ct = (ifrom + tid * 2) * maxnbors;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1034,10 +1036,10 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
const int edge = (ct % alignb);
|
||||
if (edge) ct += alignb - edge;
|
||||
neighptr = firstneigh + ct;
|
||||
if (ct + pack_offset + maxnbors > list_size) {
|
||||
if (ct + obound > list_size) {
|
||||
if (i < ito - 1) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid) * maxnbors;
|
||||
ct = (ifrom + tid * 2) * maxnbors;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -1965,15 +1967,14 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
int e_ito = ito;
|
||||
if (ito == num) {
|
||||
int imod = ito % pack_width;
|
||||
if (imod) e_ito += pack_width - e_ito;
|
||||
if (imod) e_ito += pack_width - imod;
|
||||
}
|
||||
const int list_size = (e_ito + tid + 1) * maxnbors;
|
||||
|
||||
const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
|
||||
int which;
|
||||
|
||||
int pack_offset = maxnbors * pack_width;
|
||||
int ct = (ifrom + tid) * maxnbors;
|
||||
int ct = (ifrom + tid * 2) * maxnbors;
|
||||
int *neighptr = firstneigh + ct;
|
||||
const int obound = pack_offset + maxnbors * 2;
|
||||
|
||||
int max_chunk = 0;
|
||||
int lane = 0;
|
||||
@ -2011,6 +2012,8 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
}
|
||||
}
|
||||
|
||||
if (raw_count > obound) *overflow = 1;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
||||
@ -2084,7 +2087,6 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
}
|
||||
}
|
||||
int ns = (n - lane) / pack_width;
|
||||
if (ns > maxnbors || n2 > list_size) *overflow = 1;
|
||||
atombin[i] = ns;
|
||||
for (int u = pack_offset; u < n2; u++) {
|
||||
neighptr[n] = neighptr[u];
|
||||
@ -2098,7 +2100,6 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
|
||||
if (ns > max_chunk) max_chunk = ns;
|
||||
lane++;
|
||||
pack_offset -= maxnbors;
|
||||
if (lane == pack_width) {
|
||||
ct += max_chunk * pack_width;
|
||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||
@ -2108,10 +2109,10 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
max_chunk = 0;
|
||||
pack_offset = maxnbors * pack_width;
|
||||
lane = 0;
|
||||
if (ct + pack_offset + maxnbors > list_size) {
|
||||
if (ct + obound > list_size) {
|
||||
if (i < ito - 1) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid) * maxnbors;
|
||||
ct = (ifrom + tid * 2) * maxnbors;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -358,12 +358,18 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
||||
|
||||
IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x);
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
} // end of omp parallel region
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
@ -390,7 +396,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom);
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
}
|
||||
|
||||
@ -410,12 +410,18 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||
f[i].z += fztmp;
|
||||
IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x);
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
} // end of omp parallel region
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
@ -442,7 +448,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom);
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
}
|
||||
|
||||
@ -321,12 +321,18 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
||||
f[i].z += fztmp;
|
||||
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x);
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
} // end of omp parallel region
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
@ -353,7 +359,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom);
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
}
|
||||
|
||||
@ -793,10 +793,6 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
sizeof(FORCE_T));
|
||||
const int two_iito = iito * 2;
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
|
||||
acc_t *facc = &(f_start[0].x);
|
||||
const int sto = two_iito * 4;
|
||||
const int fst4 = f_stride * 4;
|
||||
@ -882,9 +878,9 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
fix->add_result_array(f_start, ev_global, offload,eatom);
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
fix->add_result_array(f_start, 0, offload, 0, 0, 2);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
@ -438,12 +438,17 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x);
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
} // end of omp parallel region
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
@ -470,7 +475,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom);
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
}
|
||||
|
||||
@ -405,12 +405,18 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
f[i].z += fztmp;
|
||||
IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x);
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
} // end of omp parallel region
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
@ -437,7 +443,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom);
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
}
|
||||
|
||||
@ -349,12 +349,17 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x);
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
} // end omp
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
@ -381,7 +386,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom);
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
}
|
||||
|
||||
@ -559,12 +559,18 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
f[i].z += fztmp;
|
||||
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x);
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
} // end omp
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
@ -590,7 +596,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom);
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
}
|
||||
@ -677,7 +683,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
|
||||
in(overflow:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ccachei,ccachej,ccachef:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
|
||||
in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
|
||||
in(ccache_stride3) \
|
||||
@ -852,7 +858,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
|
||||
const int ejnum_max = SIMD_max(ejnum);
|
||||
const int ejnumhalf_max = SIMD_max(ejnumhalf);
|
||||
memset(tf, 0, ejnum_max * sizeof(double) * swidth * 3);
|
||||
memset(tf, 0, ejnum_max * sizeof(acc_t) * swidth * 3);
|
||||
for (int jj = 0; jj < ejnum_max; jj++) {
|
||||
SIMD_int ijtype;
|
||||
const int coffset = jj * swidth;
|
||||
@ -1006,9 +1012,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
tf + kcoffset * 3, swidth);
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
if (EFLAG) {
|
||||
SIMD_int k;
|
||||
if (eatom) {
|
||||
k = SIMD_load(tj + kcoffset);
|
||||
k = k << 4;
|
||||
}
|
||||
SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
|
||||
fwtmp2, fjtmp2);
|
||||
fwtmp2, fjtmp2, k, &(f[0].x));
|
||||
}
|
||||
SIMD_ev_tally_nbor3v(kmask, vflag, fjx, fjy, fjz, fkx, fky, fkz,
|
||||
delx, dely, delz, delr2x, delr2y, delr2z,
|
||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
@ -1021,8 +1033,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp,
|
||||
fjxtmp2, fjytmp2, fjztmp2);
|
||||
|
||||
//if (EFLAG)
|
||||
// if (eatom) f[j].w += fjtmp;
|
||||
if (EFLAG) {
|
||||
if (eatom) {
|
||||
SIMD_int j = SIMD_load(tj + coffset);
|
||||
j = j << 4;
|
||||
SIMD_jeng_update(jmask, &(f[0].x) + 3, j, fjtmp);
|
||||
if (is_same<flt_t,acc_t>::value == 0)
|
||||
SIMD_jeng_update_hi(jmask, &(f[0].x) + 3, j, fjtmp2);
|
||||
}
|
||||
}
|
||||
} // for jj first loop
|
||||
|
||||
for (int jj = 0; jj < ejnum_max; jj++) {
|
||||
@ -1066,15 +1085,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
} // for jj second loop
|
||||
|
||||
SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp);
|
||||
SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp,
|
||||
EVFLAG, eatom, fwtmp);
|
||||
if (is_same<flt_t,acc_t>::value == 0) {
|
||||
imask = imask >> 8;
|
||||
SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2,
|
||||
fztmp2);
|
||||
fztmp2, EVFLAG, eatom, fwtmp2);
|
||||
}
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) oevdwl += SIMD_sum(sevdwl);
|
||||
// f[i].w
|
||||
if (vflag == 1) {
|
||||
ov0 += SIMD_sum(sv0);
|
||||
ov1 += SIMD_sum(sv1);
|
||||
@ -1086,11 +1105,18 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
ilist = ilist + swidth;
|
||||
} // for ii
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall, nlocal,
|
||||
minlocal, nthreads, f_start, f_stride, x);
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall, nlocal,
|
||||
minlocal, nthreads, f_start, f_stride, x,
|
||||
offload);
|
||||
}
|
||||
} // end omp
|
||||
|
||||
if (EVFLAG) {
|
||||
@ -1117,7 +1143,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom);
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
}
|
||||
|
||||
@ -384,12 +384,18 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x);
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
} // end of omp parallel region
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
@ -419,7 +425,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom);
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
|
||||
|
||||
@ -73,11 +73,10 @@ void VerletIntel::init()
|
||||
if (nvlist_atom)
|
||||
error->all(FLERR,
|
||||
"Cannot currently get per-atom virials with Intel package.");
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
|
||||
ifix = modify->find_fix("package_intel");
|
||||
if (ifix >= 0) fix_intel = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||
else fix_intel = 0;
|
||||
#endif
|
||||
|
||||
// set flags for what arrays to clear in force_clear()
|
||||
// need to clear additionals arrays if they exist
|
||||
@ -149,6 +148,8 @@ void VerletIntel::setup()
|
||||
else force->kspace->compute_dummy(eflag,vflag);
|
||||
}
|
||||
|
||||
if (fix_intel) fix_intel->sync();
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
sync_mode = 0;
|
||||
if (fix_intel) {
|
||||
@ -227,6 +228,8 @@ void VerletIntel::setup_minimal(int flag)
|
||||
else force->kspace->compute_dummy(eflag,vflag);
|
||||
}
|
||||
|
||||
if (fix_intel) fix_intel->sync();
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
sync_mode = 0;
|
||||
if (fix_intel) {
|
||||
@ -349,6 +352,8 @@ void VerletIntel::run(int n)
|
||||
timer->stamp(Timer::KSPACE);
|
||||
}
|
||||
|
||||
if (fix_intel) fix_intel->sync();
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (sync_mode == 1) {
|
||||
fix_intel->sync_coprocessor();
|
||||
|
||||
@ -21,9 +21,7 @@ IntegrateStyle(verlet/intel,VerletIntel)
|
||||
#define LMP_VERLET_INTEL_H
|
||||
|
||||
#include "integrate.h"
|
||||
#ifdef LMP_INTEL_OFFLOAD
|
||||
#include "fix_intel.h"
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
@ -42,8 +40,9 @@ class VerletIntel : public Integrate {
|
||||
int torqueflag,extraflag;
|
||||
|
||||
virtual void force_clear();
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
|
||||
FixIntel *fix_intel;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int sync_mode;
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -378,6 +378,8 @@ void VerletSplitIntel::run(int n)
|
||||
timer->stamp(Timer::BOND);
|
||||
}
|
||||
|
||||
if (fix_intel) fix_intel->sync();
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (sync_mode == 1) {
|
||||
fix_intel->sync_coprocessor();
|
||||
|
||||
@ -21,9 +21,6 @@ IntegrateStyle(verlet/split/intel,VerletSplitIntel)
|
||||
#define LMP_VERLET_SPLIT_INTEL_H
|
||||
|
||||
#include "verlet_intel.h"
|
||||
#ifdef LMP_INTEL_OFFLOAD
|
||||
#include "fix_intel.h"
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
|
||||
Reference in New Issue
Block a user