diff --git a/src/USER-INTEL/angle_charmm_intel.cpp b/src/USER-INTEL/angle_charmm_intel.cpp index fd9f6911e1..aafc765c6b 100644 --- a/src/USER-INTEL/angle_charmm_intel.cpp +++ b/src/USER-INTEL/angle_charmm_intel.cpp @@ -31,7 +31,8 @@ using namespace LAMMPS_NS; using namespace MathConst; -#define SMALL (flt_t)0.001 +#define SMALL2 (flt_t)0.000001 +#define INVSMALL (flt_t)1000.0 typedef struct { int a,b,c,t; } int4_t; /* ---------------------------------------------------------------------- */ @@ -162,7 +163,7 @@ void AngleCharmmIntel::eval(const int vflag, const flt_t delz1 = x[i1].z - x[i2].z; const flt_t rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1; - const flt_t r1 = sqrt(rsq1); + flt_t ir12 = (flt_t)1.0/sqrt(rsq1); // 2nd bond @@ -171,7 +172,7 @@ void AngleCharmmIntel::eval(const int vflag, const flt_t delz2 = x[i3].z - x[i2].z; const flt_t rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2; - const flt_t r2 = sqrt(rsq2); + ir12 *= (flt_t)1.0/sqrt(rsq2); // Urey-Bradley bond @@ -180,15 +181,15 @@ void AngleCharmmIntel::eval(const int vflag, const flt_t delzUB = x[i3].z - x[i1].z; const flt_t rsqUB = delxUB*delxUB + delyUB*delyUB + delzUB*delzUB; - const flt_t rUB = sqrt(rsqUB); + const flt_t irUB = (flt_t)1.0/sqrt(rsqUB); // Urey-Bradley force & energy - const flt_t dr = rUB - fc.fc[type].r_ub; + const flt_t dr = (flt_t)1.0/irUB - fc.fc[type].r_ub; const flt_t rk = fc.fc[type].k_ub * dr; flt_t forceUB; - if (rUB > (flt_t)0.0) forceUB = (flt_t)-2.0*rk/rUB; + if (rsqUB > (flt_t)0.0) forceUB = (flt_t)-2.0*rk*irUB; else forceUB = 0.0; flt_t eangle; @@ -197,14 +198,14 @@ void AngleCharmmIntel::eval(const int vflag, // angle (cos and sin) flt_t c = delx1*delx2 + dely1*dely2 + delz1*delz2; - c /= r1*r2; + c *= ir12; if (c > (flt_t)1.0) c = (flt_t)1.0; if (c < (flt_t)-1.0) c = (flt_t)-1.0; - flt_t s = sqrt((flt_t)1.0 - c*c); - if (s < SMALL) s = SMALL; - s = (flt_t)1.0/s; + const flt_t sd = (flt_t)1.0 - c * c; + flt_t s = (flt_t)1.0 / sqrt(sd); + if (sd < SMALL2) s = INVSMALL; // harmonic force & energy @@ -215,7 +216,7 @@ void AngleCharmmIntel::eval(const int vflag, const flt_t a = (flt_t)-2.0 * tk * s; const flt_t a11 = a*c / rsq1; - const flt_t a12 = -a / (r1*r2); + const flt_t a12 = -a * ir12; const flt_t a22 = a*c / rsq2; const flt_t f1x = a11*delx1 + a12*delx2 - delxUB*forceUB; @@ -303,7 +304,7 @@ template void AngleCharmmIntel::pack_force_const(ForceConst &fc, IntelBuffers *buffers) { - const int bp1 = atom->ndihedraltypes + 1; + const int bp1 = atom->nangletypes + 1; fc.set_ntypes(bp1,memory); for (int i = 0; i < bp1; i++) { diff --git a/src/USER-INTEL/angle_harmonic_intel.cpp b/src/USER-INTEL/angle_harmonic_intel.cpp new file mode 100644 index 0000000000..f101fd9e1f --- /dev/null +++ b/src/USER-INTEL/angle_harmonic_intel.cpp @@ -0,0 +1,312 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include +#include +#include "angle_harmonic_intel.h" +#include "atom.h" +#include "neighbor.h" +#include "domain.h" +#include "comm.h" +#include "force.h" +#include "math_const.h" +#include "memory.h" +#include "suffix.h" +#include "error.h" + +using namespace LAMMPS_NS; +using namespace MathConst; + +#define SMALL2 (flt_t)0.000001 +#define INVSMALL (flt_t)1000.0 +typedef struct { int a,b,c,t; } int4_t; + +/* ---------------------------------------------------------------------- */ + +AngleHarmonicIntel::AngleHarmonicIntel(LAMMPS *lmp) : AngleHarmonic(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +AngleHarmonicIntel::~AngleHarmonicIntel() +{ +} + +/* ---------------------------------------------------------------------- */ + +void AngleHarmonicIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + AngleHarmonic::compute(eflag, vflag); + return; + } + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); +} + +/* ---------------------------------------------------------------------- */ + +template +void AngleHarmonicIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = 0; + + if (evflag) { + if (eflag) { + if (force->newton_bond) + eval<1,1,1>(vflag, buffers, fc); + else + eval<1,1,0>(vflag, buffers, fc); + } else { + if (force->newton_bond) + eval<1,0,1>(vflag, buffers, fc); + else + eval<1,0,0>(vflag, buffers, fc); + } + } else { + if (force->newton_bond) + eval<0,0,1>(vflag, buffers, fc); + else + eval<0,0,0>(vflag, buffers, fc); + } +} + +/* ---------------------------------------------------------------------- */ + +template +void AngleHarmonicIntel::eval(const int vflag, + IntelBuffers *buffers, + const ForceConst &fc) + +{ + const int inum = neighbor->nanglelist; + if (inum == 0) return; + + ATOM_T * _noalias const x = buffers->get_x(0); + const int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + int f_stride; + if (NEWTON_BOND) f_stride = buffers->get_stride(nall); + else f_stride = buffers->get_stride(nlocal); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + + acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5; + if (EVFLAG) { + if (EFLAG) + oeangle = (acc_t)0.0; + if (vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + } + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,fc) \ + reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int nfrom, nto, tid; + IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + + FORCE_T * _noalias const f = f_start + (tid * f_stride); + if (fix->need_zero(tid)) + memset(f, 0, f_stride * sizeof(FORCE_T)); + + const int4_t * _noalias const anglelist = + (int4_t *) neighbor->anglelist[0]; + + for (int n = nfrom; n < nto; n++) { + const int i1 = anglelist[n].a; + const int i2 = anglelist[n].b; + const int i3 = anglelist[n].c; + const int type = anglelist[n].t; + + // 1st bond + + const flt_t delx1 = x[i1].x - x[i2].x; + const flt_t dely1 = x[i1].y - x[i2].y; + const flt_t delz1 = x[i1].z - x[i2].z; + + const flt_t rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1; + const flt_t r1 = (flt_t)1.0/sqrt(rsq1); + + // 2nd bond + + const flt_t delx2 = x[i3].x - x[i2].x; + const flt_t dely2 = x[i3].y - x[i2].y; + const flt_t delz2 = x[i3].z - x[i2].z; + + const flt_t rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2; + const flt_t r2 = (flt_t)1.0/sqrt(rsq2); + + // angle (cos and sin) + + flt_t c = delx1*delx2 + dely1*dely2 + delz1*delz2; + const flt_t r1r2 = r1 * r2; + c *= r1r2; + + if (c > (flt_t)1.0) c = (flt_t)1.0; + if (c < (flt_t)-1.0) c = (flt_t)-1.0; + + const flt_t sd = (flt_t)1.0 - c * c; + flt_t s = (flt_t)1.0/sqrt(sd); + if (sd < SMALL2) s = INVSMALL; + + // harmonic force & energy + + const flt_t dtheta = acos(c) - fc.fc[type].theta0; + const flt_t tk = fc.fc[type].k * dtheta; + + flt_t eangle; + if (EFLAG) eangle = tk*dtheta; + + const flt_t a = (flt_t)-2.0 * tk * s; + const flt_t ac = a*c; + const flt_t a11 = ac / rsq1; + const flt_t a12 = -a * (r1r2); + const flt_t a22 = ac / rsq2; + + const flt_t f1x = a11*delx1 + a12*delx2; + const flt_t f1y = a11*dely1 + a12*dely2; + const flt_t f1z = a11*delz1 + a12*delz2; + + const flt_t f3x = a22*delx2 + a12*delx1; + const flt_t f3y = a22*dely2 + a12*dely1; + const flt_t f3z = a22*delz2 + a12*delz1; + + // apply force to each of 3 atoms + + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x -= f1x + f3x; + f[i2].y -= f1y + f3y; + f[i2].z -= f1z + f3z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } + + if (EVFLAG) { + IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x, + f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1, + delx2, dely2, delz2, oeangle, f, NEWTON_BOND, + nlocal, ov0, ov1, ov2, ov3, ov4, ov5); + } + } // for n + } // omp parallel + + if (EVFLAG) { + if (EFLAG) + energy += oeangle; + if (vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + } + } + + fix->set_reduce_flag(); +} + +/* ---------------------------------------------------------------------- */ + +void AngleHarmonicIntel::init_style() +{ + AngleHarmonic::init_style(); + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->bond_init_check(); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void AngleHarmonicIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + const int bp1 = atom->nangletypes + 1; + fc.set_ntypes(bp1,memory); + + for (int i = 0; i < bp1; i++) { + fc.fc[i].k = k[i]; + fc.fc[i].theta0 = theta0[i]; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void AngleHarmonicIntel::ForceConst::set_ntypes(const int nangletypes, + Memory *memory) { + if (nangletypes != _nangletypes) { + if (_nangletypes > 0) + _memory->destroy(fc); + + if (nangletypes > 0) + _memory->create(fc,nangletypes,"anglecharmmintel.fc"); + } + _nangletypes = nangletypes; + _memory = memory; +} diff --git a/src/USER-INTEL/angle_harmonic_intel.h b/src/USER-INTEL/angle_harmonic_intel.h new file mode 100644 index 0000000000..340ea4b974 --- /dev/null +++ b/src/USER-INTEL/angle_harmonic_intel.h @@ -0,0 +1,84 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef ANGLE_CLASS + +AngleStyle(harmonic/intel,AngleHarmonicIntel) + +#else + +#ifndef LMP_ANGLE_HARMONIC_INTEL_H +#define LMP_ANGLE_HARMONIC_INTEL_H + +#include +#include "angle_harmonic.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class AngleHarmonicIntel : public AngleHarmonic { + public: + AngleHarmonicIntel(class LAMMPS *); + virtual ~AngleHarmonicIntel(); + virtual void compute(int, int); + virtual void init_style(); + + protected: + FixIntel *fix; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int vflag, IntelBuffers * buffers, + const ForceConst &fc); + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template + class ForceConst { + public: + typedef struct { flt_t k, theta0; } fc_packed1; + + fc_packed1 *fc; + ForceConst() : _nangletypes(0) {} + ~ForceConst() { set_ntypes(0, NULL); } + + void set_ntypes(const int nangletypes, Memory *memory); + + private: + int _nangletypes; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/USER-INTEL/dihedral_harmonic_intel.cpp b/src/USER-INTEL/dihedral_harmonic_intel.cpp new file mode 100644 index 0000000000..aa6ba9d1df --- /dev/null +++ b/src/USER-INTEL/dihedral_harmonic_intel.cpp @@ -0,0 +1,411 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include "math.h" +#include "dihedral_harmonic_intel.h" +#include "atom.h" +#include "comm.h" +#include "memory.h" +#include "neighbor.h" +#include "domain.h" +#include "force.h" +#include "pair.h" +#include "update.h" +#include "error.h" + +#include "suffix.h" +using namespace LAMMPS_NS; + +#define PTOLERANCE (flt_t)1.05 +#define MTOLERANCE (flt_t)-1.05 +typedef struct { int a,b,c,d,t; } int5_t; + +/* ---------------------------------------------------------------------- */ + +DihedralHarmonicIntel::DihedralHarmonicIntel(class LAMMPS *lmp) + : DihedralHarmonic(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +void DihedralHarmonicIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + DihedralHarmonic::compute(eflag, vflag); + return; + } + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralHarmonicIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag,vflag); + } else evflag = 0; + + if (evflag) { + if (eflag) { + if (force->newton_bond) + eval<1,1,1>(vflag, buffers, fc); + else + eval<1,1,0>(vflag, buffers, fc); + } else { + if (force->newton_bond) + eval<1,0,1>(vflag, buffers, fc); + else + eval<1,0,0>(vflag, buffers, fc); + } + } else { + if (force->newton_bond) + eval<0,0,1>(vflag, buffers, fc); + else + eval<0,0,0>(vflag, buffers, fc); + } +} + +template +void DihedralHarmonicIntel::eval(const int vflag, + IntelBuffers *buffers, + const ForceConst &fc) + +{ + const int inum = neighbor->ndihedrallist; + if (inum == 0) return; + + ATOM_T * _noalias const x = buffers->get_x(0); + const int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + int f_stride; + if (NEWTON_BOND) f_stride = buffers->get_stride(nall); + else f_stride = buffers->get_stride(nlocal); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + + acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5; + if (EVFLAG) { + if (EFLAG) + oedihedral = (acc_t)0.0; + if (vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + } + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,fc) \ + reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int nfrom, nto, tid; + IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + + FORCE_T * _noalias const f = f_start + (tid * f_stride); + if (fix->need_zero(tid)) + memset(f, 0, f_stride * sizeof(FORCE_T)); + + const int5_t * _noalias const dihedrallist = + (int5_t *) neighbor->dihedrallist[0]; + + acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5; + if (EVFLAG) { + if (EFLAG) + sedihedral = (acc_t)0.0; + if (vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + } + } + + for (int n = nfrom; n < nto; n++) { + const int i1 = dihedrallist[n].a; + const int i2 = dihedrallist[n].b; + const int i3 = dihedrallist[n].c; + const int i4 = dihedrallist[n].d; + const int type = dihedrallist[n].t; + + // 1st bond + + const flt_t vb1x = x[i1].x - x[i2].x; + const flt_t vb1y = x[i1].y - x[i2].y; + const flt_t vb1z = x[i1].z - x[i2].z; + + // 2nd bond + + const flt_t vb2xm = x[i2].x - x[i3].x; + const flt_t vb2ym = x[i2].y - x[i3].y; + const flt_t vb2zm = x[i2].z - x[i3].z; + + // 3rd bond + + const flt_t vb3x = x[i4].x - x[i3].x; + const flt_t vb3y = x[i4].y - x[i3].y; + const flt_t vb3z = x[i4].z - x[i3].z; + + // c,s calculation + + const flt_t ax = vb1y*vb2zm - vb1z*vb2ym; + const flt_t ay = vb1z*vb2xm - vb1x*vb2zm; + const flt_t az = vb1x*vb2ym - vb1y*vb2xm; + const flt_t bx = vb3y*vb2zm - vb3z*vb2ym; + const flt_t by = vb3z*vb2xm - vb3x*vb2zm; + const flt_t bz = vb3x*vb2ym - vb3y*vb2xm; + + const flt_t rasq = ax*ax + ay*ay + az*az; + const flt_t rbsq = bx*bx + by*by + bz*bz; + const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm; + const flt_t rg = sqrt(rgsq); + + flt_t rginv, ra2inv, rb2inv; + rginv = ra2inv = rb2inv = (flt_t)0.0; + if (rg > 0) rginv = (flt_t)1.0/rg; + if (rasq > 0) ra2inv = (flt_t)1.0/rasq; + if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq; + const flt_t rabinv = sqrt(ra2inv*rb2inv); + + flt_t c = (ax*bx + ay*by + az*bz)*rabinv; + const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z); + + // error check + if (c > PTOLERANCE || c < MTOLERANCE) { + int me = comm->me; + + if (screen) { + char str[128]; + sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + me,tid,update->ntimestep, + atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", + me,x[i1].x,x[i1].y,x[i1].z); + fprintf(screen," 2nd atom: %d %g %g %g\n", + me,x[i2].x,x[i2].y,x[i2].z); + fprintf(screen," 3rd atom: %d %g %g %g\n", + me,x[i3].x,x[i3].y,x[i3].z); + fprintf(screen," 4th atom: %d %g %g %g\n", + me,x[i4].x,x[i4].y,x[i4].z); + } + } + + if (c > (flt_t)1.0) c = (flt_t)1.0; + if (c < (flt_t)-1.0) c = (flt_t)-1.0; + + const flt_t tcos_shift = fc.bp[type].cos_shift; + const flt_t tsin_shift = fc.bp[type].sin_shift; + const flt_t tk = fc.bp[type].k; + const int m = fc.bp[type].multiplicity; + + flt_t p = (flt_t)1.0; + flt_t ddf1, df1; + ddf1 = df1 = (flt_t)0.0; + + for (int i = 0; i < m; i++) { + ddf1 = p*c - df1*s; + df1 = p*s + df1*c; + p = ddf1; + } + + p = p*tcos_shift + df1*tsin_shift; + df1 = df1*tcos_shift - ddf1*tsin_shift; + df1 *= -m; + p += (flt_t)1.0; + + if (m == 0) { + p = (flt_t)1.0 + tcos_shift; + df1 = (flt_t)0.0; + } + + const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm; + const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm; + const flt_t fga = fg*ra2inv*rginv; + const flt_t hgb = hg*rb2inv*rginv; + const flt_t gaa = -ra2inv*rg; + const flt_t gbb = rb2inv*rg; + + const flt_t dtfx = gaa*ax; + const flt_t dtfy = gaa*ay; + const flt_t dtfz = gaa*az; + const flt_t dtgx = fga*ax - hgb*bx; + const flt_t dtgy = fga*ay - hgb*by; + const flt_t dtgz = fga*az - hgb*bz; + const flt_t dthx = gbb*bx; + const flt_t dthy = gbb*by; + const flt_t dthz = gbb*bz; + + const flt_t df = -tk * df1; + + const flt_t sx2 = df*dtgx; + const flt_t sy2 = df*dtgy; + const flt_t sz2 = df*dtgz; + + flt_t f1x = df*dtfx; + flt_t f1y = df*dtfy; + flt_t f1z = df*dtfz; + + const flt_t f2x = sx2 - f1x; + const flt_t f2y = sy2 - f1y; + const flt_t f2z = sz2 - f1z; + + flt_t f4x = df*dthx; + flt_t f4y = df*dthy; + flt_t f4z = df*dthz; + + const flt_t f3x = -sx2 - f4x; + const flt_t f3y = -sy2 - f4y; + const flt_t f3z = -sz2 - f4z; + + if (EVFLAG) { + flt_t deng; + if (EFLAG) deng = tk * p; + IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x, + f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, + vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, + vb3z, sedihedral, f, NEWTON_BOND, nlocal, + sv0, sv1, sv2, sv3, sv4, sv5); + } + + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } + + if (NEWTON_BOND || i4 < nlocal) { + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; + } + } + } // for n + if (EVFLAG) { + if (EFLAG) oedihedral += sedihedral; + if (vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5; + } + } + } // omp parallel + + if (EVFLAG) { + if (EFLAG) energy += oedihedral; + if (vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + } + } + + fix->set_reduce_flag(); +} + +/* ---------------------------------------------------------------------- */ + +void DihedralHarmonicIntel::init_style() +{ + DihedralHarmonic::init_style(); + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->bond_init_check(); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralHarmonicIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + const int bp1 = atom->ndihedraltypes + 1; + fc.set_ntypes(bp1,memory); + + for (int i = 0; i < bp1; i++) { + fc.bp[i].multiplicity = multiplicity[i]; + fc.bp[i].cos_shift = cos_shift[i]; + fc.bp[i].sin_shift = sin_shift[i]; + fc.bp[i].k = k[i]; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralHarmonicIntel::ForceConst::set_ntypes(const int nbondtypes, + Memory *memory) { + if (nbondtypes != _nbondtypes) { + if (_nbondtypes > 0) + _memory->destroy(bp); + + if (nbondtypes > 0) + _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp"); + } + _nbondtypes = nbondtypes; + _memory = memory; +} diff --git a/src/USER-INTEL/dihedral_harmonic_intel.h b/src/USER-INTEL/dihedral_harmonic_intel.h new file mode 100644 index 0000000000..41e3d20540 --- /dev/null +++ b/src/USER-INTEL/dihedral_harmonic_intel.h @@ -0,0 +1,81 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef DIHEDRAL_CLASS + +DihedralStyle(harmonic/intel,DihedralHarmonicIntel) + +#else + +#ifndef LMP_DIHEDRAL_HARMONIC_INTEL_H +#define LMP_DIHEDRAL_HARMONIC_INTEL_H + +#include "dihedral_harmonic.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class DihedralHarmonicIntel : public DihedralHarmonic { + + public: + DihedralHarmonicIntel(class LAMMPS *lmp); + virtual void compute(int, int); + void init_style(); + + private: + FixIntel *fix; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int vflag, IntelBuffers * buffers, + const ForceConst &fc); + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template + class ForceConst { + public: + typedef struct { flt_t cos_shift, sin_shift, k; + int multiplicity; } fc_packed1; + + fc_packed1 *bp; + + ForceConst() : _nbondtypes(0) {} + ~ForceConst() { set_ntypes(0, NULL); } + + void set_ntypes(const int nbondtypes, Memory *memory); + + private: + int _nbondtypes; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif diff --git a/src/USER-INTEL/dihedral_opls_intel.cpp b/src/USER-INTEL/dihedral_opls_intel.cpp new file mode 100644 index 0000000000..1edb52f056 --- /dev/null +++ b/src/USER-INTEL/dihedral_opls_intel.cpp @@ -0,0 +1,438 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include "math.h" +#include "dihedral_opls_intel.h" +#include "atom.h" +#include "comm.h" +#include "memory.h" +#include "neighbor.h" +#include "domain.h" +#include "force.h" +#include "pair.h" +#include "update.h" +#include "error.h" + +#include "suffix.h" +using namespace LAMMPS_NS; + +#define PTOLERANCE (flt_t)1.05 +#define MTOLERANCE (flt_t)-1.05 +#define SMALL2 (flt_t)0.000001 +#define INVSMALL (flt_t)1000.0 +#define SMALLER2 (flt_t)0.0000000001 +#define INVSMALLER (flt_t)100000.0 +typedef struct { int a,b,c,d,t; } int5_t; + +/* ---------------------------------------------------------------------- */ + +DihedralOPLSIntel::DihedralOPLSIntel(class LAMMPS *lmp) + : DihedralOPLS(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +void DihedralOPLSIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + DihedralOPLS::compute(eflag, vflag); + return; + } + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralOPLSIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag,vflag); + } else evflag = 0; + + if (evflag) { + if (eflag) { + if (force->newton_bond) + eval<1,1,1>(vflag, buffers, fc); + else + eval<1,1,0>(vflag, buffers, fc); + } else { + if (force->newton_bond) + eval<1,0,1>(vflag, buffers, fc); + else + eval<1,0,0>(vflag, buffers, fc); + } + } else { + if (force->newton_bond) + eval<0,0,1>(vflag, buffers, fc); + else + eval<0,0,0>(vflag, buffers, fc); + } +} + +template +void DihedralOPLSIntel::eval(const int vflag, + IntelBuffers *buffers, + const ForceConst &fc) + +{ + const int inum = neighbor->ndihedrallist; + if (inum == 0) return; + + ATOM_T * _noalias const x = buffers->get_x(0); + const int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + int f_stride; + if (NEWTON_BOND) f_stride = buffers->get_stride(nall); + else f_stride = buffers->get_stride(nlocal); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + + acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5; + if (EVFLAG) { + if (EFLAG) + oedihedral = (acc_t)0.0; + if (vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + } + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,fc) \ + reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int nfrom, nto, tid; + IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + + FORCE_T * _noalias const f = f_start + (tid * f_stride); + if (fix->need_zero(tid)) + memset(f, 0, f_stride * sizeof(FORCE_T)); + + const int5_t * _noalias const dihedrallist = + (int5_t *) neighbor->dihedrallist[0]; + + acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5; + if (EVFLAG) { + if (EFLAG) + sedihedral = (acc_t)0.0; + if (vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + } + } + + for (int n = nfrom; n < nto; n++) { + const int i1 = dihedrallist[n].a; + const int i2 = dihedrallist[n].b; + const int i3 = dihedrallist[n].c; + const int i4 = dihedrallist[n].d; + const int type = dihedrallist[n].t; + + // 1st bond + + const flt_t vb1x = x[i1].x - x[i2].x; + const flt_t vb1y = x[i1].y - x[i2].y; + const flt_t vb1z = x[i1].z - x[i2].z; + + // 2nd bond + + const flt_t vb2xm = x[i2].x - x[i3].x; + const flt_t vb2ym = x[i2].y - x[i3].y; + const flt_t vb2zm = x[i2].z - x[i3].z; + + // 3rd bond + + const flt_t vb3x = x[i4].x - x[i3].x; + const flt_t vb3y = x[i4].y - x[i3].y; + const flt_t vb3z = x[i4].z - x[i3].z; + + // 1-4 + + const flt_t delx = x[i1].x - x[i4].x; + const flt_t dely = x[i1].y - x[i4].y; + const flt_t delz = x[i1].z - x[i4].z; + + + // c0 calculation + // 1st and 2nd angle + + const flt_t b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z; + const flt_t rb1 = (flt_t)1.0 / sqrt(b1mag2); + const flt_t sb1 = (flt_t)1.0 / b1mag2; + + const flt_t b2mag2 = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm; + const flt_t rb2 = (flt_t)1.0 / sqrt(b2mag2); + const flt_t sb2 = (flt_t)1.0 / b2mag2; + + const flt_t b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z; + const flt_t rb3 = (flt_t)1.0 / sqrt(b3mag2); + const flt_t sb3 = (flt_t)1.0 / b3mag2; + + const flt_t c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3; + + flt_t ctmp = -vb1x*vb2xm - vb1y*vb2ym - vb1z*vb2zm; + const flt_t r12c1 = rb1 * rb2; + const flt_t c1mag = ctmp * r12c1; + + ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z; + const flt_t r12c2 = rb2 * rb3; + const flt_t c2mag = ctmp * r12c2; + + // cos and sin of 2 angles and final c + + flt_t sin2 = MAX((flt_t)1.0 - c1mag*c1mag,(flt_t)0.0); + flt_t sc1 = (flt_t)1.0/sqrt(sin2); + if (sin2 < SMALL2) sc1 = INVSMALL; + + sin2 = MAX((flt_t)1.0 - c2mag*c2mag,(flt_t)0.0); + flt_t sc2 = (flt_t)1.0/sqrt(sin2); + if (sin2 < SMALL2) sc2 = INVSMALL; + + const flt_t s1 = sc1 * sc1; + const flt_t s2 = sc2 * sc2; + flt_t s12 = sc1 * sc2; + flt_t c = (c0 + c1mag*c2mag) * s12; + + const flt_t cx = vb1z*vb2ym - vb1y*vb2zm; + const flt_t cy = vb1x*vb2zm - vb1z*vb2xm; + const flt_t cz = vb1y*vb2xm - vb1x*vb2ym; + const flt_t cmag = (flt_t)1.0/sqrt(cx*cx + cy*cy + cz*cz); + const flt_t dx = (cx*vb3x + cy*vb3y + cz*vb3z)*cmag*rb3; + + // error check + if (c > PTOLERANCE || c < MTOLERANCE) { + int me = comm->me; + + if (screen) { + char str[128]; + sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + me,tid,update->ntimestep, + atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", + me,x[i1].x,x[i1].y,x[i1].z); + fprintf(screen," 2nd atom: %d %g %g %g\n", + me,x[i2].x,x[i2].y,x[i2].z); + fprintf(screen," 3rd atom: %d %g %g %g\n", + me,x[i3].x,x[i3].y,x[i3].z); + fprintf(screen," 4th atom: %d %g %g %g\n", + me,x[i4].x,x[i4].y,x[i4].z); + } + } + + if (c > (flt_t)1.0) c = (flt_t)1.0; + if (c < (flt_t)-1.0) c = (flt_t)-1.0; + + // force & energy + // p = sum (i=1,4) k_i * (1 + (-1)**(i+1)*cos(i*phi) ) + // pd = dp/dc + + const flt_t cossq = c * c; + const flt_t sinsq = (flt_t)1.0 - cossq; + flt_t siinv = (flt_t)1.0/sqrt(sinsq); + if (sinsq < SMALLER2 ) siinv = INVSMALLER; + if (dx < (flt_t)0.0) siinv = -siinv; + + const flt_t cos_2phi = cossq - sinsq; + const flt_t sin_2phim = (flt_t)2.0 * c; + const flt_t cos_3phi = (flt_t)2.0 * c * cos_2phi - c; + const flt_t sin_3phim = (flt_t)2.0 * cos_2phi + (flt_t)1.0; + const flt_t cos_4phi = (flt_t)2.0 * cos_2phi * cos_2phi - (flt_t)1.0; + const flt_t sin_4phim = (flt_t)2.0 * cos_2phi * sin_2phim; + + flt_t p, pd; + p = fc.bp[type].k1*((flt_t)1.0 + c) + + fc.bp[type].k2*((flt_t)1.0 - cos_2phi) + + fc.bp[type].k3*((flt_t)1.0 + cos_3phi) + + fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ; + pd = fc.bp[type].k1 - + (flt_t)2.0 * fc.bp[type].k2 * sin_2phim + + (flt_t)3.0 * fc.bp[type].k3 * sin_3phim - + (flt_t)4.0 * fc.bp[type].k4 * sin_4phim; + + flt_t edihed; + if (EFLAG) edihed = p; + + const flt_t a = pd; + c = c * a; + s12 = s12 * a; + const flt_t a11 = c*sb1*s1; + const flt_t a22 = -sb2 * ((flt_t)2.0*c0*s12 - c*(s1+s2)); + const flt_t a33 = c*sb3*s2; + const flt_t a12 = -r12c1 * (c1mag*c*s1 + c2mag*s12); + const flt_t a13 = -rb1*rb3*s12; + const flt_t a23 = r12c2 * (c2mag*c*s2 + c1mag*s12); + + const flt_t sx2 = a12*vb1x - a22*vb2xm + a23*vb3x; + const flt_t sy2 = a12*vb1y - a22*vb2ym + a23*vb3y; + const flt_t sz2 = a12*vb1z - a22*vb2zm + a23*vb3z; + + const flt_t f1x = a11*vb1x - a12*vb2xm + a13*vb3x; + const flt_t f1y = a11*vb1y - a12*vb2ym + a13*vb3y; + const flt_t f1z = a11*vb1z - a12*vb2zm + a13*vb3z; + + const flt_t f2x = -sx2 - f1x; + const flt_t f2y = -sy2 - f1y; + const flt_t f2z = -sz2 - f1z; + + const flt_t f4x = a13*vb1x - a23*vb2xm + a33*vb3x; + const flt_t f4y = a13*vb1y - a23*vb2ym + a33*vb3y; + const flt_t f4z = a13*vb1z - a23*vb2zm + a33*vb3z; + + const flt_t f3x = sx2 - f4x; + const flt_t f3y = sy2 - f4y; + const flt_t f3z = sz2 - f4z; + + if (EVFLAG) { + IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, edihed, i1, i2, i3, i4, f1x, + f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, + vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, + vb3z, sedihedral, f, NEWTON_BOND, nlocal, + sv0, sv1, sv2, sv3, sv4, sv5); + } + + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } + + if (NEWTON_BOND || i4 < nlocal) { + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; + } + } + } // for n + if (EVFLAG) { + if (EFLAG) oedihedral += sedihedral; + if (vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5; + } + } + } // omp parallel + + if (EVFLAG) { + if (EFLAG) energy += oedihedral; + if (vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + } + } + + fix->set_reduce_flag(); +} + +/* ---------------------------------------------------------------------- */ + +void DihedralOPLSIntel::init_style() +{ + DihedralOPLS::init_style(); + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->bond_init_check(); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralOPLSIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + const int bp1 = atom->ndihedraltypes + 1; + fc.set_ntypes(bp1,memory); + + for (int i = 0; i < bp1; i++) { + fc.bp[i].k1 = k1[i]; + fc.bp[i].k2 = k2[i]; + fc.bp[i].k3 = k3[i]; + fc.bp[i].k4 = k4[i]; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralOPLSIntel::ForceConst::set_ntypes(const int nbondtypes, + Memory *memory) { + if (nbondtypes != _nbondtypes) { + if (_nbondtypes > 0) + _memory->destroy(bp); + + if (nbondtypes > 0) + _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp"); + } + _nbondtypes = nbondtypes; + _memory = memory; +} diff --git a/src/USER-INTEL/dihedral_opls_intel.h b/src/USER-INTEL/dihedral_opls_intel.h new file mode 100644 index 0000000000..ea0930f4b8 --- /dev/null +++ b/src/USER-INTEL/dihedral_opls_intel.h @@ -0,0 +1,80 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef DIHEDRAL_CLASS + +DihedralStyle(opls/intel,DihedralOPLSIntel) + +#else + +#ifndef LMP_DIHEDRAL_OPLS_INTEL_H +#define LMP_DIHEDRAL_OPLS_INTEL_H + +#include "dihedral_opls.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class DihedralOPLSIntel : public DihedralOPLS { + + public: + DihedralOPLSIntel(class LAMMPS *lmp); + virtual void compute(int, int); + void init_style(); + + private: + FixIntel *fix; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int vflag, IntelBuffers * buffers, + const ForceConst &fc); + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template + class ForceConst { + public: + typedef struct { flt_t k1, k2, k3, k4; } fc_packed1; + + fc_packed1 *bp; + + ForceConst() : _nbondtypes(0) {} + ~ForceConst() { set_ntypes(0, NULL); } + + void set_ntypes(const int nbondtypes, Memory *memory); + + private: + int _nbondtypes; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif diff --git a/src/USER-INTEL/fix_nvt_intel.cpp b/src/USER-INTEL/fix_nvt_intel.cpp new file mode 100644 index 0000000000..65854b07a8 --- /dev/null +++ b/src/USER-INTEL/fix_nvt_intel.cpp @@ -0,0 +1,50 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include "fix_nvt_intel.h" +#include "group.h" +#include "modify.h" +#include "error.h" + +using namespace LAMMPS_NS; +using namespace FixConst; + +/* ---------------------------------------------------------------------- */ + +FixNVTIntel::FixNVTIntel(LAMMPS *lmp, int narg, char **arg) : + FixNHIntel(lmp, narg, arg) +{ + if (!tstat_flag) + error->all(FLERR,"Temperature control must be used with fix nvt"); + if (pstat_flag) + error->all(FLERR,"Pressure control can not be used with fix nvt"); + + // create a new compute temp style + // id = fix-ID + temp + + int n = strlen(id) + 6; + id_temp = new char[n]; + strcpy(id_temp,id); + strcat(id_temp,"_temp"); + + char **newarg = new char*[3]; + newarg[0] = id_temp; + newarg[1] = group->names[igroup]; + newarg[2] = (char *) "temp"; + + modify->add_compute(3,newarg); + delete [] newarg; + tflag = 1; +} + diff --git a/src/USER-INTEL/fix_nvt_intel.h b/src/USER-INTEL/fix_nvt_intel.h new file mode 100644 index 0000000000..7e940191ca --- /dev/null +++ b/src/USER-INTEL/fix_nvt_intel.h @@ -0,0 +1,52 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(nvt/intel,FixNVTIntel) + +#else + +#ifndef LMP_FIX_NVT_INTEL_H +#define LMP_FIX_NVT_INTEL_H + +#include "fix_nh_intel.h" + +namespace LAMMPS_NS { + +class FixNVTIntel : public FixNHIntel { + public: + FixNVTIntel(class LAMMPS *, int, char **); + ~FixNVTIntel() {} +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Temperature control must be used with fix nvt + +Self-explanatory. + +E: Pressure control can not be used with fix nvt + +Self-explanatory. + +*/ diff --git a/src/USER-INTEL/fix_nvt_sllod_intel.cpp b/src/USER-INTEL/fix_nvt_sllod_intel.cpp new file mode 100644 index 0000000000..5505abb9ce --- /dev/null +++ b/src/USER-INTEL/fix_nvt_sllod_intel.cpp @@ -0,0 +1,126 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include +#include "fix_nvt_sllod_intel.h" +#include "math_extra.h" +#include "atom.h" +#include "domain.h" +#include "group.h" +#include "modify.h" +#include "fix.h" +#include "fix_deform.h" +#include "compute.h" +#include "error.h" + +using namespace LAMMPS_NS; +using namespace FixConst; + +enum{NO_REMAP,X_REMAP,V_REMAP}; // same as fix_deform.cpp + +/* ---------------------------------------------------------------------- */ + +FixNVTSllodIntel::FixNVTSllodIntel(LAMMPS *lmp, int narg, char **arg) : + FixNHIntel(lmp, narg, arg) +{ + if (!tstat_flag) + error->all(FLERR,"Temperature control must be used with fix nvt/sllod"); + if (pstat_flag) + error->all(FLERR,"Pressure control can not be used with fix nvt/sllod"); + + // default values + + if (mtchain_default_flag) mtchain = 1; + + + // create a new compute temp style + // id = fix-ID + temp + + int n = strlen(id) + 6; + id_temp = new char[n]; + strcpy(id_temp,id); + strcat(id_temp,"_temp"); + + char **newarg = new char*[3]; + newarg[0] = id_temp; + newarg[1] = group->names[igroup]; + newarg[2] = (char *) "temp/deform"; + + modify->add_compute(3,newarg); + delete [] newarg; + tflag = 1; +} + +/* ---------------------------------------------------------------------- */ + +void FixNVTSllodIntel::init() +{ + FixNHIntel::init(); + + if (!temperature->tempbias) + error->all(FLERR,"Temperature for fix nvt/sllod does not have a bias"); + + nondeformbias = 0; + if (strcmp(temperature->style,"temp/deform") != 0) nondeformbias = 1; + + // check fix deform remap settings + + int i; + for (i = 0; i < modify->nfix; i++) + if (strcmp(modify->fix[i]->style,"deform") == 0) { + if (((FixDeform *) modify->fix[i])->remapflag != V_REMAP) + error->all(FLERR,"Using fix nvt/sllod with inconsistent fix deform " + "remap option"); + break; + } + if (i == modify->nfix) + error->all(FLERR,"Using fix nvt/sllod with no fix deform defined"); +} + +/* ---------------------------------------------------------------------- + perform half-step scaling of velocities +-----------------------------------------------------------------------*/ + +void FixNVTSllodIntel::nh_v_temp() +{ + // remove and restore bias = streaming velocity = Hrate*lamda + Hratelo + // thermostat thermal velocity only + // vdelu = SLLOD correction = Hrate*Hinv*vthermal + // for non temp/deform BIAS: + // calculate temperature since some computes require temp + // computed on current nlocal atoms to remove bias + + if (nondeformbias) temperature->compute_scalar(); + + double **v = atom->v; + int *mask = atom->mask; + int nlocal = atom->nlocal; + if (igroup == atom->firstgroup) nlocal = atom->nfirst; + + double h_two[6],vdelu[3]; + MathExtra::multiply_shape_shape(domain->h_rate,domain->h_inv,h_two); + + for (int i = 0; i < nlocal; i++) { + if (mask[i] & groupbit) { + vdelu[0] = h_two[0]*v[i][0] + h_two[5]*v[i][1] + h_two[4]*v[i][2]; + vdelu[1] = h_two[1]*v[i][1] + h_two[3]*v[i][2]; + vdelu[2] = h_two[2]*v[i][2]; + temperature->remove_bias(i,v[i]); + v[i][0] = v[i][0]*factor_eta - dthalf*vdelu[0]; + v[i][1] = v[i][1]*factor_eta - dthalf*vdelu[1]; + v[i][2] = v[i][2]*factor_eta - dthalf*vdelu[2]; + temperature->restore_bias(i,v[i]); + } + } +} diff --git a/src/USER-INTEL/fix_nvt_sllod_intel.h b/src/USER-INTEL/fix_nvt_sllod_intel.h new file mode 100644 index 0000000000..81552aa34a --- /dev/null +++ b/src/USER-INTEL/fix_nvt_sllod_intel.h @@ -0,0 +1,72 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(nvt/sllod/intel,FixNVTSllodIntel) + +#else + +#ifndef LMP_FIX_NVTSLLOD_INTEL_H +#define LMP_FIX_NVTSLLOD_INTEL_H + +#include "fix_nh_intel.h" + +namespace LAMMPS_NS { + +class FixNVTSllodIntel : public FixNHIntel { + public: + FixNVTSllodIntel(class LAMMPS *, int, char **); + ~FixNVTSllodIntel() {} + void init(); + + private: + int nondeformbias; + + void nh_v_temp(); +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Temperature control must be used with fix nvt/sllod + +Self-explanatory. + +E: Pressure control can not be used with fix nvt/sllod + +Self-explanatory. + +E: Temperature for fix nvt/sllod does not have a bias + +The specified compute must compute temperature with a bias. + +E: Using fix nvt/sllod with inconsistent fix deform remap option + +Fix nvt/sllod requires that deforming atoms have a velocity profile +provided by "remap v" as a fix deform option. + +E: Using fix nvt/sllod with no fix deform defined + +Self-explanatory. + +*/ + diff --git a/src/USER-INTEL/improper_cvff_intel.cpp b/src/USER-INTEL/improper_cvff_intel.cpp new file mode 100644 index 0000000000..6c0735a755 --- /dev/null +++ b/src/USER-INTEL/improper_cvff_intel.cpp @@ -0,0 +1,425 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "improper_cvff_intel.h" +#include "atom.h" +#include "comm.h" +#include "neighbor.h" +#include "domain.h" +#include "force.h" +#include "update.h" +#include "math_const.h" +#include "memory.h" +#include "modify.h" +#include "suffix.h" +#include "error.h" + +using namespace LAMMPS_NS; +using namespace MathConst; + +#define PTOLERANCE (flt_t)1.05 +#define MTOLERANCE (flt_t)-1.05 +#define SMALL2 (flt_t)0.000001 +#define INVSMALL (flt_t)1000.0 +typedef struct { int a,b,c,d,t; } int5_t; + +/* ---------------------------------------------------------------------- */ + +ImproperCvffIntel::ImproperCvffIntel(LAMMPS *lmp) : + ImproperCvff(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +ImproperCvffIntel::~ImproperCvffIntel() +{ +} + +/* ---------------------------------------------------------------------- */ + +void ImproperCvffIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + ImproperCvff::compute(eflag, vflag); + return; + } + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); +} + +/* ---------------------------------------------------------------------- */ + +template +void ImproperCvffIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = 0; + + if (evflag) { + if (eflag) { + if (force->newton_bond) + eval<1,1,1>(vflag, buffers, fc); + else + eval<1,1,0>(vflag, buffers, fc); + } else { + if (force->newton_bond) + eval<1,0,1>(vflag, buffers, fc); + else + eval<1,0,0>(vflag, buffers, fc); + } + } else { + if (force->newton_bond) + eval<0,0,1>(vflag, buffers, fc); + else + eval<0,0,0>(vflag, buffers, fc); + } +} + +/* ---------------------------------------------------------------------- */ + +template +void ImproperCvffIntel::eval(const int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + const int inum = neighbor->nimproperlist; + if (inum == 0) return; + + ATOM_T * _noalias const x = buffers->get_x(0); + const int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + int f_stride; + if (NEWTON_BOND) f_stride = buffers->get_stride(nall); + else f_stride = buffers->get_stride(nlocal); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + + acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5; + if (EVFLAG) { + if (EFLAG) + oeimproper = (acc_t)0.0; + if (vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + } + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,fc) \ + reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int nfrom, nto, tid; + IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + + FORCE_T * _noalias const f = f_start + (tid * f_stride); + if (fix->need_zero(tid)) + memset(f, 0, f_stride * sizeof(FORCE_T)); + + const int5_t * _noalias const improperlist = + (int5_t *) neighbor->improperlist[0]; + + for (int n = nfrom; n < nto; n++) { + const int i1 = improperlist[n].a; + const int i2 = improperlist[n].b; + const int i3 = improperlist[n].c; + const int i4 = improperlist[n].d; + const int type = improperlist[n].t; + + // geometry of 4-body + + const flt_t vb1x = x[i1].x - x[i2].x; + const flt_t vb1y = x[i1].y - x[i2].y; + const flt_t vb1z = x[i1].z - x[i2].z; + + const flt_t vb2xm = x[i2].x - x[i3].x; + const flt_t vb2ym = x[i2].y - x[i3].y; + const flt_t vb2zm = x[i2].z - x[i3].z; + + const flt_t vb3x = x[i4].x - x[i3].x; + const flt_t vb3y = x[i4].y - x[i3].y; + const flt_t vb3z = x[i4].z - x[i3].z; + + // 1st and 2nd angle + + const flt_t b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z; + const flt_t rb1 = (flt_t)1.0 / sqrt(b1mag2); + const flt_t sb1 = (flt_t)1.0 / b1mag2; + + const flt_t b2mag2 = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm; + const flt_t rb2 = (flt_t)1.0 / sqrt(b2mag2); + const flt_t sb2 = (flt_t)1.0 / b2mag2; + + const flt_t b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z; + const flt_t rb3 = (flt_t)1.0 / sqrt(b3mag2); + const flt_t sb3 = (flt_t)1.0 / b3mag2; + + const flt_t c0 = (vb1x * vb3x + vb1y * vb3y + vb1z * vb3z) * rb1 * rb3; + + flt_t ctmp = -vb1x*vb2xm - vb1y*vb2ym - vb1z*vb2zm; + const flt_t r12c1 = rb1 * rb2; + const flt_t c1mag = ctmp * r12c1; + + ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z; + const flt_t r12c2 = rb2 * rb3; + const flt_t c2mag = ctmp * r12c2; + + // cos and sin of 2 angles and final c + + const flt_t sd1 = (flt_t)1.0 - c1mag * c1mag; + flt_t sc1 = (flt_t)1.0/sqrt(sd1); + if (sd1 < SMALL2) sc1 = INVSMALL; + + const flt_t sd2 = (flt_t)1.0 - c2mag * c2mag; + flt_t sc2 = (flt_t)1.0/sqrt(sd2); + if (sc2 < SMALL2) sc2 = INVSMALL; + + const flt_t s1 = sc1 * sc1; + const flt_t s2 = sc2 * sc2; + flt_t s12 = sc1 * sc2; + flt_t c = (c0 + c1mag*c2mag) * s12; + + // error check + + if (c > PTOLERANCE || c < MTOLERANCE) { + int me; + MPI_Comm_rank(world,&me); + if (screen) { + char str[128]; + sprintf(str,"Improper problem: %d " BIGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + me,update->ntimestep, + atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", + me,x[i1].x,x[i1].y,x[i1].z); + fprintf(screen," 2nd atom: %d %g %g %g\n", + me,x[i2].x,x[i2].y,x[i2].z); + fprintf(screen," 3rd atom: %d %g %g %g\n", + me,x[i3].x,x[i3].y,x[i3].z); + fprintf(screen," 4th atom: %d %g %g %g\n", + me,x[i4].x,x[i4].y,x[i4].z); + } + } + + if (c > (flt_t)1.0) c = (flt_t)1.0; + if (c < (flt_t)-1.0) c = (flt_t)-1.0; + + // force & energy + // p = 1 + cos(n*phi) for d = 1 + // p = 1 - cos(n*phi) for d = -1 + // pd = dp/dc / 2 + + const int m = fc.fc[type].multiplicity; + + flt_t p, pd; + if (m == 2) { + p = (flt_t)2.0*c*c; + pd = (flt_t)2.0*c; + } else if (m == 3) { + const flt_t rc2 = c*c; + p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0; + pd = (flt_t)6.0*rc2 - (flt_t)1.5; + } else if (m == 4) { + const flt_t rc2 = c*c; + p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0; + pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c; + } else if (m == 6) { + const flt_t rc2 = c*c; + p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2; + pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c; + } else if (m == 1) { + p = c + (flt_t)1.0; + pd = (flt_t)0.5; + } else if (m == 5) { + const flt_t rc2 = c*c; + p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0; + pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5; + } else if (m == 0) { + p = (flt_t)2.0; + pd = (flt_t)0.0; + } + + if (fc.fc[type].sign == -1) { + p = (flt_t)2.0 - p; + pd = -pd; + } + + flt_t eimproper; + if (EFLAG) eimproper = fc.fc[type].k * p; + + const flt_t a = (flt_t)2.0 * fc.fc[type].k * pd; + c = c * a; + s12 = s12 * a; + const flt_t a11 = c*sb1*s1; + const flt_t a22 = -sb2*((flt_t)2.0*c0*s12 - c*(s1+s2)); + const flt_t a33 = c*sb3*s2; + const flt_t a12 = -r12c1*(c1mag*c*s1 + c2mag*s12); + const flt_t a13 = -rb1*rb3*s12; + const flt_t a23 = r12c2*(c2mag*c*s2 + c1mag*s12); + + const flt_t sx2 = a12*vb1x - a22*vb2xm + a23*vb3x; + const flt_t sy2 = a12*vb1y - a22*vb2ym + a23*vb3y; + const flt_t sz2 = a12*vb1z - a22*vb2zm + a23*vb3z; + + const flt_t f1x = a11*vb1x - a12*vb2xm + a13*vb3x; + const flt_t f1y = a11*vb1y - a12*vb2ym + a13*vb3y; + const flt_t f1z = a11*vb1z - a12*vb2zm + a13*vb3z; + + const flt_t f2x = -sx2 - f1x; + const flt_t f2y = -sy2 - f1y; + const flt_t f2z = -sz2 - f1z; + + const flt_t f4x = a13*vb1x - a23*vb2xm + a33*vb3x; + const flt_t f4y = a13*vb1y - a23*vb2ym + a33*vb3y; + const flt_t f4z = a13*vb1z - a23*vb2zm + a33*vb3z; + + const flt_t f3x = sx2 - f4x; + const flt_t f3y = sy2 - f4y; + const flt_t f3z = sz2 - f4z; + + // apply force to each of 4 atoms + + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } + + if (NEWTON_BOND || i4 < nlocal) { + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; + } + + if (EVFLAG) { + IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, oeimproper, f, NEWTON_BOND, nlocal, + ov0, ov1, ov2, ov3, ov4, ov5); + } + } // for n + } // omp parallel + if (EVFLAG) { + if (EFLAG) + energy += oeimproper; + if (vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + } + } + + fix->set_reduce_flag(); +} + +/* ---------------------------------------------------------------------- */ + +void ImproperCvffIntel::init() +{ + ImproperCvff::init(); + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->bond_init_check(); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void ImproperCvffIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + const int bp1 = atom->nimpropertypes + 1; + fc.set_ntypes(bp1,memory); + + for (int i = 0; i < bp1; i++) { + fc.fc[i].k = k[i]; + fc.fc[i].sign = sign[i]; + fc.fc[i].multiplicity = multiplicity[i]; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void ImproperCvffIntel::ForceConst::set_ntypes(const int nimproper, + Memory *memory) { + if (nimproper != _nimpropertypes) { + if (_nimpropertypes > 0) + _memory->destroy(fc); + + if (nimproper > 0) + _memory->create(fc,nimproper,"improperharmonicintel.fc"); + } + _nimpropertypes = nimproper; + _memory = memory; +} diff --git a/src/USER-INTEL/improper_cvff_intel.h b/src/USER-INTEL/improper_cvff_intel.h new file mode 100644 index 0000000000..154bb5bd18 --- /dev/null +++ b/src/USER-INTEL/improper_cvff_intel.h @@ -0,0 +1,90 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef IMPROPER_CLASS + +ImproperStyle(cvff/intel,ImproperCvffIntel) + +#else + +#ifndef LMP_IMPROPER_CVFF_INTEL_H +#define LMP_IMPROPER_CVFF_INTEL_H + +#include +#include "improper_cvff.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class ImproperCvffIntel : public ImproperCvff { + public: + ImproperCvffIntel(class LAMMPS *); + virtual ~ImproperCvffIntel(); + virtual void compute(int, int); + virtual void init(); + + protected: + FixIntel *fix; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int vflag, IntelBuffers * buffers, + const ForceConst &fc); + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template + class ForceConst { + public: + typedef struct { flt_t k; int sign, multiplicity; } fc_packed1; + + fc_packed1 *fc; + + ForceConst() : _nimpropertypes(0) {} + ~ForceConst() { set_ntypes(0, NULL); } + + void set_ntypes(const int nimpropertypes, Memory *memory); + + private: + int _nimpropertypes; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +W: Improper problem: %d %ld %d %d %d %d + +Conformation of the 4 listed improper atoms is extreme; you may want +to check your simulation geometry. + +*/ diff --git a/src/USER-INTEL/improper_harmonic_intel.cpp b/src/USER-INTEL/improper_harmonic_intel.cpp index 40ded62f5c..7693793f02 100644 --- a/src/USER-INTEL/improper_harmonic_intel.cpp +++ b/src/USER-INTEL/improper_harmonic_intel.cpp @@ -37,6 +37,8 @@ using namespace MathConst; #define PTOLERANCE (flt_t)1.05 #define MTOLERANCE (flt_t)-1.05 #define SMALL (flt_t)0.001 +#define SMALL2 (flt_t)0.000001 +#define INVSMALL (flt_t)1000.0 typedef struct { int a,b,c,d,t; } int5_t; /* ---------------------------------------------------------------------- */ @@ -175,13 +177,17 @@ void ImproperHarmonicIntel::eval(const int vflag, const flt_t vb3y = x[i4].y - x[i3].y; const flt_t vb3z = x[i4].z - x[i3].z; - const flt_t ss1 = (flt_t)1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z); - const flt_t ss2 = (flt_t)1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z); - const flt_t ss3 = (flt_t)1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z); + flt_t ss1 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z; + flt_t ss2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z; + flt_t ss3 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z; - const flt_t r1 = sqrt(ss1); - const flt_t r2 = sqrt(ss2); - const flt_t r3 = sqrt(ss3); + const flt_t r1 = (flt_t)1.0 / sqrt(ss1); + const flt_t r2 = (flt_t)1.0 / sqrt(ss2); + const flt_t r3 = (flt_t)1.0 / sqrt(ss3); + + ss1 = (flt_t)1.0 / ss1; + ss2 = (flt_t)1.0 / ss2; + ss3 = (flt_t)1.0 / ss3; // sin and cos of angle @@ -191,13 +197,13 @@ void ImproperHarmonicIntel::eval(const int vflag, flt_t s1 = 1.0 - c1*c1; if (s1 < SMALL) s1 = SMALL; - s1 = (flt_t)1.0 / s1; flt_t s2 = (flt_t)1.0 - c2*c2; if (s2 < SMALL) s2 = SMALL; - s2 = (flt_t)1.0 / s2; - flt_t s12 = sqrt(s1*s2); + flt_t s12 = (flt_t)1.0 / sqrt(s1*s2); + s1 = (flt_t)1.0 / s1; + s2 = (flt_t)1.0 / s2; flt_t c = (c1*c2 + c0) * s12; // error check @@ -227,8 +233,9 @@ void ImproperHarmonicIntel::eval(const int vflag, if (c > (flt_t)1.0) c = (flt_t)1.0; if (c < (flt_t)-1.0) c = (flt_t)-1.0; - flt_t s = sqrt((flt_t)1.0 - c*c); - if (s < SMALL) s = SMALL; + const flt_t sd = (flt_t)1.0 - c * c; + flt_t s = (flt_t)1.0 / sqrt(sd); + if (sd < SMALL2) s = INVSMALL; // force & energy @@ -239,7 +246,7 @@ void ImproperHarmonicIntel::eval(const int vflag, flt_t eimproper; if (EFLAG) eimproper = a*domega; - a = -a * (flt_t)2.0/s; + a = -a * (flt_t)2.0 * s; c = c * a; s12 = s12 * a; const flt_t a11 = c*ss1*s1; diff --git a/src/USER-INTEL/neigh_half_bin_intel.cpp b/src/USER-INTEL/neigh_half_bin_intel.cpp index 6fa1d48e48..8a6e3983b1 100644 --- a/src/USER-INTEL/neigh_half_bin_intel.cpp +++ b/src/USER-INTEL/neigh_half_bin_intel.cpp @@ -1454,28 +1454,24 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1; #endif - const int num = aend-astart; + const int num = aend - astart; int tid, ifrom, ito; - IP_PRE_omp_range_id(ifrom,ito,tid,num,nthreads); + IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads); ifrom += astart; ito += astart; int which; - const int list_size = (ito + tid + 1) * maxnbors; - int ct = (ifrom + tid) * maxnbors; + const int list_size = (ito + tid * 2 + 2) * maxnbors; + int ct = (ifrom + tid * 2) * maxnbors; int *neighptr = firstneigh + ct; + const int obound = maxnbors * 3; + for (int i = ifrom; i < ito; i++) { - int j, k, n, n2, itype, jtype, ibin; - double xtmp, ytmp, ztmp, delx, dely, delz, rsq; - - n = 0; - n2 = maxnbors; - - xtmp = x[i].x; - ytmp = x[i].y; - ztmp = x[i].z; - itype = x[i].w; + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + const int itype = x[i].w; const int ioffset = ntypes * itype; // loop over all atoms in bins in stencil @@ -1484,10 +1480,11 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, // (equal zyx and j <= i) // latter excludes self-self interaction but allows superposed atoms - ibin = atombin[i]; + const int ibin = atombin[i]; - for (k = 0; k < nstencil; k++) { - for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) { + int raw_count = maxnbors; + for (int k = 0; k < nstencil; k++) { + for (int j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) { if (offload_noghost) { if (j < nlocal) { if (i < offload_end) continue; @@ -1503,62 +1500,93 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, } } - jtype = x[j].w; #ifndef _LMP_INTEL_OFFLOAD - if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue; + if (exclude) { + const int jtype = x[j].w; + if (exclusion(i,j,itype,jtype,mask,molecule)) continue; + } #endif - delx = xtmp - x[j].x; - dely = ytmp - x[j].y; - delz = ztmp - x[j].z; - rsq = delx * delx + dely * dely + delz * delz; - if (rsq <= cutneighsq[ioffset + jtype]) { - if (j < nlocal) { - if (need_ic) { - int no_special; - ominimum_image_check(no_special, delx, dely, delz); - if (no_special) - neighptr[n++] = -j - 1; - else - neighptr[n++] = j; - } else - neighptr[n++] = j; - #ifdef _LMP_INTEL_OFFLOAD - if (j < lmin) lmin = j; - if (j > lmax) lmax = j; - #endif - } else { - if (need_ic) { - int no_special; - ominimum_image_check(no_special, delx, dely, delz); - if (no_special) - neighptr[n2++] = -j - 1; - else - neighptr[n2++] = j; - } else - neighptr[n2++] = j; - #ifdef _LMP_INTEL_OFFLOAD - if (j < gmin) gmin = j; - if (j > gmax) gmax = j; - #endif - } + neighptr[raw_count++] = j; + } + } + if (raw_count > obound) + *overflow = 1; + + #if defined(LMP_SIMD_COMPILER) + #ifdef _LMP_INTEL_OFFLOAD + int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax; + #pragma vector aligned + #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin) + #else + #pragma vector aligned + #pragma simd + #endif + #endif + for (int u = maxnbors; u < raw_count; u++) { + int j = neighptr[u]; + const flt_t delx = xtmp - x[j].x; + const flt_t dely = ytmp - x[j].y; + const flt_t delz = ztmp - x[j].z; + const int jtype = x[j].w; + const flt_t rsq = delx * delx + dely * dely + delz * delz; + if (rsq > cutneighsq[ioffset + jtype]) + neighptr[u] = e_nall; + else { + if (need_ic) { + int no_special; + ominimum_image_check(no_special, delx, dely, delz); + if (no_special) + neighptr[u] = -j - 1; + } + + #ifdef _LMP_INTEL_OFFLOAD + if (j < nlocal) { + if (j < vlmin) vlmin = j; + if (j > vlmax) vlmax = j; + } else { + if (j < vgmin) vgmin = j; + if (j > vgmax) vgmax = j; } + #endif } } - ilist[i] = i; + int n = 0, n2 = maxnbors; + for (int u = maxnbors; u < raw_count; u++) { + const int j = neighptr[u]; + int pj = j; + if (pj < e_nall) { + if (need_ic) + if (pj < 0) pj = -pj - 1; + + if (pj < nlocal) + neighptr[n++] = j; + else + neighptr[n2++] = j; + } + } + int ns = n; + for (int u = maxnbors; u < n2; u++) + neighptr[n++] = neighptr[u]; + + ilist[i] = i; cnumneigh[i] = ct; - if (n > maxnbors) *overflow = 1; - for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k]; - while( (n % pad_width) != 0 ) neighptr[n++] = e_nall; - numneigh[i] = n; - while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++; - ct += n; - neighptr += n; - if (ct + n + maxnbors > list_size) { - *overflow = 1; - ct = (ifrom + tid) * maxnbors; - } + ns += n2 - maxnbors; + while( (ns % pad_width) != 0 ) neighptr[ns++] = e_nall; + numneigh[i] = ns; + + ct += ns; + const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); + const int edge = (ct % alignb); + if (edge) ct += alignb - edge; + neighptr = firstneigh + ct; + if (ct + obound > list_size) { + if (i < ito - 1) { + *overflow = 1; + ct = (ifrom + tid * 2) * maxnbors; + } + } } if (*overflow == 1) @@ -1597,6 +1625,10 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, for (int i = ifrom; i < ito; ++i) { int * _noalias jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif for (int jj = 0; jj < jnum; jj++) { const int j = jlist[jj]; if (need_ic && j < 0) { diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp index 7f2cc654e5..98b926b28e 100644 --- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp @@ -284,9 +284,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, const flt_t delz = ztmp - x[j].z; const int jtype = x[j].w; const flt_t rsq = delx * delx + dely * dely + delz * delz; - const flt_t r = sqrt(rsq); - const flt_t r2inv = (flt_t)1.0 / rsq; + const flt_t r = (flt_t)1.0 / sqrt(r2inv); #ifdef INTEL_VMASK if (rsq < c_forcei[jtype].cutsq) { @@ -309,12 +308,12 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, const flt_t prefactor = qqrd2e * qtmp * q[j] / r; forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); if (EFLAG) ecoul = prefactor * erfc; - if (sbindex) { - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* - prefactor; - forcecoul -= adjust; - if (EFLAG) ecoul -= adjust; - } + + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; + #ifdef INTEL_ALLOW_TABLE } else { float rsq_lookup = rsq; diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp index 1f985a8d36..ecf2840ff3 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp @@ -303,7 +303,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, const flt_t EWALD_F = 1.12837917; const flt_t INV_EWALD_P = 1.0 / 0.3275911; - const flt_t r = sqrt(rsq); + const flt_t r = (flt_t)1.0 / sqrt(r2inv); const flt_t grij = g_ewald * r; const flt_t expm2 = exp(-grij * grij); const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); @@ -311,12 +311,12 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, const flt_t prefactor = qqrd2e * qtmp * q[j] / r; forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); if (EFLAG) ecoul = prefactor * erfc; - if (sbindex) { - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* - prefactor; - forcecoul -= adjust; - if (EFLAG) ecoul -= adjust; - } + + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; + #ifdef INTEL_ALLOW_TABLE } else { float rsq_lookup = rsq; diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp index a158ead327..861d3108f2 100644 --- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp @@ -297,7 +297,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, const flt_t EWALD_F = 1.12837917; const flt_t INV_EWALD_P = 1.0 / 0.3275911; - const flt_t r = sqrt(rsq); + const flt_t r = (flt_t)1.0 / sqrt(r2inv); const flt_t grij = g_ewald * r; const flt_t expm2 = exp(-grij * grij); const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); @@ -305,12 +305,12 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, const flt_t prefactor = qqrd2e * qtmp * q[j] / r; forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); if (EFLAG) ecoul = prefactor * erfc; - if (sbindex) { - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* - prefactor; - forcecoul -= adjust; - if (EFLAG) ecoul -= adjust; - } + + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; + #ifdef INTEL_ALLOW_TABLE } else { float rsq_lookup = rsq;