1505 lines
58 KiB
C++
1505 lines
58 KiB
C++
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the GNU General Public License.
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
------------------------------------------------------------------------- */
|
|
|
|
/* ----------------------------------------------------------------------
|
|
Contributing author: Markus Höhnerbach (RWTH)
|
|
------------------------------------------------------------------------- */
|
|
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include "pair_tersoff_intel.h"
|
|
#include "atom.h"
|
|
#include "neighbor.h"
|
|
#include "neigh_list.h"
|
|
#include "neigh_request.h"
|
|
#include "force.h"
|
|
#include "comm.h"
|
|
#include "memory.h"
|
|
#include "error.h"
|
|
|
|
// Currently Intel compiler is required for this pair style.
|
|
// For convenience, base class routines are called if not using Intel compiler.
|
|
#ifndef __INTEL_COMPILER
|
|
using namespace LAMMPS_NS;
|
|
|
|
PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp)
|
|
{
|
|
}
|
|
|
|
void PairTersoffIntel::compute(int eflag, int vflag)
|
|
{
|
|
PairTersoff::compute(eflag, vflag);
|
|
}
|
|
|
|
void PairTersoffIntel::init_style()
|
|
{
|
|
if (comm->me == 0) {
|
|
error->warning(FLERR, "Tersoff/intel currently requires intel compiler. "
|
|
"Using MANYBODY version.");
|
|
}
|
|
PairTersoff::init_style();
|
|
}
|
|
|
|
#else
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
#pragma offload_attribute(push,target(mic))
|
|
#endif
|
|
|
|
#include "intel_intrinsics.h"
|
|
#include "math_const.h"
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
#pragma offload_attribute(pop)
|
|
#endif
|
|
|
|
#include "group.h"
|
|
#include "kspace.h"
|
|
#include "modify.h"
|
|
#include "suffix.h"
|
|
|
|
using namespace LAMMPS_NS;
|
|
using namespace MathConst;
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp)
|
|
{
|
|
suffix_flag |= Suffix::INTEL;
|
|
respa_enable = 0;
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
// Dispatch the requested precision
|
|
void PairTersoffIntel::compute(int eflag, int vflag)
|
|
{
|
|
if (fix->precision()==FixIntel::PREC_MODE_MIXED) {
|
|
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
|
force_const_single);
|
|
} else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) {
|
|
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
|
force_const_double);
|
|
} else {
|
|
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
|
|
force_const_single);
|
|
}
|
|
fix->balance_stamp();
|
|
vflag_fdotr = 0;
|
|
}
|
|
|
|
// Dispatch the extent of computation:
|
|
// do we need to calculate energy/virial
|
|
template <class flt_t, class acc_t>
|
|
void PairTersoffIntel::compute(int eflag, int vflag,
|
|
IntelBuffers<flt_t,acc_t> *buffers,
|
|
const ForceConst<flt_t> &fc)
|
|
{
|
|
if (eflag || vflag) {
|
|
ev_setup(eflag,vflag);
|
|
} else evflag = vflag_fdotr = 0;
|
|
|
|
const int inum = list->inum;
|
|
const int nthreads = comm->nthreads;
|
|
const int host_start = fix->host_start_pair();
|
|
const int offload_end = fix->offload_end_pair();
|
|
const int ago = neighbor->ago;
|
|
|
|
if (ago != 0 && fix->separate_buffers() == 0) {
|
|
fix->start_watch(TIME_PACK);
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
|
#endif
|
|
{
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
|
nthreads, sizeof(ATOM_T));
|
|
buffers->thr_pack(ifrom,ito,ago);
|
|
}
|
|
fix->stop_watch(TIME_PACK);
|
|
}
|
|
|
|
if (evflag || vflag_fdotr) {
|
|
int ovflag = 0;
|
|
if (vflag_fdotr) ovflag = 2;
|
|
else if (vflag) ovflag = 1;
|
|
if (eflag) {
|
|
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
|
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
} else {
|
|
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
|
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
}
|
|
} else {
|
|
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
|
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
|
}
|
|
}
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
#pragma offload_attribute(push, target(mic))
|
|
#endif
|
|
|
|
// The complete Tersoff computation kernel is encapsulated here
|
|
// everything is static, the class just serves as a unit of organization
|
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
|
struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> {
|
|
// instantiate the vector library and import the types
|
|
typedef typename lmp_intel::vector_routines<flt_t, acc_t, mic> v;
|
|
typedef typename v::fvec fvec;
|
|
typedef typename v::ivec ivec;
|
|
typedef typename v::bvec bvec;
|
|
typedef typename v::avec avec;
|
|
typedef typename v::iarr iarr;
|
|
typedef typename v::farr farr;
|
|
typedef typename v::aarr aarr;
|
|
typedef typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t c_inner_t;
|
|
typedef typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t c_outer_t;
|
|
|
|
// for descriptions of these methods, please have a look at the original code
|
|
// what's done in here is that they are inlined and vectorized
|
|
// attractive() also provides an option to compute zeta as well
|
|
static fvec zeta_vector(
|
|
const c_inner_t * param,
|
|
ivec xjw, bvec mask,
|
|
fvec vrij, fvec rsq2,
|
|
fvec vdijx, fvec vdijy, fvec vdijz,
|
|
fvec dikx, fvec diky, fvec dikz
|
|
);
|
|
static void force_zeta_vector(
|
|
const c_outer_t * param,
|
|
ivec xjw,
|
|
bvec mask,
|
|
fvec vrijsq, fvec vzeta_ij,
|
|
fvec *vfpair, fvec *vprefactor, int EVDWL, fvec *vevdwl,
|
|
bvec vmask_repulsive
|
|
);
|
|
template<bool ZETA>
|
|
static void attractive_vector(
|
|
const c_inner_t * param,
|
|
ivec xjw,
|
|
bvec mask,
|
|
fvec vprefactor,
|
|
fvec vrijsq, fvec rsq2,
|
|
fvec vdijx, fvec vdijy, fvec vdijz,
|
|
fvec dikx, fvec diky, fvec dikz,
|
|
fvec *fix, fvec *fiy, fvec *fiz,
|
|
fvec *fjx, fvec *fjy, fvec *fjz,
|
|
fvec *fkx, fvec *fky, fvec *fkz,
|
|
fvec *zeta
|
|
);
|
|
|
|
// perform the actual computation
|
|
template<bool EVFLAG, bool EFLAG>
|
|
static void kernel(
|
|
int iito, int iifrom, int eatom, int vflag,
|
|
const int * _noalias const numneigh,
|
|
const int * _noalias const numneighhalf,
|
|
const int * _noalias const cnumneigh,
|
|
const int * _noalias const firstneigh, int ntypes,
|
|
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
|
const c_inner_t * _noalias const c_inner,
|
|
const c_outer_t * _noalias const c_outer,
|
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
|
acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
|
|
);
|
|
|
|
// perform one step of calculation, pass in i-j pairs of atoms (is, js)
|
|
template<int EVFLAG, int EFLAG>
|
|
static void kernel_step(
|
|
int eatom, int vflag,
|
|
const int * _noalias const numneigh,
|
|
const int * _noalias const cnumneigh,
|
|
const int * _noalias const firstneigh,
|
|
int ntypes,
|
|
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
|
const c_inner_t * _noalias const c_inner,
|
|
const c_outer_t * _noalias const c_outer,
|
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
|
avec *vsevdwl, avec *vsv0, avec * vsv1, avec *vsv2, avec* vsv3, avec *vsv4, avec *vsv5,
|
|
int compress_idx, iarr is, iarr js, bvec vmask_repulsive
|
|
);
|
|
|
|
// perform one step of calculation, as opposed to the previous method now
|
|
// with fixed i and a number of js
|
|
template<int EVFLAG, int EFLAG>
|
|
static void kernel_step_const_i(
|
|
int eatom, int vflag,
|
|
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
|
const int * _noalias const firstneigh, int ntypes,
|
|
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
|
const c_inner_t * _noalias const c_inner,
|
|
const c_outer_t * _noalias const c_outer,
|
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
|
avec *vsevdwl, avec *vsv0, avec *vsv1, avec *vsv2, avec *vsv3, avec *vsv4, avec *vsv5,
|
|
int compress_idx, int i, iarr js, bvec vmask_repulsive
|
|
);
|
|
};
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
#pragma offload_attribute(pop)
|
|
#endif
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
// Dispatch to correct kernel instatiation and perform all the work neccesary
|
|
// for offloading. In this routine we enter the Phi.
|
|
// This method is nearly identical to what happens in the other /intel styles
|
|
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
|
void PairTersoffIntel::eval(const int offload, const int vflag,
|
|
IntelBuffers<flt_t,acc_t> *buffers,
|
|
const ForceConst<flt_t> &fc,
|
|
const int astart, const int aend)
|
|
{
|
|
const int inum = aend - astart;
|
|
if (inum == 0) return;
|
|
int nlocal, nall, minlocal;
|
|
fix->get_buffern(offload, nlocal, nall, minlocal);
|
|
|
|
const int ago = neighbor->ago;
|
|
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
|
|
|
|
ATOM_T * _noalias const x = buffers->get_x(offload);
|
|
tagint * _noalias tag = this->atom->tag;
|
|
flt_t * _noalias const q = buffers->get_q(offload);
|
|
|
|
const int * _noalias const numneigh = list->numneigh;
|
|
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
|
const int * _noalias const numneighhalf = buffers->get_atombin();
|
|
const int * _noalias const firstneigh = buffers->firstneigh(list);
|
|
|
|
typedef typename ForceConst<flt_t>::c_inner_t c_inner_t;
|
|
typedef typename ForceConst<flt_t>::c_outer_t c_outer_t;
|
|
typedef typename ForceConst<flt_t>::c_cutoff_t c_cutoff_t;
|
|
const c_outer_t * _noalias const c_outer = fc.c_outer[0];
|
|
const c_inner_t * _noalias const c_inner = fc.c_inner[0][0];
|
|
const c_cutoff_t * _noalias const c_inner_cutoff = fc.c_cutoff_inner[0][0];
|
|
|
|
const int ntypes = atom->ntypes + 1;
|
|
const int eatom = this->eflag_atom;
|
|
|
|
// Determine how much data to transfer
|
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
|
buffers, offload, fix, separate_flag,
|
|
x_size, q_size, ev_size, f_stride);
|
|
|
|
int tc;
|
|
FORCE_T * _noalias f_start;
|
|
acc_t * _noalias ev_global;
|
|
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
|
|
|
|
const int nthreads = tc;
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
int *overflow = fix->get_off_overflow_flag();
|
|
double *timer_compute = fix->off_watch_pair();
|
|
if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
|
|
#pragma offload target(mic:_cop) if(offload) \
|
|
in(c_inner, c_outer :length(0) alloc_if(0) free_if(0)) \
|
|
in(c_inner_cutoff :length(0) alloc_if(0) free_if(0)) \
|
|
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
|
|
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
|
|
in(numneigh:length(0) alloc_if(0) free_if(0)) \
|
|
in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
|
|
in(x:length(x_size) alloc_if(0) free_if(0)) \
|
|
in(overflow:length(0) alloc_if(0) free_if(0)) \
|
|
in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
|
|
in(f_stride,nlocal,minlocal,separate_flag,offload) \
|
|
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
|
|
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
|
|
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
|
signal(f_start)
|
|
#endif
|
|
{
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
#ifdef __MIC__
|
|
*timer_compute = MIC_Wtime();
|
|
#endif
|
|
#endif
|
|
|
|
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
|
f_stride, x, 0);
|
|
|
|
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
|
if (EVFLAG) {
|
|
oevdwl = oecoul = (acc_t)0;
|
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
|
}
|
|
|
|
// loop over neighbors of my atoms
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel default(none) \
|
|
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
|
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
|
#endif
|
|
{
|
|
int iifrom, iito, tid;
|
|
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
|
iifrom += astart;
|
|
iito += astart;
|
|
|
|
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
|
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
|
|
|
{
|
|
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
|
sevdwl = sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = 0.;
|
|
#define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \
|
|
firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl, &sv0, &sv1, &sv2, &sv3, &sv4, &sv5
|
|
// Pick the variable i algorithm under specific conditions
|
|
// do use scalar algorithm with very short vectors
|
|
int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
|
|
bool pack_i = VL >= 8 &&
|
|
lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
|
|
bool use_scalar = VL < 4;
|
|
if (use_scalar) {
|
|
IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EVFLAG,EFLAG>(ARGS);
|
|
} else if (pack_i) {
|
|
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EVFLAG,EFLAG>(ARGS);
|
|
} else {
|
|
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EVFLAG,EFLAG>(ARGS);
|
|
}
|
|
if (EVFLAG) {
|
|
if (EFLAG) oevdwl += sevdwl;
|
|
if (vflag == 1) {
|
|
ov0 += sv0;
|
|
ov1 += sv1;
|
|
ov2 += sv2;
|
|
ov3 += sv3;
|
|
ov4 += sv4;
|
|
ov5 += sv5;
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifndef _LMP_INTEL_OFFLOAD
|
|
if (vflag == 2)
|
|
#endif
|
|
{
|
|
#if defined(_OPENMP)
|
|
#pragma omp barrier
|
|
#endif
|
|
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
|
nlocal, minlocal, nthreads, f_start, f_stride,
|
|
x, offload);
|
|
}
|
|
} // end of omp parallel region
|
|
if (EVFLAG) {
|
|
if (EFLAG) {
|
|
ev_global[0] = oevdwl;
|
|
ev_global[1] = 0.0;
|
|
}
|
|
if (vflag) {
|
|
ev_global[2] = ov0;
|
|
ev_global[3] = ov1;
|
|
ev_global[4] = ov2;
|
|
ev_global[5] = ov3;
|
|
ev_global[6] = ov4;
|
|
ev_global[7] = ov5;
|
|
}
|
|
}
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
#ifdef __MIC__
|
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
|
#endif
|
|
#endif
|
|
} // end of offload region
|
|
|
|
if (offload)
|
|
fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
|
else
|
|
fix->stop_watch(TIME_HOST_PAIR);
|
|
|
|
if (EVFLAG)
|
|
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
|
else
|
|
fix->add_result_array(f_start, 0, offload);
|
|
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
init specific to this pair style
|
|
------------------------------------------------------------------------- */
|
|
|
|
// As in any other /intel pair style
|
|
void PairTersoffIntel::init_style()
|
|
{
|
|
if (atom->tag_enable == 0)
|
|
error->all(FLERR,"Pair style Tersoff requires atom IDs");
|
|
if (force->newton_pair == 0)
|
|
error->all(FLERR,"Pair style Tersoff requires newton pair on");
|
|
|
|
// need a full neighbor list
|
|
|
|
int irequest = neighbor->request(this);
|
|
neighbor->requests[irequest]->half = 0;
|
|
neighbor->requests[irequest]->full = 1;
|
|
neighbor->requests[irequest]->intel = 1;
|
|
|
|
int ifix = modify->find_fix("package_intel");
|
|
if (ifix < 0)
|
|
error->all(FLERR,
|
|
"The 'package intel' command is required for /intel styles");
|
|
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
|
|
|
fix->pair_init_check();
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
_cop = fix->coprocessor_number();
|
|
#endif
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
|
fix->get_mixed_buffers()->need_tag(1);
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
fix->get_double_buffers()->need_tag(1);
|
|
pack_force_const(force_const_double, fix->get_double_buffers());
|
|
} else {
|
|
pack_force_const(force_const_single, fix->get_single_buffers());
|
|
fix->get_single_buffers()->need_tag(1);
|
|
}
|
|
}
|
|
|
|
// As in any other /intel pair style
|
|
template <class flt_t, class acc_t>
|
|
void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|
IntelBuffers<flt_t,acc_t> *buffers)
|
|
{
|
|
int tp1 = atom->ntypes + 1;
|
|
|
|
fc.set_ntypes(tp1, memory, _cop);
|
|
buffers->set_ntypes(tp1);
|
|
flt_t **cutneighsq = buffers->get_cutneighsq();
|
|
|
|
// Repeat cutsq calculation because done after call to init_style
|
|
double cut, cutneigh;
|
|
for (int i = 1; i <= atom->ntypes; i++) {
|
|
for (int j = i; j <= atom->ntypes; j++) {
|
|
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
|
|
cut = init_one(i, j);
|
|
cutneigh = cut + neighbor->skin;
|
|
cutsq[i][j] = cutsq[j][i] = cut*cut;
|
|
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int i = 1; i < tp1; i++) {
|
|
for (int j = 1; j < tp1; j++) {
|
|
fc.c_inner_loop[i][j][0].d2 = 1.0;
|
|
fc.c_inner_loop[i][0][j].d2 = 1.0;
|
|
fc.c_inner_loop[0][i][j].d2 = 1.0;
|
|
for (int k = 1; k < tp1; k++) {
|
|
Param * param = ¶ms[elem2param[map[i]][map[j]][map[k]]];
|
|
fc.c_cutoff_inner[i][k][j].cutsq = static_cast<flt_t>(param->cutsq);
|
|
fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
|
|
fc.c_inner_loop[i][j][k].bigr = static_cast<flt_t>(param->bigr);
|
|
fc.c_inner_loop[i][j][k].bigd = static_cast<flt_t>(param->bigd);
|
|
fc.c_inner_loop[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
|
|
fc.c_inner_loop[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
|
|
fc.c_inner_loop[i][j][k].h = static_cast<flt_t>(param->h);
|
|
fc.c_inner_loop[i][j][k].gamma = static_cast<flt_t>(param->gamma);
|
|
fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint);
|
|
|
|
fc.c_inner[i][j][k].cutsq = static_cast<flt_t>(param->cutsq);
|
|
fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
|
|
fc.c_inner[i][j][k].bigr = static_cast<flt_t>(param->bigr);
|
|
fc.c_inner[i][j][k].bigd = static_cast<flt_t>(param->bigd);
|
|
fc.c_inner[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
|
|
fc.c_inner[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
|
|
fc.c_inner[i][j][k].h = static_cast<flt_t>(param->h);
|
|
fc.c_inner[i][j][k].gamma = static_cast<flt_t>(param->gamma);
|
|
fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint);
|
|
|
|
}
|
|
Param * param = ¶ms[elem2param[map[i]][map[j]][map[j]]];
|
|
fc.c_cutoff_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
|
|
fc.c_first_loop[i][j].bigr = static_cast<flt_t>(param->bigr);
|
|
fc.c_first_loop[i][j].bigd = static_cast<flt_t>(param->bigd);
|
|
fc.c_first_loop[i][j].lam1 = static_cast<flt_t>(param->lam1);
|
|
fc.c_first_loop[i][j].biga = static_cast<flt_t>(param->biga);
|
|
fc.c_second_loop[i][j].lam2 = static_cast<flt_t>(param->lam2);
|
|
fc.c_second_loop[i][j].beta = static_cast<flt_t>(param->beta);
|
|
fc.c_second_loop[i][j].bigb = static_cast<flt_t>(param->bigb);
|
|
fc.c_second_loop[i][j].powern = static_cast<flt_t>(param->powern);
|
|
fc.c_second_loop[i][j].c1 = static_cast<flt_t>(param->c1);
|
|
fc.c_second_loop[i][j].c2 = static_cast<flt_t>(param->c2);
|
|
fc.c_second_loop[i][j].c3 = static_cast<flt_t>(param->c3);
|
|
fc.c_second_loop[i][j].c4 = static_cast<flt_t>(param->c4);
|
|
|
|
fc.c_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
|
|
fc.c_outer[i][j].bigr = static_cast<flt_t>(param->bigr);
|
|
fc.c_outer[i][j].bigd = static_cast<flt_t>(param->bigd);
|
|
fc.c_outer[i][j].lam1 = static_cast<flt_t>(param->lam1);
|
|
fc.c_outer[i][j].biga = static_cast<flt_t>(param->biga);
|
|
fc.c_outer[i][j].lam2 = static_cast<flt_t>(param->lam2);
|
|
fc.c_outer[i][j].beta = static_cast<flt_t>(param->beta);
|
|
fc.c_outer[i][j].bigb = static_cast<flt_t>(param->bigb);
|
|
fc.c_outer[i][j].powern = static_cast<flt_t>(param->powern);
|
|
fc.c_outer[i][j].c1 = static_cast<flt_t>(param->c1);
|
|
fc.c_outer[i][j].c2 = static_cast<flt_t>(param->c2);
|
|
fc.c_outer[i][j].c3 = static_cast<flt_t>(param->c3);
|
|
fc.c_outer[i][j].c4 = static_cast<flt_t>(param->c4);
|
|
|
|
}
|
|
fc.c_outer[i][0].cutsq = 0.;
|
|
}
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
if (_cop < 0) return;
|
|
typename ForceConst<flt_t>::c_first_loop_t * c_first_loop = fc.c_first_loop[0];
|
|
typename ForceConst<flt_t>::c_cutoff_t * c_cutoff_outer = fc.c_cutoff_outer[0];
|
|
typename ForceConst<flt_t>::c_outer_t * c_outer = fc.c_outer[0];
|
|
typename ForceConst<flt_t>::c_second_loop_t * c_second_loop = fc.c_second_loop[0];
|
|
typename ForceConst<flt_t>::c_inner_loop_t * c_inner_loop = fc.c_inner_loop[0][0];
|
|
typename ForceConst<flt_t>::c_cutoff_t * c_cutoff_inner = fc.c_cutoff_inner[0][0];
|
|
typename ForceConst<flt_t>::c_inner_t * c_inner = fc.c_inner[0][0];
|
|
flt_t * ocutneighsq = cutneighsq[0];
|
|
size_t VL = 512 / 8 / sizeof(flt_t);
|
|
int ntypes = tp1;
|
|
int ntypes_pad = ntypes + VL - ntypes % VL;
|
|
int tp1sq = tp1 * tp1;
|
|
int tp1cb = tp1 * tp1 * tp1;
|
|
int tp1cb_pad = tp1 * tp1 * ntypes_pad;
|
|
#pragma offload_transfer target(mic:_cop) \
|
|
in(c_first_loop, c_second_loop, c_cutoff_outer, c_outer : length(tp1sq) alloc_if(0) free_if(0)) \
|
|
in(c_inner : length(tp1cb) alloc_if(0) free_if(0)) \
|
|
in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0)) \
|
|
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
|
|
#endif
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
// As in any other /intel pair style
|
|
template <class flt_t>
|
|
void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|
Memory *memory,
|
|
const int cop) {
|
|
if ( (ntypes != _ntypes) ) {
|
|
if (_ntypes > 0) {
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
c_first_loop_t * oc_first_loop = c_first_loop[0];
|
|
c_second_loop_t * oc_second_loop = c_second_loop[0];
|
|
c_inner_loop_t * oc_inner_loop = c_inner_loop[0][0];
|
|
c_cutoff_t * oc_cutoff_inner = c_cutoff_inner[0][0];
|
|
c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
|
|
c_inner_t * oc_inner = c_inner[0][0];
|
|
c_outer_t * oc_outer = c_outer[0];
|
|
if (c_first_loop != NULL && c_second_loop != NULL &&
|
|
c_inner_loop != NULL && _cop >= 0) {
|
|
|
|
#pragma offload_transfer target(mic:cop) \
|
|
nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \
|
|
nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \
|
|
nocopy(oc_inner, oc_outer: alloc_if(0) free_if(0))
|
|
}
|
|
#endif
|
|
_memory->destroy(c_first_loop);
|
|
_memory->destroy(c_second_loop);
|
|
_memory->destroy(c_inner_loop);
|
|
_memory->destroy(c_cutoff_outer);
|
|
_memory->destroy(c_cutoff_inner);
|
|
_memory->destroy(c_inner);
|
|
_memory->destroy(c_outer);
|
|
}
|
|
if (ntypes > 0) {
|
|
_cop = cop;
|
|
size_t VL = 512 / 8 / sizeof(flt_t);
|
|
int ntypes_pad = ntypes + VL - ntypes % VL;
|
|
memory->create(c_first_loop,ntypes,ntypes,"fc.c_first_loop");
|
|
memory->create(c_second_loop,ntypes,ntypes,"fc.c_second_loop");
|
|
memory->create(c_cutoff_outer,ntypes,ntypes,"fc.c_cutoff_outer");
|
|
memory->create(c_inner_loop,ntypes,ntypes,ntypes,"fc.c_inner_loop");
|
|
memory->create(c_cutoff_inner,ntypes,ntypes,ntypes_pad,"fc.c_cutoff_inner");
|
|
memory->create(c_inner,ntypes,ntypes,ntypes,"fc.c_inner");
|
|
memory->create(c_outer,ntypes,ntypes,"fc.c_outer");
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
c_first_loop_t * oc_first_loop = c_first_loop[0];
|
|
c_second_loop_t * oc_second_loop = c_second_loop[0];
|
|
c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
|
|
c_inner_loop_t * oc_inner_loop = c_inner_loop[0][0];
|
|
c_cutoff_t * oc_cutoff_inner = c_cutoff_inner[0][0];
|
|
c_inner_t * oc_inner = c_inner[0][0];
|
|
c_outer_t * oc_outer = c_outer[0];
|
|
int tp1sq = ntypes * ntypes;
|
|
int tp1cb = ntypes * ntypes * ntypes;
|
|
int tp1cb_pad = ntypes * ntypes * ntypes_pad;
|
|
if (oc_first_loop != NULL && oc_second_loop != NULL &&
|
|
oc_inner_loop != NULL && cop >= 0) {
|
|
#pragma offload_transfer target(mic:cop) \
|
|
nocopy(oc_first_loop: length(tp1sq) alloc_if(1) free_if(0)) \
|
|
nocopy(oc_second_loop: length(tp1sq) alloc_if(1) free_if(0)) \
|
|
nocopy(oc_cutoff_outer: length(tp1sq) alloc_if(1) free_if(0)) \
|
|
nocopy(oc_outer: length(tp1sq) alloc_if(1) free_if(0)) \
|
|
nocopy(oc_inner_loop: length(tp1cb) alloc_if(1) free_if(0)) \
|
|
nocopy(oc_inner: length(tp1cb) alloc_if(1) free_if(0)) \
|
|
nocopy(oc_cutoff_inner: length(tp1cb_pad) alloc_if(1) free_if(0))
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
_ntypes=ntypes;
|
|
_memory=memory;
|
|
}
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
#pragma offload_attribute(push,target(mic))
|
|
#endif
|
|
|
|
// The factor up to which we do caching
|
|
static const int N_CACHE = 8;
|
|
|
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
|
template<int EVFLAG, int EFLAG>
|
|
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|
int eatom, int vflag,
|
|
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
|
const int * _noalias const firstneigh, int ntypes,
|
|
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
|
const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
|
|
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
|
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
|
avec *vsevdwl,
|
|
avec *vsv0,
|
|
avec *vsv1,
|
|
avec *vsv2,
|
|
avec* vsv3,
|
|
avec *vsv4,
|
|
avec *vsv5,
|
|
int compress_idx,
|
|
iarr is,
|
|
iarr js,
|
|
bvec vmask_repulsive
|
|
) {
|
|
ivec v_i4floats((int) (4 * sizeof(typename v::fscal)));
|
|
ivec v_i1(1);
|
|
fvec v_2(0.);
|
|
fvec v_0_5(0.5);
|
|
ivec v_i0(0);
|
|
ivec v_i_ntypes(ntypes);
|
|
ivec v_i_NEIGHMASK(NEIGHMASK);
|
|
|
|
farr fx, fy, fz, fw;
|
|
int cache_idx = 0;
|
|
fvec vfkx_cache[N_CACHE];
|
|
fvec vfky_cache[N_CACHE];
|
|
fvec vfkz_cache[N_CACHE];
|
|
ivec vks_cache[N_CACHE];
|
|
bvec vmask_cache[N_CACHE];
|
|
ivec vkks_final_cache;
|
|
bvec vmask_final_cache;
|
|
iarr ts;
|
|
// compute all the stuff we know from i and j
|
|
// TDO: We could extract this from the driver routine
|
|
ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is));
|
|
ivec vjs = v::int_mullo(v_i4floats, v::int_load_vl(js));
|
|
bvec vmask = v::mask_enable_lower(compress_idx);
|
|
fvec vx_i = v::zero(), vy_i = v::zero(), vz_i = v::zero();
|
|
ivec vw_i = v_i0;
|
|
v::gather_x(vis, vmask, x, &vx_i, &vy_i, &vz_i, &vw_i);
|
|
fvec vx_j = v::zero(), vy_j = v::zero(), vz_j = v::zero();
|
|
ivec vw_j = v_i0;
|
|
v::gather_x(vjs, vmask, x, &vx_j, &vy_j, &vz_j, &vw_j);
|
|
fvec vdx_ij = vx_j - vx_i, vdy_ij = vy_j - vy_i, vdz_ij = vz_j - vz_i;
|
|
fvec vrijsq = vdx_ij * vdx_ij + vdy_ij * vdy_ij + vdz_ij * vdz_ij;
|
|
fvec vrij = sqrt(vrijsq);
|
|
ivec vis_orig = v::int_load_vl(is);
|
|
ivec vcnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, cnumneigh);
|
|
ivec vnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, numneigh);
|
|
ivec vc_idx_ij = v::int_mullo(v_i4floats, vw_j + v::int_mullo(v_i_ntypes, vw_i));
|
|
|
|
fvec vzeta = v::zero();
|
|
fvec vfxtmp = v::zero(), vfytmp = v::zero(), vfztmp = v::zero();
|
|
fvec vfjxtmp = v::zero(), vfjytmp = v::zero(), vfjztmp = v::zero();
|
|
// This piece of code faciliates the traversal of the k loop assuming
|
|
// nothing about i. As such, it uses masking to avoid superfluous loads
|
|
// and fast-forwards each lane until work is available.
|
|
// This is useful because we can not make assumptions as to where in the
|
|
// neighbor list the atoms within the cutoff might be.
|
|
// We also implement the caching in here, i.e. collect force contributions
|
|
// due to zeta.
|
|
// This means that you will see four loops:
|
|
// 1. the loop that does zeta calculation and caches the force contributions
|
|
// 2. the loop that processes the remaining zeta calculations
|
|
// 3. the loop that updates the force based on the cached force contributions
|
|
// 4. the loop that computes force contributions for the remainder
|
|
{
|
|
ivec vkks = v_i0;
|
|
bvec vactive_mask = vmask;
|
|
bvec veff_old_mask(0);
|
|
ivec vks, vw_k;
|
|
fvec vx_k, vy_k, vz_k, vcutsq;
|
|
while (! v::mask_testz(vactive_mask) && cache_idx < N_CACHE) {
|
|
bvec vnew_mask = vactive_mask & ~ veff_old_mask;
|
|
vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
|
|
v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
|
|
v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
|
|
fvec vdx_ik = (vx_k - vx_i);
|
|
fvec vdy_ik = (vy_k - vy_i);
|
|
fvec vdz_ik = (vz_k - vz_i);
|
|
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
|
|
ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
|
|
vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
|
|
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
|
|
bvec vsame_mask = v::int_cmpneq(vjs, vks);
|
|
bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
|
|
if (v::mask_testz(~(veff_mask | ~vactive_mask))) {
|
|
fvec vzeta_contrib;
|
|
fvec vfix, vfiy, vfiz;
|
|
fvec vfjx, vfjy, vfjz;
|
|
fvec vfkx, vfky, vfkz;
|
|
|
|
attractive_vector<true>(c_inner,vc_idx,veff_mask,fvec(1.),
|
|
vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
|
|
&vfix,&vfiy,&vfiz,
|
|
&vfjx,&vfjy,&vfjz,
|
|
&vfkx,&vfky,&vfkz,
|
|
&vzeta_contrib);
|
|
vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
|
|
vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
|
|
vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
|
|
vfjxtmp = v::mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
|
|
vfjytmp = v::mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
|
|
vfjztmp = v::mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
|
|
|
|
vfkx_cache[cache_idx] = vfkx;
|
|
vfky_cache[cache_idx] = vfky;
|
|
vfkz_cache[cache_idx] = vfkz;
|
|
vks_cache[cache_idx] = vks;
|
|
vmask_cache[cache_idx] = veff_mask;
|
|
cache_idx += 1;
|
|
|
|
vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
|
|
vkks = vkks + v_i1;
|
|
veff_old_mask = bvec(0);
|
|
} else {
|
|
vkks = v::int_mask_add(vkks, ~veff_mask, vkks, v_i1);
|
|
veff_old_mask = veff_mask;
|
|
}
|
|
vactive_mask &= v::int_cmplt(vkks, vnumneigh_i);
|
|
}
|
|
vkks_final_cache = vkks;
|
|
vmask_final_cache = vactive_mask;
|
|
while (! v::mask_testz(vactive_mask)) {
|
|
bvec vnew_mask = vactive_mask & ~ veff_old_mask;
|
|
vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
|
|
v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
|
|
v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
|
|
fvec vdx_ik = (vx_k - vx_i);
|
|
fvec vdy_ik = (vy_k - vy_i);
|
|
fvec vdz_ik = (vz_k - vz_i);
|
|
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
|
|
ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
|
|
vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
|
|
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
|
|
bvec vsame_mask = v::int_cmpneq(vjs, vks);
|
|
bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
|
|
if (v::mask_testz(~(veff_mask | ~vactive_mask))) {
|
|
fvec vzeta_contrib;
|
|
vzeta_contrib = zeta_vector(c_inner,vc_idx,veff_mask,vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
|
|
vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
|
|
vkks = vkks + v_i1;
|
|
veff_old_mask = bvec(0);
|
|
} else {
|
|
vkks = v::int_mask_add(vkks, ~veff_mask, vkks, v_i1);
|
|
veff_old_mask = veff_mask;
|
|
}
|
|
vactive_mask &= v::int_cmplt(vkks, vnumneigh_i);
|
|
}
|
|
}
|
|
fvec vfpair, vevdwl, vprefactor, vfwtmp, vfjwtmp;
|
|
force_zeta_vector(c_outer, vc_idx_ij, vmask, vrij, vzeta, &vfpair, &vprefactor, EFLAG, &vevdwl, vmask_repulsive);
|
|
vfxtmp = vfxtmp * vprefactor + vdx_ij * vfpair;
|
|
vfytmp = vfytmp * vprefactor + vdy_ij * vfpair;
|
|
vfztmp = vfztmp * vprefactor + vdz_ij * vfpair;
|
|
vfjxtmp = vfjxtmp * vprefactor - vdx_ij * vfpair;
|
|
vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
|
|
vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;
|
|
|
|
if (EVFLAG) {
|
|
if (EFLAG) {
|
|
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
|
if (eatom) {
|
|
v::store(fw, (v_0_5 * vevdwl));
|
|
}
|
|
}
|
|
if (vflag == 1) {
|
|
*vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
|
|
*vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
|
|
*vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
|
|
*vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
|
|
*vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
|
|
*vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
|
|
}
|
|
}
|
|
{
|
|
while (cache_idx-- > 0) {
|
|
fvec vfkx = vprefactor * vfkx_cache[cache_idx];
|
|
fvec vfky = vprefactor * vfky_cache[cache_idx];
|
|
fvec vfkz = vprefactor * vfkz_cache[cache_idx];
|
|
ivec vks = vks_cache[cache_idx];
|
|
bvec veff_mask = vmask_cache[cache_idx];
|
|
v::store(fx, vfkx);
|
|
v::store(fy, vfky);
|
|
v::store(fz, vfkz);
|
|
v::int_store(ts, vks);
|
|
for (int t = 0; t < v::VL; t++) {
|
|
if (v::mask_test_at(veff_mask, t)) {
|
|
int t_ = ts[t] / (4 * sizeof(typename v::fscal));
|
|
f[t_].x += fx[t];
|
|
f[t_].y += fy[t];
|
|
f[t_].z += fz[t];
|
|
}
|
|
}
|
|
}
|
|
ivec vkks = vkks_final_cache;
|
|
bvec vactive_mask = vmask_final_cache;
|
|
bvec veff_old_mask(0);
|
|
ivec vks, vw_k;
|
|
fvec vx_k, vy_k, vz_k, vcutsq;
|
|
while (! v::mask_testz(vactive_mask)) {
|
|
bvec vnew_mask = vactive_mask & ~ veff_old_mask;
|
|
vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
|
|
v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
|
|
v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
|
|
fvec vdx_ik = vx_k - vx_i;
|
|
fvec vdy_ik = vy_k - vy_i;
|
|
fvec vdz_ik = vz_k - vz_i;
|
|
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
|
|
ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
|
|
vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
|
|
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
|
|
bvec vsame_mask = v::int_cmpneq(vjs, vks);
|
|
bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
|
|
if (v::mask_testz(~(veff_mask | ~vactive_mask))) {
|
|
fvec vfix, vfiy, vfiz;
|
|
fvec vfjx, vfjy, vfjz;
|
|
fvec vfkx, vfky, vfkz;
|
|
|
|
attractive_vector<false>(c_inner,vc_idx,veff_mask,vprefactor,
|
|
vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
|
|
&vfix,&vfiy,&vfiz,
|
|
&vfjx,&vfjy,&vfjz,
|
|
&vfkx,&vfky,&vfkz,
|
|
0);
|
|
vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
|
|
vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
|
|
vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
|
|
vfjxtmp = v::mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
|
|
vfjytmp = v::mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
|
|
vfjztmp = v::mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
|
|
v::store(fx, vfkx);
|
|
v::store(fy, vfky);
|
|
v::store(fz, vfkz);
|
|
v::int_store(ts, vks);
|
|
for (int t = 0; t < v::VL; t++) {
|
|
if (v::mask_test_at(veff_mask, t)) {
|
|
int t_ = ts[t] / (4 * sizeof(typename v::fscal));
|
|
f[t_].x += fx[t];
|
|
f[t_].y += fy[t];
|
|
f[t_].z += fz[t];
|
|
}
|
|
}
|
|
vkks = vkks + v_i1;
|
|
veff_old_mask = bvec(0);
|
|
} else {
|
|
vkks = v::int_mask_add(vkks, ~veff_mask, vkks, v_i1);
|
|
veff_old_mask = veff_mask;
|
|
}
|
|
vactive_mask &= v::int_cmplt(vkks, vnumneigh_i);
|
|
} // while (vactive_mask != 0)
|
|
} // section k
|
|
// We can not make any assumptions regarding conflicts.
|
|
// So we sequentialize this.
|
|
// TDO: Once AVX-512 is around check out VPCONFLICT
|
|
v::store(fx, vfjxtmp);
|
|
v::store(fy, vfjytmp);
|
|
v::store(fz, vfjztmp);
|
|
for (int t = 0; t < compress_idx; t++) {
|
|
int t_ = js[t];
|
|
f[t_].x += fx[t];
|
|
f[t_].y += fy[t];
|
|
f[t_].z += fz[t];
|
|
if (EVFLAG && EFLAG && eatom) {
|
|
f[t_].w += fw[t];
|
|
}
|
|
}
|
|
v::store(fx, vfxtmp);
|
|
v::store(fy, vfytmp);
|
|
v::store(fz, vfztmp);
|
|
for (int t = 0; t < compress_idx; t++) {
|
|
int t_ = is[t];
|
|
f[t_].x += fx[t];
|
|
f[t_].y += fy[t];
|
|
f[t_].z += fz[t];
|
|
if (EVFLAG && EFLAG && eatom) {
|
|
f[t_].w += fw[t];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Specialized kernel step for fixed i, means that we don't have to use the
|
|
// convoluted iteration scheme above, as the loop variables are uniform.
|
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
|
template<int EVFLAG, int EFLAG>
|
|
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
|
int eatom, int vflag,
|
|
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
|
const int * _noalias const firstneigh, int ntypes,
|
|
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
|
const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
|
|
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
|
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
|
avec *vsevdwl,
|
|
avec *vsv0,
|
|
avec *vsv1,
|
|
avec *vsv2,
|
|
avec* vsv3,
|
|
avec *vsv4,
|
|
avec *vsv5,
|
|
int compress_idx,
|
|
int i,
|
|
iarr js,
|
|
bvec vmask_repulsive
|
|
) {
|
|
typedef typename v::fvec fvec;
|
|
typedef typename v::ivec ivec;
|
|
typedef typename v::bvec bvec;
|
|
typedef typename v::farr farr;
|
|
typedef typename v::iarr iarr;
|
|
typedef typename v::avec avec;
|
|
typedef typename v::aarr aarr;
|
|
|
|
ivec v_i4floats((int) (4 * sizeof(typename v::fscal)));
|
|
ivec v_i1(1), v_i0(0), v_i_ntypes(ntypes), v_i_NEIGHMASK(NEIGHMASK);
|
|
fvec v_0_5(0.5);
|
|
|
|
int cache_idx = 0;
|
|
fvec vfkx_cache[N_CACHE];
|
|
fvec vfky_cache[N_CACHE];
|
|
fvec vfkz_cache[N_CACHE];
|
|
int k_cache[N_CACHE];
|
|
bvec vmask_cache[N_CACHE];
|
|
int kk_final_cache;
|
|
|
|
aarr fx, fy, fz, fw;
|
|
iarr ts;
|
|
|
|
bvec vmask = v::mask_enable_lower(compress_idx);
|
|
fvec vx_i(x[i].x), vy_i(x[i].y), vz_i(x[i].z);
|
|
int w_i = x[i].w;
|
|
|
|
ivec vjs = v::int_mullo(v_i4floats, v::int_load_vl(js));
|
|
fvec vx_j = v::zero(), vy_j = v::zero(), vz_j = v::zero();
|
|
ivec vw_j = v_i0;
|
|
v::gather_x(vjs, vmask, x, &vx_j, &vy_j, &vz_j, &vw_j);
|
|
|
|
fvec vdx_ij = vx_j - vx_i, vdy_ij = vy_j - vy_i, vdz_ij = vz_j - vz_i;
|
|
fvec vrijsq = vdx_ij * vdx_ij + vdy_ij * vdy_ij + vdz_ij * vdz_ij;
|
|
fvec vrij = sqrt(vrijsq);
|
|
|
|
int cnumneigh_i = cnumneigh[i];
|
|
int numneigh_i = numneigh[i];
|
|
ivec vc_idx_j = v::int_mullo(v_i4floats, vw_j);
|
|
ivec vc_idx_j_ntypes = v::int_mullo(v_i_ntypes, vc_idx_j);
|
|
|
|
avec vzeta = v::acc_zero();
|
|
avec vfxtmp = v::acc_zero(), vfytmp = v::acc_zero(), vfztmp = v::acc_zero();
|
|
avec vfjxtmp = v::acc_zero(), vfjytmp = v::acc_zero(), vfjztmp = v::acc_zero();
|
|
|
|
// Same structure as kernel_step, just simpler as the loops all iterate over
|
|
// the same k
|
|
int kk = 0;
|
|
for (; kk < numneigh_i && cache_idx < N_CACHE; kk++) {
|
|
int k = firstneigh[kk + cnumneigh_i] & NEIGHMASK;
|
|
fvec vx_k(x[k].x);
|
|
fvec vy_k(x[k].y);
|
|
fvec vz_k(x[k].z);
|
|
int w_k = x[k].w;
|
|
fvec vdx_ik = vx_k - vx_i;
|
|
fvec vdy_ik = vy_k - vy_i;
|
|
fvec vdz_ik = vz_k - vz_i;
|
|
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
|
|
fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
|
|
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
|
|
bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
|
|
bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
|
|
if (! v::mask_testz(veff_mask)) {
|
|
fvec vzeta_contrib;
|
|
fvec vfix, vfiy, vfiz;
|
|
fvec vfjx, vfjy, vfjz;
|
|
fvec vfkx, vfky, vfkz;
|
|
|
|
attractive_vector<true>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,fvec(1.),
|
|
vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
|
|
&vfix,&vfiy,&vfiz,
|
|
&vfjx,&vfjy,&vfjz,
|
|
&vfkx,&vfky,&vfkz,
|
|
&vzeta_contrib);
|
|
vfxtmp = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
|
|
vfytmp = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
|
|
vfztmp = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
|
|
vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
|
|
vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
|
|
vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
|
|
|
|
vfkx_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkx, v::zero());
|
|
vfky_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfky, v::zero());
|
|
vfkz_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkz, v::zero());
|
|
vmask_cache[cache_idx] = veff_mask;
|
|
k_cache[cache_idx] = k;
|
|
cache_idx += 1;
|
|
|
|
vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
|
|
}
|
|
}
|
|
kk_final_cache = kk;
|
|
for (; kk < numneigh_i; kk++) {
|
|
int k = firstneigh[kk + cnumneigh_i] & NEIGHMASK;
|
|
fvec vx_k(x[k].x);
|
|
fvec vy_k(x[k].y);
|
|
fvec vz_k(x[k].z);
|
|
int w_k = x[k].w;
|
|
fvec vdx_ik = vx_k - vx_i;
|
|
fvec vdy_ik = vy_k - vy_i;
|
|
fvec vdz_ik = vz_k - vz_i;
|
|
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
|
|
fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
|
|
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
|
|
bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
|
|
bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
|
|
if (! v::mask_testz(veff_mask)) {
|
|
fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq,
|
|
vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
|
|
vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
|
|
}
|
|
}
|
|
fvec vfpair, vevdwl, vprefactor, vfwtmp;
|
|
force_zeta_vector(&c_outer[ntypes * w_i], vc_idx_j, vmask, vrij, vzeta, &vfpair, &vprefactor, EFLAG, &vevdwl, vmask_repulsive);
|
|
avec vaprefactor(vprefactor);
|
|
vfxtmp = vfxtmp * vaprefactor + avec(vdx_ij * vfpair);
|
|
vfytmp = vfytmp * vaprefactor + avec(vdy_ij * vfpair);
|
|
vfztmp = vfztmp * vaprefactor + avec(vdz_ij * vfpair);
|
|
vfjxtmp = vfjxtmp * vaprefactor - avec(vdx_ij * vfpair);
|
|
vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
|
|
vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);
|
|
|
|
if (EVFLAG) {
|
|
if (EFLAG) {
|
|
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
|
if (eatom) {
|
|
vfwtmp = v_0_5 * vevdwl;
|
|
v::store(fw, vfwtmp);
|
|
}
|
|
}
|
|
if (vflag == 1) {
|
|
*vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
|
|
*vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
|
|
*vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
|
|
*vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
|
|
*vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
|
|
*vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
|
|
}
|
|
}
|
|
while (cache_idx-- > 0) {
|
|
fvec vfkx = vprefactor * vfkx_cache[cache_idx];
|
|
fvec vfky = vprefactor * vfky_cache[cache_idx];
|
|
fvec vfkz = vprefactor * vfkz_cache[cache_idx];
|
|
int k = k_cache[cache_idx];
|
|
bvec veff_mask = vmask_cache[cache_idx];
|
|
f[k].x += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkx, v::zero()));
|
|
f[k].y += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfky, v::zero()));
|
|
f[k].z += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkz, v::zero()));
|
|
}
|
|
for (int kk = kk_final_cache; kk < numneigh_i; kk++) {
|
|
int k = firstneigh[kk + cnumneigh_i] & NEIGHMASK;
|
|
fvec vx_k(x[k].x);
|
|
fvec vy_k(x[k].y);
|
|
fvec vz_k(x[k].z);
|
|
int w_k = x[k].w;
|
|
fvec vdx_ik = vx_k - vx_i;
|
|
fvec vdy_ik = vy_k - vy_i;
|
|
fvec vdz_ik = vz_k - vz_i;
|
|
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
|
|
fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k].cutsq);
|
|
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
|
|
bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
|
|
bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
|
|
if (! v::mask_testz(veff_mask)) {
|
|
fvec vfix, vfiy, vfiz;
|
|
fvec vfjx, vfjy, vfjz;
|
|
fvec vfkx, vfky, vfkz;
|
|
|
|
attractive_vector<false>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,vprefactor,
|
|
vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
|
|
&vfix,&vfiy,&vfiz,
|
|
&vfjx,&vfjy,&vfjz,
|
|
&vfkx,&vfky,&vfkz,
|
|
0);
|
|
vfxtmp = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
|
|
vfytmp = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
|
|
vfztmp = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
|
|
vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
|
|
vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
|
|
vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
|
|
f[k].x += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkx, v::zero()));
|
|
f[k].y += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfky, v::zero()));
|
|
f[k].z += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkz, v::zero()));
|
|
}
|
|
}
|
|
// TDO: This could be a scatter
|
|
v::acc_store(fx, vfjxtmp);
|
|
v::acc_store(fy, vfjytmp);
|
|
v::acc_store(fz, vfjztmp);
|
|
for (int t = 0; t < compress_idx; t++) {
|
|
int t_ = js[t];
|
|
f[t_].x += fx[t];
|
|
f[t_].y += fy[t];
|
|
f[t_].z += fz[t];
|
|
if (EVFLAG && EFLAG && eatom) {
|
|
f[t_].w += fw[t];
|
|
}
|
|
}
|
|
f[i].x += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfxtmp, v::zero()));
|
|
f[i].y += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfytmp, v::zero()));
|
|
f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfztmp, v::zero()));
|
|
if (EVFLAG && EFLAG && eatom) {
|
|
f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfwtmp, v::zero()));
|
|
}
|
|
}
|
|
|
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
|
template<bool EVFLAG, bool EFLAG>
|
|
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
|
int iito, int iifrom, int eatom, int vflag,
|
|
const int * _noalias const numneigh,
|
|
const int * _noalias const numneighhalf,
|
|
const int * _noalias const cnumneigh,
|
|
const int * _noalias const firstneigh, int ntypes,
|
|
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
|
const c_inner_t * _noalias const c_inner,
|
|
const c_outer_t * _noalias const c_outer,
|
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
|
acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
|
|
) {
|
|
int compress_idx = 0;
|
|
int ii, jj;
|
|
iarr is, js;
|
|
avec vsevdwl = v::acc_zero();
|
|
avec vsv0 = v::acc_zero(), vsv1 = v::acc_zero(), vsv2 = v::acc_zero();
|
|
avec vsv3 = v::acc_zero(), vsv4 = v::acc_zero(), vsv5 = v::acc_zero();
|
|
ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4));
|
|
ivec vj, v_NEIGHMASK(NEIGHMASK);
|
|
bvec vmask_repulsive(0);
|
|
iarr repulsive_flag = {0};
|
|
// If you want to get the very most out of this, please uncomment.
|
|
// Consider getting a coffee or doing something else.
|
|
// Also good for heating.
|
|
//#pragma forceinline recursive
|
|
for (ii = iifrom; ii < iito; ii++) {
|
|
// Right now this loop is scalar, to allow for the compiler to do
|
|
// its prefetching magic.
|
|
int i = ii;
|
|
int w_i = x[i].w;
|
|
flt_t x_i = x[i].x;
|
|
flt_t y_i = x[i].y;
|
|
flt_t z_i = x[i].z;
|
|
int jlist_off_i = cnumneigh[i];
|
|
int jnum = numneigh[ii];
|
|
for (jj = 0; jj < jnum; jj++) {
|
|
int j = firstneigh[jlist_off_i + jj] & NEIGHMASK;
|
|
int w_j = x[j].w;
|
|
flt_t dx_ij = x[j].x - x_i;
|
|
flt_t dy_ij = x[j].y - y_i;
|
|
flt_t dz_ij = x[j].z - z_i;
|
|
flt_t rsq = dx_ij*dx_ij + dy_ij*dy_ij + dz_ij*dz_ij;
|
|
flt_t cutsq = c_outer[w_i * ntypes + w_j].cutsq;
|
|
if (rsq < cutsq) {
|
|
is[compress_idx] = ii;
|
|
js[compress_idx] = j;
|
|
if (jj < numneighhalf[i])
|
|
repulsive_flag[compress_idx] = 1;
|
|
compress_idx += 1;
|
|
}
|
|
if (pack_i) {
|
|
if (compress_idx == v::VL) {
|
|
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
|
kernel_step<EVFLAG,EFLAG>(
|
|
eatom, vflag,
|
|
numneigh, cnumneigh, firstneigh, ntypes,
|
|
x, c_inner, c_outer, f,
|
|
&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
|
|
is, js, vmask_repulsive
|
|
);
|
|
compress_idx = 0;
|
|
v::int_clear_arr(repulsive_flag);
|
|
}
|
|
} else {
|
|
if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) {
|
|
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
|
kernel_step_const_i<EVFLAG,EFLAG>(
|
|
eatom, vflag,
|
|
numneigh, cnumneigh, firstneigh, ntypes,
|
|
x, c_inner, c_outer, f,
|
|
&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
|
|
i, js, vmask_repulsive
|
|
);
|
|
compress_idx = 0;
|
|
v::int_clear_arr(repulsive_flag);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (compress_idx > 0) {
|
|
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
|
IntelKernelTersoff::kernel_step<EVFLAG,EFLAG>(
|
|
eatom, vflag,
|
|
numneigh, cnumneigh, firstneigh, ntypes,
|
|
x, c_inner, c_outer, f,
|
|
&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
|
|
is, js, vmask_repulsive
|
|
);
|
|
}
|
|
if (EVFLAG) {
|
|
if (EFLAG) {
|
|
*evdwl += v::acc_reduce_add(vsevdwl);
|
|
}
|
|
if (vflag == 1) {
|
|
*ov0 += v::acc_reduce_add(vsv0);
|
|
*ov1 += v::acc_reduce_add(vsv1);
|
|
*ov2 += v::acc_reduce_add(vsv2);
|
|
*ov3 += v::acc_reduce_add(vsv3);
|
|
*ov4 += v::acc_reduce_add(vsv4);
|
|
*ov5 += v::acc_reduce_add(vsv5);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
|
IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
|
|
const c_inner_t * param,
|
|
ivec xjw, bvec mask,
|
|
fvec vrij, fvec rsq2,
|
|
fvec vdijx, fvec vdijy, fvec vdijz,
|
|
fvec dikx, fvec diky, fvec dikz
|
|
) {
|
|
fvec v_1_0(1.0);
|
|
fvec v_0_5(0.5);
|
|
fvec vph = v::zero(), vpc2 = v::zero(), vpd2 = v::zero(), vpgamma = v::zero(), vplam3 = v::zero(), vppowermint = v::zero(), vpbigr = v::zero(), vpbigd = v::zero();
|
|
// TDO: Specialize on number of species
|
|
v::gather_8(xjw, mask, ¶m[0].lam3, &vplam3, &vppowermint, &vpbigr, &vpbigd, &vpc2, &vpd2, &vph, &vpgamma);
|
|
fvec vrik = sqrt(rsq2);
|
|
fvec vcostheta = (vdijx * dikx + vdijy * diky + vdijz * dikz) * v::recip(vrij * vrik);
|
|
fvec vhcth = vph - vcostheta;
|
|
fvec vgijk_a = vhcth * vhcth;
|
|
fvec vgijk = vpgamma * (v_1_0 + vpc2 * vgijk_a * v::recip(vpd2 * (vpd2 + vgijk_a)));
|
|
fvec varg1 = vplam3 * (vrij - vrik);
|
|
fvec varg3 = varg1 * varg1 * varg1;
|
|
bvec mask_ex = v::cmpeq(vppowermint, fvec(3.));
|
|
fvec varg = v::blend(mask_ex, varg1, varg3);
|
|
fvec vex_delr = v::min(fvec(1.e30), exp(varg));
|
|
bvec vmask_need_sine = v::cmpnle(vrik, vpbigr - vpbigd) & mask;
|
|
fvec vfc = v_1_0;
|
|
// Its kind of important to check the mask.
|
|
// Some simulations never/rarely invoke this branch.
|
|
if (! v::mask_testz(vmask_need_sine)) {
|
|
vfc = v::blend(vmask_need_sine, vfc,
|
|
v_0_5 * (v_1_0 - sin(fvec(MY_PI2) * (vrik - vpbigr) * v::recip(vpbigd))));
|
|
}
|
|
return vgijk * vex_delr * vfc;
|
|
}
|
|
|
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
|
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector(
|
|
const c_outer_t * param,
|
|
ivec xjw,
|
|
bvec mask,
|
|
fvec vrij, fvec vzeta_ij,
|
|
fvec *vfpair, fvec *vprefactor, int EVDWL, fvec *vevdwl,
|
|
bvec vmask_repulsive
|
|
) {
|
|
fvec v_0_0(0.0);
|
|
fvec v_0_5(0.5);
|
|
fvec v_m0_5(-0.5);
|
|
fvec v_1_0(1.0);
|
|
fvec v_m1_0(-1.0);
|
|
fvec v_2_0(2.0);
|
|
fvec vpbigr = v::zero(), vpbigd = v::zero(), vplam1 = v::zero(), vpbiga = v::zero(), vplam2 = v::zero(), vpbeta = v::zero(), vpbigb = v::zero(), vppowern = v::zero();
|
|
v::gather_8(xjw, mask, ¶m[0].bigr, &vpbigr, &vpbigd, &vplam1, &vpbiga, &vplam2, &vpbeta, &vpbigb, &vppowern);
|
|
fvec vfccos;
|
|
|
|
// This is pretty much a literal translation.
|
|
bvec vmask_need_sine = v::cmpnle(vrij, vpbigr - vpbigd) & mask;
|
|
fvec vfc = v_1_0;
|
|
fvec vfc_d = v_0_0;
|
|
if (! v::mask_testz(vmask_need_sine)) {
|
|
fvec vtmp = fvec(MY_PI2) * v::recip(vpbigd);
|
|
vfc = v::blend(vmask_need_sine, vfc,
|
|
v_0_5 * (v_1_0 - v::sincos(&vfccos, vtmp * (vrij - vpbigr))));
|
|
vfc_d = v::blend(vmask_need_sine, vfc_d, v_m0_5 * vtmp * vfccos);
|
|
}
|
|
fvec vpminus_lam2 = - vplam2;
|
|
|
|
fvec vpminus_bigb = -vpbigb;
|
|
fvec vexp = exp(vpminus_lam2 * vrij);
|
|
fvec vfa = vpminus_bigb * vexp * vfc;
|
|
fvec vfa_d = vpminus_lam2 * vfa + vpminus_bigb * vexp * vfc_d;
|
|
|
|
fvec vpc1 = v::zero(), vpc2 = v::zero(), vpc3 = v::zero(), vpc4 = v::zero();
|
|
v::gather_4(xjw, mask, ¶m[0].c1, &vpc1, &vpc2, &vpc3, &vpc4);
|
|
fvec vpminus_powern = - vppowern;
|
|
fvec vbij(0.), vbij_d(0.);
|
|
fvec vtmp = vpbeta * vzeta_ij;
|
|
bvec vmc1 = v::cmple(vpc1, vtmp) & mask;
|
|
if (! v::mask_testz(vmc1)) {
|
|
vbij = v::invsqrt(vtmp);
|
|
vbij_d = vpbeta * v_m0_5 * vbij * v::recip(vtmp);
|
|
}
|
|
bvec vmc2 = v::cmple(vpc2, vtmp) & ~ vmc1 & mask;
|
|
if (! v::mask_testz(vmc2)) {
|
|
fvec vpowminus_powern = pow(vtmp, vpminus_powern);
|
|
fvec vinvsqrt = v::invsqrt(vtmp);
|
|
fvec vrcp2powern = v::recip(v_2_0 * vppowern);
|
|
fvec va = (v_1_0 - vpowminus_powern * vrcp2powern) * vinvsqrt;
|
|
fvec va_d = vpbeta * v_m0_5 * vinvsqrt * v::recip(vtmp) *
|
|
(v_1_0 + v_m0_5 * vpowminus_powern * (v_1_0 + vrcp2powern));
|
|
vbij = v::blend(vmc2, vbij, va);
|
|
vbij_d = v::blend(vmc2, vbij_d, va_d);
|
|
}
|
|
bvec vmc3 = v::cmplt(vtmp, vpc4) & ~vmc2 & ~vmc1 & mask;
|
|
if (! v::mask_testz(vmc3)) {
|
|
vbij = v::blend(vmc3, vbij, v_1_0);
|
|
vbij_d = v::blend(vmc3, vbij_d, v_0_0);
|
|
}
|
|
bvec vmc4 = v::cmple(vtmp, vpc3) & ~vmc3 & ~vmc2 & ~ vmc1 & mask;
|
|
if (! v::mask_testz(vmc4)) {
|
|
fvec vpowm1 = pow(vtmp, vppowern - v_1_0);
|
|
fvec vrcp2powern = v::recip(v_2_0 * vppowern);
|
|
fvec va = v_1_0 - vtmp * vrcp2powern * vpowm1;
|
|
fvec va_d = v_m0_5 * vpbeta * vpowm1;
|
|
vbij = v::blend(vmc4, vbij, va);
|
|
vbij_d = v::blend(vmc4, vbij_d, va_d);
|
|
}
|
|
bvec vmc5 = mask & ~vmc1 & ~vmc2 & ~vmc3 & ~vmc4;
|
|
if (! v::mask_testz(vmc5)) {
|
|
fvec vtmp_n = pow(vtmp, vppowern);
|
|
fvec vpow2 = pow(v_1_0 + vtmp_n, v_m1_0 - v::recip(v_2_0 * vppowern));
|
|
fvec va = (v_1_0 + vtmp_n) * vpow2;
|
|
fvec va_d = v_m0_5 * vpow2 * vtmp_n * v::recip(vzeta_ij);
|
|
vbij = v::blend(vmc5, vbij, va);
|
|
vbij_d = v::blend(vmc5, vbij_d, va_d);
|
|
}
|
|
fvec vtmp_exp = exp(-vplam1 * vrij);
|
|
fvec vrep_fforce = vpbiga * vtmp_exp * (vfc_d - vfc * vplam1);
|
|
fvec vfz_fforce = v_0_5 * vbij * vfa_d;
|
|
|
|
*vfpair = v::mask_add(vfz_fforce, vmask_repulsive, vfz_fforce, vrep_fforce) * v::recip(vrij);
|
|
*vprefactor = v_m0_5 * vfa * vbij_d;
|
|
if (EVDWL) {
|
|
fvec vrep_eng = vfc * vpbiga * vtmp_exp;
|
|
fvec vfz_eng = v_0_5 * vfa * vbij;
|
|
*vevdwl = v::mask_add(vfz_eng, vmask_repulsive, vfz_eng, vrep_eng);
|
|
}
|
|
}
|
|
|
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
|
template<bool ZETA>
|
|
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
|
|
const c_inner_t * param,
|
|
ivec xjw,
|
|
bvec mask,
|
|
fvec vprefactor,
|
|
fvec vrij, fvec rsq2,
|
|
fvec vdijx, fvec vdijy, fvec vdijz,
|
|
fvec dikx, fvec diky, fvec dikz,
|
|
fvec *fix, fvec *fiy, fvec *fiz,
|
|
fvec *fjx, fvec *fjy, fvec *fjz,
|
|
fvec *fkx, fvec *fky, fvec *fkz,
|
|
fvec *zeta
|
|
) {
|
|
fvec v_1_0 = fvec(1.0);
|
|
|
|
fvec vph = v::zero(), vpc2 = v::zero(), vpd2 = fvec(1.0), vpgamma = v::zero(), vplam3 = v::zero(), vppowermint = v::zero(), vpbigr = v::zero(), vpbigd = fvec(1.0);
|
|
v::gather_8(xjw, mask, ¶m[0].lam3, &vplam3, &vppowermint, &vpbigr, &vpbigd, &vpc2, &vpd2, &vph, &vpgamma);
|
|
fvec vrijinv = v::recip(vrij);
|
|
fvec vrij_hatx = vrijinv * vdijx;
|
|
fvec vrij_haty = vrijinv * vdijy;
|
|
fvec vrij_hatz = vrijinv * vdijz;
|
|
fvec rikinv = invsqrt(rsq2);
|
|
fvec rik_hatx = rikinv * dikx;
|
|
fvec rik_haty = rikinv * diky;
|
|
fvec rik_hatz = rikinv * dikz;
|
|
|
|
fvec vrik = sqrt(rsq2);
|
|
fvec vcostheta = (vdijx * dikx + vdijy * diky + vdijz * dikz) * v::recip(vrij * vrik);
|
|
fvec vhcth = vph - vcostheta;
|
|
fvec vdenominator = v::recip(vpd2 + vhcth * vhcth);
|
|
fvec vgijk = vpgamma * (v_1_0 + vpc2 * v::recip(vpd2) - vpc2 * vdenominator);
|
|
fvec vnumerator = fvec(-2.) * vpc2 * vhcth;
|
|
fvec vgijk_d = vpgamma * vnumerator * vdenominator * vdenominator;
|
|
fvec varg1 = vplam3 * (vrij - vrik);
|
|
fvec varg3 = varg1 * varg1 * varg1;
|
|
bvec mask_ex = v::cmpeq(vppowermint, fvec(3.));
|
|
fvec varg = v::blend(mask_ex, varg1, varg3);
|
|
fvec vex_delr = min(fvec(1.e30), exp(varg));
|
|
fvec vex_delr_d_factor = v::blend(mask_ex, v_1_0, fvec(3.0) * varg1 * varg1);
|
|
fvec vex_delr_d = vplam3 * vex_delr_d_factor * vex_delr;
|
|
bvec vmask_need_sine = v::cmpnle(vrik, vpbigr - vpbigd) & mask;
|
|
fvec vfccos;
|
|
fvec vfc = v_1_0;
|
|
fvec vfc_d = v::zero();
|
|
if (! v::mask_testz(vmask_need_sine)) {
|
|
fvec vtmp = fvec(MY_PI2) * v::recip(vpbigd);
|
|
vfc = v::blend(vmask_need_sine, vfc,
|
|
fvec(0.5) * (v_1_0 - v::sincos(&vfccos, vtmp * (vrik - vpbigr))));
|
|
vfc_d = v::blend(vmask_need_sine, vfc_d, fvec(-0.5) * vtmp * vfccos);
|
|
}
|
|
|
|
fvec vzeta_d_fc = vfc_d * vgijk * vex_delr;
|
|
fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr;
|
|
fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d;
|
|
if (ZETA) *zeta = vfc * vgijk * vex_delr;
|
|
|
|
fvec vminus_costheta = - vcostheta;
|
|
fvec vdcosdrjx = vrijinv * fmadd(vminus_costheta, vrij_hatx, rik_hatx);
|
|
fvec vdcosdrjy = vrijinv * fmadd(vminus_costheta, vrij_haty, rik_haty);
|
|
fvec vdcosdrjz = vrijinv * fmadd(vminus_costheta, vrij_hatz, rik_hatz);
|
|
fvec vdcosdrkx = rikinv * fmadd(vminus_costheta, rik_hatx, vrij_hatx);
|
|
fvec vdcosdrky = rikinv * fmadd(vminus_costheta, rik_haty, vrij_haty);
|
|
fvec vdcosdrkz = rikinv * fmadd(vminus_costheta, rik_hatz, vrij_hatz);
|
|
fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx);
|
|
fvec vdcosdriy = -(vdcosdrjy + vdcosdrky);
|
|
fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz);
|
|
|
|
*fix = vprefactor * (vzeta_d_gijk * vdcosdrix + vzeta_d_ex_delr * (rik_hatx - vrij_hatx) - vzeta_d_fc * rik_hatx);
|
|
*fiy = vprefactor * (vzeta_d_gijk * vdcosdriy + vzeta_d_ex_delr * (rik_haty - vrij_haty) - vzeta_d_fc * rik_haty);
|
|
*fiz = vprefactor * (vzeta_d_gijk * vdcosdriz + vzeta_d_ex_delr * (rik_hatz - vrij_hatz) - vzeta_d_fc * rik_hatz);
|
|
*fjx = vprefactor * (vzeta_d_gijk * vdcosdrjx + vzeta_d_ex_delr * vrij_hatx);
|
|
*fjy = vprefactor * (vzeta_d_gijk * vdcosdrjy + vzeta_d_ex_delr * vrij_haty);
|
|
*fjz = vprefactor * (vzeta_d_gijk * vdcosdrjz + vzeta_d_ex_delr * vrij_hatz);
|
|
*fkx = vprefactor * ((vzeta_d_fc - vzeta_d_ex_delr) * rik_hatx + vzeta_d_gijk * vdcosdrkx);
|
|
*fky = vprefactor * ((vzeta_d_fc - vzeta_d_ex_delr) * rik_haty + vzeta_d_gijk * vdcosdrky);
|
|
*fkz = vprefactor * ((vzeta_d_fc - vzeta_d_ex_delr) * rik_hatz + vzeta_d_gijk * vdcosdrkz);
|
|
}
|
|
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
#pragma offload_attribute(pop)
|
|
#endif
|
|
|
|
#endif
|