git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@14386 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2015-12-19 18:30:53 +00:00
parent fb2cb218c8
commit 72ebb1dd9b
17 changed files with 1499 additions and 402 deletions

View File

@ -26,14 +26,12 @@ action () {
# do not install child files if parent does not exist
for file in *_intel.cpp; do
test $file = thr_intel.cpp && continue
dep=`echo $file | sed 's/neigh_full_intel/neigh_full/g' | \
sed 's/_offload_intel//g' | sed 's/_intel//g'`
action $file $dep
done
for file in *_intel.h; do
test $file = thr_intel.h && continue
dep=`echo $file | sed 's/_offload_intel//g' | sed 's/_intel//g'`
action $file $dep
done
@ -42,6 +40,8 @@ action intel_preprocess.h
action intel_buffers.h
action intel_buffers.cpp
action math_extra_intel.h
action intel_simd.h pair_sw_intel.cpp
action intel_intrinsics.h pair_tersoff_intel.cpp
# step 2: handle cases and tasks not handled in step 1.

View File

@ -3,9 +3,11 @@
LAMMPS Intel(R) Package
--------------------------------
W. Michael Brown (Intel)
michael.w.brown at intel.com
W. Michael Brown (Intel) michael.w.brown at intel.com
Rodrigo Canales (RWTH Aachen University)
Markus H<>hnerbach (RWTH Aachen University)
Ahmed E. Ismail (RWTH Aachen University)
Paolo Bientinesi (RWTH Aachen University)
Anupama Kurpad (Intel)
Biswajit Mishra (Shell)
@ -53,3 +55,12 @@ By default, when running with offload to Intel(R) coprocessors, affinity
for host MPI tasks and OpenMP threads is set automatically within the code.
This currently requires the use of system calls. To disable at build time,
compile with -DINTEL_OFFLOAD_NOAFFINITY.
-----------------------------------------------------------------------------
Vector intrinsics are temporarily being used for the Stillinger-Weber
potential to allow for advanced features in the AVX512 instruction set to
be exploited on early hardware. We hope to see compiler improvements for
AVX512 that will eliminate this requirement, so it is not recommended to
develop code based on the intrinsics implementation. Please e-mail the
authors for more details.

View File

@ -7,7 +7,7 @@
package intel 1 mode mixed balance $b
package omp 0
suffix $s
processors * * * grid numa
# processors * * * grid numa
variable x index 4
variable y index 2

View File

@ -60,6 +60,8 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
int ncops = force->inumeric(FLERR,arg[3]);
_nbor_pack_width = 1;
_precision_mode = PREC_MODE_MIXED;
_offload_balance = 1.0;
_overflow_flag[LMP_OVERFLOW] = 0;
@ -307,12 +309,14 @@ void FixIntel::setup(int vflag)
/* ---------------------------------------------------------------------- */
void FixIntel::pair_init_check()
void FixIntel::pair_init_check(const bool cdmessage)
{
#ifdef INTEL_VMASK
atom->sortfreq = 1;
#endif
_nbor_pack_width = 1;
#ifdef _LMP_INTEL_OFFLOAD
if (_offload_balance != 0.0) atom->sortfreq = 1;
@ -371,15 +375,12 @@ void FixIntel::pair_init_check()
char kmode[80];
if (_precision_mode == PREC_MODE_SINGLE) {
strcpy(kmode, "single");
get_single_buffers()->free_all_nbor_buffers();
get_single_buffers()->need_tag(need_tag);
} else if (_precision_mode == PREC_MODE_MIXED) {
strcpy(kmode, "mixed");
get_mixed_buffers()->free_all_nbor_buffers();
get_mixed_buffers()->need_tag(need_tag);
} else {
strcpy(kmode, "double");
get_double_buffers()->free_all_nbor_buffers();
get_double_buffers()->need_tag(need_tag);
}
@ -399,6 +400,13 @@ void FixIntel::pair_init_check()
fprintf(screen,"Using Intel Package without Coprocessor.\n");
}
fprintf(screen,"Precision: %s\n",kmode);
if (cdmessage) {
#ifdef LMP_USE_AVXCD
fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
#else
fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
#endif
}
fprintf(screen,
"----------------------------------------------------------\n");
}

View File

@ -11,6 +11,10 @@
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#ifdef FIX_CLASS
FixStyle(INTEL,FixIntel)
@ -39,7 +43,7 @@ class FixIntel : public Fix {
virtual int setmask();
virtual void init();
virtual void setup(int);
void pair_init_check();
void pair_init_check(const bool cdmessage=false);
// Get all forces, calculation results from coprocesser
void sync_coprocessor();
@ -58,12 +62,15 @@ class FixIntel : public Fix {
inline IntelBuffers<double,double> * get_double_buffers()
{ return _double_buffers; }
inline int nbor_pack_width() const { return _nbor_pack_width; }
inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
protected:
IntelBuffers<float,float> *_single_buffers;
IntelBuffers<float,double> *_mixed_buffers;
IntelBuffers<double,double> *_double_buffers;
int _precision_mode, _nthreads;
int _precision_mode, _nthreads, _nbor_pack_width;
public:
inline int* get_overflow_flag() { return _overflow_flag; }

View File

@ -343,11 +343,15 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list()
template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
const int nlocal,
const int offload_end)
const int nthreads,
const int offload_end,
const int pack_width)
{
free_nbor_list();
_list_alloc_atoms = 1.10 * nlocal;
int list_alloc_size = (_list_alloc_atoms + _off_threads) * get_max_nbors();
int nt = MAX(nthreads, _off_threads);
int list_alloc_size = (_list_alloc_atoms + nt + pack_width - 1) *
get_max_nbors();
lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
#ifdef _LMP_INTEL_OFFLOAD
if (offload_end > 0) {
@ -393,6 +397,9 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
flt_t *ccachew = _ccachew;
int *ccachei = _ccachei;
int *ccachej = _ccachej;
#ifdef LMP_USE_AVXCD
acc_t *ccachef = _ccachef;
#endif
#ifdef _LMP_INTEL_OFFLOAD
if (_off_ccache) {
@ -409,6 +416,9 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
lmp->memory->destroy(ccachew);
lmp->memory->destroy(ccachei);
lmp->memory->destroy(ccachej);
#ifdef LMP_USE_AVXCD
lmp->memory->destroy(ccachef);
#endif
_ccachex = 0;
}
@ -418,7 +428,8 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
const int nthreads)
const int nthreads,
const int width)
{
#ifdef _LMP_INTEL_OFFLOAD
if (_ccachex && off_flag && _off_ccache == 0)
@ -427,7 +438,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
if (_ccachex)
return;
const int nsize = get_max_nbors();
const int nsize = get_max_nbors() * width;
int esize = MIN(sizeof(int), sizeof(flt_t));
IP_PRE_get_stride(_ccache_stride, nsize, esize, 0);
int nt = MAX(nthreads, _off_threads);
@ -439,6 +450,11 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
lmp->memory->create(_ccachew, vsize, "_ccachew");
lmp->memory->create(_ccachei, vsize, "_ccachei");
lmp->memory->create(_ccachej, vsize, "_ccachej");
#ifdef LMP_USE_AVXCD
IP_PRE_get_stride(_ccache_stride3, nsize * 3, esize, 0);
lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
#endif
memset(_ccachej, 0, vsize * sizeof(int));
#ifdef _LMP_INTEL_OFFLOAD
if (off_flag) {
@ -454,7 +470,8 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
#pragma offload_transfer target(mic:_cop) \
nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
nocopy(ccachei,ccachej:length(vsize) alloc_if(1) free_if(0))
nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
in(ccachej:length(vsize) alloc_if(1) free_if(0))
}
_off_ccache = 1;
}

View File

@ -75,14 +75,14 @@ class IntelBuffers {
free_local();
}
inline void grow_nbor(NeighList *list, const int nlocal,
const int offload_end) {
inline void grow_nbor(NeighList *list, const int nlocal, const int nthreads,
const int offload_end, const int pack_width=1) {
grow_local(list, offload_end);
if (offload_end) {
grow_nmax();
grow_binhead();
}
grow_nbor_list(list, nlocal, offload_end);
grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
}
void free_nmax();
@ -111,7 +111,7 @@ class IntelBuffers {
}
void free_ccache();
void grow_ccache(const int off_flag, const int nthreads);
void grow_ccache(const int off_flag, const int nthreads, const int width=1);
inline int ccache_stride() { return _ccache_stride; }
inline flt_t * get_ccachex() { return _ccachex; }
inline flt_t * get_ccachey() { return _ccachey; }
@ -119,6 +119,10 @@ class IntelBuffers {
inline flt_t * get_ccachew() { return _ccachew; }
inline int * get_ccachei() { return _ccachei; }
inline int * get_ccachej() { return _ccachej; }
#ifdef LMP_USE_AVXCD
inline int ccache_stride3() { return _ccache_stride3; }
inline acc_t * get_ccachef() { return _ccachef; }
#endif
inline int get_max_nbors() {
int mn = lmp->neighbor->oneatom * sizeof(int) /
@ -129,9 +133,10 @@ class IntelBuffers {
void free_nbor_list();
inline void grow_nbor_list(NeighList *list, const int nlocal,
const int offload_end) {
const int nthreads, const int offload_end,
const int pack_width) {
if (nlocal > _list_alloc_atoms)
_grow_nbor_list(list, nlocal, offload_end);
_grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
#ifdef _LMP_INTEL_OFFLOAD
else if (offload_end > 0 && _off_map_stencil != list->stencil)
_grow_stencil(list);
@ -281,6 +286,10 @@ class IntelBuffers {
int _ccache_stride;
flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
int *_ccachei, *_ccachej;
#ifdef LMP_USE_AVXCD
int _ccache_stride3;
acc_t * _ccachef;
#endif
#ifdef _LMP_INTEL_OFFLOAD
int _separate_buffers;
@ -305,8 +314,8 @@ class IntelBuffers {
void _grow_nmax();
void _grow_local(NeighList *list, const int offload_end);
void _grow_binhead();
void _grow_nbor_list(NeighList *list, const int nlocal,
const int offload_end);
void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
const int offload_end, const int pack_width);
void _grow_stencil(NeighList *list);
};

View File

@ -55,30 +55,37 @@ enum {LMP_OVERFLOW, LMP_LOCAL_MIN, LMP_LOCAL_MAX, LMP_GHOST_MIN,
enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
TIME_OFFLOAD_PAIR, TIME_OFFLOAD_WAIT, TIME_OFFLOAD_LATENCY,
TIME_IMBALANCE};
#define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
#define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
#define INTEL_MIC_VECTOR_WIDTH 16
#define INTEL_VECTOR_WIDTH 4
#ifdef __AVX__
#undef INTEL_VECTOR_WIDTH
#define INTEL_VECTOR_WIDTH 8
#endif
#ifdef __AVX2__
#undef INTEL_VECTOR_WIDTH
#define INTEL_VECTOR_WIDTH 8
#endif
#ifdef __AVX512F__
#undef INTEL_VECTOR_WIDTH
#define INTEL_VECTOR_WIDTH 16
#define INTEL_V512 1
#define INTEL_VMASK 1
#else
#ifdef __MIC__
#define INTEL_V512 1
#define INTEL_VMASK 1
#endif
#endif
#ifdef __AVX512CD__
#ifndef _LMP_INTEL_OFFLOAD
#define LMP_USE_AVXCD
#endif
#endif
#define INTEL_DATA_ALIGN 64
@ -134,6 +141,18 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
datasize); \
}
#define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, \
nthreads, vecsize) \
{ \
tid = omp_get_thread_num(); \
int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
/vecsize/nthreads)); \
idelta *= vecsize; \
ifrom = tid*idelta; \
ito = ifrom + idelta; \
if (ito > inum) ito = inum; \
}
#else
#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
@ -364,6 +383,43 @@ inline double MIC_Wtime() {
} \
}
#define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, \
f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, \
vb3x, vb3y, vb3z,oedihedral, force, \
newton, nlocal) \
{ \
flt_t ev_pre; \
if (newton) ev_pre = (flt_t)1.0; \
else { \
ev_pre = (flt_t)0.0; \
if (i1 < nlocal) ev_pre += (flt_t)0.25; \
if (i2 < nlocal) ev_pre += (flt_t)0.25; \
if (i3 < nlocal) ev_pre += (flt_t)0.25; \
if (i4 < nlocal) ev_pre += (flt_t)0.25; \
} \
\
if (eflag) { \
oedihedral += ev_pre * deng; \
if (eatom) { \
flt_t qdeng = deng * (flt_t)0.25; \
if (newton || i1 < nlocal) f[i1].w += qdeng; \
if (newton || i2 < nlocal) f[i2].w += qdeng; \
if (newton || i3 < nlocal) f[i3].w += qdeng; \
if (newton || i4 < nlocal) f[i4].w += qdeng; \
} \
} \
\
if (vflag) { \
sv0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \
sv1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \
sv2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \
sv3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y); \
sv4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z); \
sv5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z); \
} \
}
#define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp) \
{ \
if (evflag) { \

View File

@ -351,4 +351,128 @@
ans##_0 = (aug_3 - t) / aug_0; \
}
/* ----------------------------------------------------------------------
normalize a quaternion
------------------------------------------------------------------------- */
#define ME_qnormalize(q) \
{ \
double norm = 1.0 / \
sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k); \
q##_w *= norm; \
q##_i *= norm; \
q##_j *= norm; \
q##_k *= norm; \
}
/* ----------------------------------------------------------------------
compute omega from angular momentum
w = omega = angular velocity in space frame
wbody = angular velocity in body frame
project space-frame angular momentum onto body axes
and divide by principal moments
------------------------------------------------------------------------- */
#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w) \
{ \
double wbody_0, wbody_1, wbody_2; \
double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \
\
double w2 = quat##_w * quat##_w; \
double i2 = quat##_i * quat##_i; \
double j2 = quat##_j * quat##_j; \
double k2 = quat##_k * quat##_k; \
double twoij = 2.0 * quat##_i * quat##_j; \
double twoik = 2.0 * quat##_i * quat##_k; \
double twojk = 2.0 * quat##_j * quat##_k; \
double twoiw = 2.0 * quat##_i * quat##_w; \
double twojw = 2.0 * quat##_j * quat##_w; \
double twokw = 2.0 * quat##_k * quat##_w; \
\
rot##_0 = w2 + i2 - j2 - k2; \
rot##_1 = twoij - twokw; \
rot##_2 = twojw + twoik; \
\
rot##_3 = twoij + twokw; \
rot##_4 = w2 - i2 + j2 - k2; \
rot##_5 = twojk - twoiw; \
\
rot##_6 = twoik - twojw; \
rot##_7 = twojk + twoiw; \
rot##_8 = w2 - i2 - j2 + k2; \
\
wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2; \
wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2; \
wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2; \
\
wbody_0 *= moments_0; \
wbody_1 *= moments_1; \
wbody_2 *= moments_2; \
\
w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2; \
w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2; \
w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2; \
}
#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2) \
{ \
angmomin[0] += dtf * torque[0]; \
double angmom_0 = angmomin[0]; \
angmomin[1] += dtf * torque[1]; \
double angmom_1 = angmomin[1]; \
angmomin[2] += dtf * torque[2]; \
double angmom_2 = angmomin[2]; \
\
double quat_w = quatin[0]; \
double quat_i = quatin[1]; \
double quat_j = quatin[2]; \
double quat_k = quatin[3]; \
\
double omega_0, omega_1, omega_2; \
ME_mq_to_omega(angmom,quat,i0,i1,i2,omega); \
\
double wq_0, wq_1, wq_2, wq_3; \
wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k; \
wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j; \
wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k; \
wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i; \
\
double qfull_w, qfull_i, qfull_j, qfull_k; \
qfull_w = quat_w + dtq * wq_0; \
qfull_i = quat_i + dtq * wq_1; \
qfull_j = quat_j + dtq * wq_2; \
qfull_k = quat_k + dtq * wq_3; \
ME_qnormalize(qfull); \
\
double qhalf_w, qhalf_i, qhalf_j, qhalf_k; \
qhalf_w = quat_w + 0.5*dtq * wq_0; \
qhalf_i = quat_i + 0.5*dtq * wq_1; \
qhalf_j = quat_j + 0.5*dtq * wq_2; \
qhalf_k = quat_k + 0.5*dtq * wq_3; \
ME_qnormalize(qhalf); \
\
ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega); \
wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k; \
wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j; \
wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k; \
wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i; \
\
qhalf_w += 0.5*dtq * wq_0; \
qhalf_i += 0.5*dtq * wq_1; \
qhalf_j += 0.5*dtq * wq_2; \
qhalf_k += 0.5*dtq * wq_3; \
ME_qnormalize(qhalf); \
\
quat_w = 2.0*qhalf_w - qfull_w; \
quat_i = 2.0*qhalf_i - qfull_i; \
quat_j = 2.0*qhalf_j - qfull_j; \
quat_k = 2.0*qhalf_k - qfull_k; \
ME_qnormalize(quat); \
\
quatin[0] = quat_w; \
quatin[1] = quat_i; \
quatin[2] = quat_j; \
quatin[3] = quat_k; \
}
#endif

View File

@ -15,6 +15,8 @@
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
//#define OUTER_CHUNK 1
#include "neighbor.h"
#include "neigh_list.h"
#include "atom.h"
@ -26,6 +28,14 @@
#include <omp.h>
#endif
#ifdef LMP_USE_AVXCD
#include "intel_simd.h"
#endif
#ifdef OUTER_CHUNK
#include "intel_simd.h"
#endif
using namespace LAMMPS_NS;
#ifdef _LMP_INTEL_OFFLOAD
@ -42,17 +52,11 @@ using namespace LAMMPS_NS;
for (int s = 0; s < n3; s++) { \
if (sptr[s] == tag) { \
if (s < n1) { \
if (special_flag[1] == 0) which = -1; \
else if (special_flag[1] == 1) which = 0; \
else which = 1; \
which = 1; \
} else if (s < n2) { \
if (special_flag[2] == 0) which = -1; \
else if (special_flag[2] == 1) which = 0; \
else which = 2; \
which = 2; \
} else { \
if (special_flag[3] == 0) which = -1; \
else if (special_flag[3] == 1) which = 0; \
else which = 3; \
which = 3; \
} \
} \
} \
@ -199,7 +203,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
if (offload) {
fix->start_watch(TIME_PACK);
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
buffers->grow_nbor(list, atom->nlocal, aend);
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
ATOM_T biga;
biga.x = INTEL_BIGP;
@ -335,7 +339,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
signal(tag)
#endif
{
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
@ -494,12 +498,12 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
if (j >= nlocal) {
if (j == nall)
jlist[jj] = nall_offset;
else if (which > 0)
else if (which)
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
else jlist[jj]-=ghost_offset;
} else
#endif
if (which > 0) jlist[jj] = j ^ (which << SBBITS);
if (which) jlist[jj] = j ^ (which << SBBITS);
}
}
}
@ -520,7 +524,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
}
#endif
} // end omp
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
@ -688,7 +692,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
if (offload) {
fix->start_watch(TIME_PACK);
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
buffers->grow_nbor(list, atom->nlocal, aend);
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
ATOM_T biga;
biga.x = INTEL_BIGP;
@ -827,7 +831,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
signal(tag)
#endif
{
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
@ -848,33 +852,47 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
const int num = aend - astart;
int tid, ifrom, ito;
#ifdef OUTER_CHUNK
const int swidth = ip_simd::SIMD_type<flt_t>::width();
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, swidth);
ifrom += astart;
ito += astart;
int e_ito = ito;
if (ito == num) {
int imod = ito % swidth;
if (imod) e_ito += swidth - e_ito;
}
const int list_size = (e_ito + tid + 1) * maxnbors;
#else
const int swidth = 1;
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
ifrom += astart;
ito += astart;
const int list_size = (ito + tid + 1) * maxnbors;
#endif
int which;
const int list_size = (ito + tid + 1) * maxnbors;
int pack_offset = maxnbors * swidth;
int ct = (ifrom + tid) * maxnbors;
int *neighptr = firstneigh + ct;
int max_chunk = 0;
int lane = 0;
for (int i = ifrom; i < ito; i++) {
int j, k, n, n2, itype, jtype, ibin;
double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
n = 0;
n2 = maxnbors;
xtmp = x[i].x;
ytmp = x[i].y;
ztmp = x[i].z;
itype = x[i].w;
const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z;
const int itype = x[i].w;
const int ioffset = ntypes * itype;
// loop over rest of atoms in i's bin, ghosts are at end of linked list
// if j is owned atom, store it, since j is beyond i in linked list
// if j is ghost, only store if j coords are "above/to the right" of i
for (j = bins[i]; j >= 0; j = bins[j]) {
int raw_count = pack_offset;
for (int j = bins[i]; j >= 0; j = bins[j]) {
if (j >= nlocal) {
if (offload_noghost && offload) continue;
if (x[j].z < ztmp) continue;
@ -884,116 +902,145 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
}
} else if (offload_noghost && i < offload_end) continue;
jtype = x[j].w;
#ifndef _LMP_INTEL_OFFLOAD
if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
if (exclude) {
const int jtype = x[j].w;
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
}
#endif
delx = xtmp - x[j].x;
dely = ytmp - x[j].y;
delz = ztmp - x[j].z;
rsq = delx * delx + dely * dely + delz * delz;
neighptr[raw_count++] = j;
}
if (rsq <= cutneighsq[ioffset + jtype]) {
if (j < nlocal) {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[n++] = -j - 1;
else
neighptr[n++] = j;
} else
neighptr[n++] = j;
#ifdef _LMP_INTEL_OFFLOAD
if (j < lmin) lmin = j;
if (j > lmax) lmax = j;
#endif
} else {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[n2++] = -j - 1;
else
neighptr[n2++] = j;
} else
neighptr[n2++] = j;
#ifdef _LMP_INTEL_OFFLOAD
if (j < gmin) gmin = j;
if (j > gmax) gmax = j;
#endif
}
}
}
// loop over all atoms in other bins in stencil, store every pair
ibin = atombin[i];
for (k = 0; k < nstencil; k++) {
for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
const int ibin = atombin[i];
for (int k = 0; k < nstencil; k++) {
for (int j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
if (offload_noghost) {
if (j < nlocal) {
if (i < offload_end) continue;
} else if (offload) continue;
}
jtype = x[j].w;
#ifndef _LMP_INTEL_OFFLOAD
if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
if (exclude) {
const int jtype = x[j].w;
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
}
#endif
delx = xtmp - x[j].x;
dely = ytmp - x[j].y;
delz = ztmp - x[j].z;
rsq = delx * delx + dely * dely + delz * delz;
if (rsq <= cutneighsq[ioffset + jtype]) {
if (j < nlocal) {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[n++] = -j - 1;
else
neighptr[n++] = j;
} else
neighptr[n++] = j;
#ifdef _LMP_INTEL_OFFLOAD
if (j < lmin) lmin = j;
if (j > lmax) lmax = j;
#endif
} else {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[n2++] = -j - 1;
else
neighptr[n2++] = j;
} else
neighptr[n2++] = j;
#ifdef _LMP_INTEL_OFFLOAD
if (j < gmin) gmin = j;
if (j > gmax) gmax = j;
#endif
}
}
}
}
ilist[i] = i;
cnumneigh[i] = ct;
if (n > maxnbors) *overflow = 1;
for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
while( (n % pad_width) != 0 ) neighptr[n++] = e_nall;
numneigh[i] = n;
while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
ct += n;
neighptr += n;
if (ct + n + maxnbors > list_size) {
*overflow = 1;
ct = (ifrom + tid) * maxnbors;
neighptr[raw_count++] = j;
}
}
#if defined(LMP_SIMD_COMPILER)
#ifdef _LMP_INTEL_OFFLOAD
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
#pragma vector aligned
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
#else
#pragma vector aligned
#pragma simd
#endif
#endif
for (int u = pack_offset; u < raw_count; u++) {
int j = neighptr[u];
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
const int jtype = x[j].w;
const flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq > cutneighsq[ioffset + jtype])
neighptr[u] = e_nall;
else {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[u] = -j - 1;
}
#ifdef _LMP_INTEL_OFFLOAD
if (j < nlocal) {
if (j < vlmin) vlmin = j;
if (j > vlmax) vlmax = j;
} else {
if (j < vgmin) vgmin = j;
if (j > vgmax) vgmax = j;
}
#endif
}
}
#ifdef _LMP_INTEL_OFFLOAD
lmin = MIN(lmin,vlmin);
gmin = MIN(gmin,vgmin);
lmax = MAX(lmax,vlmax);
gmax = MAX(gmax,vgmax);
#endif
int n = lane, n2 = pack_offset;
for (int u = pack_offset; u < raw_count; u++) {
const int j = neighptr[u];
int pj = j;
if (pj < e_nall) {
if (need_ic)
if (pj < 0) pj = -pj - 1;
if (pj < nlocal) {
neighptr[n] = j;
n += swidth;
} else
neighptr[n2++] = j;
}
}
int ns = (n - lane) / swidth;
if (ns > maxnbors || n2 > list_size) *overflow = 1;
for (int u = pack_offset; u < n2; u++) {
neighptr[n] = neighptr[u];
n += swidth;
}
ilist[i] = i;
cnumneigh[i] = ct + lane;
ns += n2 - pack_offset;
#ifndef OUTER_CHUNK
while( (ns % pad_width) != 0 ) neighptr[ns++] = e_nall;
#endif
numneigh[i] = ns;
#ifdef OUTER_CHUNK
if (ns > max_chunk) max_chunk = ns;
lane++;
pack_offset -= maxnbors;
if (lane == swidth) {
ct += max_chunk * swidth;
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
const int edge = (ct % alignb);
if (edge) ct += alignb - edge;
neighptr = firstneigh + ct;
max_chunk = 0;
pack_offset = maxnbors * swidth;
lane = 0;
if (ct + pack_offset + maxnbors > list_size) {
if (i < ito - 1) {
*overflow = 1;
ct = (ifrom + tid) * maxnbors;
}
}
}
#else
ct += ns;
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
const int edge = (ct % alignb);
if (edge) ct += alignb - edge;
neighptr = firstneigh + ct;
if (ct + pack_offset + maxnbors > list_size) {
if (i < ito - 1) {
*overflow = 1;
ct = (ifrom + tid) * maxnbors;
}
}
#endif
}
if (*overflow == 1)
@ -1032,7 +1079,16 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
#ifndef OUTER_CHUNK
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = 0; jj < jnum; jj++) {
#else
const int trip = jnum * swidth;
for (int jj = 0; jj < trip; jj+= swidth) {
#endif
const int j = jlist[jj];
if (need_ic && j < 0) {
which = 0;
@ -1044,12 +1100,12 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
if (j >= nlocal) {
if (j == e_nall)
jlist[jj] = nall_offset;
else if (which > 0)
else if (which)
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
else jlist[jj]-=ghost_offset;
} else
#endif
if (which > 0) jlist[jj] = j ^ (which << SBBITS);
if (which) jlist[jj] = j ^ (which << SBBITS);
}
}
}
@ -1070,7 +1126,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
}
#endif
} // end omp
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
@ -1238,7 +1294,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
if (offload) {
fix->start_watch(TIME_PACK);
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
buffers->grow_nbor(list, atom->nlocal, aend);
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
ATOM_T biga;
biga.x = INTEL_BIGP;
@ -1377,7 +1433,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
signal(tag)
#endif
{
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
@ -1550,12 +1606,12 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
if (j >= nlocal) {
if (j == e_nall)
jlist[jj] = nall_offset;
else if (which > 0)
else if (which)
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
else jlist[jj]-=ghost_offset;
} else
#endif
if (which > 0) jlist[jj] = j ^ (which << SBBITS);
if (which) jlist[jj] = j ^ (which << SBBITS);
}
}
}
@ -1576,7 +1632,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
}
#endif
} // end omp
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
@ -1741,10 +1797,12 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
const int nall = atom->nlocal + atom->nghost;
int pad = 1;
const int pack_width = fix->nbor_pack_width();
if (offload) {
fix->start_watch(TIME_PACK);
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
buffers->grow_nbor(list, atom->nlocal, aend);
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend, pack_width);
ATOM_T biga;
biga.x = INTEL_BIGP;
@ -1871,7 +1929,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
in(atombin:length(aend) alloc_if(0) free_if(0)) \
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
in(special_flag:length(0) alloc_if(0) free_if(0)) \
in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pack_width) \
in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
out(overflow:length(5) alloc_if(0) free_if(0)) \
@ -1879,7 +1937,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
signal(tag)
#endif
{
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
@ -1900,36 +1958,40 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
const int num = aend - astart;
int tid, ifrom, ito;
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
ifrom += astart;
ito += astart;
int e_ito = ito;
if (ito == num) {
int imod = ito % pack_width;
if (imod) e_ito += pack_width - e_ito;
}
const int list_size = (e_ito + tid + 1) * maxnbors;
int which;
const int list_size = (ito + tid + 1) * maxnbors;
int pack_offset = maxnbors * pack_width;
int ct = (ifrom + tid) * maxnbors;
int *neighptr = firstneigh + ct;
int max_chunk = 0;
int lane = 0;
for (int i = ifrom; i < ito; i++) {
int j, k, n, n2, itype, jtype, ibin;
double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
n = 0;
n2 = maxnbors;
xtmp = x[i].x;
ytmp = x[i].y;
ztmp = x[i].z;
itype = x[i].w;
const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z;
const int itype = x[i].w;
const tagint itag = tag[i];
const int ioffset = ntypes * itype;
const int ibin = atombin[i];
int raw_count = pack_offset;
// loop over all atoms in surrounding bins in stencil including self
// skip i = j
ibin = atombin[i];
for (k = 0; k < nstencil; k++) {
for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
for (int k = 0; k < nstencil; k++) {
for (int j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
if (i == j) continue;
if (offload_noghost) {
@ -1938,76 +2000,121 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
} else if (offload) continue;
}
jtype = x[j].w;
#ifndef _LMP_INTEL_OFFLOAD
if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
#endif
delx = xtmp - x[j].x;
dely = ytmp - x[j].y;
delz = ztmp - x[j].z;
rsq = delx * delx + dely * dely + delz * delz;
if (rsq <= cutneighsq[ioffset + jtype]) {
const int jtag = tag[j];
int flist = 0;
if (itag > jtag) {
if ((itag+jtag) % 2 == 0) flist = 1;
} else if (itag < jtag) {
if ((itag+jtag) % 2 == 1) flist = 1;
} else {
if (x[j].z < ztmp) flist = 1;
else if (x[j].z == ztmp && x[j].y < ytmp) flist = 1;
else if (x[j].z == ztmp && x[j].y == ytmp && x[j].x < xtmp)
flist = 1;
}
if (flist) {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[n2++] = -j - 1;
else
neighptr[n2++] = j;
} else
neighptr[n2++] = j;
} else {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[n++] = -j - 1;
else
neighptr[n++] = j;
} else
neighptr[n++] = j;
}
#ifdef _LMP_INTEL_OFFLOAD
if (j < nlocal) {
if (j < lmin) lmin = j;
if (j > lmax) lmax = j;
} else {
if (j < gmin) gmin = j;
if (j > gmax) gmax = j;
}
#endif
if (exclude) {
const int jtype = x[j].w;
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
}
#endif
neighptr[raw_count++] = j;
}
}
ilist[i] = i;
cnumneigh[i] = ct;
if (n > maxnbors) *overflow = 1;
atombin[i] = n;
for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
numneigh[i] = n;
while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
ct += n;
neighptr += n;
if (ct + n + maxnbors > list_size) {
*overflow = 1;
ct = (ifrom + tid) * maxnbors;
#if defined(LMP_SIMD_COMPILER)
#ifdef _LMP_INTEL_OFFLOAD
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
#pragma vector aligned
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
#else
#pragma vector aligned
#pragma simd
#endif
#endif
for (int u = pack_offset; u < raw_count; u++) {
int j = neighptr[u];
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
const int jtype = x[j].w;
const flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq > cutneighsq[ioffset + jtype])
neighptr[u] = e_nall;
else {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[u] = -j - 1;
}
#ifdef _LMP_INTEL_OFFLOAD
if (j < nlocal) {
if (j < vlmin) vlmin = j;
if (j > vlmax) vlmax = j;
} else {
if (j < vgmin) vgmin = j;
if (j > vgmax) vgmax = j;
}
#endif
}
}
#ifdef _LMP_INTEL_OFFLOAD
lmin = MIN(lmin,vlmin);
gmin = MIN(gmin,vgmin);
lmax = MAX(lmax,vlmax);
gmax = MAX(gmax,vgmax);
#endif
int n = lane, n2 = pack_offset;
for (int u = pack_offset; u < raw_count; u++) {
const int j = neighptr[u];
int pj = j;
if (pj < e_nall) {
if (need_ic)
if (pj < 0) pj = -pj - 1;
const int jtag = tag[pj];
int flist = 0;
if (itag > jtag) {
if ((itag+jtag) % 2 == 0) flist = 1;
} else if (itag < jtag) {
if ((itag+jtag) % 2 == 1) flist = 1;
} else {
if (x[pj].z < ztmp) flist = 1;
else if (x[pj].z == ztmp && x[pj].y < ytmp) flist = 1;
else if (x[pj].z == ztmp && x[pj].y == ytmp && x[pj].x < xtmp)
flist = 1;
}
if (flist) {
neighptr[n2++] = j;
} else {
neighptr[n] = j;
n += pack_width;
}
}
}
int ns = (n - lane) / pack_width;
if (ns > maxnbors || n2 > list_size) *overflow = 1;
atombin[i] = ns;
for (int u = pack_offset; u < n2; u++) {
neighptr[n] = neighptr[u];
n += pack_width;
}
ilist[i] = i;
cnumneigh[i] = ct + lane;
ns += n2 - pack_offset;
numneigh[i] = ns;
if (ns > max_chunk) max_chunk = ns;
lane++;
pack_offset -= maxnbors;
if (lane == pack_width) {
ct += max_chunk * pack_width;
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
const int edge = (ct % alignb);
if (edge) ct += alignb - edge;
neighptr = firstneigh + ct;
max_chunk = 0;
pack_offset = maxnbors * pack_width;
lane = 0;
if (ct + pack_offset + maxnbors > list_size) {
if (i < ito - 1) {
*overflow = 1;
ct = (ifrom + tid) * maxnbors;
}
}
}
}
if (*overflow == 1)
@ -2046,7 +2153,9 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
for (int jj = 0; jj < jnum; jj++) {
const int trip = jnum * pack_width;
for (int jj = 0; jj < trip; jj+=pack_width) {
const int j = jlist[jj];
if (need_ic && j < 0) {
which = 0;
@ -2058,12 +2167,12 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
if (j >= nlocal) {
if (j == e_nall)
jlist[jj] = nall_offset;
else if (which > 0)
else if (which)
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
else jlist[jj]-=ghost_offset;
} else
#endif
if (which > 0) jlist[jj] = j ^ (which << SBBITS);
if (which) jlist[jj] = j ^ (which << SBBITS);
}
}
}
@ -2083,7 +2192,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
}
#endif
} // end omp
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
@ -2113,3 +2222,4 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
#endif
}
}

View File

@ -12,9 +12,17 @@
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#include <math.h>
#include "pair_gayberne_intel.h"
#include "math_extra_intel.h"
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(push,target(mic))
#endif
#include <cmath>
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
#include "atom.h"
#include "comm.h"
#include "atom_vec_ellipsoid.h"
@ -295,7 +303,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
signal(f_start)
#endif
{
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute=MIC_Wtime();
#endif
@ -335,8 +343,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EVFLAG) {
oevdwl = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
oevdwl = (acc_t)0.0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
}
// loop over neighbors of my atoms
@ -394,8 +402,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
if (EVFLAG) {
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
bool multiple_forms = false;
@ -485,14 +493,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
tempv_1 = kappa_1 * inv_r;
tempv_2 = kappa_2 * inv_r;
flt_t sigma12 = ME_dot3(r12hat, tempv);
sigma12 = pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
sigma12 = std::pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
flt_t h12 = r - sigma12;
// energy
// compute u_r
flt_t varrho = sigma / (h12 + gamma * sigma);
flt_t varrho6 = pow(varrho, (flt_t)6.0);
flt_t varrho6 = std::pow(varrho, (flt_t)6.0);
flt_t varrho12 = varrho6 * varrho6;
flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);
@ -500,7 +508,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
flt_t det_g12 = ME_det3(g12);
eta = pow(eta / det_g12, upsilon);
eta = std::pow(eta / det_g12, upsilon);
// compute chi_12
@ -516,7 +524,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
tempv_1 = iota_1 * inv_r;
tempv_2 = iota_2 * inv_r;
flt_t chi = ME_dot3(r12hat, tempv);
chi = pow(chi * (flt_t)2.0, mu);
chi = std::pow(chi * (flt_t)2.0, mu);
// force
// compute dUr/dr
@ -524,7 +532,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
sigma;
temp1 = temp1 * (flt_t)24.0 * epsilon;
flt_t u_slj = temp1 * pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
flt_t dUr_0, dUr_1, dUr_2;
temp2 = ME_dot3(kappa, r12hat);
flt_t uslj_rsq = u_slj / rsq_form[jj];
@ -536,8 +544,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
flt_t dchi_0, dchi_1, dchi_2;
temp1 = ME_dot3(iota, r12hat);
temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
pow(chi, (mu - (flt_t)1.0) / mu);
temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
std::pow(chi, (mu - (flt_t)1.0) / mu);
dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
@ -714,7 +722,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
}
if (EVFLAG) {
flt_t ev_pre = (flt_t)0;
flt_t ev_pre = (flt_t)0.0;
if (NEWTON_PAIR || i < nlocal)
ev_pre += (flt_t)0.5;
if (NEWTON_PAIR || j < nlocal)
@ -863,7 +871,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
}
}
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // offload

View File

@ -217,7 +217,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
ITABLE_IN signal(f_start)
#endif
{
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
@ -459,7 +459,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
ev_global[7] = ov5;
}
}
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end of offload region

View File

@ -212,7 +212,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
ITABLE_IN signal(f_start)
#endif
{
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
@ -263,7 +263,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
}
#if defined(__INTEL_COMPILER)
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
sv0, sv1, sv2, sv3, sv4, sv5)
@ -283,7 +283,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
const flt_t r2inv = (flt_t)1.0 / rsq;
#ifdef __MIC__
#ifdef INTEL_VMASK
if (rsq < c_forcei[jtype].cutsq) {
#endif
#ifdef INTEL_ALLOW_TABLE
@ -335,11 +335,11 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
}
}
#endif
#ifdef __MIC__
#ifdef INTEL_VMASK
}
#endif
#ifdef __MIC__
#ifdef INTEL_VMASK
if (rsq < c_forcei[jtype].cut_ljsq) {
#endif
flt_t r6inv = r2inv * r2inv * r2inv;
@ -354,7 +354,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
forcelj *= factor_lj;
if (EFLAG) evdwl *= factor_lj;
}
#ifdef __MIC__
#ifdef INTEL_VMASK
}
#else
if (rsq > c_forcei[jtype].cutsq)
@ -363,7 +363,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
{ forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
#endif
#ifdef __MIC__
#ifdef INTEL_VMASK
if (rsq < c_forcei[jtype].cutsq) {
#endif
const flt_t fpair = (forcecoul + forcelj) * r2inv;
@ -395,7 +395,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
}
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
}
#ifdef __MIC__
#ifdef INTEL_VMASK
}
#endif
} // for jj
@ -426,7 +426,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
ev_global[7] = ov5;
}
}
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end of offload region

View File

@ -88,39 +88,73 @@ void PairLJCutIntel::compute(int eflag, int vflag,
fix->stop_watch(TIME_PACK);
}
if (evflag || vflag_fdotr) {
int ovflag = 0;
if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1;
if (eflag) {
if (force->newton_pair) {
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
if (_onetype) {
if (evflag || vflag_fdotr) {
int ovflag = 0;
if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1;
if (eflag) {
if (force->newton_pair) {
eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
if (force->newton_pair) {
eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
} else {
if (force->newton_pair) {
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
eval<1,0,0,1>(1, 0, buffers, fc, 0, offload_end);
eval<1,0,0,1>(0, 0, buffers, fc, host_start, inum);
} else {
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end);
eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum);
}
}
} else {
if (force->newton_pair) {
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
if (evflag || vflag_fdotr) {
int ovflag = 0;
if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1;
if (eflag) {
if (force->newton_pair) {
eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (force->newton_pair) {
eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
} else {
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
if (force->newton_pair) {
eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
} else {
eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
}
}
}
}
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
class acc_t>
void PairLJCutIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc,
@ -159,7 +193,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
const int nthreads = tc;
int *overflow = fix->get_off_overflow_flag();
{
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
@ -187,12 +221,25 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
flt_t cutsq, lj1, lj2, lj3, lj4, offset;
if (ONETYPE) {
cutsq = ljc12o[3].cutsq;
lj1 = ljc12o[3].lj1;
lj2 = ljc12o[3].lj2;
lj3 = lj34[3].lj3;
lj4 = lj34[3].lj4;
offset = ljc12o[3].offset;
}
for (int i = iifrom; i < iito; ++i) {
const int itype = x[i].w;
const int ptr_off = itype * ntypes;
const FC_PACKED1_T * _noalias const ljc12oi = ljc12o + ptr_off;
const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
int itype, ptr_off;
const FC_PACKED1_T * _noalias ljc12oi;
const FC_PACKED2_T * _noalias lj34i;
if (!ONETYPE) {
itype = x[i].w;
ptr_off = itype * ntypes;
ljc12oi = ljc12o + ptr_off;
lj34i = lj34 + ptr_off;
}
const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
@ -218,25 +265,42 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
flt_t forcelj, evdwl;
forcelj = evdwl = (flt_t)0.0;
const int sbindex = jlist[jj] >> SBBITS & 3;
const int j = jlist[jj] & NEIGHMASK;
int j, jtype, sbindex;
if (!ONETYPE) {
sbindex = jlist[jj] >> SBBITS & 3;
j = jlist[jj] & NEIGHMASK;
} else
j = jlist[jj];
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
const int jtype = x[j].w;
if (!ONETYPE) {
jtype = x[j].w;
cutsq = ljc12oi[jtype].cutsq;
}
const flt_t rsq = delx * delx + dely * dely + delz * delz;
#ifdef INTEL_VMASK
if (rsq < ljc12oi[jtype].cutsq) {
if (rsq < cutsq) {
#endif
flt_t factor_lj = special_lj[sbindex];
flt_t factor_lj;
if (!ONETYPE) factor_lj = special_lj[sbindex];
flt_t r2inv = 1.0 / rsq;
flt_t r6inv = r2inv * r2inv * r2inv;
#ifndef INTEL_VMASK
if (rsq > ljc12oi[jtype].cutsq) r6inv = (flt_t)0.0;
if (rsq > cutsq) r6inv = (flt_t)0.0;
#endif
forcelj = r6inv * (ljc12oi[jtype].lj1 * r6inv - ljc12oi[jtype].lj2);
flt_t fpair = factor_lj * forcelj * r2inv;
if (!ONETYPE) {
lj1 = ljc12oi[jtype].lj1;
lj2 = ljc12oi[jtype].lj2;
}
forcelj = r6inv * (lj1 * r6inv - lj2);
flt_t fpair;
if (!ONETYPE)
fpair = factor_lj * forcelj * r2inv;
else
fpair = forcelj * r2inv;
fxtmp += delx * fpair;
fytmp += dely * fpair;
@ -255,9 +319,13 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
ev_pre += (flt_t)0.5;
if (EFLAG) {
evdwl = r6inv * (lj34i[jtype].lj3 * r6inv-lj34i[jtype].lj4) -
ljc12oi[jtype].offset;
evdwl *= factor_lj;
if (!ONETYPE) {
lj3 = lj34i[jtype].lj3;
lj4 = lj34i[jtype].lj4;
offset = ljc12oi[jtype].offset;
}
evdwl = r6inv * (lj3 * r6inv - lj4) - offset;
if (!ONETYPE) evdwl *= factor_lj;
sevdwl += ev_pre*evdwl;
if (eatom) {
if (NEWTON_PAIR || i < nlocal)
@ -302,7 +370,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
ev_global[7] = ov5;
}
}
#ifdef __MIC__
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
@ -352,6 +420,9 @@ template <class flt_t, class acc_t>
void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
_onetype = 0;
if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1,memory,_cop);
buffers->set_ntypes(tp1);

View File

@ -39,13 +39,14 @@ class PairLJCutIntel : public PairLJCut {
private:
FixIntel *fix;
int _cop;
int _cop, _onetype;
template <class flt_t> class ForceConst;
template <class flt_t, class acc_t>
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
class acc_t>
void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc, const int astart, const int aend);

File diff suppressed because it is too large Load Diff

View File

@ -11,6 +11,10 @@
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(sw/intel,PairSWIntel)
@ -42,7 +46,7 @@ class PairSWIntel : public PairSW {
template <class flt_t, class acc_t>
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc);
template <int SPQ, int EVFLAG, int EFLAG, class flt_t, class acc_t>
template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
const int astart, const int aend, const int pad_width);
@ -51,7 +55,10 @@ class PairSWIntel : public PairSW {
void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers);
int _ccache_stride, _host_pad, _offload_pad, _spq;
int _ccache_stride, _host_pad, _offload_pad, _spq, _onetype;
#ifdef LMP_USE_AVXCD
int _ccache_stride3;
#endif
// ----------------------------------------------------------------------
@ -62,8 +69,11 @@ class PairSWIntel : public PairSW {
flt_t cutsq, cut, sigma_gamma, pad;
} fc_packed0;
typedef struct {
flt_t powerp, powerq, cut, sigma, c1, c2, c3, c4;
flt_t powerp, powerq, cut, sigma;
} fc_packed1;
typedef struct {
flt_t c1, c2, c3, c4;
} fc_packed1p2;
typedef struct {
flt_t c5, c6;
} fc_packed2;
@ -73,6 +83,7 @@ class PairSWIntel : public PairSW {
fc_packed0 **p2;
fc_packed1 **p2f;
fc_packed1p2 **p2f2;
fc_packed2 **p2e;
fc_packed3 ***p3;