git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@14386 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -26,14 +26,12 @@ action () {
|
||||
# do not install child files if parent does not exist
|
||||
|
||||
for file in *_intel.cpp; do
|
||||
test $file = thr_intel.cpp && continue
|
||||
dep=`echo $file | sed 's/neigh_full_intel/neigh_full/g' | \
|
||||
sed 's/_offload_intel//g' | sed 's/_intel//g'`
|
||||
action $file $dep
|
||||
done
|
||||
|
||||
for file in *_intel.h; do
|
||||
test $file = thr_intel.h && continue
|
||||
dep=`echo $file | sed 's/_offload_intel//g' | sed 's/_intel//g'`
|
||||
action $file $dep
|
||||
done
|
||||
@ -42,6 +40,8 @@ action intel_preprocess.h
|
||||
action intel_buffers.h
|
||||
action intel_buffers.cpp
|
||||
action math_extra_intel.h
|
||||
action intel_simd.h pair_sw_intel.cpp
|
||||
action intel_intrinsics.h pair_tersoff_intel.cpp
|
||||
|
||||
# step 2: handle cases and tasks not handled in step 1.
|
||||
|
||||
|
||||
@ -3,9 +3,11 @@
|
||||
LAMMPS Intel(R) Package
|
||||
--------------------------------
|
||||
|
||||
W. Michael Brown (Intel)
|
||||
michael.w.brown at intel.com
|
||||
|
||||
W. Michael Brown (Intel) michael.w.brown at intel.com
|
||||
Rodrigo Canales (RWTH Aachen University)
|
||||
Markus H<>hnerbach (RWTH Aachen University)
|
||||
Ahmed E. Ismail (RWTH Aachen University)
|
||||
Paolo Bientinesi (RWTH Aachen University)
|
||||
Anupama Kurpad (Intel)
|
||||
Biswajit Mishra (Shell)
|
||||
|
||||
@ -53,3 +55,12 @@ By default, when running with offload to Intel(R) coprocessors, affinity
|
||||
for host MPI tasks and OpenMP threads is set automatically within the code.
|
||||
This currently requires the use of system calls. To disable at build time,
|
||||
compile with -DINTEL_OFFLOAD_NOAFFINITY.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
Vector intrinsics are temporarily being used for the Stillinger-Weber
|
||||
potential to allow for advanced features in the AVX512 instruction set to
|
||||
be exploited on early hardware. We hope to see compiler improvements for
|
||||
AVX512 that will eliminate this requirement, so it is not recommended to
|
||||
develop code based on the intrinsics implementation. Please e-mail the
|
||||
authors for more details.
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
package intel 1 mode mixed balance $b
|
||||
package omp 0
|
||||
suffix $s
|
||||
processors * * * grid numa
|
||||
# processors * * * grid numa
|
||||
|
||||
variable x index 4
|
||||
variable y index 2
|
||||
|
||||
@ -60,6 +60,8 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
||||
|
||||
int ncops = force->inumeric(FLERR,arg[3]);
|
||||
|
||||
_nbor_pack_width = 1;
|
||||
|
||||
_precision_mode = PREC_MODE_MIXED;
|
||||
_offload_balance = 1.0;
|
||||
_overflow_flag[LMP_OVERFLOW] = 0;
|
||||
@ -307,12 +309,14 @@ void FixIntel::setup(int vflag)
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void FixIntel::pair_init_check()
|
||||
void FixIntel::pair_init_check(const bool cdmessage)
|
||||
{
|
||||
#ifdef INTEL_VMASK
|
||||
atom->sortfreq = 1;
|
||||
#endif
|
||||
|
||||
_nbor_pack_width = 1;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_offload_balance != 0.0) atom->sortfreq = 1;
|
||||
|
||||
@ -371,15 +375,12 @@ void FixIntel::pair_init_check()
|
||||
char kmode[80];
|
||||
if (_precision_mode == PREC_MODE_SINGLE) {
|
||||
strcpy(kmode, "single");
|
||||
get_single_buffers()->free_all_nbor_buffers();
|
||||
get_single_buffers()->need_tag(need_tag);
|
||||
} else if (_precision_mode == PREC_MODE_MIXED) {
|
||||
strcpy(kmode, "mixed");
|
||||
get_mixed_buffers()->free_all_nbor_buffers();
|
||||
get_mixed_buffers()->need_tag(need_tag);
|
||||
} else {
|
||||
strcpy(kmode, "double");
|
||||
get_double_buffers()->free_all_nbor_buffers();
|
||||
get_double_buffers()->need_tag(need_tag);
|
||||
}
|
||||
|
||||
@ -399,6 +400,13 @@ void FixIntel::pair_init_check()
|
||||
fprintf(screen,"Using Intel Package without Coprocessor.\n");
|
||||
}
|
||||
fprintf(screen,"Precision: %s\n",kmode);
|
||||
if (cdmessage) {
|
||||
#ifdef LMP_USE_AVXCD
|
||||
fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
|
||||
#else
|
||||
fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
|
||||
#endif
|
||||
}
|
||||
fprintf(screen,
|
||||
"----------------------------------------------------------\n");
|
||||
}
|
||||
|
||||
@ -11,6 +11,10 @@
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef FIX_CLASS
|
||||
|
||||
FixStyle(INTEL,FixIntel)
|
||||
@ -39,7 +43,7 @@ class FixIntel : public Fix {
|
||||
virtual int setmask();
|
||||
virtual void init();
|
||||
virtual void setup(int);
|
||||
void pair_init_check();
|
||||
void pair_init_check(const bool cdmessage=false);
|
||||
|
||||
// Get all forces, calculation results from coprocesser
|
||||
void sync_coprocessor();
|
||||
@ -58,12 +62,15 @@ class FixIntel : public Fix {
|
||||
inline IntelBuffers<double,double> * get_double_buffers()
|
||||
{ return _double_buffers; }
|
||||
|
||||
inline int nbor_pack_width() const { return _nbor_pack_width; }
|
||||
inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
|
||||
|
||||
protected:
|
||||
IntelBuffers<float,float> *_single_buffers;
|
||||
IntelBuffers<float,double> *_mixed_buffers;
|
||||
IntelBuffers<double,double> *_double_buffers;
|
||||
|
||||
int _precision_mode, _nthreads;
|
||||
int _precision_mode, _nthreads, _nbor_pack_width;
|
||||
|
||||
public:
|
||||
inline int* get_overflow_flag() { return _overflow_flag; }
|
||||
|
||||
@ -343,11 +343,15 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list()
|
||||
template <class flt_t, class acc_t>
|
||||
void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
|
||||
const int nlocal,
|
||||
const int offload_end)
|
||||
const int nthreads,
|
||||
const int offload_end,
|
||||
const int pack_width)
|
||||
{
|
||||
free_nbor_list();
|
||||
_list_alloc_atoms = 1.10 * nlocal;
|
||||
int list_alloc_size = (_list_alloc_atoms + _off_threads) * get_max_nbors();
|
||||
int nt = MAX(nthreads, _off_threads);
|
||||
int list_alloc_size = (_list_alloc_atoms + nt + pack_width - 1) *
|
||||
get_max_nbors();
|
||||
lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_end > 0) {
|
||||
@ -393,6 +397,9 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
|
||||
flt_t *ccachew = _ccachew;
|
||||
int *ccachei = _ccachei;
|
||||
int *ccachej = _ccachej;
|
||||
#ifdef LMP_USE_AVXCD
|
||||
acc_t *ccachef = _ccachef;
|
||||
#endif
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_off_ccache) {
|
||||
@ -409,6 +416,9 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
|
||||
lmp->memory->destroy(ccachew);
|
||||
lmp->memory->destroy(ccachei);
|
||||
lmp->memory->destroy(ccachej);
|
||||
#ifdef LMP_USE_AVXCD
|
||||
lmp->memory->destroy(ccachef);
|
||||
#endif
|
||||
|
||||
_ccachex = 0;
|
||||
}
|
||||
@ -418,7 +428,8 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
||||
const int nthreads)
|
||||
const int nthreads,
|
||||
const int width)
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_ccachex && off_flag && _off_ccache == 0)
|
||||
@ -427,7 +438,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
||||
if (_ccachex)
|
||||
return;
|
||||
|
||||
const int nsize = get_max_nbors();
|
||||
const int nsize = get_max_nbors() * width;
|
||||
int esize = MIN(sizeof(int), sizeof(flt_t));
|
||||
IP_PRE_get_stride(_ccache_stride, nsize, esize, 0);
|
||||
int nt = MAX(nthreads, _off_threads);
|
||||
@ -439,6 +450,11 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
||||
lmp->memory->create(_ccachew, vsize, "_ccachew");
|
||||
lmp->memory->create(_ccachei, vsize, "_ccachei");
|
||||
lmp->memory->create(_ccachej, vsize, "_ccachej");
|
||||
#ifdef LMP_USE_AVXCD
|
||||
IP_PRE_get_stride(_ccache_stride3, nsize * 3, esize, 0);
|
||||
lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
|
||||
#endif
|
||||
memset(_ccachej, 0, vsize * sizeof(int));
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (off_flag) {
|
||||
@ -454,7 +470,8 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
||||
#pragma offload_transfer target(mic:_cop) \
|
||||
nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
|
||||
nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
|
||||
nocopy(ccachei,ccachej:length(vsize) alloc_if(1) free_if(0))
|
||||
nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
|
||||
in(ccachej:length(vsize) alloc_if(1) free_if(0))
|
||||
}
|
||||
_off_ccache = 1;
|
||||
}
|
||||
|
||||
@ -75,14 +75,14 @@ class IntelBuffers {
|
||||
free_local();
|
||||
}
|
||||
|
||||
inline void grow_nbor(NeighList *list, const int nlocal,
|
||||
const int offload_end) {
|
||||
inline void grow_nbor(NeighList *list, const int nlocal, const int nthreads,
|
||||
const int offload_end, const int pack_width=1) {
|
||||
grow_local(list, offload_end);
|
||||
if (offload_end) {
|
||||
grow_nmax();
|
||||
grow_binhead();
|
||||
}
|
||||
grow_nbor_list(list, nlocal, offload_end);
|
||||
grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
|
||||
}
|
||||
|
||||
void free_nmax();
|
||||
@ -111,7 +111,7 @@ class IntelBuffers {
|
||||
}
|
||||
|
||||
void free_ccache();
|
||||
void grow_ccache(const int off_flag, const int nthreads);
|
||||
void grow_ccache(const int off_flag, const int nthreads, const int width=1);
|
||||
inline int ccache_stride() { return _ccache_stride; }
|
||||
inline flt_t * get_ccachex() { return _ccachex; }
|
||||
inline flt_t * get_ccachey() { return _ccachey; }
|
||||
@ -119,6 +119,10 @@ class IntelBuffers {
|
||||
inline flt_t * get_ccachew() { return _ccachew; }
|
||||
inline int * get_ccachei() { return _ccachei; }
|
||||
inline int * get_ccachej() { return _ccachej; }
|
||||
#ifdef LMP_USE_AVXCD
|
||||
inline int ccache_stride3() { return _ccache_stride3; }
|
||||
inline acc_t * get_ccachef() { return _ccachef; }
|
||||
#endif
|
||||
|
||||
inline int get_max_nbors() {
|
||||
int mn = lmp->neighbor->oneatom * sizeof(int) /
|
||||
@ -129,9 +133,10 @@ class IntelBuffers {
|
||||
void free_nbor_list();
|
||||
|
||||
inline void grow_nbor_list(NeighList *list, const int nlocal,
|
||||
const int offload_end) {
|
||||
const int nthreads, const int offload_end,
|
||||
const int pack_width) {
|
||||
if (nlocal > _list_alloc_atoms)
|
||||
_grow_nbor_list(list, nlocal, offload_end);
|
||||
_grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
else if (offload_end > 0 && _off_map_stencil != list->stencil)
|
||||
_grow_stencil(list);
|
||||
@ -281,6 +286,10 @@ class IntelBuffers {
|
||||
int _ccache_stride;
|
||||
flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
|
||||
int *_ccachei, *_ccachej;
|
||||
#ifdef LMP_USE_AVXCD
|
||||
int _ccache_stride3;
|
||||
acc_t * _ccachef;
|
||||
#endif
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int _separate_buffers;
|
||||
@ -305,8 +314,8 @@ class IntelBuffers {
|
||||
void _grow_nmax();
|
||||
void _grow_local(NeighList *list, const int offload_end);
|
||||
void _grow_binhead();
|
||||
void _grow_nbor_list(NeighList *list, const int nlocal,
|
||||
const int offload_end);
|
||||
void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
|
||||
const int offload_end, const int pack_width);
|
||||
void _grow_stencil(NeighList *list);
|
||||
};
|
||||
|
||||
|
||||
@ -55,30 +55,37 @@ enum {LMP_OVERFLOW, LMP_LOCAL_MIN, LMP_LOCAL_MAX, LMP_GHOST_MIN,
|
||||
enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
TIME_OFFLOAD_PAIR, TIME_OFFLOAD_WAIT, TIME_OFFLOAD_LATENCY,
|
||||
TIME_IMBALANCE};
|
||||
#define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
|
||||
|
||||
#define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
|
||||
#define INTEL_MIC_VECTOR_WIDTH 16
|
||||
#define INTEL_VECTOR_WIDTH 4
|
||||
|
||||
#ifdef __AVX__
|
||||
#undef INTEL_VECTOR_WIDTH
|
||||
#define INTEL_VECTOR_WIDTH 8
|
||||
#endif
|
||||
|
||||
#ifdef __AVX2__
|
||||
#undef INTEL_VECTOR_WIDTH
|
||||
#define INTEL_VECTOR_WIDTH 8
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512F__
|
||||
#undef INTEL_VECTOR_WIDTH
|
||||
#define INTEL_VECTOR_WIDTH 16
|
||||
#define INTEL_V512 1
|
||||
#define INTEL_VMASK 1
|
||||
#else
|
||||
|
||||
#ifdef __MIC__
|
||||
#define INTEL_V512 1
|
||||
#define INTEL_VMASK 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
#define LMP_USE_AVXCD
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define INTEL_DATA_ALIGN 64
|
||||
@ -134,6 +141,18 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
datasize); \
|
||||
}
|
||||
|
||||
#define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, \
|
||||
nthreads, vecsize) \
|
||||
{ \
|
||||
tid = omp_get_thread_num(); \
|
||||
int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
|
||||
/vecsize/nthreads)); \
|
||||
idelta *= vecsize; \
|
||||
ifrom = tid*idelta; \
|
||||
ito = ifrom + idelta; \
|
||||
if (ito > inum) ito = inum; \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
|
||||
@ -364,6 +383,43 @@ inline double MIC_Wtime() {
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, \
|
||||
f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, \
|
||||
vb3x, vb3y, vb3z,oedihedral, force, \
|
||||
newton, nlocal) \
|
||||
{ \
|
||||
flt_t ev_pre; \
|
||||
if (newton) ev_pre = (flt_t)1.0; \
|
||||
else { \
|
||||
ev_pre = (flt_t)0.0; \
|
||||
if (i1 < nlocal) ev_pre += (flt_t)0.25; \
|
||||
if (i2 < nlocal) ev_pre += (flt_t)0.25; \
|
||||
if (i3 < nlocal) ev_pre += (flt_t)0.25; \
|
||||
if (i4 < nlocal) ev_pre += (flt_t)0.25; \
|
||||
} \
|
||||
\
|
||||
if (eflag) { \
|
||||
oedihedral += ev_pre * deng; \
|
||||
if (eatom) { \
|
||||
flt_t qdeng = deng * (flt_t)0.25; \
|
||||
if (newton || i1 < nlocal) f[i1].w += qdeng; \
|
||||
if (newton || i2 < nlocal) f[i2].w += qdeng; \
|
||||
if (newton || i3 < nlocal) f[i3].w += qdeng; \
|
||||
if (newton || i4 < nlocal) f[i4].w += qdeng; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (vflag) { \
|
||||
sv0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \
|
||||
sv1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \
|
||||
sv2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \
|
||||
sv3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y); \
|
||||
sv4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z); \
|
||||
sv5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp) \
|
||||
{ \
|
||||
if (evflag) { \
|
||||
|
||||
@ -351,4 +351,128 @@
|
||||
ans##_0 = (aug_3 - t) / aug_0; \
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
normalize a quaternion
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#define ME_qnormalize(q) \
|
||||
{ \
|
||||
double norm = 1.0 / \
|
||||
sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k); \
|
||||
q##_w *= norm; \
|
||||
q##_i *= norm; \
|
||||
q##_j *= norm; \
|
||||
q##_k *= norm; \
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute omega from angular momentum
|
||||
w = omega = angular velocity in space frame
|
||||
wbody = angular velocity in body frame
|
||||
project space-frame angular momentum onto body axes
|
||||
and divide by principal moments
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w) \
|
||||
{ \
|
||||
double wbody_0, wbody_1, wbody_2; \
|
||||
double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \
|
||||
\
|
||||
double w2 = quat##_w * quat##_w; \
|
||||
double i2 = quat##_i * quat##_i; \
|
||||
double j2 = quat##_j * quat##_j; \
|
||||
double k2 = quat##_k * quat##_k; \
|
||||
double twoij = 2.0 * quat##_i * quat##_j; \
|
||||
double twoik = 2.0 * quat##_i * quat##_k; \
|
||||
double twojk = 2.0 * quat##_j * quat##_k; \
|
||||
double twoiw = 2.0 * quat##_i * quat##_w; \
|
||||
double twojw = 2.0 * quat##_j * quat##_w; \
|
||||
double twokw = 2.0 * quat##_k * quat##_w; \
|
||||
\
|
||||
rot##_0 = w2 + i2 - j2 - k2; \
|
||||
rot##_1 = twoij - twokw; \
|
||||
rot##_2 = twojw + twoik; \
|
||||
\
|
||||
rot##_3 = twoij + twokw; \
|
||||
rot##_4 = w2 - i2 + j2 - k2; \
|
||||
rot##_5 = twojk - twoiw; \
|
||||
\
|
||||
rot##_6 = twoik - twojw; \
|
||||
rot##_7 = twojk + twoiw; \
|
||||
rot##_8 = w2 - i2 - j2 + k2; \
|
||||
\
|
||||
wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2; \
|
||||
wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2; \
|
||||
wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2; \
|
||||
\
|
||||
wbody_0 *= moments_0; \
|
||||
wbody_1 *= moments_1; \
|
||||
wbody_2 *= moments_2; \
|
||||
\
|
||||
w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2; \
|
||||
w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2; \
|
||||
w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2; \
|
||||
}
|
||||
|
||||
#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2) \
|
||||
{ \
|
||||
angmomin[0] += dtf * torque[0]; \
|
||||
double angmom_0 = angmomin[0]; \
|
||||
angmomin[1] += dtf * torque[1]; \
|
||||
double angmom_1 = angmomin[1]; \
|
||||
angmomin[2] += dtf * torque[2]; \
|
||||
double angmom_2 = angmomin[2]; \
|
||||
\
|
||||
double quat_w = quatin[0]; \
|
||||
double quat_i = quatin[1]; \
|
||||
double quat_j = quatin[2]; \
|
||||
double quat_k = quatin[3]; \
|
||||
\
|
||||
double omega_0, omega_1, omega_2; \
|
||||
ME_mq_to_omega(angmom,quat,i0,i1,i2,omega); \
|
||||
\
|
||||
double wq_0, wq_1, wq_2, wq_3; \
|
||||
wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k; \
|
||||
wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j; \
|
||||
wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k; \
|
||||
wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i; \
|
||||
\
|
||||
double qfull_w, qfull_i, qfull_j, qfull_k; \
|
||||
qfull_w = quat_w + dtq * wq_0; \
|
||||
qfull_i = quat_i + dtq * wq_1; \
|
||||
qfull_j = quat_j + dtq * wq_2; \
|
||||
qfull_k = quat_k + dtq * wq_3; \
|
||||
ME_qnormalize(qfull); \
|
||||
\
|
||||
double qhalf_w, qhalf_i, qhalf_j, qhalf_k; \
|
||||
qhalf_w = quat_w + 0.5*dtq * wq_0; \
|
||||
qhalf_i = quat_i + 0.5*dtq * wq_1; \
|
||||
qhalf_j = quat_j + 0.5*dtq * wq_2; \
|
||||
qhalf_k = quat_k + 0.5*dtq * wq_3; \
|
||||
ME_qnormalize(qhalf); \
|
||||
\
|
||||
ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega); \
|
||||
wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k; \
|
||||
wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j; \
|
||||
wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k; \
|
||||
wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i; \
|
||||
\
|
||||
qhalf_w += 0.5*dtq * wq_0; \
|
||||
qhalf_i += 0.5*dtq * wq_1; \
|
||||
qhalf_j += 0.5*dtq * wq_2; \
|
||||
qhalf_k += 0.5*dtq * wq_3; \
|
||||
ME_qnormalize(qhalf); \
|
||||
\
|
||||
quat_w = 2.0*qhalf_w - qfull_w; \
|
||||
quat_i = 2.0*qhalf_i - qfull_i; \
|
||||
quat_j = 2.0*qhalf_j - qfull_j; \
|
||||
quat_k = 2.0*qhalf_k - qfull_k; \
|
||||
ME_qnormalize(quat); \
|
||||
\
|
||||
quatin[0] = quat_w; \
|
||||
quatin[1] = quat_i; \
|
||||
quatin[2] = quat_j; \
|
||||
quatin[3] = quat_k; \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -15,6 +15,8 @@
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
//#define OUTER_CHUNK 1
|
||||
|
||||
#include "neighbor.h"
|
||||
#include "neigh_list.h"
|
||||
#include "atom.h"
|
||||
@ -26,6 +28,14 @@
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#ifdef LMP_USE_AVXCD
|
||||
#include "intel_simd.h"
|
||||
#endif
|
||||
|
||||
#ifdef OUTER_CHUNK
|
||||
#include "intel_simd.h"
|
||||
#endif
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
@ -42,17 +52,11 @@ using namespace LAMMPS_NS;
|
||||
for (int s = 0; s < n3; s++) { \
|
||||
if (sptr[s] == tag) { \
|
||||
if (s < n1) { \
|
||||
if (special_flag[1] == 0) which = -1; \
|
||||
else if (special_flag[1] == 1) which = 0; \
|
||||
else which = 1; \
|
||||
which = 1; \
|
||||
} else if (s < n2) { \
|
||||
if (special_flag[2] == 0) which = -1; \
|
||||
else if (special_flag[2] == 1) which = 0; \
|
||||
else which = 2; \
|
||||
which = 2; \
|
||||
} else { \
|
||||
if (special_flag[3] == 0) which = -1; \
|
||||
else if (special_flag[3] == 1) which = 0; \
|
||||
else which = 3; \
|
||||
which = 3; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
@ -199,7 +203,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
|
||||
if (offload) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
|
||||
buffers->grow_nbor(list, atom->nlocal, aend);
|
||||
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
|
||||
|
||||
ATOM_T biga;
|
||||
biga.x = INTEL_BIGP;
|
||||
@ -335,7 +339,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
|
||||
signal(tag)
|
||||
#endif
|
||||
{
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
@ -494,12 +498,12 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
|
||||
if (j >= nlocal) {
|
||||
if (j == nall)
|
||||
jlist[jj] = nall_offset;
|
||||
else if (which > 0)
|
||||
else if (which)
|
||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||
else jlist[jj]-=ghost_offset;
|
||||
} else
|
||||
#endif
|
||||
if (which > 0) jlist[jj] = j ^ (which << SBBITS);
|
||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -520,7 +524,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
|
||||
}
|
||||
#endif
|
||||
} // end omp
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end offload
|
||||
@ -688,7 +692,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
if (offload) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
|
||||
buffers->grow_nbor(list, atom->nlocal, aend);
|
||||
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
|
||||
|
||||
ATOM_T biga;
|
||||
biga.x = INTEL_BIGP;
|
||||
@ -827,7 +831,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
signal(tag)
|
||||
#endif
|
||||
{
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
@ -848,33 +852,47 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
|
||||
const int num = aend - astart;
|
||||
int tid, ifrom, ito;
|
||||
|
||||
#ifdef OUTER_CHUNK
|
||||
const int swidth = ip_simd::SIMD_type<flt_t>::width();
|
||||
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, swidth);
|
||||
ifrom += astart;
|
||||
ito += astart;
|
||||
int e_ito = ito;
|
||||
if (ito == num) {
|
||||
int imod = ito % swidth;
|
||||
if (imod) e_ito += swidth - e_ito;
|
||||
}
|
||||
const int list_size = (e_ito + tid + 1) * maxnbors;
|
||||
#else
|
||||
const int swidth = 1;
|
||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
||||
ifrom += astart;
|
||||
ito += astart;
|
||||
const int list_size = (ito + tid + 1) * maxnbors;
|
||||
#endif
|
||||
|
||||
int which;
|
||||
|
||||
const int list_size = (ito + tid + 1) * maxnbors;
|
||||
int pack_offset = maxnbors * swidth;
|
||||
int ct = (ifrom + tid) * maxnbors;
|
||||
int *neighptr = firstneigh + ct;
|
||||
|
||||
int max_chunk = 0;
|
||||
int lane = 0;
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
int j, k, n, n2, itype, jtype, ibin;
|
||||
double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
|
||||
|
||||
n = 0;
|
||||
n2 = maxnbors;
|
||||
|
||||
xtmp = x[i].x;
|
||||
ytmp = x[i].y;
|
||||
ztmp = x[i].z;
|
||||
itype = x[i].w;
|
||||
const flt_t xtmp = x[i].x;
|
||||
const flt_t ytmp = x[i].y;
|
||||
const flt_t ztmp = x[i].z;
|
||||
const int itype = x[i].w;
|
||||
const int ioffset = ntypes * itype;
|
||||
|
||||
// loop over rest of atoms in i's bin, ghosts are at end of linked list
|
||||
// if j is owned atom, store it, since j is beyond i in linked list
|
||||
// if j is ghost, only store if j coords are "above/to the right" of i
|
||||
|
||||
for (j = bins[i]; j >= 0; j = bins[j]) {
|
||||
int raw_count = pack_offset;
|
||||
for (int j = bins[i]; j >= 0; j = bins[j]) {
|
||||
if (j >= nlocal) {
|
||||
if (offload_noghost && offload) continue;
|
||||
if (x[j].z < ztmp) continue;
|
||||
@ -884,116 +902,145 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
}
|
||||
} else if (offload_noghost && i < offload_end) continue;
|
||||
|
||||
jtype = x[j].w;
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||
if (exclude) {
|
||||
const int jtype = x[j].w;
|
||||
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
delx = xtmp - x[j].x;
|
||||
dely = ytmp - x[j].y;
|
||||
delz = ztmp - x[j].z;
|
||||
rsq = delx * delx + dely * dely + delz * delz;
|
||||
neighptr[raw_count++] = j;
|
||||
}
|
||||
|
||||
if (rsq <= cutneighsq[ioffset + jtype]) {
|
||||
if (j < nlocal) {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[n++] = -j - 1;
|
||||
else
|
||||
neighptr[n++] = j;
|
||||
} else
|
||||
neighptr[n++] = j;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j < lmin) lmin = j;
|
||||
if (j > lmax) lmax = j;
|
||||
#endif
|
||||
} else {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[n2++] = -j - 1;
|
||||
else
|
||||
neighptr[n2++] = j;
|
||||
} else
|
||||
neighptr[n2++] = j;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j < gmin) gmin = j;
|
||||
if (j > gmax) gmax = j;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
// loop over all atoms in other bins in stencil, store every pair
|
||||
|
||||
ibin = atombin[i];
|
||||
|
||||
for (k = 0; k < nstencil; k++) {
|
||||
for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
|
||||
const int ibin = atombin[i];
|
||||
for (int k = 0; k < nstencil; k++) {
|
||||
for (int j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
|
||||
if (offload_noghost) {
|
||||
if (j < nlocal) {
|
||||
if (i < offload_end) continue;
|
||||
} else if (offload) continue;
|
||||
}
|
||||
|
||||
jtype = x[j].w;
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||
if (exclude) {
|
||||
const int jtype = x[j].w;
|
||||
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
delx = xtmp - x[j].x;
|
||||
dely = ytmp - x[j].y;
|
||||
delz = ztmp - x[j].z;
|
||||
rsq = delx * delx + dely * dely + delz * delz;
|
||||
if (rsq <= cutneighsq[ioffset + jtype]) {
|
||||
if (j < nlocal) {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[n++] = -j - 1;
|
||||
else
|
||||
neighptr[n++] = j;
|
||||
} else
|
||||
neighptr[n++] = j;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j < lmin) lmin = j;
|
||||
if (j > lmax) lmax = j;
|
||||
#endif
|
||||
} else {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[n2++] = -j - 1;
|
||||
else
|
||||
neighptr[n2++] = j;
|
||||
} else
|
||||
neighptr[n2++] = j;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j < gmin) gmin = j;
|
||||
if (j > gmax) gmax = j;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ilist[i] = i;
|
||||
|
||||
cnumneigh[i] = ct;
|
||||
if (n > maxnbors) *overflow = 1;
|
||||
for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
|
||||
while( (n % pad_width) != 0 ) neighptr[n++] = e_nall;
|
||||
numneigh[i] = n;
|
||||
while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
|
||||
ct += n;
|
||||
neighptr += n;
|
||||
if (ct + n + maxnbors > list_size) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid) * maxnbors;
|
||||
neighptr[raw_count++] = j;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
||||
#else
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int u = pack_offset; u < raw_count; u++) {
|
||||
int j = neighptr[u];
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
const int jtype = x[j].w;
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
if (rsq > cutneighsq[ioffset + jtype])
|
||||
neighptr[u] = e_nall;
|
||||
else {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[u] = -j - 1;
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j < nlocal) {
|
||||
if (j < vlmin) vlmin = j;
|
||||
if (j > vlmax) vlmax = j;
|
||||
} else {
|
||||
if (j < vgmin) vgmin = j;
|
||||
if (j > vgmax) vgmax = j;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
lmin = MIN(lmin,vlmin);
|
||||
gmin = MIN(gmin,vgmin);
|
||||
lmax = MAX(lmax,vlmax);
|
||||
gmax = MAX(gmax,vgmax);
|
||||
#endif
|
||||
|
||||
int n = lane, n2 = pack_offset;
|
||||
for (int u = pack_offset; u < raw_count; u++) {
|
||||
const int j = neighptr[u];
|
||||
int pj = j;
|
||||
if (pj < e_nall) {
|
||||
if (need_ic)
|
||||
if (pj < 0) pj = -pj - 1;
|
||||
|
||||
if (pj < nlocal) {
|
||||
neighptr[n] = j;
|
||||
n += swidth;
|
||||
} else
|
||||
neighptr[n2++] = j;
|
||||
}
|
||||
}
|
||||
int ns = (n - lane) / swidth;
|
||||
if (ns > maxnbors || n2 > list_size) *overflow = 1;
|
||||
for (int u = pack_offset; u < n2; u++) {
|
||||
neighptr[n] = neighptr[u];
|
||||
n += swidth;
|
||||
}
|
||||
|
||||
ilist[i] = i;
|
||||
cnumneigh[i] = ct + lane;
|
||||
ns += n2 - pack_offset;
|
||||
#ifndef OUTER_CHUNK
|
||||
while( (ns % pad_width) != 0 ) neighptr[ns++] = e_nall;
|
||||
#endif
|
||||
numneigh[i] = ns;
|
||||
|
||||
#ifdef OUTER_CHUNK
|
||||
if (ns > max_chunk) max_chunk = ns;
|
||||
lane++;
|
||||
pack_offset -= maxnbors;
|
||||
if (lane == swidth) {
|
||||
ct += max_chunk * swidth;
|
||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||
const int edge = (ct % alignb);
|
||||
if (edge) ct += alignb - edge;
|
||||
neighptr = firstneigh + ct;
|
||||
max_chunk = 0;
|
||||
pack_offset = maxnbors * swidth;
|
||||
lane = 0;
|
||||
if (ct + pack_offset + maxnbors > list_size) {
|
||||
if (i < ito - 1) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid) * maxnbors;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
ct += ns;
|
||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||
const int edge = (ct % alignb);
|
||||
if (edge) ct += alignb - edge;
|
||||
neighptr = firstneigh + ct;
|
||||
if (ct + pack_offset + maxnbors > list_size) {
|
||||
if (i < ito - 1) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid) * maxnbors;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (*overflow == 1)
|
||||
@ -1032,7 +1079,16 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
#ifndef OUTER_CHUNK
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
#else
|
||||
const int trip = jnum * swidth;
|
||||
for (int jj = 0; jj < trip; jj+= swidth) {
|
||||
#endif
|
||||
const int j = jlist[jj];
|
||||
if (need_ic && j < 0) {
|
||||
which = 0;
|
||||
@ -1044,12 +1100,12 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
if (j >= nlocal) {
|
||||
if (j == e_nall)
|
||||
jlist[jj] = nall_offset;
|
||||
else if (which > 0)
|
||||
else if (which)
|
||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||
else jlist[jj]-=ghost_offset;
|
||||
} else
|
||||
#endif
|
||||
if (which > 0) jlist[jj] = j ^ (which << SBBITS);
|
||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1070,7 +1126,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
||||
}
|
||||
#endif
|
||||
} // end omp
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end offload
|
||||
@ -1238,7 +1294,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
|
||||
if (offload) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
|
||||
buffers->grow_nbor(list, atom->nlocal, aend);
|
||||
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
|
||||
|
||||
ATOM_T biga;
|
||||
biga.x = INTEL_BIGP;
|
||||
@ -1377,7 +1433,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
|
||||
signal(tag)
|
||||
#endif
|
||||
{
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
@ -1550,12 +1606,12 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
|
||||
if (j >= nlocal) {
|
||||
if (j == e_nall)
|
||||
jlist[jj] = nall_offset;
|
||||
else if (which > 0)
|
||||
else if (which)
|
||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||
else jlist[jj]-=ghost_offset;
|
||||
} else
|
||||
#endif
|
||||
if (which > 0) jlist[jj] = j ^ (which << SBBITS);
|
||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1576,7 +1632,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
|
||||
}
|
||||
#endif
|
||||
} // end omp
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end offload
|
||||
@ -1741,10 +1797,12 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
const int nall = atom->nlocal + atom->nghost;
|
||||
int pad = 1;
|
||||
|
||||
const int pack_width = fix->nbor_pack_width();
|
||||
|
||||
if (offload) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
|
||||
buffers->grow_nbor(list, atom->nlocal, aend);
|
||||
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend, pack_width);
|
||||
|
||||
ATOM_T biga;
|
||||
biga.x = INTEL_BIGP;
|
||||
@ -1871,7 +1929,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
||||
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
||||
in(special_flag:length(0) alloc_if(0) free_if(0)) \
|
||||
in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
|
||||
in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pack_width) \
|
||||
in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
|
||||
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
|
||||
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
||||
@ -1879,7 +1937,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
signal(tag)
|
||||
#endif
|
||||
{
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
@ -1900,36 +1958,40 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
|
||||
const int num = aend - astart;
|
||||
int tid, ifrom, ito;
|
||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
||||
|
||||
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
|
||||
ifrom += astart;
|
||||
ito += astart;
|
||||
int e_ito = ito;
|
||||
if (ito == num) {
|
||||
int imod = ito % pack_width;
|
||||
if (imod) e_ito += pack_width - e_ito;
|
||||
}
|
||||
const int list_size = (e_ito + tid + 1) * maxnbors;
|
||||
|
||||
int which;
|
||||
|
||||
const int list_size = (ito + tid + 1) * maxnbors;
|
||||
int pack_offset = maxnbors * pack_width;
|
||||
int ct = (ifrom + tid) * maxnbors;
|
||||
int *neighptr = firstneigh + ct;
|
||||
|
||||
int max_chunk = 0;
|
||||
int lane = 0;
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
int j, k, n, n2, itype, jtype, ibin;
|
||||
double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
|
||||
|
||||
n = 0;
|
||||
n2 = maxnbors;
|
||||
|
||||
xtmp = x[i].x;
|
||||
ytmp = x[i].y;
|
||||
ztmp = x[i].z;
|
||||
itype = x[i].w;
|
||||
const flt_t xtmp = x[i].x;
|
||||
const flt_t ytmp = x[i].y;
|
||||
const flt_t ztmp = x[i].z;
|
||||
const int itype = x[i].w;
|
||||
const tagint itag = tag[i];
|
||||
const int ioffset = ntypes * itype;
|
||||
|
||||
const int ibin = atombin[i];
|
||||
int raw_count = pack_offset;
|
||||
|
||||
// loop over all atoms in surrounding bins in stencil including self
|
||||
// skip i = j
|
||||
|
||||
ibin = atombin[i];
|
||||
|
||||
for (k = 0; k < nstencil; k++) {
|
||||
for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
|
||||
for (int k = 0; k < nstencil; k++) {
|
||||
for (int j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
|
||||
if (i == j) continue;
|
||||
|
||||
if (offload_noghost) {
|
||||
@ -1938,76 +2000,121 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
} else if (offload) continue;
|
||||
}
|
||||
|
||||
jtype = x[j].w;
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||
#endif
|
||||
|
||||
delx = xtmp - x[j].x;
|
||||
dely = ytmp - x[j].y;
|
||||
delz = ztmp - x[j].z;
|
||||
rsq = delx * delx + dely * dely + delz * delz;
|
||||
if (rsq <= cutneighsq[ioffset + jtype]) {
|
||||
const int jtag = tag[j];
|
||||
int flist = 0;
|
||||
if (itag > jtag) {
|
||||
if ((itag+jtag) % 2 == 0) flist = 1;
|
||||
} else if (itag < jtag) {
|
||||
if ((itag+jtag) % 2 == 1) flist = 1;
|
||||
} else {
|
||||
if (x[j].z < ztmp) flist = 1;
|
||||
else if (x[j].z == ztmp && x[j].y < ytmp) flist = 1;
|
||||
else if (x[j].z == ztmp && x[j].y == ytmp && x[j].x < xtmp)
|
||||
flist = 1;
|
||||
}
|
||||
if (flist) {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[n2++] = -j - 1;
|
||||
else
|
||||
neighptr[n2++] = j;
|
||||
} else
|
||||
neighptr[n2++] = j;
|
||||
} else {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[n++] = -j - 1;
|
||||
else
|
||||
neighptr[n++] = j;
|
||||
} else
|
||||
neighptr[n++] = j;
|
||||
}
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j < nlocal) {
|
||||
if (j < lmin) lmin = j;
|
||||
if (j > lmax) lmax = j;
|
||||
} else {
|
||||
if (j < gmin) gmin = j;
|
||||
if (j > gmax) gmax = j;
|
||||
}
|
||||
#endif
|
||||
if (exclude) {
|
||||
const int jtype = x[j].w;
|
||||
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
neighptr[raw_count++] = j;
|
||||
}
|
||||
}
|
||||
ilist[i] = i;
|
||||
|
||||
cnumneigh[i] = ct;
|
||||
if (n > maxnbors) *overflow = 1;
|
||||
atombin[i] = n;
|
||||
for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
|
||||
numneigh[i] = n;
|
||||
while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
|
||||
ct += n;
|
||||
neighptr += n;
|
||||
if (ct + n + maxnbors > list_size) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid) * maxnbors;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
||||
#else
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int u = pack_offset; u < raw_count; u++) {
|
||||
int j = neighptr[u];
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
const int jtype = x[j].w;
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
if (rsq > cutneighsq[ioffset + jtype])
|
||||
neighptr[u] = e_nall;
|
||||
else {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[u] = -j - 1;
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j < nlocal) {
|
||||
if (j < vlmin) vlmin = j;
|
||||
if (j > vlmax) vlmax = j;
|
||||
} else {
|
||||
if (j < vgmin) vgmin = j;
|
||||
if (j > vgmax) vgmax = j;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
lmin = MIN(lmin,vlmin);
|
||||
gmin = MIN(gmin,vgmin);
|
||||
lmax = MAX(lmax,vlmax);
|
||||
gmax = MAX(gmax,vgmax);
|
||||
#endif
|
||||
|
||||
int n = lane, n2 = pack_offset;
|
||||
for (int u = pack_offset; u < raw_count; u++) {
|
||||
const int j = neighptr[u];
|
||||
int pj = j;
|
||||
if (pj < e_nall) {
|
||||
if (need_ic)
|
||||
if (pj < 0) pj = -pj - 1;
|
||||
|
||||
const int jtag = tag[pj];
|
||||
int flist = 0;
|
||||
if (itag > jtag) {
|
||||
if ((itag+jtag) % 2 == 0) flist = 1;
|
||||
} else if (itag < jtag) {
|
||||
if ((itag+jtag) % 2 == 1) flist = 1;
|
||||
} else {
|
||||
if (x[pj].z < ztmp) flist = 1;
|
||||
else if (x[pj].z == ztmp && x[pj].y < ytmp) flist = 1;
|
||||
else if (x[pj].z == ztmp && x[pj].y == ytmp && x[pj].x < xtmp)
|
||||
flist = 1;
|
||||
}
|
||||
if (flist) {
|
||||
neighptr[n2++] = j;
|
||||
} else {
|
||||
neighptr[n] = j;
|
||||
n += pack_width;
|
||||
}
|
||||
}
|
||||
}
|
||||
int ns = (n - lane) / pack_width;
|
||||
if (ns > maxnbors || n2 > list_size) *overflow = 1;
|
||||
atombin[i] = ns;
|
||||
for (int u = pack_offset; u < n2; u++) {
|
||||
neighptr[n] = neighptr[u];
|
||||
n += pack_width;
|
||||
}
|
||||
|
||||
ilist[i] = i;
|
||||
cnumneigh[i] = ct + lane;
|
||||
ns += n2 - pack_offset;
|
||||
numneigh[i] = ns;
|
||||
|
||||
if (ns > max_chunk) max_chunk = ns;
|
||||
lane++;
|
||||
pack_offset -= maxnbors;
|
||||
if (lane == pack_width) {
|
||||
ct += max_chunk * pack_width;
|
||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||
const int edge = (ct % alignb);
|
||||
if (edge) ct += alignb - edge;
|
||||
neighptr = firstneigh + ct;
|
||||
max_chunk = 0;
|
||||
pack_offset = maxnbors * pack_width;
|
||||
lane = 0;
|
||||
if (ct + pack_offset + maxnbors > list_size) {
|
||||
if (i < ito - 1) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid) * maxnbors;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (*overflow == 1)
|
||||
@ -2046,7 +2153,9 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
|
||||
const int trip = jnum * pack_width;
|
||||
for (int jj = 0; jj < trip; jj+=pack_width) {
|
||||
const int j = jlist[jj];
|
||||
if (need_ic && j < 0) {
|
||||
which = 0;
|
||||
@ -2058,12 +2167,12 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
if (j >= nlocal) {
|
||||
if (j == e_nall)
|
||||
jlist[jj] = nall_offset;
|
||||
else if (which > 0)
|
||||
else if (which)
|
||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||
else jlist[jj]-=ghost_offset;
|
||||
} else
|
||||
#endif
|
||||
if (which > 0) jlist[jj] = j ^ (which << SBBITS);
|
||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2083,7 +2192,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
}
|
||||
#endif
|
||||
} // end omp
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end offload
|
||||
@ -2113,3 +2222,4 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -12,9 +12,17 @@
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <math.h>
|
||||
#include "pair_gayberne_intel.h"
|
||||
#include "math_extra_intel.h"
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
#pragma offload_attribute(push,target(mic))
|
||||
#endif
|
||||
#include <cmath>
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
#pragma offload_attribute(pop)
|
||||
#endif
|
||||
|
||||
#include "atom.h"
|
||||
#include "comm.h"
|
||||
#include "atom_vec_ellipsoid.h"
|
||||
@ -295,7 +303,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
signal(f_start)
|
||||
#endif
|
||||
{
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute=MIC_Wtime();
|
||||
#endif
|
||||
|
||||
@ -335,8 +343,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
|
||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
oevdwl = (acc_t)0.0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
@ -394,8 +402,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
bool multiple_forms = false;
|
||||
@ -485,14 +493,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
tempv_1 = kappa_1 * inv_r;
|
||||
tempv_2 = kappa_2 * inv_r;
|
||||
flt_t sigma12 = ME_dot3(r12hat, tempv);
|
||||
sigma12 = pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
|
||||
sigma12 = std::pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
|
||||
flt_t h12 = r - sigma12;
|
||||
|
||||
// energy
|
||||
// compute u_r
|
||||
|
||||
flt_t varrho = sigma / (h12 + gamma * sigma);
|
||||
flt_t varrho6 = pow(varrho, (flt_t)6.0);
|
||||
flt_t varrho6 = std::pow(varrho, (flt_t)6.0);
|
||||
flt_t varrho12 = varrho6 * varrho6;
|
||||
flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);
|
||||
|
||||
@ -500,7 +508,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
|
||||
flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
|
||||
flt_t det_g12 = ME_det3(g12);
|
||||
eta = pow(eta / det_g12, upsilon);
|
||||
eta = std::pow(eta / det_g12, upsilon);
|
||||
|
||||
// compute chi_12
|
||||
|
||||
@ -516,7 +524,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
tempv_1 = iota_1 * inv_r;
|
||||
tempv_2 = iota_2 * inv_r;
|
||||
flt_t chi = ME_dot3(r12hat, tempv);
|
||||
chi = pow(chi * (flt_t)2.0, mu);
|
||||
chi = std::pow(chi * (flt_t)2.0, mu);
|
||||
|
||||
// force
|
||||
// compute dUr/dr
|
||||
@ -524,7 +532,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
|
||||
sigma;
|
||||
temp1 = temp1 * (flt_t)24.0 * epsilon;
|
||||
flt_t u_slj = temp1 * pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
|
||||
flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
|
||||
flt_t dUr_0, dUr_1, dUr_2;
|
||||
temp2 = ME_dot3(kappa, r12hat);
|
||||
flt_t uslj_rsq = u_slj / rsq_form[jj];
|
||||
@ -536,8 +544,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
|
||||
flt_t dchi_0, dchi_1, dchi_2;
|
||||
temp1 = ME_dot3(iota, r12hat);
|
||||
temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
|
||||
pow(chi, (mu - (flt_t)1.0) / mu);
|
||||
temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
|
||||
std::pow(chi, (mu - (flt_t)1.0) / mu);
|
||||
dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
|
||||
dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
|
||||
dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
|
||||
@ -714,7 +722,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
flt_t ev_pre = (flt_t)0;
|
||||
flt_t ev_pre = (flt_t)0.0;
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
@ -863,7 +871,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // offload
|
||||
|
||||
@ -217,7 +217,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
ITABLE_IN signal(f_start)
|
||||
#endif
|
||||
{
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
@ -459,7 +459,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
}
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end of offload region
|
||||
|
||||
@ -212,7 +212,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
ITABLE_IN signal(f_start)
|
||||
#endif
|
||||
{
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
@ -263,7 +263,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
}
|
||||
|
||||
#if defined(__INTEL_COMPILER)
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
@ -283,7 +283,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
|
||||
const flt_t r2inv = (flt_t)1.0 / rsq;
|
||||
|
||||
#ifdef __MIC__
|
||||
#ifdef INTEL_VMASK
|
||||
if (rsq < c_forcei[jtype].cutsq) {
|
||||
#endif
|
||||
#ifdef INTEL_ALLOW_TABLE
|
||||
@ -335,11 +335,11 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef __MIC__
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __MIC__
|
||||
#ifdef INTEL_VMASK
|
||||
if (rsq < c_forcei[jtype].cut_ljsq) {
|
||||
#endif
|
||||
flt_t r6inv = r2inv * r2inv * r2inv;
|
||||
@ -354,7 +354,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
forcelj *= factor_lj;
|
||||
if (EFLAG) evdwl *= factor_lj;
|
||||
}
|
||||
#ifdef __MIC__
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
#else
|
||||
if (rsq > c_forcei[jtype].cutsq)
|
||||
@ -363,7 +363,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
{ forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
||||
#endif
|
||||
|
||||
#ifdef __MIC__
|
||||
#ifdef INTEL_VMASK
|
||||
if (rsq < c_forcei[jtype].cutsq) {
|
||||
#endif
|
||||
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
||||
@ -395,7 +395,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
|
||||
}
|
||||
#ifdef __MIC__
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
#endif
|
||||
} // for jj
|
||||
@ -426,7 +426,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
}
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end of offload region
|
||||
|
||||
@ -88,39 +88,73 @@ void PairLJCutIntel::compute(int eflag, int vflag,
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
if (_onetype) {
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
eval<1,0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
|
||||
class acc_t>
|
||||
void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
@ -159,7 +193,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
const int nthreads = tc;
|
||||
int *overflow = fix->get_off_overflow_flag();
|
||||
{
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
@ -187,12 +221,25 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
flt_t cutsq, lj1, lj2, lj3, lj4, offset;
|
||||
if (ONETYPE) {
|
||||
cutsq = ljc12o[3].cutsq;
|
||||
lj1 = ljc12o[3].lj1;
|
||||
lj2 = ljc12o[3].lj2;
|
||||
lj3 = lj34[3].lj3;
|
||||
lj4 = lj34[3].lj4;
|
||||
offset = ljc12o[3].offset;
|
||||
}
|
||||
for (int i = iifrom; i < iito; ++i) {
|
||||
const int itype = x[i].w;
|
||||
|
||||
const int ptr_off = itype * ntypes;
|
||||
const FC_PACKED1_T * _noalias const ljc12oi = ljc12o + ptr_off;
|
||||
const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
|
||||
int itype, ptr_off;
|
||||
const FC_PACKED1_T * _noalias ljc12oi;
|
||||
const FC_PACKED2_T * _noalias lj34i;
|
||||
if (!ONETYPE) {
|
||||
itype = x[i].w;
|
||||
ptr_off = itype * ntypes;
|
||||
ljc12oi = ljc12o + ptr_off;
|
||||
lj34i = lj34 + ptr_off;
|
||||
}
|
||||
|
||||
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
@ -218,25 +265,42 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
flt_t forcelj, evdwl;
|
||||
forcelj = evdwl = (flt_t)0.0;
|
||||
|
||||
const int sbindex = jlist[jj] >> SBBITS & 3;
|
||||
const int j = jlist[jj] & NEIGHMASK;
|
||||
int j, jtype, sbindex;
|
||||
if (!ONETYPE) {
|
||||
sbindex = jlist[jj] >> SBBITS & 3;
|
||||
j = jlist[jj] & NEIGHMASK;
|
||||
} else
|
||||
j = jlist[jj];
|
||||
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
const int jtype = x[j].w;
|
||||
if (!ONETYPE) {
|
||||
jtype = x[j].w;
|
||||
cutsq = ljc12oi[jtype].cutsq;
|
||||
}
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef INTEL_VMASK
|
||||
if (rsq < ljc12oi[jtype].cutsq) {
|
||||
if (rsq < cutsq) {
|
||||
#endif
|
||||
flt_t factor_lj = special_lj[sbindex];
|
||||
flt_t factor_lj;
|
||||
if (!ONETYPE) factor_lj = special_lj[sbindex];
|
||||
flt_t r2inv = 1.0 / rsq;
|
||||
flt_t r6inv = r2inv * r2inv * r2inv;
|
||||
#ifndef INTEL_VMASK
|
||||
if (rsq > ljc12oi[jtype].cutsq) r6inv = (flt_t)0.0;
|
||||
if (rsq > cutsq) r6inv = (flt_t)0.0;
|
||||
#endif
|
||||
forcelj = r6inv * (ljc12oi[jtype].lj1 * r6inv - ljc12oi[jtype].lj2);
|
||||
flt_t fpair = factor_lj * forcelj * r2inv;
|
||||
if (!ONETYPE) {
|
||||
lj1 = ljc12oi[jtype].lj1;
|
||||
lj2 = ljc12oi[jtype].lj2;
|
||||
}
|
||||
forcelj = r6inv * (lj1 * r6inv - lj2);
|
||||
flt_t fpair;
|
||||
if (!ONETYPE)
|
||||
fpair = factor_lj * forcelj * r2inv;
|
||||
else
|
||||
fpair = forcelj * r2inv;
|
||||
|
||||
fxtmp += delx * fpair;
|
||||
fytmp += dely * fpair;
|
||||
@ -255,9 +319,13 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
ev_pre += (flt_t)0.5;
|
||||
|
||||
if (EFLAG) {
|
||||
evdwl = r6inv * (lj34i[jtype].lj3 * r6inv-lj34i[jtype].lj4) -
|
||||
ljc12oi[jtype].offset;
|
||||
evdwl *= factor_lj;
|
||||
if (!ONETYPE) {
|
||||
lj3 = lj34i[jtype].lj3;
|
||||
lj4 = lj34i[jtype].lj4;
|
||||
offset = ljc12oi[jtype].offset;
|
||||
}
|
||||
evdwl = r6inv * (lj3 * r6inv - lj4) - offset;
|
||||
if (!ONETYPE) evdwl *= factor_lj;
|
||||
sevdwl += ev_pre*evdwl;
|
||||
if (eatom) {
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
@ -302,7 +370,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
}
|
||||
#ifdef __MIC__
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end offload
|
||||
@ -352,6 +420,9 @@ template <class flt_t, class acc_t>
|
||||
void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
_onetype = 0;
|
||||
if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
|
||||
|
||||
int tp1 = atom->ntypes + 1;
|
||||
fc.set_ntypes(tp1,memory,_cop);
|
||||
buffers->set_ntypes(tp1);
|
||||
|
||||
@ -39,13 +39,14 @@ class PairLJCutIntel : public PairLJCut {
|
||||
|
||||
private:
|
||||
FixIntel *fix;
|
||||
int _cop;
|
||||
int _cop, _onetype;
|
||||
|
||||
template <class flt_t> class ForceConst;
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
|
||||
class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -11,6 +11,10 @@
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef PAIR_CLASS
|
||||
|
||||
PairStyle(sw/intel,PairSWIntel)
|
||||
@ -42,7 +46,7 @@ class PairSWIntel : public PairSW {
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int SPQ, int EVFLAG, int EFLAG, class flt_t, class acc_t>
|
||||
template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
|
||||
const int astart, const int aend, const int pad_width);
|
||||
@ -51,7 +55,10 @@ class PairSWIntel : public PairSW {
|
||||
void pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t, acc_t> *buffers);
|
||||
|
||||
int _ccache_stride, _host_pad, _offload_pad, _spq;
|
||||
int _ccache_stride, _host_pad, _offload_pad, _spq, _onetype;
|
||||
#ifdef LMP_USE_AVXCD
|
||||
int _ccache_stride3;
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
@ -62,8 +69,11 @@ class PairSWIntel : public PairSW {
|
||||
flt_t cutsq, cut, sigma_gamma, pad;
|
||||
} fc_packed0;
|
||||
typedef struct {
|
||||
flt_t powerp, powerq, cut, sigma, c1, c2, c3, c4;
|
||||
flt_t powerp, powerq, cut, sigma;
|
||||
} fc_packed1;
|
||||
typedef struct {
|
||||
flt_t c1, c2, c3, c4;
|
||||
} fc_packed1p2;
|
||||
typedef struct {
|
||||
flt_t c5, c6;
|
||||
} fc_packed2;
|
||||
@ -73,6 +83,7 @@ class PairSWIntel : public PairSW {
|
||||
|
||||
fc_packed0 **p2;
|
||||
fc_packed1 **p2f;
|
||||
fc_packed1p2 **p2f2;
|
||||
fc_packed2 **p2e;
|
||||
fc_packed3 ***p3;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user