git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@14386 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -26,14 +26,12 @@ action () {
|
|||||||
# do not install child files if parent does not exist
|
# do not install child files if parent does not exist
|
||||||
|
|
||||||
for file in *_intel.cpp; do
|
for file in *_intel.cpp; do
|
||||||
test $file = thr_intel.cpp && continue
|
|
||||||
dep=`echo $file | sed 's/neigh_full_intel/neigh_full/g' | \
|
dep=`echo $file | sed 's/neigh_full_intel/neigh_full/g' | \
|
||||||
sed 's/_offload_intel//g' | sed 's/_intel//g'`
|
sed 's/_offload_intel//g' | sed 's/_intel//g'`
|
||||||
action $file $dep
|
action $file $dep
|
||||||
done
|
done
|
||||||
|
|
||||||
for file in *_intel.h; do
|
for file in *_intel.h; do
|
||||||
test $file = thr_intel.h && continue
|
|
||||||
dep=`echo $file | sed 's/_offload_intel//g' | sed 's/_intel//g'`
|
dep=`echo $file | sed 's/_offload_intel//g' | sed 's/_intel//g'`
|
||||||
action $file $dep
|
action $file $dep
|
||||||
done
|
done
|
||||||
@ -42,6 +40,8 @@ action intel_preprocess.h
|
|||||||
action intel_buffers.h
|
action intel_buffers.h
|
||||||
action intel_buffers.cpp
|
action intel_buffers.cpp
|
||||||
action math_extra_intel.h
|
action math_extra_intel.h
|
||||||
|
action intel_simd.h pair_sw_intel.cpp
|
||||||
|
action intel_intrinsics.h pair_tersoff_intel.cpp
|
||||||
|
|
||||||
# step 2: handle cases and tasks not handled in step 1.
|
# step 2: handle cases and tasks not handled in step 1.
|
||||||
|
|
||||||
|
|||||||
@ -3,9 +3,11 @@
|
|||||||
LAMMPS Intel(R) Package
|
LAMMPS Intel(R) Package
|
||||||
--------------------------------
|
--------------------------------
|
||||||
|
|
||||||
W. Michael Brown (Intel)
|
W. Michael Brown (Intel) michael.w.brown at intel.com
|
||||||
michael.w.brown at intel.com
|
Rodrigo Canales (RWTH Aachen University)
|
||||||
|
Markus H<>hnerbach (RWTH Aachen University)
|
||||||
|
Ahmed E. Ismail (RWTH Aachen University)
|
||||||
|
Paolo Bientinesi (RWTH Aachen University)
|
||||||
Anupama Kurpad (Intel)
|
Anupama Kurpad (Intel)
|
||||||
Biswajit Mishra (Shell)
|
Biswajit Mishra (Shell)
|
||||||
|
|
||||||
@ -53,3 +55,12 @@ By default, when running with offload to Intel(R) coprocessors, affinity
|
|||||||
for host MPI tasks and OpenMP threads is set automatically within the code.
|
for host MPI tasks and OpenMP threads is set automatically within the code.
|
||||||
This currently requires the use of system calls. To disable at build time,
|
This currently requires the use of system calls. To disable at build time,
|
||||||
compile with -DINTEL_OFFLOAD_NOAFFINITY.
|
compile with -DINTEL_OFFLOAD_NOAFFINITY.
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
Vector intrinsics are temporarily being used for the Stillinger-Weber
|
||||||
|
potential to allow for advanced features in the AVX512 instruction set to
|
||||||
|
be exploited on early hardware. We hope to see compiler improvements for
|
||||||
|
AVX512 that will eliminate this requirement, so it is not recommended to
|
||||||
|
develop code based on the intrinsics implementation. Please e-mail the
|
||||||
|
authors for more details.
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
package intel 1 mode mixed balance $b
|
package intel 1 mode mixed balance $b
|
||||||
package omp 0
|
package omp 0
|
||||||
suffix $s
|
suffix $s
|
||||||
processors * * * grid numa
|
# processors * * * grid numa
|
||||||
|
|
||||||
variable x index 4
|
variable x index 4
|
||||||
variable y index 2
|
variable y index 2
|
||||||
|
|||||||
@ -60,6 +60,8 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
|||||||
|
|
||||||
int ncops = force->inumeric(FLERR,arg[3]);
|
int ncops = force->inumeric(FLERR,arg[3]);
|
||||||
|
|
||||||
|
_nbor_pack_width = 1;
|
||||||
|
|
||||||
_precision_mode = PREC_MODE_MIXED;
|
_precision_mode = PREC_MODE_MIXED;
|
||||||
_offload_balance = 1.0;
|
_offload_balance = 1.0;
|
||||||
_overflow_flag[LMP_OVERFLOW] = 0;
|
_overflow_flag[LMP_OVERFLOW] = 0;
|
||||||
@ -307,12 +309,14 @@ void FixIntel::setup(int vflag)
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
void FixIntel::pair_init_check()
|
void FixIntel::pair_init_check(const bool cdmessage)
|
||||||
{
|
{
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
atom->sortfreq = 1;
|
atom->sortfreq = 1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
_nbor_pack_width = 1;
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_offload_balance != 0.0) atom->sortfreq = 1;
|
if (_offload_balance != 0.0) atom->sortfreq = 1;
|
||||||
|
|
||||||
@ -371,15 +375,12 @@ void FixIntel::pair_init_check()
|
|||||||
char kmode[80];
|
char kmode[80];
|
||||||
if (_precision_mode == PREC_MODE_SINGLE) {
|
if (_precision_mode == PREC_MODE_SINGLE) {
|
||||||
strcpy(kmode, "single");
|
strcpy(kmode, "single");
|
||||||
get_single_buffers()->free_all_nbor_buffers();
|
|
||||||
get_single_buffers()->need_tag(need_tag);
|
get_single_buffers()->need_tag(need_tag);
|
||||||
} else if (_precision_mode == PREC_MODE_MIXED) {
|
} else if (_precision_mode == PREC_MODE_MIXED) {
|
||||||
strcpy(kmode, "mixed");
|
strcpy(kmode, "mixed");
|
||||||
get_mixed_buffers()->free_all_nbor_buffers();
|
|
||||||
get_mixed_buffers()->need_tag(need_tag);
|
get_mixed_buffers()->need_tag(need_tag);
|
||||||
} else {
|
} else {
|
||||||
strcpy(kmode, "double");
|
strcpy(kmode, "double");
|
||||||
get_double_buffers()->free_all_nbor_buffers();
|
|
||||||
get_double_buffers()->need_tag(need_tag);
|
get_double_buffers()->need_tag(need_tag);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -399,6 +400,13 @@ void FixIntel::pair_init_check()
|
|||||||
fprintf(screen,"Using Intel Package without Coprocessor.\n");
|
fprintf(screen,"Using Intel Package without Coprocessor.\n");
|
||||||
}
|
}
|
||||||
fprintf(screen,"Precision: %s\n",kmode);
|
fprintf(screen,"Precision: %s\n",kmode);
|
||||||
|
if (cdmessage) {
|
||||||
|
#ifdef LMP_USE_AVXCD
|
||||||
|
fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
|
||||||
|
#else
|
||||||
|
fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
fprintf(screen,
|
fprintf(screen,
|
||||||
"----------------------------------------------------------\n");
|
"----------------------------------------------------------\n");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -11,6 +11,10 @@
|
|||||||
See the README file in the top-level LAMMPS directory.
|
See the README file in the top-level LAMMPS directory.
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------------
|
||||||
|
Contributing author: W. Michael Brown (Intel)
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#ifdef FIX_CLASS
|
#ifdef FIX_CLASS
|
||||||
|
|
||||||
FixStyle(INTEL,FixIntel)
|
FixStyle(INTEL,FixIntel)
|
||||||
@ -39,7 +43,7 @@ class FixIntel : public Fix {
|
|||||||
virtual int setmask();
|
virtual int setmask();
|
||||||
virtual void init();
|
virtual void init();
|
||||||
virtual void setup(int);
|
virtual void setup(int);
|
||||||
void pair_init_check();
|
void pair_init_check(const bool cdmessage=false);
|
||||||
|
|
||||||
// Get all forces, calculation results from coprocesser
|
// Get all forces, calculation results from coprocesser
|
||||||
void sync_coprocessor();
|
void sync_coprocessor();
|
||||||
@ -58,12 +62,15 @@ class FixIntel : public Fix {
|
|||||||
inline IntelBuffers<double,double> * get_double_buffers()
|
inline IntelBuffers<double,double> * get_double_buffers()
|
||||||
{ return _double_buffers; }
|
{ return _double_buffers; }
|
||||||
|
|
||||||
|
inline int nbor_pack_width() const { return _nbor_pack_width; }
|
||||||
|
inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
IntelBuffers<float,float> *_single_buffers;
|
IntelBuffers<float,float> *_single_buffers;
|
||||||
IntelBuffers<float,double> *_mixed_buffers;
|
IntelBuffers<float,double> *_mixed_buffers;
|
||||||
IntelBuffers<double,double> *_double_buffers;
|
IntelBuffers<double,double> *_double_buffers;
|
||||||
|
|
||||||
int _precision_mode, _nthreads;
|
int _precision_mode, _nthreads, _nbor_pack_width;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
inline int* get_overflow_flag() { return _overflow_flag; }
|
inline int* get_overflow_flag() { return _overflow_flag; }
|
||||||
|
|||||||
@ -343,11 +343,15 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list()
|
|||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
|
void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
|
||||||
const int nlocal,
|
const int nlocal,
|
||||||
const int offload_end)
|
const int nthreads,
|
||||||
|
const int offload_end,
|
||||||
|
const int pack_width)
|
||||||
{
|
{
|
||||||
free_nbor_list();
|
free_nbor_list();
|
||||||
_list_alloc_atoms = 1.10 * nlocal;
|
_list_alloc_atoms = 1.10 * nlocal;
|
||||||
int list_alloc_size = (_list_alloc_atoms + _off_threads) * get_max_nbors();
|
int nt = MAX(nthreads, _off_threads);
|
||||||
|
int list_alloc_size = (_list_alloc_atoms + nt + pack_width - 1) *
|
||||||
|
get_max_nbors();
|
||||||
lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
|
lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (offload_end > 0) {
|
if (offload_end > 0) {
|
||||||
@ -393,6 +397,9 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
|
|||||||
flt_t *ccachew = _ccachew;
|
flt_t *ccachew = _ccachew;
|
||||||
int *ccachei = _ccachei;
|
int *ccachei = _ccachei;
|
||||||
int *ccachej = _ccachej;
|
int *ccachej = _ccachej;
|
||||||
|
#ifdef LMP_USE_AVXCD
|
||||||
|
acc_t *ccachef = _ccachef;
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_off_ccache) {
|
if (_off_ccache) {
|
||||||
@ -409,6 +416,9 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
|
|||||||
lmp->memory->destroy(ccachew);
|
lmp->memory->destroy(ccachew);
|
||||||
lmp->memory->destroy(ccachei);
|
lmp->memory->destroy(ccachei);
|
||||||
lmp->memory->destroy(ccachej);
|
lmp->memory->destroy(ccachej);
|
||||||
|
#ifdef LMP_USE_AVXCD
|
||||||
|
lmp->memory->destroy(ccachef);
|
||||||
|
#endif
|
||||||
|
|
||||||
_ccachex = 0;
|
_ccachex = 0;
|
||||||
}
|
}
|
||||||
@ -418,7 +428,8 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
||||||
const int nthreads)
|
const int nthreads,
|
||||||
|
const int width)
|
||||||
{
|
{
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_ccachex && off_flag && _off_ccache == 0)
|
if (_ccachex && off_flag && _off_ccache == 0)
|
||||||
@ -427,7 +438,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
|||||||
if (_ccachex)
|
if (_ccachex)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
const int nsize = get_max_nbors();
|
const int nsize = get_max_nbors() * width;
|
||||||
int esize = MIN(sizeof(int), sizeof(flt_t));
|
int esize = MIN(sizeof(int), sizeof(flt_t));
|
||||||
IP_PRE_get_stride(_ccache_stride, nsize, esize, 0);
|
IP_PRE_get_stride(_ccache_stride, nsize, esize, 0);
|
||||||
int nt = MAX(nthreads, _off_threads);
|
int nt = MAX(nthreads, _off_threads);
|
||||||
@ -439,6 +450,11 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
|||||||
lmp->memory->create(_ccachew, vsize, "_ccachew");
|
lmp->memory->create(_ccachew, vsize, "_ccachew");
|
||||||
lmp->memory->create(_ccachei, vsize, "_ccachei");
|
lmp->memory->create(_ccachei, vsize, "_ccachei");
|
||||||
lmp->memory->create(_ccachej, vsize, "_ccachej");
|
lmp->memory->create(_ccachej, vsize, "_ccachej");
|
||||||
|
#ifdef LMP_USE_AVXCD
|
||||||
|
IP_PRE_get_stride(_ccache_stride3, nsize * 3, esize, 0);
|
||||||
|
lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
|
||||||
|
#endif
|
||||||
|
memset(_ccachej, 0, vsize * sizeof(int));
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (off_flag) {
|
if (off_flag) {
|
||||||
@ -454,7 +470,8 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
|||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
|
nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
|
||||||
nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
|
nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
|
||||||
nocopy(ccachei,ccachej:length(vsize) alloc_if(1) free_if(0))
|
nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
|
||||||
|
in(ccachej:length(vsize) alloc_if(1) free_if(0))
|
||||||
}
|
}
|
||||||
_off_ccache = 1;
|
_off_ccache = 1;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -75,14 +75,14 @@ class IntelBuffers {
|
|||||||
free_local();
|
free_local();
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void grow_nbor(NeighList *list, const int nlocal,
|
inline void grow_nbor(NeighList *list, const int nlocal, const int nthreads,
|
||||||
const int offload_end) {
|
const int offload_end, const int pack_width=1) {
|
||||||
grow_local(list, offload_end);
|
grow_local(list, offload_end);
|
||||||
if (offload_end) {
|
if (offload_end) {
|
||||||
grow_nmax();
|
grow_nmax();
|
||||||
grow_binhead();
|
grow_binhead();
|
||||||
}
|
}
|
||||||
grow_nbor_list(list, nlocal, offload_end);
|
grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
|
||||||
}
|
}
|
||||||
|
|
||||||
void free_nmax();
|
void free_nmax();
|
||||||
@ -111,7 +111,7 @@ class IntelBuffers {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void free_ccache();
|
void free_ccache();
|
||||||
void grow_ccache(const int off_flag, const int nthreads);
|
void grow_ccache(const int off_flag, const int nthreads, const int width=1);
|
||||||
inline int ccache_stride() { return _ccache_stride; }
|
inline int ccache_stride() { return _ccache_stride; }
|
||||||
inline flt_t * get_ccachex() { return _ccachex; }
|
inline flt_t * get_ccachex() { return _ccachex; }
|
||||||
inline flt_t * get_ccachey() { return _ccachey; }
|
inline flt_t * get_ccachey() { return _ccachey; }
|
||||||
@ -119,6 +119,10 @@ class IntelBuffers {
|
|||||||
inline flt_t * get_ccachew() { return _ccachew; }
|
inline flt_t * get_ccachew() { return _ccachew; }
|
||||||
inline int * get_ccachei() { return _ccachei; }
|
inline int * get_ccachei() { return _ccachei; }
|
||||||
inline int * get_ccachej() { return _ccachej; }
|
inline int * get_ccachej() { return _ccachej; }
|
||||||
|
#ifdef LMP_USE_AVXCD
|
||||||
|
inline int ccache_stride3() { return _ccache_stride3; }
|
||||||
|
inline acc_t * get_ccachef() { return _ccachef; }
|
||||||
|
#endif
|
||||||
|
|
||||||
inline int get_max_nbors() {
|
inline int get_max_nbors() {
|
||||||
int mn = lmp->neighbor->oneatom * sizeof(int) /
|
int mn = lmp->neighbor->oneatom * sizeof(int) /
|
||||||
@ -129,9 +133,10 @@ class IntelBuffers {
|
|||||||
void free_nbor_list();
|
void free_nbor_list();
|
||||||
|
|
||||||
inline void grow_nbor_list(NeighList *list, const int nlocal,
|
inline void grow_nbor_list(NeighList *list, const int nlocal,
|
||||||
const int offload_end) {
|
const int nthreads, const int offload_end,
|
||||||
|
const int pack_width) {
|
||||||
if (nlocal > _list_alloc_atoms)
|
if (nlocal > _list_alloc_atoms)
|
||||||
_grow_nbor_list(list, nlocal, offload_end);
|
_grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
else if (offload_end > 0 && _off_map_stencil != list->stencil)
|
else if (offload_end > 0 && _off_map_stencil != list->stencil)
|
||||||
_grow_stencil(list);
|
_grow_stencil(list);
|
||||||
@ -281,6 +286,10 @@ class IntelBuffers {
|
|||||||
int _ccache_stride;
|
int _ccache_stride;
|
||||||
flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
|
flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
|
||||||
int *_ccachei, *_ccachej;
|
int *_ccachei, *_ccachej;
|
||||||
|
#ifdef LMP_USE_AVXCD
|
||||||
|
int _ccache_stride3;
|
||||||
|
acc_t * _ccachef;
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
int _separate_buffers;
|
int _separate_buffers;
|
||||||
@ -305,8 +314,8 @@ class IntelBuffers {
|
|||||||
void _grow_nmax();
|
void _grow_nmax();
|
||||||
void _grow_local(NeighList *list, const int offload_end);
|
void _grow_local(NeighList *list, const int offload_end);
|
||||||
void _grow_binhead();
|
void _grow_binhead();
|
||||||
void _grow_nbor_list(NeighList *list, const int nlocal,
|
void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
|
||||||
const int offload_end);
|
const int offload_end, const int pack_width);
|
||||||
void _grow_stencil(NeighList *list);
|
void _grow_stencil(NeighList *list);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -55,30 +55,37 @@ enum {LMP_OVERFLOW, LMP_LOCAL_MIN, LMP_LOCAL_MAX, LMP_GHOST_MIN,
|
|||||||
enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||||
TIME_OFFLOAD_PAIR, TIME_OFFLOAD_WAIT, TIME_OFFLOAD_LATENCY,
|
TIME_OFFLOAD_PAIR, TIME_OFFLOAD_WAIT, TIME_OFFLOAD_LATENCY,
|
||||||
TIME_IMBALANCE};
|
TIME_IMBALANCE};
|
||||||
#define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
|
|
||||||
|
|
||||||
|
#define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
|
||||||
#define INTEL_MIC_VECTOR_WIDTH 16
|
#define INTEL_MIC_VECTOR_WIDTH 16
|
||||||
#define INTEL_VECTOR_WIDTH 4
|
#define INTEL_VECTOR_WIDTH 4
|
||||||
|
|
||||||
#ifdef __AVX__
|
#ifdef __AVX__
|
||||||
#undef INTEL_VECTOR_WIDTH
|
#undef INTEL_VECTOR_WIDTH
|
||||||
#define INTEL_VECTOR_WIDTH 8
|
#define INTEL_VECTOR_WIDTH 8
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
#undef INTEL_VECTOR_WIDTH
|
#undef INTEL_VECTOR_WIDTH
|
||||||
#define INTEL_VECTOR_WIDTH 8
|
#define INTEL_VECTOR_WIDTH 8
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __AVX512F__
|
#ifdef __AVX512F__
|
||||||
#undef INTEL_VECTOR_WIDTH
|
#undef INTEL_VECTOR_WIDTH
|
||||||
#define INTEL_VECTOR_WIDTH 16
|
#define INTEL_VECTOR_WIDTH 16
|
||||||
#define INTEL_V512 1
|
#define INTEL_V512 1
|
||||||
#define INTEL_VMASK 1
|
#define INTEL_VMASK 1
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef __MIC__
|
#ifdef __MIC__
|
||||||
#define INTEL_V512 1
|
#define INTEL_V512 1
|
||||||
#define INTEL_VMASK 1
|
#define INTEL_VMASK 1
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __AVX512CD__
|
||||||
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
|
#define LMP_USE_AVXCD
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define INTEL_DATA_ALIGN 64
|
#define INTEL_DATA_ALIGN 64
|
||||||
@ -134,6 +141,18 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||||||
datasize); \
|
datasize); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, \
|
||||||
|
nthreads, vecsize) \
|
||||||
|
{ \
|
||||||
|
tid = omp_get_thread_num(); \
|
||||||
|
int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
|
||||||
|
/vecsize/nthreads)); \
|
||||||
|
idelta *= vecsize; \
|
||||||
|
ifrom = tid*idelta; \
|
||||||
|
ito = ifrom + idelta; \
|
||||||
|
if (ito > inum) ito = inum; \
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
|
#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
|
||||||
@ -364,6 +383,43 @@ inline double MIC_Wtime() {
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\
|
||||||
|
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, \
|
||||||
|
f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, \
|
||||||
|
vb3x, vb3y, vb3z,oedihedral, force, \
|
||||||
|
newton, nlocal) \
|
||||||
|
{ \
|
||||||
|
flt_t ev_pre; \
|
||||||
|
if (newton) ev_pre = (flt_t)1.0; \
|
||||||
|
else { \
|
||||||
|
ev_pre = (flt_t)0.0; \
|
||||||
|
if (i1 < nlocal) ev_pre += (flt_t)0.25; \
|
||||||
|
if (i2 < nlocal) ev_pre += (flt_t)0.25; \
|
||||||
|
if (i3 < nlocal) ev_pre += (flt_t)0.25; \
|
||||||
|
if (i4 < nlocal) ev_pre += (flt_t)0.25; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if (eflag) { \
|
||||||
|
oedihedral += ev_pre * deng; \
|
||||||
|
if (eatom) { \
|
||||||
|
flt_t qdeng = deng * (flt_t)0.25; \
|
||||||
|
if (newton || i1 < nlocal) f[i1].w += qdeng; \
|
||||||
|
if (newton || i2 < nlocal) f[i2].w += qdeng; \
|
||||||
|
if (newton || i3 < nlocal) f[i3].w += qdeng; \
|
||||||
|
if (newton || i4 < nlocal) f[i4].w += qdeng; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if (vflag) { \
|
||||||
|
sv0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \
|
||||||
|
sv1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \
|
||||||
|
sv2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \
|
||||||
|
sv3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y); \
|
||||||
|
sv4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z); \
|
||||||
|
sv5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
#define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp) \
|
#define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp) \
|
||||||
{ \
|
{ \
|
||||||
if (evflag) { \
|
if (evflag) { \
|
||||||
|
|||||||
@ -351,4 +351,128 @@
|
|||||||
ans##_0 = (aug_3 - t) / aug_0; \
|
ans##_0 = (aug_3 - t) / aug_0; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------------
|
||||||
|
normalize a quaternion
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
#define ME_qnormalize(q) \
|
||||||
|
{ \
|
||||||
|
double norm = 1.0 / \
|
||||||
|
sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k); \
|
||||||
|
q##_w *= norm; \
|
||||||
|
q##_i *= norm; \
|
||||||
|
q##_j *= norm; \
|
||||||
|
q##_k *= norm; \
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------------
|
||||||
|
compute omega from angular momentum
|
||||||
|
w = omega = angular velocity in space frame
|
||||||
|
wbody = angular velocity in body frame
|
||||||
|
project space-frame angular momentum onto body axes
|
||||||
|
and divide by principal moments
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w) \
|
||||||
|
{ \
|
||||||
|
double wbody_0, wbody_1, wbody_2; \
|
||||||
|
double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \
|
||||||
|
\
|
||||||
|
double w2 = quat##_w * quat##_w; \
|
||||||
|
double i2 = quat##_i * quat##_i; \
|
||||||
|
double j2 = quat##_j * quat##_j; \
|
||||||
|
double k2 = quat##_k * quat##_k; \
|
||||||
|
double twoij = 2.0 * quat##_i * quat##_j; \
|
||||||
|
double twoik = 2.0 * quat##_i * quat##_k; \
|
||||||
|
double twojk = 2.0 * quat##_j * quat##_k; \
|
||||||
|
double twoiw = 2.0 * quat##_i * quat##_w; \
|
||||||
|
double twojw = 2.0 * quat##_j * quat##_w; \
|
||||||
|
double twokw = 2.0 * quat##_k * quat##_w; \
|
||||||
|
\
|
||||||
|
rot##_0 = w2 + i2 - j2 - k2; \
|
||||||
|
rot##_1 = twoij - twokw; \
|
||||||
|
rot##_2 = twojw + twoik; \
|
||||||
|
\
|
||||||
|
rot##_3 = twoij + twokw; \
|
||||||
|
rot##_4 = w2 - i2 + j2 - k2; \
|
||||||
|
rot##_5 = twojk - twoiw; \
|
||||||
|
\
|
||||||
|
rot##_6 = twoik - twojw; \
|
||||||
|
rot##_7 = twojk + twoiw; \
|
||||||
|
rot##_8 = w2 - i2 - j2 + k2; \
|
||||||
|
\
|
||||||
|
wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2; \
|
||||||
|
wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2; \
|
||||||
|
wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2; \
|
||||||
|
\
|
||||||
|
wbody_0 *= moments_0; \
|
||||||
|
wbody_1 *= moments_1; \
|
||||||
|
wbody_2 *= moments_2; \
|
||||||
|
\
|
||||||
|
w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2; \
|
||||||
|
w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2; \
|
||||||
|
w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2) \
|
||||||
|
{ \
|
||||||
|
angmomin[0] += dtf * torque[0]; \
|
||||||
|
double angmom_0 = angmomin[0]; \
|
||||||
|
angmomin[1] += dtf * torque[1]; \
|
||||||
|
double angmom_1 = angmomin[1]; \
|
||||||
|
angmomin[2] += dtf * torque[2]; \
|
||||||
|
double angmom_2 = angmomin[2]; \
|
||||||
|
\
|
||||||
|
double quat_w = quatin[0]; \
|
||||||
|
double quat_i = quatin[1]; \
|
||||||
|
double quat_j = quatin[2]; \
|
||||||
|
double quat_k = quatin[3]; \
|
||||||
|
\
|
||||||
|
double omega_0, omega_1, omega_2; \
|
||||||
|
ME_mq_to_omega(angmom,quat,i0,i1,i2,omega); \
|
||||||
|
\
|
||||||
|
double wq_0, wq_1, wq_2, wq_3; \
|
||||||
|
wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k; \
|
||||||
|
wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j; \
|
||||||
|
wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k; \
|
||||||
|
wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i; \
|
||||||
|
\
|
||||||
|
double qfull_w, qfull_i, qfull_j, qfull_k; \
|
||||||
|
qfull_w = quat_w + dtq * wq_0; \
|
||||||
|
qfull_i = quat_i + dtq * wq_1; \
|
||||||
|
qfull_j = quat_j + dtq * wq_2; \
|
||||||
|
qfull_k = quat_k + dtq * wq_3; \
|
||||||
|
ME_qnormalize(qfull); \
|
||||||
|
\
|
||||||
|
double qhalf_w, qhalf_i, qhalf_j, qhalf_k; \
|
||||||
|
qhalf_w = quat_w + 0.5*dtq * wq_0; \
|
||||||
|
qhalf_i = quat_i + 0.5*dtq * wq_1; \
|
||||||
|
qhalf_j = quat_j + 0.5*dtq * wq_2; \
|
||||||
|
qhalf_k = quat_k + 0.5*dtq * wq_3; \
|
||||||
|
ME_qnormalize(qhalf); \
|
||||||
|
\
|
||||||
|
ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega); \
|
||||||
|
wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k; \
|
||||||
|
wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j; \
|
||||||
|
wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k; \
|
||||||
|
wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i; \
|
||||||
|
\
|
||||||
|
qhalf_w += 0.5*dtq * wq_0; \
|
||||||
|
qhalf_i += 0.5*dtq * wq_1; \
|
||||||
|
qhalf_j += 0.5*dtq * wq_2; \
|
||||||
|
qhalf_k += 0.5*dtq * wq_3; \
|
||||||
|
ME_qnormalize(qhalf); \
|
||||||
|
\
|
||||||
|
quat_w = 2.0*qhalf_w - qfull_w; \
|
||||||
|
quat_i = 2.0*qhalf_i - qfull_i; \
|
||||||
|
quat_j = 2.0*qhalf_j - qfull_j; \
|
||||||
|
quat_k = 2.0*qhalf_k - qfull_k; \
|
||||||
|
ME_qnormalize(quat); \
|
||||||
|
\
|
||||||
|
quatin[0] = quat_w; \
|
||||||
|
quatin[1] = quat_i; \
|
||||||
|
quatin[2] = quat_j; \
|
||||||
|
quatin[3] = quat_k; \
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -15,6 +15,8 @@
|
|||||||
Contributing author: W. Michael Brown (Intel)
|
Contributing author: W. Michael Brown (Intel)
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
//#define OUTER_CHUNK 1
|
||||||
|
|
||||||
#include "neighbor.h"
|
#include "neighbor.h"
|
||||||
#include "neigh_list.h"
|
#include "neigh_list.h"
|
||||||
#include "atom.h"
|
#include "atom.h"
|
||||||
@ -26,6 +28,14 @@
|
|||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef LMP_USE_AVXCD
|
||||||
|
#include "intel_simd.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef OUTER_CHUNK
|
||||||
|
#include "intel_simd.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
using namespace LAMMPS_NS;
|
using namespace LAMMPS_NS;
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
@ -42,17 +52,11 @@ using namespace LAMMPS_NS;
|
|||||||
for (int s = 0; s < n3; s++) { \
|
for (int s = 0; s < n3; s++) { \
|
||||||
if (sptr[s] == tag) { \
|
if (sptr[s] == tag) { \
|
||||||
if (s < n1) { \
|
if (s < n1) { \
|
||||||
if (special_flag[1] == 0) which = -1; \
|
which = 1; \
|
||||||
else if (special_flag[1] == 1) which = 0; \
|
|
||||||
else which = 1; \
|
|
||||||
} else if (s < n2) { \
|
} else if (s < n2) { \
|
||||||
if (special_flag[2] == 0) which = -1; \
|
which = 2; \
|
||||||
else if (special_flag[2] == 1) which = 0; \
|
|
||||||
else which = 2; \
|
|
||||||
} else { \
|
} else { \
|
||||||
if (special_flag[3] == 0) which = -1; \
|
which = 3; \
|
||||||
else if (special_flag[3] == 1) which = 0; \
|
|
||||||
else which = 3; \
|
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -199,7 +203,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
|
|||||||
if (offload) {
|
if (offload) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
|
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
|
||||||
buffers->grow_nbor(list, atom->nlocal, aend);
|
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
|
||||||
|
|
||||||
ATOM_T biga;
|
ATOM_T biga;
|
||||||
biga.x = INTEL_BIGP;
|
biga.x = INTEL_BIGP;
|
||||||
@ -335,7 +339,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
|
|||||||
signal(tag)
|
signal(tag)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime();
|
*timer_compute = MIC_Wtime();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -494,12 +498,12 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
|
|||||||
if (j >= nlocal) {
|
if (j >= nlocal) {
|
||||||
if (j == nall)
|
if (j == nall)
|
||||||
jlist[jj] = nall_offset;
|
jlist[jj] = nall_offset;
|
||||||
else if (which > 0)
|
else if (which)
|
||||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||||
else jlist[jj]-=ghost_offset;
|
else jlist[jj]-=ghost_offset;
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
if (which > 0) jlist[jj] = j ^ (which << SBBITS);
|
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -520,7 +524,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // end omp
|
} // end omp
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
#endif
|
#endif
|
||||||
} // end offload
|
} // end offload
|
||||||
@ -688,7 +692,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
|||||||
if (offload) {
|
if (offload) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
|
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
|
||||||
buffers->grow_nbor(list, atom->nlocal, aend);
|
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
|
||||||
|
|
||||||
ATOM_T biga;
|
ATOM_T biga;
|
||||||
biga.x = INTEL_BIGP;
|
biga.x = INTEL_BIGP;
|
||||||
@ -827,7 +831,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
|||||||
signal(tag)
|
signal(tag)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime();
|
*timer_compute = MIC_Wtime();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -848,33 +852,47 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
|||||||
|
|
||||||
const int num = aend - astart;
|
const int num = aend - astart;
|
||||||
int tid, ifrom, ito;
|
int tid, ifrom, ito;
|
||||||
|
|
||||||
|
#ifdef OUTER_CHUNK
|
||||||
|
const int swidth = ip_simd::SIMD_type<flt_t>::width();
|
||||||
|
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, swidth);
|
||||||
|
ifrom += astart;
|
||||||
|
ito += astart;
|
||||||
|
int e_ito = ito;
|
||||||
|
if (ito == num) {
|
||||||
|
int imod = ito % swidth;
|
||||||
|
if (imod) e_ito += swidth - e_ito;
|
||||||
|
}
|
||||||
|
const int list_size = (e_ito + tid + 1) * maxnbors;
|
||||||
|
#else
|
||||||
|
const int swidth = 1;
|
||||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
||||||
ifrom += astart;
|
ifrom += astart;
|
||||||
ito += astart;
|
ito += astart;
|
||||||
|
const int list_size = (ito + tid + 1) * maxnbors;
|
||||||
|
#endif
|
||||||
|
|
||||||
int which;
|
int which;
|
||||||
|
|
||||||
const int list_size = (ito + tid + 1) * maxnbors;
|
int pack_offset = maxnbors * swidth;
|
||||||
int ct = (ifrom + tid) * maxnbors;
|
int ct = (ifrom + tid) * maxnbors;
|
||||||
int *neighptr = firstneigh + ct;
|
int *neighptr = firstneigh + ct;
|
||||||
|
|
||||||
|
int max_chunk = 0;
|
||||||
|
int lane = 0;
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
int j, k, n, n2, itype, jtype, ibin;
|
const flt_t xtmp = x[i].x;
|
||||||
double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
|
const flt_t ytmp = x[i].y;
|
||||||
|
const flt_t ztmp = x[i].z;
|
||||||
n = 0;
|
const int itype = x[i].w;
|
||||||
n2 = maxnbors;
|
|
||||||
|
|
||||||
xtmp = x[i].x;
|
|
||||||
ytmp = x[i].y;
|
|
||||||
ztmp = x[i].z;
|
|
||||||
itype = x[i].w;
|
|
||||||
const int ioffset = ntypes * itype;
|
const int ioffset = ntypes * itype;
|
||||||
|
|
||||||
// loop over rest of atoms in i's bin, ghosts are at end of linked list
|
// loop over rest of atoms in i's bin, ghosts are at end of linked list
|
||||||
// if j is owned atom, store it, since j is beyond i in linked list
|
// if j is owned atom, store it, since j is beyond i in linked list
|
||||||
// if j is ghost, only store if j coords are "above/to the right" of i
|
// if j is ghost, only store if j coords are "above/to the right" of i
|
||||||
|
|
||||||
for (j = bins[i]; j >= 0; j = bins[j]) {
|
int raw_count = pack_offset;
|
||||||
|
for (int j = bins[i]; j >= 0; j = bins[j]) {
|
||||||
if (j >= nlocal) {
|
if (j >= nlocal) {
|
||||||
if (offload_noghost && offload) continue;
|
if (offload_noghost && offload) continue;
|
||||||
if (x[j].z < ztmp) continue;
|
if (x[j].z < ztmp) continue;
|
||||||
@ -884,116 +902,145 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
|||||||
}
|
}
|
||||||
} else if (offload_noghost && i < offload_end) continue;
|
} else if (offload_noghost && i < offload_end) continue;
|
||||||
|
|
||||||
jtype = x[j].w;
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
if (exclude) {
|
||||||
|
const int jtype = x[j].w;
|
||||||
|
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
delx = xtmp - x[j].x;
|
neighptr[raw_count++] = j;
|
||||||
dely = ytmp - x[j].y;
|
}
|
||||||
delz = ztmp - x[j].z;
|
|
||||||
rsq = delx * delx + dely * dely + delz * delz;
|
|
||||||
|
|
||||||
if (rsq <= cutneighsq[ioffset + jtype]) {
|
|
||||||
if (j < nlocal) {
|
|
||||||
if (need_ic) {
|
|
||||||
int no_special;
|
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
|
||||||
if (no_special)
|
|
||||||
neighptr[n++] = -j - 1;
|
|
||||||
else
|
|
||||||
neighptr[n++] = j;
|
|
||||||
} else
|
|
||||||
neighptr[n++] = j;
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j < lmin) lmin = j;
|
|
||||||
if (j > lmax) lmax = j;
|
|
||||||
#endif
|
|
||||||
} else {
|
|
||||||
if (need_ic) {
|
|
||||||
int no_special;
|
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
|
||||||
if (no_special)
|
|
||||||
neighptr[n2++] = -j - 1;
|
|
||||||
else
|
|
||||||
neighptr[n2++] = j;
|
|
||||||
} else
|
|
||||||
neighptr[n2++] = j;
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j < gmin) gmin = j;
|
|
||||||
if (j > gmax) gmax = j;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// loop over all atoms in other bins in stencil, store every pair
|
// loop over all atoms in other bins in stencil, store every pair
|
||||||
|
|
||||||
ibin = atombin[i];
|
const int ibin = atombin[i];
|
||||||
|
for (int k = 0; k < nstencil; k++) {
|
||||||
for (k = 0; k < nstencil; k++) {
|
for (int j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
|
||||||
for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
|
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
if (j < nlocal) {
|
if (j < nlocal) {
|
||||||
if (i < offload_end) continue;
|
if (i < offload_end) continue;
|
||||||
} else if (offload) continue;
|
} else if (offload) continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
jtype = x[j].w;
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
if (exclude) {
|
||||||
|
const int jtype = x[j].w;
|
||||||
|
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
delx = xtmp - x[j].x;
|
neighptr[raw_count++] = j;
|
||||||
dely = ytmp - x[j].y;
|
}
|
||||||
delz = ztmp - x[j].z;
|
|
||||||
rsq = delx * delx + dely * dely + delz * delz;
|
|
||||||
if (rsq <= cutneighsq[ioffset + jtype]) {
|
|
||||||
if (j < nlocal) {
|
|
||||||
if (need_ic) {
|
|
||||||
int no_special;
|
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
|
||||||
if (no_special)
|
|
||||||
neighptr[n++] = -j - 1;
|
|
||||||
else
|
|
||||||
neighptr[n++] = j;
|
|
||||||
} else
|
|
||||||
neighptr[n++] = j;
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j < lmin) lmin = j;
|
|
||||||
if (j > lmax) lmax = j;
|
|
||||||
#endif
|
|
||||||
} else {
|
|
||||||
if (need_ic) {
|
|
||||||
int no_special;
|
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
|
||||||
if (no_special)
|
|
||||||
neighptr[n2++] = -j - 1;
|
|
||||||
else
|
|
||||||
neighptr[n2++] = j;
|
|
||||||
} else
|
|
||||||
neighptr[n2++] = j;
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j < gmin) gmin = j;
|
|
||||||
if (j > gmax) gmax = j;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ilist[i] = i;
|
|
||||||
|
|
||||||
cnumneigh[i] = ct;
|
|
||||||
if (n > maxnbors) *overflow = 1;
|
|
||||||
for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
|
|
||||||
while( (n % pad_width) != 0 ) neighptr[n++] = e_nall;
|
|
||||||
numneigh[i] = n;
|
|
||||||
while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
|
|
||||||
ct += n;
|
|
||||||
neighptr += n;
|
|
||||||
if (ct + n + maxnbors > list_size) {
|
|
||||||
*overflow = 1;
|
|
||||||
ct = (ifrom + tid) * maxnbors;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
||||||
|
#else
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
for (int u = pack_offset; u < raw_count; u++) {
|
||||||
|
int j = neighptr[u];
|
||||||
|
const flt_t delx = xtmp - x[j].x;
|
||||||
|
const flt_t dely = ytmp - x[j].y;
|
||||||
|
const flt_t delz = ztmp - x[j].z;
|
||||||
|
const int jtype = x[j].w;
|
||||||
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
if (rsq > cutneighsq[ioffset + jtype])
|
||||||
|
neighptr[u] = e_nall;
|
||||||
|
else {
|
||||||
|
if (need_ic) {
|
||||||
|
int no_special;
|
||||||
|
ominimum_image_check(no_special, delx, dely, delz);
|
||||||
|
if (no_special)
|
||||||
|
neighptr[u] = -j - 1;
|
||||||
|
}
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (j < nlocal) {
|
||||||
|
if (j < vlmin) vlmin = j;
|
||||||
|
if (j > vlmax) vlmax = j;
|
||||||
|
} else {
|
||||||
|
if (j < vgmin) vgmin = j;
|
||||||
|
if (j > vgmax) vgmax = j;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
lmin = MIN(lmin,vlmin);
|
||||||
|
gmin = MIN(gmin,vgmin);
|
||||||
|
lmax = MAX(lmax,vlmax);
|
||||||
|
gmax = MAX(gmax,vgmax);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int n = lane, n2 = pack_offset;
|
||||||
|
for (int u = pack_offset; u < raw_count; u++) {
|
||||||
|
const int j = neighptr[u];
|
||||||
|
int pj = j;
|
||||||
|
if (pj < e_nall) {
|
||||||
|
if (need_ic)
|
||||||
|
if (pj < 0) pj = -pj - 1;
|
||||||
|
|
||||||
|
if (pj < nlocal) {
|
||||||
|
neighptr[n] = j;
|
||||||
|
n += swidth;
|
||||||
|
} else
|
||||||
|
neighptr[n2++] = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int ns = (n - lane) / swidth;
|
||||||
|
if (ns > maxnbors || n2 > list_size) *overflow = 1;
|
||||||
|
for (int u = pack_offset; u < n2; u++) {
|
||||||
|
neighptr[n] = neighptr[u];
|
||||||
|
n += swidth;
|
||||||
|
}
|
||||||
|
|
||||||
|
ilist[i] = i;
|
||||||
|
cnumneigh[i] = ct + lane;
|
||||||
|
ns += n2 - pack_offset;
|
||||||
|
#ifndef OUTER_CHUNK
|
||||||
|
while( (ns % pad_width) != 0 ) neighptr[ns++] = e_nall;
|
||||||
|
#endif
|
||||||
|
numneigh[i] = ns;
|
||||||
|
|
||||||
|
#ifdef OUTER_CHUNK
|
||||||
|
if (ns > max_chunk) max_chunk = ns;
|
||||||
|
lane++;
|
||||||
|
pack_offset -= maxnbors;
|
||||||
|
if (lane == swidth) {
|
||||||
|
ct += max_chunk * swidth;
|
||||||
|
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||||
|
const int edge = (ct % alignb);
|
||||||
|
if (edge) ct += alignb - edge;
|
||||||
|
neighptr = firstneigh + ct;
|
||||||
|
max_chunk = 0;
|
||||||
|
pack_offset = maxnbors * swidth;
|
||||||
|
lane = 0;
|
||||||
|
if (ct + pack_offset + maxnbors > list_size) {
|
||||||
|
if (i < ito - 1) {
|
||||||
|
*overflow = 1;
|
||||||
|
ct = (ifrom + tid) * maxnbors;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
ct += ns;
|
||||||
|
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||||
|
const int edge = (ct % alignb);
|
||||||
|
if (edge) ct += alignb - edge;
|
||||||
|
neighptr = firstneigh + ct;
|
||||||
|
if (ct + pack_offset + maxnbors > list_size) {
|
||||||
|
if (i < ito - 1) {
|
||||||
|
*overflow = 1;
|
||||||
|
ct = (ifrom + tid) * maxnbors;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (*overflow == 1)
|
if (*overflow == 1)
|
||||||
@ -1032,7 +1079,16 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
|||||||
for (int i = ifrom; i < ito; ++i) {
|
for (int i = ifrom; i < ito; ++i) {
|
||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
|
#ifndef OUTER_CHUNK
|
||||||
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
|
#else
|
||||||
|
const int trip = jnum * swidth;
|
||||||
|
for (int jj = 0; jj < trip; jj+= swidth) {
|
||||||
|
#endif
|
||||||
const int j = jlist[jj];
|
const int j = jlist[jj];
|
||||||
if (need_ic && j < 0) {
|
if (need_ic && j < 0) {
|
||||||
which = 0;
|
which = 0;
|
||||||
@ -1044,12 +1100,12 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
|||||||
if (j >= nlocal) {
|
if (j >= nlocal) {
|
||||||
if (j == e_nall)
|
if (j == e_nall)
|
||||||
jlist[jj] = nall_offset;
|
jlist[jj] = nall_offset;
|
||||||
else if (which > 0)
|
else if (which)
|
||||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||||
else jlist[jj]-=ghost_offset;
|
else jlist[jj]-=ghost_offset;
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
if (which > 0) jlist[jj] = j ^ (which << SBBITS);
|
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1070,7 +1126,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // end omp
|
} // end omp
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
#endif
|
#endif
|
||||||
} // end offload
|
} // end offload
|
||||||
@ -1238,7 +1294,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
|
|||||||
if (offload) {
|
if (offload) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
|
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
|
||||||
buffers->grow_nbor(list, atom->nlocal, aend);
|
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
|
||||||
|
|
||||||
ATOM_T biga;
|
ATOM_T biga;
|
||||||
biga.x = INTEL_BIGP;
|
biga.x = INTEL_BIGP;
|
||||||
@ -1377,7 +1433,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
|
|||||||
signal(tag)
|
signal(tag)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime();
|
*timer_compute = MIC_Wtime();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -1550,12 +1606,12 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
|
|||||||
if (j >= nlocal) {
|
if (j >= nlocal) {
|
||||||
if (j == e_nall)
|
if (j == e_nall)
|
||||||
jlist[jj] = nall_offset;
|
jlist[jj] = nall_offset;
|
||||||
else if (which > 0)
|
else if (which)
|
||||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||||
else jlist[jj]-=ghost_offset;
|
else jlist[jj]-=ghost_offset;
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
if (which > 0) jlist[jj] = j ^ (which << SBBITS);
|
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1576,7 +1632,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // end omp
|
} // end omp
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
#endif
|
#endif
|
||||||
} // end offload
|
} // end offload
|
||||||
@ -1741,10 +1797,12 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
|||||||
const int nall = atom->nlocal + atom->nghost;
|
const int nall = atom->nlocal + atom->nghost;
|
||||||
int pad = 1;
|
int pad = 1;
|
||||||
|
|
||||||
|
const int pack_width = fix->nbor_pack_width();
|
||||||
|
|
||||||
if (offload) {
|
if (offload) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
|
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
|
||||||
buffers->grow_nbor(list, atom->nlocal, aend);
|
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend, pack_width);
|
||||||
|
|
||||||
ATOM_T biga;
|
ATOM_T biga;
|
||||||
biga.x = INTEL_BIGP;
|
biga.x = INTEL_BIGP;
|
||||||
@ -1871,7 +1929,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
|||||||
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
||||||
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
||||||
in(special_flag:length(0) alloc_if(0) free_if(0)) \
|
in(special_flag:length(0) alloc_if(0) free_if(0)) \
|
||||||
in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
|
in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pack_width) \
|
||||||
in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
|
in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
|
||||||
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
|
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
|
||||||
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
||||||
@ -1879,7 +1937,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
|||||||
signal(tag)
|
signal(tag)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime();
|
*timer_compute = MIC_Wtime();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -1900,36 +1958,40 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
|||||||
|
|
||||||
const int num = aend - astart;
|
const int num = aend - astart;
|
||||||
int tid, ifrom, ito;
|
int tid, ifrom, ito;
|
||||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
|
||||||
|
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
|
||||||
ifrom += astart;
|
ifrom += astart;
|
||||||
ito += astart;
|
ito += astart;
|
||||||
|
int e_ito = ito;
|
||||||
|
if (ito == num) {
|
||||||
|
int imod = ito % pack_width;
|
||||||
|
if (imod) e_ito += pack_width - e_ito;
|
||||||
|
}
|
||||||
|
const int list_size = (e_ito + tid + 1) * maxnbors;
|
||||||
|
|
||||||
int which;
|
int which;
|
||||||
|
|
||||||
const int list_size = (ito + tid + 1) * maxnbors;
|
int pack_offset = maxnbors * pack_width;
|
||||||
int ct = (ifrom + tid) * maxnbors;
|
int ct = (ifrom + tid) * maxnbors;
|
||||||
int *neighptr = firstneigh + ct;
|
int *neighptr = firstneigh + ct;
|
||||||
|
|
||||||
|
int max_chunk = 0;
|
||||||
|
int lane = 0;
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
int j, k, n, n2, itype, jtype, ibin;
|
const flt_t xtmp = x[i].x;
|
||||||
double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
|
const flt_t ytmp = x[i].y;
|
||||||
|
const flt_t ztmp = x[i].z;
|
||||||
n = 0;
|
const int itype = x[i].w;
|
||||||
n2 = maxnbors;
|
|
||||||
|
|
||||||
xtmp = x[i].x;
|
|
||||||
ytmp = x[i].y;
|
|
||||||
ztmp = x[i].z;
|
|
||||||
itype = x[i].w;
|
|
||||||
const tagint itag = tag[i];
|
const tagint itag = tag[i];
|
||||||
const int ioffset = ntypes * itype;
|
const int ioffset = ntypes * itype;
|
||||||
|
|
||||||
|
const int ibin = atombin[i];
|
||||||
|
int raw_count = pack_offset;
|
||||||
|
|
||||||
// loop over all atoms in surrounding bins in stencil including self
|
// loop over all atoms in surrounding bins in stencil including self
|
||||||
// skip i = j
|
// skip i = j
|
||||||
|
for (int k = 0; k < nstencil; k++) {
|
||||||
ibin = atombin[i];
|
for (int j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
|
||||||
|
|
||||||
for (k = 0; k < nstencil; k++) {
|
|
||||||
for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
|
|
||||||
if (i == j) continue;
|
if (i == j) continue;
|
||||||
|
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
@ -1938,76 +2000,121 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
|||||||
} else if (offload) continue;
|
} else if (offload) continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
jtype = x[j].w;
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
if (exclude) {
|
||||||
#endif
|
const int jtype = x[j].w;
|
||||||
|
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||||
delx = xtmp - x[j].x;
|
|
||||||
dely = ytmp - x[j].y;
|
|
||||||
delz = ztmp - x[j].z;
|
|
||||||
rsq = delx * delx + dely * dely + delz * delz;
|
|
||||||
if (rsq <= cutneighsq[ioffset + jtype]) {
|
|
||||||
const int jtag = tag[j];
|
|
||||||
int flist = 0;
|
|
||||||
if (itag > jtag) {
|
|
||||||
if ((itag+jtag) % 2 == 0) flist = 1;
|
|
||||||
} else if (itag < jtag) {
|
|
||||||
if ((itag+jtag) % 2 == 1) flist = 1;
|
|
||||||
} else {
|
|
||||||
if (x[j].z < ztmp) flist = 1;
|
|
||||||
else if (x[j].z == ztmp && x[j].y < ytmp) flist = 1;
|
|
||||||
else if (x[j].z == ztmp && x[j].y == ytmp && x[j].x < xtmp)
|
|
||||||
flist = 1;
|
|
||||||
}
|
|
||||||
if (flist) {
|
|
||||||
if (need_ic) {
|
|
||||||
int no_special;
|
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
|
||||||
if (no_special)
|
|
||||||
neighptr[n2++] = -j - 1;
|
|
||||||
else
|
|
||||||
neighptr[n2++] = j;
|
|
||||||
} else
|
|
||||||
neighptr[n2++] = j;
|
|
||||||
} else {
|
|
||||||
if (need_ic) {
|
|
||||||
int no_special;
|
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
|
||||||
if (no_special)
|
|
||||||
neighptr[n++] = -j - 1;
|
|
||||||
else
|
|
||||||
neighptr[n++] = j;
|
|
||||||
} else
|
|
||||||
neighptr[n++] = j;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j < nlocal) {
|
|
||||||
if (j < lmin) lmin = j;
|
|
||||||
if (j > lmax) lmax = j;
|
|
||||||
} else {
|
|
||||||
if (j < gmin) gmin = j;
|
|
||||||
if (j > gmax) gmax = j;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
neighptr[raw_count++] = j;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ilist[i] = i;
|
|
||||||
|
|
||||||
cnumneigh[i] = ct;
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
if (n > maxnbors) *overflow = 1;
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
atombin[i] = n;
|
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
||||||
for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
|
#pragma vector aligned
|
||||||
numneigh[i] = n;
|
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
||||||
while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
|
#else
|
||||||
ct += n;
|
#pragma vector aligned
|
||||||
neighptr += n;
|
#pragma simd
|
||||||
if (ct + n + maxnbors > list_size) {
|
#endif
|
||||||
*overflow = 1;
|
#endif
|
||||||
ct = (ifrom + tid) * maxnbors;
|
for (int u = pack_offset; u < raw_count; u++) {
|
||||||
|
int j = neighptr[u];
|
||||||
|
const flt_t delx = xtmp - x[j].x;
|
||||||
|
const flt_t dely = ytmp - x[j].y;
|
||||||
|
const flt_t delz = ztmp - x[j].z;
|
||||||
|
const int jtype = x[j].w;
|
||||||
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
if (rsq > cutneighsq[ioffset + jtype])
|
||||||
|
neighptr[u] = e_nall;
|
||||||
|
else {
|
||||||
|
if (need_ic) {
|
||||||
|
int no_special;
|
||||||
|
ominimum_image_check(no_special, delx, dely, delz);
|
||||||
|
if (no_special)
|
||||||
|
neighptr[u] = -j - 1;
|
||||||
|
}
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (j < nlocal) {
|
||||||
|
if (j < vlmin) vlmin = j;
|
||||||
|
if (j > vlmax) vlmax = j;
|
||||||
|
} else {
|
||||||
|
if (j < vgmin) vgmin = j;
|
||||||
|
if (j > vgmax) vgmax = j;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
lmin = MIN(lmin,vlmin);
|
||||||
|
gmin = MIN(gmin,vgmin);
|
||||||
|
lmax = MAX(lmax,vlmax);
|
||||||
|
gmax = MAX(gmax,vgmax);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int n = lane, n2 = pack_offset;
|
||||||
|
for (int u = pack_offset; u < raw_count; u++) {
|
||||||
|
const int j = neighptr[u];
|
||||||
|
int pj = j;
|
||||||
|
if (pj < e_nall) {
|
||||||
|
if (need_ic)
|
||||||
|
if (pj < 0) pj = -pj - 1;
|
||||||
|
|
||||||
|
const int jtag = tag[pj];
|
||||||
|
int flist = 0;
|
||||||
|
if (itag > jtag) {
|
||||||
|
if ((itag+jtag) % 2 == 0) flist = 1;
|
||||||
|
} else if (itag < jtag) {
|
||||||
|
if ((itag+jtag) % 2 == 1) flist = 1;
|
||||||
|
} else {
|
||||||
|
if (x[pj].z < ztmp) flist = 1;
|
||||||
|
else if (x[pj].z == ztmp && x[pj].y < ytmp) flist = 1;
|
||||||
|
else if (x[pj].z == ztmp && x[pj].y == ytmp && x[pj].x < xtmp)
|
||||||
|
flist = 1;
|
||||||
|
}
|
||||||
|
if (flist) {
|
||||||
|
neighptr[n2++] = j;
|
||||||
|
} else {
|
||||||
|
neighptr[n] = j;
|
||||||
|
n += pack_width;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
int ns = (n - lane) / pack_width;
|
||||||
|
if (ns > maxnbors || n2 > list_size) *overflow = 1;
|
||||||
|
atombin[i] = ns;
|
||||||
|
for (int u = pack_offset; u < n2; u++) {
|
||||||
|
neighptr[n] = neighptr[u];
|
||||||
|
n += pack_width;
|
||||||
|
}
|
||||||
|
|
||||||
|
ilist[i] = i;
|
||||||
|
cnumneigh[i] = ct + lane;
|
||||||
|
ns += n2 - pack_offset;
|
||||||
|
numneigh[i] = ns;
|
||||||
|
|
||||||
|
if (ns > max_chunk) max_chunk = ns;
|
||||||
|
lane++;
|
||||||
|
pack_offset -= maxnbors;
|
||||||
|
if (lane == pack_width) {
|
||||||
|
ct += max_chunk * pack_width;
|
||||||
|
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||||
|
const int edge = (ct % alignb);
|
||||||
|
if (edge) ct += alignb - edge;
|
||||||
|
neighptr = firstneigh + ct;
|
||||||
|
max_chunk = 0;
|
||||||
|
pack_offset = maxnbors * pack_width;
|
||||||
|
lane = 0;
|
||||||
|
if (ct + pack_offset + maxnbors > list_size) {
|
||||||
|
if (i < ito - 1) {
|
||||||
|
*overflow = 1;
|
||||||
|
ct = (ifrom + tid) * maxnbors;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (*overflow == 1)
|
if (*overflow == 1)
|
||||||
@ -2046,7 +2153,9 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
|||||||
for (int i = ifrom; i < ito; ++i) {
|
for (int i = ifrom; i < ito; ++i) {
|
||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
|
||||||
|
const int trip = jnum * pack_width;
|
||||||
|
for (int jj = 0; jj < trip; jj+=pack_width) {
|
||||||
const int j = jlist[jj];
|
const int j = jlist[jj];
|
||||||
if (need_ic && j < 0) {
|
if (need_ic && j < 0) {
|
||||||
which = 0;
|
which = 0;
|
||||||
@ -2058,12 +2167,12 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
|||||||
if (j >= nlocal) {
|
if (j >= nlocal) {
|
||||||
if (j == e_nall)
|
if (j == e_nall)
|
||||||
jlist[jj] = nall_offset;
|
jlist[jj] = nall_offset;
|
||||||
else if (which > 0)
|
else if (which)
|
||||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||||
else jlist[jj]-=ghost_offset;
|
else jlist[jj]-=ghost_offset;
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
if (which > 0) jlist[jj] = j ^ (which << SBBITS);
|
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2083,7 +2192,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // end omp
|
} // end omp
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
#endif
|
#endif
|
||||||
} // end offload
|
} // end offload
|
||||||
@ -2113,3 +2222,4 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -12,9 +12,17 @@
|
|||||||
Contributing author: W. Michael Brown (Intel)
|
Contributing author: W. Michael Brown (Intel)
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#include <math.h>
|
|
||||||
#include "pair_gayberne_intel.h"
|
#include "pair_gayberne_intel.h"
|
||||||
#include "math_extra_intel.h"
|
#include "math_extra_intel.h"
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
#pragma offload_attribute(push,target(mic))
|
||||||
|
#endif
|
||||||
|
#include <cmath>
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
#pragma offload_attribute(pop)
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "atom.h"
|
#include "atom.h"
|
||||||
#include "comm.h"
|
#include "comm.h"
|
||||||
#include "atom_vec_ellipsoid.h"
|
#include "atom_vec_ellipsoid.h"
|
||||||
@ -295,7 +303,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
signal(f_start)
|
signal(f_start)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute=MIC_Wtime();
|
*timer_compute=MIC_Wtime();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -335,8 +343,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EVFLAG) {
|
||||||
oevdwl = (acc_t)0;
|
oevdwl = (acc_t)0.0;
|
||||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// loop over neighbors of my atoms
|
// loop over neighbors of my atoms
|
||||||
@ -394,8 +402,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
|
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EVFLAG) {
|
||||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool multiple_forms = false;
|
bool multiple_forms = false;
|
||||||
@ -485,14 +493,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
tempv_1 = kappa_1 * inv_r;
|
tempv_1 = kappa_1 * inv_r;
|
||||||
tempv_2 = kappa_2 * inv_r;
|
tempv_2 = kappa_2 * inv_r;
|
||||||
flt_t sigma12 = ME_dot3(r12hat, tempv);
|
flt_t sigma12 = ME_dot3(r12hat, tempv);
|
||||||
sigma12 = pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
|
sigma12 = std::pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
|
||||||
flt_t h12 = r - sigma12;
|
flt_t h12 = r - sigma12;
|
||||||
|
|
||||||
// energy
|
// energy
|
||||||
// compute u_r
|
// compute u_r
|
||||||
|
|
||||||
flt_t varrho = sigma / (h12 + gamma * sigma);
|
flt_t varrho = sigma / (h12 + gamma * sigma);
|
||||||
flt_t varrho6 = pow(varrho, (flt_t)6.0);
|
flt_t varrho6 = std::pow(varrho, (flt_t)6.0);
|
||||||
flt_t varrho12 = varrho6 * varrho6;
|
flt_t varrho12 = varrho6 * varrho6;
|
||||||
flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);
|
flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);
|
||||||
|
|
||||||
@ -500,7 +508,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
|
flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
|
||||||
flt_t det_g12 = ME_det3(g12);
|
flt_t det_g12 = ME_det3(g12);
|
||||||
eta = pow(eta / det_g12, upsilon);
|
eta = std::pow(eta / det_g12, upsilon);
|
||||||
|
|
||||||
// compute chi_12
|
// compute chi_12
|
||||||
|
|
||||||
@ -516,7 +524,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
tempv_1 = iota_1 * inv_r;
|
tempv_1 = iota_1 * inv_r;
|
||||||
tempv_2 = iota_2 * inv_r;
|
tempv_2 = iota_2 * inv_r;
|
||||||
flt_t chi = ME_dot3(r12hat, tempv);
|
flt_t chi = ME_dot3(r12hat, tempv);
|
||||||
chi = pow(chi * (flt_t)2.0, mu);
|
chi = std::pow(chi * (flt_t)2.0, mu);
|
||||||
|
|
||||||
// force
|
// force
|
||||||
// compute dUr/dr
|
// compute dUr/dr
|
||||||
@ -524,7 +532,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
|
temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
|
||||||
sigma;
|
sigma;
|
||||||
temp1 = temp1 * (flt_t)24.0 * epsilon;
|
temp1 = temp1 * (flt_t)24.0 * epsilon;
|
||||||
flt_t u_slj = temp1 * pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
|
flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
|
||||||
flt_t dUr_0, dUr_1, dUr_2;
|
flt_t dUr_0, dUr_1, dUr_2;
|
||||||
temp2 = ME_dot3(kappa, r12hat);
|
temp2 = ME_dot3(kappa, r12hat);
|
||||||
flt_t uslj_rsq = u_slj / rsq_form[jj];
|
flt_t uslj_rsq = u_slj / rsq_form[jj];
|
||||||
@ -536,8 +544,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
flt_t dchi_0, dchi_1, dchi_2;
|
flt_t dchi_0, dchi_1, dchi_2;
|
||||||
temp1 = ME_dot3(iota, r12hat);
|
temp1 = ME_dot3(iota, r12hat);
|
||||||
temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
|
temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
|
||||||
pow(chi, (mu - (flt_t)1.0) / mu);
|
std::pow(chi, (mu - (flt_t)1.0) / mu);
|
||||||
dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
|
dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
|
||||||
dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
|
dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
|
||||||
dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
|
dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
|
||||||
@ -714,7 +722,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EVFLAG) {
|
||||||
flt_t ev_pre = (flt_t)0;
|
flt_t ev_pre = (flt_t)0.0;
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
if (NEWTON_PAIR || i < nlocal)
|
||||||
ev_pre += (flt_t)0.5;
|
ev_pre += (flt_t)0.5;
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
if (NEWTON_PAIR || j < nlocal)
|
||||||
@ -863,7 +871,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
#endif
|
#endif
|
||||||
} // offload
|
} // offload
|
||||||
|
|||||||
@ -217,7 +217,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
ITABLE_IN signal(f_start)
|
ITABLE_IN signal(f_start)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime();
|
*timer_compute = MIC_Wtime();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -459,7 +459,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
ev_global[7] = ov5;
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
#endif
|
#endif
|
||||||
} // end of offload region
|
} // end of offload region
|
||||||
|
|||||||
@ -212,7 +212,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
ITABLE_IN signal(f_start)
|
ITABLE_IN signal(f_start)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime();
|
*timer_compute = MIC_Wtime();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -263,7 +263,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__INTEL_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
@ -283,7 +283,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
const flt_t r2inv = (flt_t)1.0 / rsq;
|
const flt_t r2inv = (flt_t)1.0 / rsq;
|
||||||
|
|
||||||
#ifdef __MIC__
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < c_forcei[jtype].cutsq) {
|
if (rsq < c_forcei[jtype].cutsq) {
|
||||||
#endif
|
#endif
|
||||||
#ifdef INTEL_ALLOW_TABLE
|
#ifdef INTEL_ALLOW_TABLE
|
||||||
@ -335,11 +335,11 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#ifdef __MIC__
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __MIC__
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < c_forcei[jtype].cut_ljsq) {
|
if (rsq < c_forcei[jtype].cut_ljsq) {
|
||||||
#endif
|
#endif
|
||||||
flt_t r6inv = r2inv * r2inv * r2inv;
|
flt_t r6inv = r2inv * r2inv * r2inv;
|
||||||
@ -354,7 +354,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
forcelj *= factor_lj;
|
forcelj *= factor_lj;
|
||||||
if (EFLAG) evdwl *= factor_lj;
|
if (EFLAG) evdwl *= factor_lj;
|
||||||
}
|
}
|
||||||
#ifdef __MIC__
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (rsq > c_forcei[jtype].cutsq)
|
if (rsq > c_forcei[jtype].cutsq)
|
||||||
@ -363,7 +363,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
{ forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
{ forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __MIC__
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < c_forcei[jtype].cutsq) {
|
if (rsq < c_forcei[jtype].cutsq) {
|
||||||
#endif
|
#endif
|
||||||
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
||||||
@ -395,7 +395,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
|
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
|
||||||
}
|
}
|
||||||
#ifdef __MIC__
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // for jj
|
} // for jj
|
||||||
@ -426,7 +426,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
ev_global[7] = ov5;
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
#endif
|
#endif
|
||||||
} // end of offload region
|
} // end of offload region
|
||||||
|
|||||||
@ -88,39 +88,73 @@ void PairLJCutIntel::compute(int eflag, int vflag,
|
|||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (evflag || vflag_fdotr) {
|
if (_onetype) {
|
||||||
int ovflag = 0;
|
if (evflag || vflag_fdotr) {
|
||||||
if (vflag_fdotr) ovflag = 2;
|
int ovflag = 0;
|
||||||
else if (vflag) ovflag = 1;
|
if (vflag_fdotr) ovflag = 2;
|
||||||
if (eflag) {
|
else if (vflag) ovflag = 1;
|
||||||
if (force->newton_pair) {
|
if (eflag) {
|
||||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
if (force->newton_pair) {
|
||||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
|
eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
|
} else {
|
||||||
|
eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
|
eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
if (force->newton_pair) {
|
||||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
|
eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
|
} else {
|
||||||
|
eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
|
eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
eval<1,0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (evflag || vflag_fdotr) {
|
||||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
int ovflag = 0;
|
||||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
if (vflag_fdotr) ovflag = 2;
|
||||||
|
else if (vflag) ovflag = 1;
|
||||||
|
if (eflag) {
|
||||||
|
if (force->newton_pair) {
|
||||||
|
eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
|
eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
|
} else {
|
||||||
|
eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
|
eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (force->newton_pair) {
|
||||||
|
eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
|
eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
|
} else {
|
||||||
|
eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
|
eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
if (force->newton_pair) {
|
||||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||||
|
eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||||
|
} else {
|
||||||
|
eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||||
|
eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
|
||||||
|
class acc_t>
|
||||||
void PairLJCutIntel::eval(const int offload, const int vflag,
|
void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
@ -159,7 +193,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
const int nthreads = tc;
|
const int nthreads = tc;
|
||||||
int *overflow = fix->get_off_overflow_flag();
|
int *overflow = fix->get_off_overflow_flag();
|
||||||
{
|
{
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime();
|
*timer_compute = MIC_Wtime();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -187,12 +221,25 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
||||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
|
flt_t cutsq, lj1, lj2, lj3, lj4, offset;
|
||||||
|
if (ONETYPE) {
|
||||||
|
cutsq = ljc12o[3].cutsq;
|
||||||
|
lj1 = ljc12o[3].lj1;
|
||||||
|
lj2 = ljc12o[3].lj2;
|
||||||
|
lj3 = lj34[3].lj3;
|
||||||
|
lj4 = lj34[3].lj4;
|
||||||
|
offset = ljc12o[3].offset;
|
||||||
|
}
|
||||||
for (int i = iifrom; i < iito; ++i) {
|
for (int i = iifrom; i < iito; ++i) {
|
||||||
const int itype = x[i].w;
|
int itype, ptr_off;
|
||||||
|
const FC_PACKED1_T * _noalias ljc12oi;
|
||||||
const int ptr_off = itype * ntypes;
|
const FC_PACKED2_T * _noalias lj34i;
|
||||||
const FC_PACKED1_T * _noalias const ljc12oi = ljc12o + ptr_off;
|
if (!ONETYPE) {
|
||||||
const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
|
itype = x[i].w;
|
||||||
|
ptr_off = itype * ntypes;
|
||||||
|
ljc12oi = ljc12o + ptr_off;
|
||||||
|
lj34i = lj34 + ptr_off;
|
||||||
|
}
|
||||||
|
|
||||||
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
@ -218,25 +265,42 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
flt_t forcelj, evdwl;
|
flt_t forcelj, evdwl;
|
||||||
forcelj = evdwl = (flt_t)0.0;
|
forcelj = evdwl = (flt_t)0.0;
|
||||||
|
|
||||||
const int sbindex = jlist[jj] >> SBBITS & 3;
|
int j, jtype, sbindex;
|
||||||
const int j = jlist[jj] & NEIGHMASK;
|
if (!ONETYPE) {
|
||||||
|
sbindex = jlist[jj] >> SBBITS & 3;
|
||||||
|
j = jlist[jj] & NEIGHMASK;
|
||||||
|
} else
|
||||||
|
j = jlist[jj];
|
||||||
|
|
||||||
const flt_t delx = xtmp - x[j].x;
|
const flt_t delx = xtmp - x[j].x;
|
||||||
const flt_t dely = ytmp - x[j].y;
|
const flt_t dely = ytmp - x[j].y;
|
||||||
const flt_t delz = ztmp - x[j].z;
|
const flt_t delz = ztmp - x[j].z;
|
||||||
const int jtype = x[j].w;
|
if (!ONETYPE) {
|
||||||
|
jtype = x[j].w;
|
||||||
|
cutsq = ljc12oi[jtype].cutsq;
|
||||||
|
}
|
||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < ljc12oi[jtype].cutsq) {
|
if (rsq < cutsq) {
|
||||||
#endif
|
#endif
|
||||||
flt_t factor_lj = special_lj[sbindex];
|
flt_t factor_lj;
|
||||||
|
if (!ONETYPE) factor_lj = special_lj[sbindex];
|
||||||
flt_t r2inv = 1.0 / rsq;
|
flt_t r2inv = 1.0 / rsq;
|
||||||
flt_t r6inv = r2inv * r2inv * r2inv;
|
flt_t r6inv = r2inv * r2inv * r2inv;
|
||||||
#ifndef INTEL_VMASK
|
#ifndef INTEL_VMASK
|
||||||
if (rsq > ljc12oi[jtype].cutsq) r6inv = (flt_t)0.0;
|
if (rsq > cutsq) r6inv = (flt_t)0.0;
|
||||||
#endif
|
#endif
|
||||||
forcelj = r6inv * (ljc12oi[jtype].lj1 * r6inv - ljc12oi[jtype].lj2);
|
if (!ONETYPE) {
|
||||||
flt_t fpair = factor_lj * forcelj * r2inv;
|
lj1 = ljc12oi[jtype].lj1;
|
||||||
|
lj2 = ljc12oi[jtype].lj2;
|
||||||
|
}
|
||||||
|
forcelj = r6inv * (lj1 * r6inv - lj2);
|
||||||
|
flt_t fpair;
|
||||||
|
if (!ONETYPE)
|
||||||
|
fpair = factor_lj * forcelj * r2inv;
|
||||||
|
else
|
||||||
|
fpair = forcelj * r2inv;
|
||||||
|
|
||||||
fxtmp += delx * fpair;
|
fxtmp += delx * fpair;
|
||||||
fytmp += dely * fpair;
|
fytmp += dely * fpair;
|
||||||
@ -255,9 +319,13 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
ev_pre += (flt_t)0.5;
|
ev_pre += (flt_t)0.5;
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
evdwl = r6inv * (lj34i[jtype].lj3 * r6inv-lj34i[jtype].lj4) -
|
if (!ONETYPE) {
|
||||||
ljc12oi[jtype].offset;
|
lj3 = lj34i[jtype].lj3;
|
||||||
evdwl *= factor_lj;
|
lj4 = lj34i[jtype].lj4;
|
||||||
|
offset = ljc12oi[jtype].offset;
|
||||||
|
}
|
||||||
|
evdwl = r6inv * (lj3 * r6inv - lj4) - offset;
|
||||||
|
if (!ONETYPE) evdwl *= factor_lj;
|
||||||
sevdwl += ev_pre*evdwl;
|
sevdwl += ev_pre*evdwl;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
if (NEWTON_PAIR || i < nlocal)
|
||||||
@ -302,7 +370,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
ev_global[7] = ov5;
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef __MIC__
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
#endif
|
#endif
|
||||||
} // end offload
|
} // end offload
|
||||||
@ -352,6 +420,9 @@ template <class flt_t, class acc_t>
|
|||||||
void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
|
void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers)
|
IntelBuffers<flt_t,acc_t> *buffers)
|
||||||
{
|
{
|
||||||
|
_onetype = 0;
|
||||||
|
if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
|
||||||
|
|
||||||
int tp1 = atom->ntypes + 1;
|
int tp1 = atom->ntypes + 1;
|
||||||
fc.set_ntypes(tp1,memory,_cop);
|
fc.set_ntypes(tp1,memory,_cop);
|
||||||
buffers->set_ntypes(tp1);
|
buffers->set_ntypes(tp1);
|
||||||
|
|||||||
@ -39,13 +39,14 @@ class PairLJCutIntel : public PairLJCut {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
FixIntel *fix;
|
FixIntel *fix;
|
||||||
int _cop;
|
int _cop, _onetype;
|
||||||
|
|
||||||
template <class flt_t> class ForceConst;
|
template <class flt_t> class ForceConst;
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
|
||||||
|
class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -11,6 +11,10 @@
|
|||||||
See the README file in the top-level LAMMPS directory.
|
See the README file in the top-level LAMMPS directory.
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------------
|
||||||
|
Contributing author: W. Michael Brown (Intel)
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#ifdef PAIR_CLASS
|
#ifdef PAIR_CLASS
|
||||||
|
|
||||||
PairStyle(sw/intel,PairSWIntel)
|
PairStyle(sw/intel,PairSWIntel)
|
||||||
@ -42,7 +46,7 @@ class PairSWIntel : public PairSW {
|
|||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int SPQ, int EVFLAG, int EFLAG, class flt_t, class acc_t>
|
template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
|
IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
|
||||||
const int astart, const int aend, const int pad_width);
|
const int astart, const int aend, const int pad_width);
|
||||||
@ -51,7 +55,10 @@ class PairSWIntel : public PairSW {
|
|||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t, acc_t> *buffers);
|
IntelBuffers<flt_t, acc_t> *buffers);
|
||||||
|
|
||||||
int _ccache_stride, _host_pad, _offload_pad, _spq;
|
int _ccache_stride, _host_pad, _offload_pad, _spq, _onetype;
|
||||||
|
#ifdef LMP_USE_AVXCD
|
||||||
|
int _ccache_stride3;
|
||||||
|
#endif
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
|
|
||||||
@ -62,8 +69,11 @@ class PairSWIntel : public PairSW {
|
|||||||
flt_t cutsq, cut, sigma_gamma, pad;
|
flt_t cutsq, cut, sigma_gamma, pad;
|
||||||
} fc_packed0;
|
} fc_packed0;
|
||||||
typedef struct {
|
typedef struct {
|
||||||
flt_t powerp, powerq, cut, sigma, c1, c2, c3, c4;
|
flt_t powerp, powerq, cut, sigma;
|
||||||
} fc_packed1;
|
} fc_packed1;
|
||||||
|
typedef struct {
|
||||||
|
flt_t c1, c2, c3, c4;
|
||||||
|
} fc_packed1p2;
|
||||||
typedef struct {
|
typedef struct {
|
||||||
flt_t c5, c6;
|
flt_t c5, c6;
|
||||||
} fc_packed2;
|
} fc_packed2;
|
||||||
@ -73,6 +83,7 @@ class PairSWIntel : public PairSW {
|
|||||||
|
|
||||||
fc_packed0 **p2;
|
fc_packed0 **p2;
|
||||||
fc_packed1 **p2f;
|
fc_packed1 **p2f;
|
||||||
|
fc_packed1p2 **p2f2;
|
||||||
fc_packed2 **p2e;
|
fc_packed2 **p2e;
|
||||||
fc_packed3 ***p3;
|
fc_packed3 ***p3;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user