Adding full hybrid support to USER-INTEL package and fixing bug with EAM parameter initialization.

This commit is contained in:
Michael Brown
2018-10-11 05:12:00 -07:00
parent 7faa48b4c3
commit c37deebffa
42 changed files with 1442 additions and 566 deletions

View File

@ -499,7 +499,7 @@ MPI task.
When offloading to a coprocessor, "hybrid"_pair_hybrid.html styles
that require skip lists for neighbor builds cannot be offloaded.
Using "hybrid/overlay"_pair_hybrid.html is allowed. Only one intel
accelerated style may be used with hybrid styles.
accelerated style may be used with hybrid styles when offloading.
"Special_bonds"_special_bonds.html exclusion lists are not currently
supported with offload, however, the same effect can often be
accomplished by setting cutoffs for excluded atom types to 0. None of

View File

@ -314,7 +314,7 @@ void AngleHarmonicIntel::init_style()
template <class flt_t, class acc_t>
void AngleHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> * /*buffers*/)
IntelBuffers<flt_t,acc_t> * /*buffers*/)
{
const int bp1 = atom->nangletypes + 1;
fc.set_ntypes(bp1,memory);

View File

@ -291,7 +291,7 @@ void BondFENEIntel::init_style()
template <class flt_t, class acc_t>
void BondFENEIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> * /*buffers*/)
IntelBuffers<flt_t,acc_t> * /*buffers*/)
{
const int bp1 = atom->nbondtypes + 1;
fc.set_ntypes(bp1,memory);

View File

@ -416,7 +416,7 @@ void DihedralOPLSIntel::init_style()
template <class flt_t, class acc_t>
void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> * /*buffers*/)
IntelBuffers<flt_t,acc_t> * /*buffers*/)
{
const int bp1 = atom->ndihedraltypes + 1;
fc.set_ntypes(bp1,memory);

View File

@ -65,6 +65,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
_nbor_pack_width = 1;
_three_body_neighbor = 0;
_hybrid_nonpair = 0;
_precision_mode = PREC_MODE_MIXED;
_offload_balance = -1.0;
@ -266,8 +267,7 @@ FixIntel::~FixIntel()
double *time1 = off_watch_pair();
double *time2 = off_watch_neighbor();
int *overflow = get_off_overflow_flag();
if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL &&
overflow != NULL) {
if (_offload_balance != 0.0) {
#pragma offload_transfer target(mic:_cop) \
nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
}
@ -314,34 +314,63 @@ void FixIntel::init()
int nstyles = 0;
if (force->pair_match("hybrid", 1) != NULL) {
_pair_hybrid_flag = 1;
PairHybrid *hybrid = (PairHybrid *) force->pair;
for (int i = 0; i < hybrid->nstyles; i++)
if (strstr(hybrid->keywords[i], "/intel") != NULL)
nstyles++;
if (force->newton_pair != 0 && force->pair->no_virial_fdotr_compute)
error->all(FLERR,
"Intel package requires fdotr virial with newton on.");
} else if (force->pair_match("hybrid/overlay", 1) != NULL) {
_pair_hybrid_flag = 1;
PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
for (int i = 0; i < hybrid->nstyles; i++)
if (strstr(hybrid->keywords[i], "/intel") != NULL)
nstyles++;
else
force->pair->no_virial_fdotr_compute = 1;
if (force->newton_pair != 0 && force->pair->no_virial_fdotr_compute)
error->all(FLERR,
"Intel package requires fdotr virial with newton on.");
} else
_pair_hybrid_flag = 0;
if (nstyles > 1 && _pair_hybrid_flag) _pair_hybrid_flag = 2;
else if (force->newton_pair == 0) _pair_hybrid_flag = 0;
_pair_hybrid_zero = 0;
_zero_master = 0;
if (_pair_hybrid_flag && _hybrid_nonpair)
if (_pair_hybrid_flag > 1 || force->newton_pair == 0)
_pair_hybrid_zero = 1;
_hybrid_nonpair = 0;
#ifdef _LMP_INTEL_OFFLOAD
if (offload_balance() != 0.0) {
_pair_hybrid_zero = 0;
if (force->newton_pair == 0) _pair_hybrid_flag = 0;
if (nstyles > 1)
error->all(FLERR,
"Currently, cannot offload more than one intel style with hybrid.");
}
if (nstyles > 1)
error->all(FLERR,
"Currently, cannot use more than one intel style with hybrid.");
#endif
check_neighbor_intel();
int off_mode = 0;
if (_offload_balance != 0.0) off_mode = 1;
if (_precision_mode == PREC_MODE_SINGLE) {
_single_buffers->zero_ev();
_single_buffers->grow_ncache(off_mode,_nthreads);
_single_buffers->free_list_ptrs();
} else if (_precision_mode == PREC_MODE_MIXED) {
_mixed_buffers->zero_ev();
_mixed_buffers->grow_ncache(off_mode,_nthreads);
_mixed_buffers->free_list_ptrs();
} else {
_double_buffers->zero_ev();
_double_buffers->grow_ncache(off_mode,_nthreads);
_double_buffers->free_list_ptrs();
}
_need_reduce = 0;
@ -349,7 +378,7 @@ void FixIntel::init()
/* ---------------------------------------------------------------------- */
void FixIntel::setup(int /*vflag*/)
void FixIntel::setup(int vflag)
{
if (neighbor->style != Neighbor::BIN)
error->all(FLERR,
@ -395,8 +424,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
double *time1 = off_watch_pair();
double *time2 = off_watch_neighbor();
int *overflow = get_off_overflow_flag();
if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL &&
overflow != NULL) {
if (_offload_balance !=0.0) {
#pragma offload_transfer target(mic:_cop) \
nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
in(overflow:length(5) alloc_if(1) free_if(0))
@ -419,6 +447,21 @@ void FixIntel::pair_init_check(const bool cdmessage)
#endif
}
#ifndef LMP_INTEL_NBOR_COMPAT
if (force->pair->manybody_flag && atom->molecular) {
int flag = 0;
if (atom->nbonds > 0 && force->special_lj[1] == 0.0 &&
force->special_coul[1] == 0.0) flag = 1;
if (atom->nangles > 0 && force->special_lj[2] == 0.0 &&
force->special_coul[2] == 0.0) flag = 1;
if (atom->ndihedrals > 0 && force->special_lj[3] == 0.0 &&
force->special_coul[3] == 0.0) flag = 1;
if (flag)
error->all(FLERR,"Add -DLMP_INTEL_NBOR_COMPAT to build for special_bond"
"exclusions with Intel");
}
#endif
int need_tag = 0;
if (atom->molecular) need_tag = 1;
@ -477,11 +520,13 @@ void FixIntel::bond_init_check()
if (force->pair_match("/intel", 0) != NULL)
intel_pair = 1;
else if (force->pair_match("hybrid", 1) != NULL) {
_hybrid_nonpair = 1;
PairHybrid *hybrid = (PairHybrid *) force->pair;
for (int i = 0; i < hybrid->nstyles; i++)
if (strstr(hybrid->keywords[i], "/intel") != NULL)
intel_pair = 1;
} else if (force->pair_match("hybrid/overlay", 1) != NULL) {
_hybrid_nonpair = 1;
PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
for (int i = 0; i < hybrid->nstyles; i++)
if (strstr(hybrid->keywords[i], "/intel") != NULL)
@ -501,11 +546,13 @@ void FixIntel::kspace_init_check()
if (force->pair_match("/intel", 0) != NULL)
intel_pair = 1;
else if (force->pair_match("hybrid", 1) != NULL) {
_hybrid_nonpair = 1;
PairHybrid *hybrid = (PairHybrid *) force->pair;
for (int i = 0; i < hybrid->nstyles; i++)
if (strstr(hybrid->keywords[i], "/intel") != NULL)
intel_pair = 1;
} else if (force->pair_match("hybrid/overlay", 1) != NULL) {
_hybrid_nonpair = 1;
PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
for (int i = 0; i < hybrid->nstyles; i++)
if (strstr(hybrid->keywords[i], "/intel") != NULL)
@ -522,51 +569,60 @@ void FixIntel::check_neighbor_intel()
{
#ifdef _LMP_INTEL_OFFLOAD
_full_host_list = 0;
#endif
const int nrequest = neighbor->nrequest;
const int nrequest = neighbor->nrequest;
for (int i = 0; i < nrequest; ++i) {
#ifdef _LMP_INTEL_OFFLOAD
if (_offload_balance != 0.0 && neighbor->requests[i]->intel == 0) {
_full_host_list = 1;
_offload_noghost = 0;
}
#endif
if (neighbor->requests[i]->skip && _offload_balance != 0.0)
error->all(FLERR, "Cannot yet use hybrid styles with Intel offload.");
// avoid flagging a neighbor list as both USER-INTEL and USER-OMP
if (neighbor->requests[i]->intel)
neighbor->requests[i]->omp = 0;
if (neighbor->requests[i]->skip)
error->all(FLERR, "Hybrid styles with Intel package are unsupported.");
}
#else
// avoid flagging a neighbor list as both USER-INTEL and USER-OMP
const int nrequest = neighbor->nrequest;
for (int i = 0; i < nrequest; ++i)
if (neighbor->requests[i]->intel)
neighbor->requests[i]->omp = 0;
#endif
}
/* ---------------------------------------------------------------------- */
void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/)
void FixIntel::_sync_main_arrays(const int prereverse)
{
if (!prereverse) _zero_master = 1;
int done_this_step = prereverse;
if (_pair_hybrid_zero == 0) done_this_step = 1;
if (_force_array_m != 0) {
if (_need_reduce) {
reduce_results(&_force_array_m[0].x);
_need_reduce = 0;
}
add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom,0);
_force_array_m = 0;
if (done_this_step) _force_array_m = 0;
else _ev_array_d = 0;
} else if (_force_array_d != 0) {
if (_need_reduce) {
reduce_results(&_force_array_d[0].x);
_need_reduce = 0;
}
add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom,0);
_force_array_d = 0;
if (done_this_step) _force_array_d = 0;
else _ev_array_d = 0;
} else if (_force_array_s != 0) {
if (_need_reduce) {
reduce_results(&_force_array_s[0].x);
_need_reduce = 0;
}
add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom,0);
_force_array_s = 0;
if (done_this_step) _force_array_s = 0;
else _ev_array_s = 0;
}
#ifdef _LMP_INTEL_OFFLOAD
@ -576,6 +632,13 @@ void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/)
/* ---------------------------------------------------------------------- */
void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/)
{
_sync_main_arrays(1);
}
/* ---------------------------------------------------------------------- */
template <class acc_t>
void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
{
@ -657,7 +720,7 @@ template <class ft, class acc_t>
void FixIntel::add_results(const ft * _noalias const f_in,
const acc_t * _noalias const ev_global,
const int eatom, const int vatom,
const int /*offload*/) {
const int offload) {
start_watch(TIME_PACK);
int f_length;
#ifdef _LMP_INTEL_OFFLOAD

View File

@ -74,11 +74,12 @@ class FixIntel : public Fix {
inline int nbor_pack_width() const { return _nbor_pack_width; }
inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
inline int three_body_neighbor() { return _three_body_neighbor; }
inline void three_body_neighbor(const int /*i*/) { _three_body_neighbor = 1; }
inline void three_body_neighbor(const int i) { _three_body_neighbor = i; }
inline int need_zero(const int tid) {
if (_need_reduce == 0 && tid > 0) return 1;
return 0;
else if (_zero_master && tid == 0) { _zero_master = 0; return 1; }
else return 0;
}
inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; }
inline int lrt() {
@ -100,7 +101,10 @@ class FixIntel : public Fix {
IntelBuffers<double,double> *_double_buffers;
int _precision_mode, _nthreads, _nbor_pack_width, _three_body_neighbor;
int _pair_hybrid_flag;
// These should be removed in subsequent update w/ simpler hybrid arch
int _pair_hybrid_zero, _hybrid_nonpair, _zero_master;
public:
inline int* get_overflow_flag() { return _overflow_flag; }
inline int* get_off_overflow_flag() { return _off_overflow_flag; }
@ -210,6 +214,8 @@ class FixIntel : public Fix {
_alignvar(double _stopwatch_offload_neighbor[1],64);
_alignvar(double _stopwatch_offload_pair[1],64);
void _sync_main_arrays(const int prereverse);
template <class ft>
void reduce_results(ft * _noalias const f_in);
@ -238,7 +244,7 @@ class FixIntel : public Fix {
/* ---------------------------------------------------------------------- */
void FixIntel::get_buffern(const int /*offload*/, int &nlocal, int &nall,
void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
int &minlocal) {
#ifdef _LMP_INTEL_OFFLOAD
if (_separate_buffers) {
@ -273,7 +279,7 @@ void FixIntel::get_buffern(const int /*offload*/, int &nlocal, int &nall,
/* ---------------------------------------------------------------------- */
void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
double *ev_in, const int /*offload*/,
double *ev_in, const int offload,
const int eatom, const int vatom,
const int rflag) {
#ifdef _LMP_INTEL_OFFLOAD
@ -282,6 +288,8 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
_off_results_vatom = vatom;
_off_force_array_d = f_in;
_off_ev_array_d = ev_in;
if (_pair_hybrid_flag && force->pair->fdotr_is_set())
_sync_main_arrays(1);
return;
}
#endif
@ -296,12 +304,15 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
if (_overflow_flag[LMP_OVERFLOW])
error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
if (_pair_hybrid_flag > 1 ||
(_pair_hybrid_flag && force->pair->fdotr_is_set())) _sync_main_arrays(0);
}
/* ---------------------------------------------------------------------- */
void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
double *ev_in, const int /*offload*/,
double *ev_in, const int offload,
const int eatom, const int vatom,
const int rflag) {
#ifdef _LMP_INTEL_OFFLOAD
@ -310,6 +321,8 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
_off_results_vatom = vatom;
_off_force_array_m = f_in;
_off_ev_array_d = ev_in;
if (_pair_hybrid_flag && force->pair->fdotr_is_set())
_sync_main_arrays(1);
return;
}
#endif
@ -324,12 +337,16 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
if (_overflow_flag[LMP_OVERFLOW])
error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
if (_pair_hybrid_flag > 1 ||
(_pair_hybrid_flag && force->pair->fdotr_is_set()))
_sync_main_arrays(0);
}
/* ---------------------------------------------------------------------- */
void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
float *ev_in, const int /*offload*/,
float *ev_in, const int offload,
const int eatom, const int vatom,
const int rflag) {
#ifdef _LMP_INTEL_OFFLOAD
@ -338,6 +355,8 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
_off_results_vatom = vatom;
_off_force_array_s = f_in;
_off_ev_array_s = ev_in;
if (_pair_hybrid_flag && force->pair->fdotr_is_set())
_sync_main_arrays(1);
return;
}
#endif
@ -352,6 +371,10 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
if (_overflow_flag[LMP_OVERFLOW])
error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
if (_pair_hybrid_flag > 1 ||
(_pair_hybrid_flag && force->pair->fdotr_is_set()))
_sync_main_arrays(0);
}
/* ---------------------------------------------------------------------- */
@ -487,16 +510,16 @@ The compiler version used to build LAMMPS is not supported when using
offload to a coprocessor. There could be performance or correctness
issues. Please use 14.0.1.106 or 15.1.133 or later.
E: Currently, cannot use more than one intel style with hybrid.
E: Currently, cannot offload more than one intel style with hybrid.
Currently, hybrid pair styles can only use the intel suffix for one of the
pair styles.
Currently, when using offload, hybrid pair styles can only use the intel
suffix for one of the pair styles.
E: Cannot yet use hybrid styles with Intel package.
E: Cannot yet use hybrid styles with Intel offload.
The hybrid pair style configuration is not yet supported by the Intel
package. Support is limited to hybrid/overlay or a hybrid style that does
not require a skip list.
The hybrid pair style configuration is not yet supported when using offload
within the Intel package. Support is limited to hybrid/overlay or a hybrid
style that does not require a skip list.
W: Leaving a core/node free can improve performance for offload
@ -538,4 +561,16 @@ E: Too few atoms for load balancing offload.
When using offload to a coprocessor, each MPI task must have at least 2
atoms throughout the simulation.
E: Intel package requires fdotr virial with newton on.
This error can occur with a hybrid pair style that mixes styles that are
incompatible with the newton pair setting turned on. Try turning the
newton pair setting off.
E: Add -DLMP_INTEL_NBOR_COMPAT to build for special_bond exclusions with Intel
When using a manybody pair style, bonds/angles/dihedrals, and special_bond
exclusions, LAMMPS should be built with the above compile flag for compatible
results.
*/

View File

@ -428,7 +428,7 @@ void ImproperCvffIntel::init_style()
template <class flt_t, class acc_t>
void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> * /*buffers*/)
IntelBuffers<flt_t,acc_t> * /*buffers*/)
{
const int bp1 = atom->nimpropertypes + 1;
fc.set_ntypes(bp1,memory);

View File

@ -24,7 +24,9 @@ using namespace LAMMPS_NS;
template <class flt_t, class acc_t>
IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _off_threads(0),
_buf_size(0), _buf_local_size(0) {
_buf_size(0), _buf_local_size(0), _n_list_ptrs(1), _max_list_ptrs(4) {
_neigh_list_ptrs = new IntelNeighListPtrs[_max_list_ptrs];
_neigh_list_ptrs[0].cnumneigh = 0;
_list_alloc_atoms = 0;
_ntypes = 0;
_off_map_listlocal = 0;
@ -55,6 +57,7 @@ IntelBuffers<flt_t, acc_t>::~IntelBuffers()
free_all_nbor_buffers();
free_ccache();
set_ntypes(0);
delete []_neigh_list_ptrs;
}
/* ---------------------------------------------------------------------- */
@ -109,7 +112,7 @@ void IntelBuffers<flt_t, acc_t>::free_buffers()
template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
const int nthreads,
const int /*offload_end*/)
const int offload_end)
{
free_buffers();
_buf_size = static_cast<double>(nall) * 1.1 + 1;
@ -186,11 +189,9 @@ void IntelBuffers<flt_t, acc_t>::free_nmax()
const int * tag = _off_map_tag;
const int * special = _off_map_special;
const int * nspecial = _off_map_nspecial;
if (tag != 0 && special != 0 && nspecial !=0) {
#pragma offload_transfer target(mic:_cop) \
nocopy(tag:alloc_if(0) free_if(1)) \
nocopy(special,nspecial:alloc_if(0) free_if(1))
}
#pragma offload_transfer target(mic:_cop) \
nocopy(tag:alloc_if(0) free_if(1)) \
nocopy(special,nspecial:alloc_if(0) free_if(1))
_off_map_nmax = 0;
_host_nmax = 0;
}
@ -200,7 +201,7 @@ void IntelBuffers<flt_t, acc_t>::free_nmax()
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::_grow_nmax(const int /*offload_end*/)
void IntelBuffers<flt_t, acc_t>::_grow_nmax(const int offload_end)
{
#ifdef _LMP_INTEL_OFFLOAD
free_nmax();
@ -243,46 +244,117 @@ template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::free_list_local()
{
if (_off_map_listlocal > 0) {
int * cnumneigh = _cnumneigh;
if (_neigh_list_ptrs[0].cnumneigh) {
int * cnumneigh = _neigh_list_ptrs[0].cnumneigh;
_neigh_list_ptrs[0].cnumneigh = 0;
#ifdef _LMP_INTEL_OFFLOAD
if (_off_map_ilist != NULL) {
#pragma offload_transfer target(mic:_cop) \
nocopy(cnumneigh:alloc_if(0) free_if(1))
}
#endif
lmp->memory->destroy(cnumneigh);
}
#ifdef _LMP_INTEL_OFFLOAD
if (_off_map_ilist != NULL) {
const int * ilist = _off_map_ilist;
const int * numneigh = _off_map_numneigh;
const int ** firstneigh = (const int **)_off_map_firstneigh;
_off_map_ilist = NULL;
if (numneigh != 0 && ilist != 0) {
#pragma offload_transfer target(mic:_cop) \
nocopy(ilist,numneigh,cnumneigh:alloc_if(0) free_if(1))
}
#pragma offload_transfer target(mic:_cop) \
nocopy(ilist,firstneigh,numneigh:alloc_if(0) free_if(1))
}
#endif
lmp->memory->destroy(cnumneigh);
_off_map_listlocal = 0;
}
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::free_list_ptrs()
{
for (int list_num = 1; list_num < _n_list_ptrs; list_num++) {
if (_neigh_list_ptrs[list_num].size) {
lmp->memory->destroy(_neigh_list_ptrs[list_num].cnumneigh);
lmp->memory->destroy(_neigh_list_ptrs[list_num].numneighhalf);
}
_neigh_list_ptrs[list_num].size = 0;
_neigh_list_ptrs[list_num].list_ptr = 0;
}
_n_list_ptrs = 1;
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::grow_data3(NeighList *list,
int *&numneighhalf,
int *&cnumneigh)
{
const int size = list->get_maxlocal();
int list_num;
for (list_num = 0; list_num < _n_list_ptrs; list_num++)
if (_neigh_list_ptrs[list_num].list_ptr == (void*)list) break;
if (list_num == _n_list_ptrs) {
if (_n_list_ptrs == _max_list_ptrs) {
_max_list_ptrs *= 2;
IntelNeighListPtrs *new_list = new IntelNeighListPtrs[_max_list_ptrs];
for (int i = 0; i < _n_list_ptrs; i++) new_list[i] = _neigh_list_ptrs[i];
delete []_neigh_list_ptrs;
_neigh_list_ptrs = new_list;
}
_neigh_list_ptrs[list_num].list_ptr = (void *)list;
_neigh_list_ptrs[list_num].size = 0;
_n_list_ptrs++;
}
if (size > _neigh_list_ptrs[list_num].size) {
if (_neigh_list_ptrs[list_num].size) {
lmp->memory->destroy(_neigh_list_ptrs[list_num].cnumneigh);
lmp->memory->destroy(_neigh_list_ptrs[list_num].numneighhalf);
}
lmp->memory->create(_neigh_list_ptrs[list_num].cnumneigh, size,
"_cnumneigh");
lmp->memory->create(_neigh_list_ptrs[list_num].numneighhalf, size,
"_cnumneigh");
_neigh_list_ptrs[list_num].size = size;
}
numneighhalf = _neigh_list_ptrs[list_num].numneighhalf;
cnumneigh = _neigh_list_ptrs[list_num].cnumneigh;
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
const int /*offload_end*/)
const int three_body,
const int offload_end)
{
free_list_local();
int size = list->get_maxlocal();
lmp->memory->create(_cnumneigh, size, "_cnumneigh");
_off_map_listlocal = size;
if (three_body)
lmp->memory->create(_neigh_list_ptrs[0].cnumneigh, size, "_cnumneigh");
#ifdef _LMP_INTEL_OFFLOAD
if (offload_end > 0) {
int tb_size = size;
if (three_body == 0) {
lmp->memory->create(_neigh_list_ptrs[0].cnumneigh, 16, "_cnumneigh");
tb_size = 16;
}
int ** firstneigh = list->firstneigh;
int * numneigh = list->numneigh;
int * ilist = list->ilist;
int * cnumneigh = _cnumneigh;
if (cnumneigh != 0) {
#pragma offload_transfer target(mic:_cop) \
nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
nocopy(cnumneigh:length(size) alloc_if(1) free_if(0))
}
int * cnumneigh = _neigh_list_ptrs[0].cnumneigh;
#pragma offload_transfer target(mic:_cop) \
nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
nocopy(firstneigh:length(size) alloc_if(1) free_if(0)) \
nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
nocopy(cnumneigh:length(tb_size) alloc_if(1) free_if(0))
_off_map_ilist = ilist;
_off_map_firstneigh = firstneigh;
_off_map_numneigh = numneigh;
}
#endif
@ -313,7 +385,7 @@ template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList * /*list*/,
const int nlocal,
const int nthreads,
const int /*offload_end*/,
const int offload_end,
const int pack_width)
{
free_nbor_list();
@ -382,7 +454,7 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::grow_ccache(const int /*off_flag*/,
void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
const int nthreads,
const int width)
{
@ -481,7 +553,7 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::grow_ncache(const int /*off_flag*/,
void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
const int nthreads)
{
const int nsize = get_max_nbors() * 3;
@ -576,12 +648,12 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes,
if (_ntypes > 0) {
#ifdef _LMP_INTEL_OFFLOAD
flt_t * cutneighsqo = _cutneighsq[0];
if (_off_threads > 0 && cutneighsqo != 0) {
if (_off_threads > 0) {
#pragma offload_transfer target(mic:_cop) \
nocopy(cutneighsqo:alloc_if(0) free_if(1))
}
flt_t * cutneighghostsqo;
if (_cutneighghostsq && _off_threads > 0 && cutneighghostsqo != 0) {
if (_cutneighghostsq && _off_threads > 0) {
cutneighghostsqo = _cutneighghostsq[0];
#pragma offload_transfer target(mic:_cop) \
nocopy(cutneighghostsqo:alloc_if(0) free_if(1))
@ -637,6 +709,8 @@ double IntelBuffers<flt_t, acc_t>::memory_usage(const int nthreads)
tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int);
tmem += _ntypes * _ntypes * sizeof(int);
tmem += _buf_local_size + (_n_list_ptrs - 1) * _buf_local_size * 2;
return tmem;
}

View File

@ -33,12 +33,20 @@ namespace LAMMPS_NS {
#define QUAT_T typename IntelBuffers<flt_t,acc_t>::quat_t
#define FORCE_T typename IntelBuffers<flt_t,acc_t>::vec3_acc_t
struct IntelNeighListPtrs {
void * list_ptr;
int *cnumneigh;
int *numneighhalf;
int size;
};
// May not need a separate force array for mixed/double
template <class flt_t, class acc_t>
class IntelBuffers {
public:
typedef struct { flt_t x,y,z; int w; } atom_t;
typedef struct { flt_t w,i,j,k; } quat_t;
typedef struct { flt_t x,y; } vec2_t;
typedef struct { flt_t x,y,z,w; } vec3_t;
typedef struct { flt_t x,y,z,w; } vec4_t;
typedef struct { acc_t x,y,z,w; } vec3_acc_t;
@ -62,8 +70,12 @@ class IntelBuffers {
void free_buffers();
void free_nmax();
inline void set_bininfo(int *atombin, int *binpacked)
{ _atombin = atombin; _binpacked = binpacked; }
inline void set_bininfo(int *atombin, int *binpacked) {
_atombin = atombin;
_binpacked = binpacked;
_neigh_list_ptrs[0].numneighhalf = atombin;
}
inline void grow(const int nall, const int nlocal, const int nthreads,
const int offload_end) {
if (nall >= _buf_size || nlocal >= _buf_local_size)
@ -79,18 +91,22 @@ class IntelBuffers {
free_nmax();
free_list_local();
free_ncache();
free_list_ptrs();
}
inline void grow_list(NeighList *list, const int nlocal, const int nthreads,
const int offload_end, const int pack_width=1) {
grow_list_local(list, offload_end);
const int three_body, const int offload_end,
const int pack_width=1) {
grow_list_local(list, three_body, offload_end);
grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
}
void free_list_local();
inline void grow_list_local(NeighList *list, const int offload_end) {
inline void grow_list_local(NeighList *list, const int three_body,
const int offload_end) {
_neigh_list_ptrs[0].list_ptr = (void *)list;
if (list->get_maxlocal() > _off_map_listlocal)
_grow_list_local(list, offload_end);
_grow_list_local(list, three_body, offload_end);
}
void free_ccache();
@ -133,26 +149,36 @@ class IntelBuffers {
_grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
}
void set_ntypes(const int ntypes, const int use_ghost_cut = 0);
void set_ntypes(const int ntypes, const int use_ghost_cut = 1);
inline int * firstneigh(const NeighList * /*list*/) { return _list_alloc; }
inline int * cnumneigh(const NeighList * /*list*/) { return _cnumneigh; }
inline int * intel_list(const NeighList * /*list*/) { return _list_alloc; }
inline int * get_atombin() { return _atombin; }
inline int * get_binpacked() { return _binpacked; }
inline int * cnumneigh() { return _neigh_list_ptrs[0].cnumneigh; }
inline void get_list_data3(const NeighList *list, int *&numneighhalf,
int *&cnumneigh) {
for (int i = 0; i < _n_list_ptrs; i++)
if ((void *)list == _neigh_list_ptrs[i].list_ptr) {
numneighhalf = _neigh_list_ptrs[i].numneighhalf;
cnumneigh = _neigh_list_ptrs[i].cnumneigh;
}
}
void grow_data3(NeighList *list, int *&numneighhalf, int *&cnumneigh);
void free_list_ptrs();
inline atom_t * get_x(const int /*offload*/ = 1) {
inline atom_t * get_x(const int offload = 1) {
#ifdef _LMP_INTEL_OFFLOAD
if (_separate_buffers && offload == 0) return _host_x;
#endif
return _x;
}
inline flt_t * get_q(const int /*offload*/ = 1) {
inline flt_t * get_q(const int offload = 1) {
#ifdef _LMP_INTEL_OFFLOAD
if (_separate_buffers && offload == 0) return _host_q;
#endif
return _q;
}
inline quat_t * get_quat(const int /*offload*/ = 1) {
inline quat_t * get_quat(const int offload = 1) {
#ifdef _LMP_INTEL_OFFLOAD
if (_separate_buffers && offload == 0) return _host_quat;
#endif
@ -298,6 +324,9 @@ class IntelBuffers {
int _list_alloc_atoms;
int *_list_alloc, *_cnumneigh, *_atombin, *_binpacked;
IntelNeighListPtrs *_neigh_list_ptrs;
int _n_list_ptrs, _max_list_ptrs;
flt_t **_cutneighsq, **_cutneighghostsq;
int _ntypes;
@ -325,7 +354,7 @@ class IntelBuffers {
int _off_map_nmax, _cop, _off_ccache, _off_ncache;
int *_off_map_ilist;
int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
int *_off_map_numneigh;
int **_off_map_firstneigh, *_off_map_numneigh;
bool _off_list_alloc;
#endif
@ -336,7 +365,8 @@ class IntelBuffers {
void _grow(const int nall, const int nlocal, const int nthreads,
const int offload_end);
void _grow_nmax(const int offload_end);
void _grow_list_local(NeighList *list, const int offload_end);
void _grow_list_local(NeighList *list, const int three_body,
const int offload_end);
void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
const int offload_end, const int pack_width);
};

View File

@ -20,6 +20,9 @@
#if (__INTEL_COMPILER_BUILD_DATE > 20160720)
#define LMP_INTEL_USE_SIMDOFF
#endif
#pragma warning (disable:3948)
#pragma warning (disable:3949)
#pragma warning (disable:13200)
#endif
#ifdef __INTEL_OFFLOAD
@ -606,7 +609,7 @@ inline double MIC_Wtime() {
if (newton) \
f_stride = buffers->get_stride(nall); \
else \
f_stride = buffers->get_stride(inum); \
f_stride = buffers->get_stride(nlocal); \
}
#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \

View File

@ -213,6 +213,12 @@ namespace ip_simd {
_MM_SCALE_8);
}
inline SIMD_int SIMD_gatherz(const SIMD_mask &m, const int *p,
const SIMD_int &i) {
return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p,
_MM_SCALE_4);
}
inline SIMD_float SIMD_gatherz(const SIMD_mask &m, const float *p,
const SIMD_int &i) {
return _mm512_mask_i32gather_ps( _mm512_set1_ps((float)0), m, i, p,

View File

@ -192,7 +192,6 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
// ---------- Bin Atoms -------------
_fix->start_watch(TIME_HOST_NEIGHBOR);
//const ATOM_T * _noalias const x = buffers->get_x();
int * _noalias const atombin = this->_atombin;
int * _noalias const binpacked = this->_binpacked;

View File

@ -86,7 +86,7 @@ void NPairFullBinGhostIntel::fbi(NeighList * list,
// only uses offload_end_neighbor to check whether we are doing offloading
// at all, no need to correct this later
buffers->grow_list(list, nall, comm->nthreads, off_end,
buffers->grow_list(list, nall, comm->nthreads, 0, off_end,
_fix->nbor_pack_width());
int need_ic = 0;
@ -106,7 +106,7 @@ void NPairFullBinGhostIntel::fbi(NeighList * list,
/* ---------------------------------------------------------------------- */
template<class flt_t, class acc_t, int need_ic>
void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
IntelBuffers<flt_t,acc_t> * buffers,
const int pstart, const int pend) {
if (pend-pstart == 0) return;
@ -116,7 +116,8 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
const int aend = nall;
const ATOM_T * _noalias const x = buffers->get_x();
int * _noalias const firstneigh = buffers->firstneigh(list);
int * _noalias const intel_list = buffers->intel_list(list);
int ** _noalias const firstneigh = list->firstneigh;
const int e_nall = nall_t;
const int molecular = atom->molecular;
@ -140,7 +141,6 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
int * _noalias const ilist = list->ilist;
int * _noalias numneigh = list->numneigh;
int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int nstencil = this->nstencil;
const int * _noalias const stencil = this->stencil;
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
@ -228,7 +228,7 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(intel_list:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(ilist:length(0) alloc_if(0) free_if(0)) \
in(atombin:length(aend) alloc_if(0) free_if(0)) \
@ -295,7 +295,7 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
int pack_offset = maxnbors;
int ct = (ifrom + tid * 2) * maxnbors;
int *neighptr = firstneigh + ct;
int *neighptr = intel_list + ct;
const int obound = pack_offset + maxnbors * 2;
const int toffs = tid * ncache_stride;
@ -528,12 +528,12 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
if (ns > maxnbors) *overflow = 1;
ilist[i] = i;
cnumneigh[i] = ct;
firstneigh[i] = intel_list + ct;
numneigh[i] = ns;
ct += ns;
IP_PRE_edge_align(ct, sizeof(int));
neighptr = firstneigh + ct;
neighptr = intel_list + ct;
if (ct + obound > list_size) {
if (i < ito - 1) {
*overflow = 1;
@ -564,13 +564,12 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
if (offload) {
_fix->stop_watch(TIME_OFFLOAD_LATENCY);
_fix->start_watch(TIME_HOST_NEIGHBOR);
firstneigh[0] = intel_list;
for (int n = 0; n < aend; n++) {
ilist[n] = n;
numneigh[n] = 0;
}
} else {
for (int i = 0; i < aend; i++)
list->firstneigh[i] = firstneigh + cnumneigh[i];
if (separate_buffers) {
_fix->start_watch(TIME_PACK);
_fix->set_neighbor_host_sizes();
@ -581,10 +580,5 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
_fix->stop_watch(TIME_PACK);
}
}
#else
#pragma vector aligned
#pragma simd
for (int i = 0; i < aend; i++)
list->firstneigh[i] = firstneigh + cnumneigh[i];
#endif
}

View File

@ -70,7 +70,8 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
int offload_noghost = _fix->offload_noghost();
#endif
buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end,
buffers->grow_list(list, atom->nlocal, comm->nthreads,
_fix->three_body_neighbor(), off_end,
_fix->nbor_pack_width());
int need_ic = 0;

View File

@ -71,7 +71,7 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
int offload_noghost = _fix->offload_noghost();
#endif
buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end);
buffers->grow_list(list, atom->nlocal, comm->nthreads, 0, off_end);
int need_ic = 0;
if (atom->molecular)

View File

@ -71,7 +71,7 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
int offload_noghost = _fix->offload_noghost();
#endif
buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end);
buffers->grow_list(list, atom->nlocal, comm->nthreads, 0, off_end);
int need_ic = 0;
if (atom->molecular)

View File

@ -0,0 +1,47 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
// Only used for hybrid to generate list for non-intel style. Use
// standard routines.
#ifdef NPAIR_CLASS
NPairStyle(halffull/newtoff/intel,
NPairHalffullNewtoff,
NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
NP_ORTHO | NP_TRI | NP_INTEL)
NPairStyle(halffull/newtoff/skip/intel,
NPairHalffullNewtoff,
NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
NP_ORTHO | NP_TRI | NP_SKIP | NP_INTEL)
NPairStyle(halffull/newtoff/ghost/intel,
NPairHalffullNewtoff,
NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
NP_ORTHO | NP_TRI | NP_GHOST | NP_INTEL)
NPairStyle(halffull/newtoff/skip/ghost/intel,
NPairHalffullNewtoff,
NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
NP_ORTHO | NP_TRI | NP_SKIP | NP_GHOST | NP_INTEL)
#endif
/* ERROR/WARNING messages:
*/

View File

@ -0,0 +1,233 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#include "npair_halffull_newton_intel.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "modify.h"
#include "molecule.h"
#include "domain.h"
#include "my_page.h"
#include "error.h"
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
NPairHalffullNewtonIntel::NPairHalffullNewtonIntel(LAMMPS *lmp) : NPair(lmp) {
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
_fix = static_cast<FixIntel *>(modify->fix[ifix]);
}
/* ----------------------------------------------------------------------
build half list from full list
pair stored once if i,j are both owned and i < j
if j is ghost, only store if j coords are "above and to the right" of i
works if full list is a skip list
------------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void NPairHalffullNewtonIntel::build_t(NeighList *list,
IntelBuffers<flt_t,acc_t> *buffers)
{
const int inum_full = list->listfull->inum;
const int nlocal = atom->nlocal;
const int e_nall = nlocal + atom->nghost;
const ATOM_T * _noalias const x = buffers->get_x();
int * _noalias const ilist = list->ilist;
int * _noalias const numneigh = list->numneigh;
int ** _noalias const firstneigh = list->firstneigh;
const int * _noalias const ilist_full = list->listfull->ilist;
const int * _noalias const numneigh_full = list->listfull->numneigh;
const int ** _noalias const firstneigh_full =
(const int ** const)list->listfull->firstneigh;
#if defined(_OPENMP)
#pragma omp parallel
#endif
{
int tid, ifrom, ito;
IP_PRE_omp_range_id(ifrom, ito, tid, inum_full, comm->nthreads);
// each thread has its own page allocator
MyPage<int> &ipage = list->ipage[tid];
ipage.reset();
// loop over parent full list
for (int ii = ifrom; ii < ito; ii++) {
int n = 0;
int *neighptr = ipage.vget();
const int i = ilist_full[ii];
const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z;
// loop over full neighbor list
const int * _noalias const jlist = firstneigh_full[i];
const int jnum = numneigh_full[i];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int jj = 0; jj < jnum; jj++) {
const int joriginal = jlist[jj];
const int j = joriginal & NEIGHMASK;
int addme = 1;
if (j < nlocal) {
if (i > j) addme = 0;
} else {
if (x[j].z < ztmp) addme = 0;
if (x[j].z == ztmp) {
if (x[j].y < ytmp) addme = 0;
if (x[j].y == ytmp && x[j].x < xtmp) addme = 0;
}
}
if (addme)
neighptr[n++] = joriginal;
}
ilist[ii] = i;
firstneigh[i] = neighptr;
numneigh[i] = n;
int pad_end = n;
IP_PRE_neighbor_pad(pad_end, 0);
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
avg=INTEL_COMPILE_WIDTH/2
#endif
for ( ; n < pad_end; n++)
neighptr[n] = e_nall;
ipage.vgot(n);
if (ipage.status())
error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
}
}
list->inum = inum_full;
}
/* ----------------------------------------------------------------------
build half list from full 3-body list
half list is already stored as first part of 3-body list
------------------------------------------------------------------------- */
template <class flt_t>
void NPairHalffullNewtonIntel::build_t3(NeighList *list, int *numhalf)
{
const int inum_full = list->listfull->inum;
const int e_nall = atom->nlocal + atom->nghost;
int * _noalias const ilist = list->ilist;
int * _noalias const numneigh = list->numneigh;
int ** _noalias const firstneigh = list->firstneigh;
const int * _noalias const ilist_full = list->listfull->ilist;
const int * _noalias const numneigh_full = numhalf;
const int ** _noalias const firstneigh_full =
(const int ** const)list->listfull->firstneigh;
int packthreads = 1;
if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;
#if defined(_OPENMP)
#pragma omp parallel if(packthreads > 1)
#endif
{
int tid, ifrom, ito;
IP_PRE_omp_range_id(ifrom, ito, tid, inum_full, packthreads);
// each thread has its own page allocator
MyPage<int> &ipage = list->ipage[tid];
ipage.reset();
// loop over parent full list
for (int ii = ifrom; ii < ito; ii++) {
int n = 0;
int *neighptr = ipage.vget();
const int i = ilist_full[ii];
// loop over full neighbor list
const int * _noalias const jlist = firstneigh_full[i];
const int jnum = numneigh_full[ii];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int jj = 0; jj < jnum; jj++) {
const int joriginal = jlist[jj];
neighptr[n++] = joriginal;
}
ilist[ii] = i;
firstneigh[i] = neighptr;
numneigh[i] = n;
int pad_end = n;
IP_PRE_neighbor_pad(pad_end, 0);
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
avg=INTEL_COMPILE_WIDTH/2
#endif
for ( ; n < pad_end; n++)
neighptr[n] = e_nall;
ipage.vgot(n);
if (ipage.status())
error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
}
}
list->inum = inum_full;
}
/* ---------------------------------------------------------------------- */
void NPairHalffullNewtonIntel::build(NeighList *list)
{
if (_fix->three_body_neighbor() == 0) {
if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
build_t(list, _fix->get_mixed_buffers());
else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
build_t(list, _fix->get_double_buffers());
else
build_t(list, _fix->get_single_buffers());
} else {
int *nhalf, *cnum;
if (_fix->precision() == FixIntel::PREC_MODE_MIXED) {
_fix->get_mixed_buffers()->get_list_data3(list->listfull, nhalf, cnum);
build_t3<float>(list, nhalf);
} else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
_fix->get_double_buffers()->get_list_data3(list->listfull, nhalf, cnum);
build_t3<double>(list, nhalf);
} else {
_fix->get_single_buffers()->get_list_data3(list->listfull, nhalf, cnum);
build_t3<float>(list, nhalf);
}
}
}

View File

@ -0,0 +1,72 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#ifdef NPAIR_CLASS
NPairStyle(halffull/newton/intel,
NPairHalffullNewtonIntel,
NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
NP_ORTHO | NP_TRI| NP_INTEL)
NPairStyle(halffull/newton/skip/intel,
NPairHalffullNewtonIntel,
NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
NP_ORTHO | NP_TRI | NP_SKIP | NP_INTEL)
#else
#ifndef LMP_NPAIR_HALFFULL_NEWTON_INTEL_H
#define LMP_NPAIR_HALFFULL_NEWTON_INTEL_H
#include "npair.h"
#include "fix_intel.h"
#if defined(_OPENMP)
#include <omp.h>
#endif
namespace LAMMPS_NS {
class NPairHalffullNewtonIntel : public NPair {
public:
NPairHalffullNewtonIntel(class LAMMPS *);
~NPairHalffullNewtonIntel() {}
void build(class NeighList *);
protected:
FixIntel *_fix;
template<class flt_t, class acc_t>
void build_t(NeighList *, IntelBuffers<flt_t,acc_t> *);
template<class flt_t>
void build_t3(NeighList *, int *);
};
}
#endif
#endif
/* ERROR/WARNING messages:
E: The 'package intel' command is required for /intel styles
Self explanatory.
*/

View File

@ -52,12 +52,61 @@ NPairIntel::~NPairIntel() {
/* ---------------------------------------------------------------------- */
void NPairIntel::copy_neighbor_info()
{
NPair::copy_neighbor_info();
if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
copy_cutsq_info(_fix->get_mixed_buffers());
else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
copy_cutsq_info(_fix->get_double_buffers());
else
copy_cutsq_info(_fix->get_single_buffers());
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void NPairIntel::copy_cutsq_info(IntelBuffers<flt_t,acc_t> *buffers) {
int tp1 = atom->ntypes + 1;
int use_ghost_cut = 0;
if (cutneighghostsq)
use_ghost_cut = 1;
buffers->set_ntypes(tp1, use_ghost_cut);
flt_t **cutneighsqb = buffers->get_cutneighsq();
for (int i = 1; i <= atom->ntypes; i++)
for (int j = 1; j <= atom->ntypes; j++)
cutneighsqb[i][j] = cutneighsq[i][j];
flt_t **cutneighghostsqb;
if (use_ghost_cut) {
cutneighghostsqb = buffers->get_cutneighghostsq();
for (int i = 1; i <= atom->ntypes; i++)
for (int j = 1; j <= atom->ntypes; j++)
cutneighghostsqb[i][j] = cutneighghostsq[i][j];
}
#ifdef _LMP_INTEL_OFFLOAD
if (_cop < 0) return;
int tp1sq = tp1 * tp1;
flt_t * ocutneighsq = cutneighsqb[0];
#pragma offload_transfer target(mic:_cop) in(ocutneighsq: length(tp1sq))
if (use_ghost_cut) {
flt_t * ocutneighghostsq = cutneighghostsqb[0];
#pragma offload_transfer target(mic:_cop) \
in(ocutneighghostsq: length(tp1sq))
}
#endif
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t, int offload_noghost, int need_ic,
int FULL, int TRI, int THREE>
void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
void NPairIntel::bin_newton(const int offload, NeighList *list,
IntelBuffers<flt_t,acc_t> *buffers,
const int astart, const int aend,
const int /*offload_end*/) {
const int offload_end) {
if (aend-astart == 0) return;
@ -71,7 +120,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
const int pack_width = _fix->nbor_pack_width();
const ATOM_T * _noalias const x = buffers->get_x();
int * _noalias const firstneigh = buffers->firstneigh(list);
int * _noalias const intel_list = buffers->intel_list(list);
const int e_nall = nall_t;
const int molecular = atom->molecular;
@ -94,8 +143,10 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
const tagint * _noalias const tag = atom->tag;
int * _noalias const ilist = list->ilist;
int * _noalias numneigh = list->numneigh;
int * _noalias const cnumneigh = buffers->cnumneigh(list);
int ** _noalias const firstneigh = list->firstneigh;
int * _noalias const numneigh = list->numneigh;
int * _noalias const cnumneigh = buffers->cnumneigh();
const int nstencil = this->nstencil;
const int * _noalias const stencil = this->stencil;
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
@ -168,6 +219,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(intel_list:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
out(numneigh:length(0) alloc_if(0) free_if(0)) \
in(ilist:length(0) alloc_if(0) free_if(0)) \
@ -178,7 +230,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
in(offload_end,separate_buffers,astart,aend,nlocal,molecular) \
in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
in(pack_width,special_bound) \
in(pack_width,special_bound) \
out(overflow:length(5) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
signal(tag)
@ -212,7 +264,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(numneigh, overflow, nstencilp, binstart, binend)
shared(overflow, nstencilp, binstart, binend)
#endif
{
#ifdef _LMP_INTEL_OFFLOAD
@ -246,7 +298,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
const int obound = maxnbors * 3;
#endif
int ct = (ifrom + tid * 2) * maxnbors;
int *neighptr = firstneigh + ct;
int *neighptr = intel_list + ct;
int *neighptr2;
if (THREE) neighptr2 = neighptr;
@ -605,12 +657,24 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
if (n > maxnbors) *overflow = 1;
ilist[i] = i;
cnumneigh[i] = ct;
firstneigh[i] = intel_list + ct;
if (THREE) {
numneigh[i] = ns;
cnumneigh[i] = ct;
#ifdef LMP_INTEL_3BODY_FAST
cnumneigh[i] += lane;
#else
// Pad anyways just in case we have hybrid with 2-body and newton off
int pad_end = ns;
IP_PRE_neighbor_pad(pad_end, offload);
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
avg=INTEL_COMPILE_WIDTH/2
#endif
for ( ; ns < pad_end; ns++)
neighptr[n++] = e_nall;
#endif
numneigh[i] = ns;
} else {
numneigh[i] = n;
int pad_end = n;
@ -631,7 +695,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
if (lane == pack_width) {
ct += max_chunk * pack_width;
IP_PRE_edge_align(ct, sizeof(int));
neighptr = firstneigh + ct;
neighptr = intel_list + ct;
neighptr2 = neighptr;
max_chunk = 0;
lane = 0;
@ -647,7 +711,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
{
ct += n;
//IP_PRE_edge_align(ct, sizeof(int));
neighptr = firstneigh + ct;
neighptr = intel_list + ct;
if (THREE) neighptr2 = neighptr;
if (ct + obound > list_size) {
if (i < ito - 1) {
@ -667,7 +731,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
int ghost_offset = 0, nall_offset = e_nall;
if (separate_buffers) {
for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i];
int * _noalias jlist = firstneigh[i];
int jnum = numneigh[i];
if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
#if __INTEL_COMPILER+0 > 1499
@ -712,7 +776,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
}
for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i];
int * _noalias jlist = firstneigh[i];
int jnum = numneigh[i];
if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
int jj = 0;
@ -739,13 +803,12 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
if (offload) {
_fix->stop_watch(TIME_OFFLOAD_LATENCY);
_fix->start_watch(TIME_HOST_NEIGHBOR);
firstneigh[0] = intel_list;
for (int n = 0; n < aend; n++) {
ilist[n] = n;
numneigh[n] = 0;
}
} else {
for (int i = astart; i < aend; i++)
list->firstneigh[i] = firstneigh + cnumneigh[i];
if (separate_buffers) {
_fix->start_watch(TIME_PACK);
_fix->set_neighbor_host_sizes();
@ -756,11 +819,6 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
_fix->stop_watch(TIME_PACK);
}
}
#else
#pragma vector aligned
#pragma simd
for (int i = astart; i < aend; i++)
list->firstneigh[i] = firstneigh + cnumneigh[i];
#endif
}

View File

@ -75,7 +75,8 @@ class NPairIntel : public NPair {
public:
NPairIntel(class LAMMPS *);
~NPairIntel();
virtual void copy_neighbor_info();
#ifdef _LMP_INTEL_OFFLOAD
void grow_stencil();
#endif
@ -83,6 +84,9 @@ class NPairIntel : public NPair {
protected:
FixIntel *_fix;
template <class flt_t, class acc_t>
void copy_cutsq_info(IntelBuffers<flt_t,acc_t> *);
template <class flt_t, class acc_t, int, int, int, int, int>
void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *,
const int, const int, const int offload_end = 0);

View File

@ -0,0 +1,230 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#include "npair_skip_intel.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "modify.h"
#include "molecule.h"
#include "neigh_request.h"
#include "domain.h"
#include "my_page.h"
#include "error.h"
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
NPairSkipIntel::NPairSkipIntel(LAMMPS *lmp) : NPair(lmp) {
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
_fix = static_cast<FixIntel *>(modify->fix[ifix]);
_inum_starts = new int[comm->nthreads];
_inum_counts = new int[comm->nthreads];
_full_props = 0;
}
/* ---------------------------------------------------------------------- */
NPairSkipIntel::~NPairSkipIntel() {
delete []_inum_starts;
delete []_inum_counts;
if (_full_props) delete []_full_props;
}
/* ---------------------------------------------------------------------- */
void NPairSkipIntel::copy_neighbor_info()
{
NPair::copy_neighbor_info();
if (_full_props) delete []_full_props;
_full_props = new int[neighbor->nlist];
for (int i = 0; i < neighbor->nlist; i++)
_full_props[i] = neighbor->requests[i]->full;
}
/* ----------------------------------------------------------------------
build skip list for subset of types from parent list
works for half and full lists
works for owned (non-ghost) list, also for ghost list
iskip and ijskip flag which atom types and type pairs to skip
if ghost, also store neighbors of ghost atoms & set inum,gnum correctly
------------------------------------------------------------------------- */
template<class flt_t, int THREE>
void NPairSkipIntel::build_t(NeighList *list, int *numhalf, int *cnumneigh,
int *numhalf_skip)
{
const int nlocal = atom->nlocal;
const int e_nall = nlocal + atom->nghost;
const int * _noalias const type = atom->type;
int * _noalias const ilist = list->ilist;
int * _noalias const numneigh = list->numneigh;
int ** _noalias const firstneigh = (int ** const)list->firstneigh;
const int * _noalias const ilist_skip = list->listskip->ilist;
const int * _noalias const numneigh_skip = list->listskip->numneigh;
const int ** _noalias const firstneigh_skip =
(const int ** const)list->listskip->firstneigh;
const int * _noalias const iskip = list->iskip;
const int ** _noalias const ijskip = (const int ** const)list->ijskip;
int num_skip = list->listskip->inum;
if (list->ghost) num_skip += list->listskip->gnum;
int packthreads;
if (comm->nthreads > INTEL_HTHREADS && THREE==0)
packthreads = comm->nthreads;
else
packthreads = 1;
#if defined(_OPENMP)
#pragma omp parallel if(packthreads > 1)
#endif
{
int tid, ifrom, ito;
IP_PRE_omp_range_id(ifrom, ito, tid, num_skip, packthreads);
// each thread has its own page allocator
MyPage<int> &ipage = list->ipage[tid];
ipage.reset();
int my_inum = ifrom;
_inum_starts[tid] = ifrom;
// loop over parent full list
for (int ii = ifrom; ii < ito; ii++) {
const int i = ilist_skip[ii];
const int itype = type[i];
if (iskip[itype]) continue;
int n = 0;
int *neighptr = ipage.vget();
// loop over parent non-skip list
const int * _noalias const jlist = firstneigh_skip[i];
const int jnum = numneigh_skip[i];
if (THREE) {
const int jnumhalf = numhalf_skip[ii];
for (int jj = 0; jj < jnumhalf; jj++) {
const int joriginal = jlist[jj];
const int j = joriginal & NEIGHMASK;
if (!ijskip[itype][type[j]]) neighptr[n++] = joriginal;
}
numhalf[my_inum] = n;
for (int jj = jnumhalf; jj < jnum; jj++) {
const int joriginal = jlist[jj];
const int j = joriginal & NEIGHMASK;
if (!ijskip[itype][type[j]]) neighptr[n++] = joriginal;
}
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int jj = 0; jj < jnum; jj++) {
const int joriginal = jlist[jj];
const int j = joriginal & NEIGHMASK;
if (!ijskip[itype][type[j]]) neighptr[n++] = joriginal;
}
}
ilist[my_inum++] = i;
firstneigh[i] = neighptr;
numneigh[i] = n;
int pad_end = n;
IP_PRE_neighbor_pad(pad_end, 0);
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
avg=INTEL_COMPILE_WIDTH/2
#endif
for ( ; n < pad_end; n++)
neighptr[n] = e_nall;
ipage.vgot(n);
if (ipage.status())
error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
}
int last_inum = 0, loop_end;
_inum_counts[tid] = my_inum;
}
int inum = _inum_counts[0];
for (int tid = 1; tid < packthreads; tid++) {
for (int i = _inum_starts[tid]; i < _inum_counts[tid]; i++) {
if (THREE) numhalf[inum] = numhalf[i];
ilist[inum++] = ilist[i];
}
}
list->inum = inum;
if (THREE && num_skip > 0) {
int * const list_start = firstneigh[ilist[0]];
for (int ii = 0; ii < inum; ii++) {
int i = ilist[ii];
cnumneigh[ii] = static_cast<int>(firstneigh[i] - list_start);
}
}
if (list->ghost) {
int num = 0;
int my_inum = list->inum;
for (int i = 0; i < my_inum; i++)
if (ilist[i] < nlocal) num++;
else break;
list->inum = num;
list->gnum = my_inum - num;
}
}
/* ---------------------------------------------------------------------- */
void NPairSkipIntel::build(NeighList *list)
{
if (_fix->three_body_neighbor()==0 ||
_full_props[list->listskip->index] == 0) {
if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
build_t<float,0>(list, 0, 0, 0);
else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
build_t<double,0>(list, 0, 0, 0);
else
build_t<float,0>(list, 0, 0, 0);
} else {
int *nhalf, *cnumneigh, *nhalf_skip, *u;
if (_fix->precision() == FixIntel::PREC_MODE_MIXED) {
_fix->get_mixed_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
_fix->get_mixed_buffers()->grow_data3(list, nhalf, cnumneigh);
build_t<float,1>(list, nhalf, cnumneigh, nhalf_skip);
} else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
_fix->get_double_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
_fix->get_double_buffers()->grow_data3(list, nhalf, cnumneigh);
build_t<double,1>(list, nhalf, cnumneigh, nhalf_skip);
} else {
_fix->get_single_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
_fix->get_single_buffers()->grow_data3(list,nhalf,cnumneigh);
build_t<float,1>(list, nhalf, cnumneigh, nhalf_skip);
}
}
}

View File

@ -0,0 +1,69 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef NPAIR_CLASS
NPairStyle(skip/intel,
NPairSkipIntel,
NP_SKIP | NP_HALF | NP_FULL |
NP_NSQ | NP_BIN | NP_MULTI |
NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_INTEL)
NPairStyle(skip/ghost/intel,
NPairSkipIntel,
NP_SKIP | NP_HALF | NP_FULL |
NP_NSQ | NP_BIN | NP_MULTI |
NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_GHOST | NP_INTEL)
#else
#ifndef LMP_NPAIR_SKIP_INTEL_H
#define LMP_NPAIR_SKIP_INTEL_H
#include "npair.h"
#include "fix_intel.h"
#if defined(_OPENMP)
#include <omp.h>
#endif
namespace LAMMPS_NS {
class NPairSkipIntel : public NPair {
public:
NPairSkipIntel(class LAMMPS *);
~NPairSkipIntel();
virtual void copy_neighbor_info();
void build(class NeighList *);
protected:
FixIntel *_fix;
int *_inum_starts, *_inum_counts, *_full_props;
template<class flt_t, int THREE>
void build_t(NeighList *, int *numhalf, int *cnumneigh, int *numhalf_skip);
};
}
#endif
#endif
/* ERROR/WARNING messages:
E: The 'package intel' command is required for /intel styles
Self explanatory.
*/

View File

@ -97,6 +97,13 @@ struct LAMMPS_NS::PairAIREBOIntelParam {
namespace {
struct NeighListLMPAIREBO {
int * num; /* num_all */
int * num_half; /* num_all */
int * offset; /* num_all */
int ** entries; /* num_all * num_neighs_per_atom */
};
struct NeighListAIREBO {
int * num; /* num_all */
int * num_half; /* num_all */
@ -125,7 +132,7 @@ struct KernelArgsAIREBOT {
int neigh_from_atom, neigh_to_atom;
int rebuild_flag;
flt_t skin;
struct NeighListAIREBO neigh_lmp;
struct NeighListLMPAIREBO neigh_lmp;
struct NeighListAIREBO neigh_rebo;
PairAIREBOIntelParam<flt_t,acc_t> params;
struct AtomAIREBOT<flt_t> * x; /* num_all */
@ -176,6 +183,11 @@ void PairAIREBOIntel::init_style()
PairAIREBO::init_style();
neighbor->requests[neighbor->nrequest-1]->intel = 1;
const int nrequest = neighbor->nrequest;
for (int i = 0; i < nrequest; ++i)
if (neighbor->requests[i]->skip)
error->all(FLERR, "Cannot yet use airebo/intel with hybrid.");
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
@ -399,8 +411,7 @@ void PairAIREBOIntel::eval(
ATOM_T * _noalias const x = buffers->get_x(offload);
const int * _noalias const numneighhalf = buffers->get_atombin();
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int ** _noalias const firstneigh = (const int **)list->firstneigh;
tagint * const tag = atom->tag;
const int ntypes = atom->ntypes + 1;
@ -441,7 +452,6 @@ void PairAIREBOIntel::eval(
#pragma offload target(mic:_cop) if(offload) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
@ -472,10 +482,8 @@ void PairAIREBOIntel::eval(
f_stride, x, 0/*q*/);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EVFLAG) {
oevdwl = oecoul = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
}
if (EVFLAG)
oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
// loop over neighbors of my atoms
#if defined(_OPENMP)
@ -508,8 +516,7 @@ void PairAIREBOIntel::eval(
args.skin = skin;
args.neigh_lmp.num = const_cast<int*>(numneigh);
args.neigh_lmp.num_half = const_cast<int*>(numneighhalf);
args.neigh_lmp.offset = const_cast<int*>(cnumneigh);
args.neigh_lmp.entries = const_cast<int*>(firstneigh);
args.neigh_lmp.entries = const_cast<int**>(firstneigh);
args.neigh_rebo.num = REBO_numneigh;
args.neigh_rebo.num_half = REBO_num_skin;
args.neigh_rebo.offset = REBO_cnumneigh;
@ -543,18 +550,14 @@ void PairAIREBOIntel::eval(
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EVFLAG) {
if (EFLAG) {
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
}
if (vflag) {
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
ev_global[5] = ov3;
ev_global[6] = ov4;
ev_global[7] = ov5;
}
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
ev_global[5] = ov3;
ev_global[6] = ov4;
ev_global[7] = ov5;
}
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
@ -578,28 +581,20 @@ template<class flt_t, class acc_t>
void PairAIREBOIntel::pack_force_const(IntelBuffers<flt_t,acc_t> * buffers) {
int tp1 = atom->ntypes + 1;
buffers->set_ntypes(tp1,1);
flt_t **cutneighsq = buffers->get_cutneighsq();
flt_t **cutneighghostsq = buffers->get_cutneighghostsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
double cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
cut = init_one(i, j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
cut = cutghost[i][j] + neighbor->skin;
cutneighghostsq[i][j] = cutneighghostsq[j][i] = cut*cut;
}
else
cut = 0.0;
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
}
#ifdef _LMP_INTEL_OFFLOAD
if (_cop < 0) return;
flt_t * ocutneighsq = cutneighsq[0];
size_t VL = 512 / 8 / sizeof(flt_t);
int ntypes = tp1;
int tp1sq = tp1 * tp1;
@ -607,7 +602,6 @@ void PairAIREBOIntel::pack_force_const(IntelBuffers<flt_t,acc_t> * buffers) {
// it might not be freed if this method is called more than once
int * map = this->map;
#pragma offload_transfer target(mic:_cop) \
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) \
in(map: length(tp1) alloc_if(1) free_if(0))
#endif
@ -1580,7 +1574,7 @@ void ref_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
ka->nC[i] = 0;
ka->nH[i] = 0;
for (int j = 0; j < ka->neigh_lmp.num[i]; j++) {
int ji = ka->neigh_lmp.entries[ka->neigh_lmp.offset[i] + j];
int ji = ka->neigh_lmp.entries[i][j];
flt_t delx = ka->x[i].x - ka->x[ji].x;
flt_t dely = ka->x[i].y - ka->x[ji].y;
flt_t delz = ka->x[i].z - ka->x[ji].z;
@ -2168,7 +2162,7 @@ void ref_lennard_jones_single_atom(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
int morseflag) {
AtomAIREBOT<flt_t> * x = ka->x;
int jj;
int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
int * neighs = ka->neigh_lmp.entries[i];
int jnum = ka->neigh_lmp.num_half[i];
for (jj = 0; jj < jnum; jj++) {
int j = neighs[jj];
@ -2332,7 +2326,7 @@ static void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
int jnum = ka->rebuild_flag ? ka->neigh_lmp.num[i] :
ka->neigh_rebo.num_half[i];
int * neighs = ka->rebuild_flag ?
&ka->neigh_lmp.entries[ka->neigh_lmp.offset[i]] :
ka->neigh_lmp.entries[i] :
&ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]+jnum];
int * skin_target = &ka->neigh_rebo.entries[offset+ka->num_neighs_per_atom];
int n = 0;
@ -4554,7 +4548,7 @@ static void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
fvec p_lj40 = fvec::set1(ka->params.lj4[itype][0]);
fvec p_lj41 = fvec::set1(ka->params.lj4[itype][1]);
int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
int * neighs = ka->neigh_lmp.entries[i];
int jnum = ka->neigh_lmp.num_half[i];
bool tap_success = aut_airebo_lj_test_all_paths(ka, i, &test_path_result);

View File

@ -107,4 +107,10 @@ E: Cannot open AIREBO potential file %s
The specified AIREBO potential file cannot be opened. Check that the
path and name are correct.
E: Cannot yet use airebo/intel with hybrid.
Pair style airebo/intel cannot currently be used as part of a hybrid
pair style (with the exception of hybrid/overlay).
*/

View File

@ -144,8 +144,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int ** _noalias const firstneigh = (const int **)list->firstneigh;
const flt_t * _noalias const special_coul = fc.special_coul;
const flt_t * _noalias const special_lj = fc.special_lj;
@ -182,7 +181,6 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
in(c_force, c_energy, c_cut:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(q:length(q_size) alloc_if(0) free_if(0)) \
@ -204,8 +202,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
f_stride, x, q);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = oecoul = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (EFLAG || vflag)
oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (NEWTON_PAIR == 0 && inum != nlocal)
memset(f_start, 0, f_stride * sizeof(FORCE_T));
// loop over neighbors of my atoms
#if defined(_OPENMP)
@ -231,8 +231,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
const C_CUT_T * _noalias const c_cuti = c_cut + ptr_off;
const int * _noalias const jlist = firstneigh + cnumneigh[ii];
int jnum = numneigh[ii];
const int * _noalias const jlist = firstneigh[i];
int jnum = numneigh[i];
IP_PRE_neighbor_pad(jnum, offload);
acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -361,13 +361,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
}
if (vflag) {
if (EFLAG || vflag) {
if (NEWTON_PAIR == 0) {
oevdwl *= (acc_t)0.5;
oecoul *= (acc_t)0.5;
ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5;
@ -375,6 +372,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -439,19 +438,16 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc,
for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
fc.set_ntypes(tp1, ntable, memory, _cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
double cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
cut = init_one(i, j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
else
cut = 0.0;
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
}
@ -483,12 +479,10 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc,
C_FORCE_T * c_force = fc.c_force[0];
C_ENERGY_T * c_energy = fc.c_energy[0];
C_CUT_T * c_cut = fc.c_cut[0];
flt_t * ocutneighsq = cutneighsq[0];
int tp1sq = tp1 * tp1;
#pragma offload_transfer target(mic:_cop) \
in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
in(c_force, c_energy, c_cut: length(tp1sq) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
in(c_force, c_energy, c_cut: length(tp1sq) alloc_if(0) free_if(0))
#endif
}

View File

@ -144,8 +144,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int ** _noalias const firstneigh = (const int **)list->firstneigh;
const flt_t * _noalias const special_coul = fc.special_coul;
const flt_t * _noalias const special_lj = fc.special_lj;
@ -206,7 +205,6 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \
in(rho_inv:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(q:length(q_size) alloc_if(0) free_if(0)) \
@ -230,8 +228,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
f_stride, x, q);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = oecoul = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (EFLAG || vflag)
oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (NEWTON_PAIR == 0 && inum != nlocal)
memset(f_start, 0, f_stride * sizeof(FORCE_T));
// loop over neighbors of my atoms
#if defined(_OPENMP)
@ -265,8 +265,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
const flt_t * _noalias const rho_invi = rho_inv + ptr_off;
const int * _noalias const jlist = firstneigh + cnumneigh[ii];
int jnum = numneigh[ii];
const int * _noalias const jlist = firstneigh[i];
int jnum = numneigh[i];
IP_PRE_neighbor_pad(jnum, offload);
acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -437,13 +437,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
}
if (vflag) {
if (EFLAG || vflag) {
if (NEWTON_PAIR == 0) {
oevdwl *= (acc_t)0.5;
oecoul *= (acc_t)0.5;
ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5;
@ -451,6 +448,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -523,19 +522,16 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
fc.set_ntypes(tp1, ntable, memory, _cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
double cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
cut = init_one(i, j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
else
cut = 0.0;
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
}
@ -591,15 +587,13 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
flt_t * detable = fc.detable;
flt_t * ctable = fc.ctable;
flt_t * dctable = fc.dctable;
flt_t * ocutneighsq = cutneighsq[0];
int tp1sq = tp1 * tp1;
#pragma offload_transfer target(mic:_cop) \
in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \
in(rho_inv: length(tp1sq) alloc_if(0) free_if(0)) \
in(table: length(ntable) alloc_if(0) free_if(0)) \
in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0))
#endif
}

View File

@ -135,8 +135,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int ** _noalias const firstneigh = (const int **)list->firstneigh;
const flt_t * _noalias const special_lj = fc.special_lj;
const C_FORCE_T * _noalias const c_force = fc.c_force[0];
@ -167,7 +166,6 @@ void PairBuckIntel::eval(const int offload, const int vflag,
in(special_lj:length(0) alloc_if(0) free_if(0)) \
in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(ilist:length(0) alloc_if(0) free_if(0)) \
@ -188,8 +186,10 @@ void PairBuckIntel::eval(const int offload, const int vflag,
f_stride, x, 0);
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (EFLAG || vflag)
oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (NEWTON_PAIR == 0 && inum != nlocal)
memset(f_start, 0, f_stride * sizeof(FORCE_T));
// loop over neighbors of my atoms
#if defined(_OPENMP)
@ -214,8 +214,8 @@ void PairBuckIntel::eval(const int offload, const int vflag,
const int ptr_off = itype * ntypes;
const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
const int * _noalias const jlist = firstneigh + cnumneigh[ii];
int jnum = numneigh[ii];
const int * _noalias const jlist = firstneigh[i];
int jnum = numneigh[i];
IP_PRE_neighbor_pad(jnum, offload);
acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -324,13 +324,9 @@ void PairBuckIntel::eval(const int offload, const int vflag,
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0;
}
if (vflag) {
if (EFLAG || vflag) {
if (NEWTON_PAIR == 0) {
oevdwl *= (acc_t)0.5;
ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5;
@ -338,6 +334,8 @@ void PairBuckIntel::eval(const int offload, const int vflag,
ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0;
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -396,19 +394,16 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc,
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1, memory, _cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
double cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
cut = init_one(i, j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
else
cut = 0.0;
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
}
@ -434,12 +429,10 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc,
flt_t * special_lj = fc.special_lj;
C_FORCE_T * c_force = fc.c_force[0];
C_ENERGY_T * c_energy = fc.c_energy[0];
flt_t * ocutneighsq = cutneighsq[0];
int tp1sq = tp1 * tp1;
#pragma offload_transfer target(mic:_cop) \
in(special_lj: length(4) alloc_if(0) free_if(0)) \
in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0))
#endif
}

View File

@ -173,8 +173,7 @@ void PairDPDIntel::eval(const int offload, const int vflag,
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int ** _noalias const firstneigh = (const int **)list->firstneigh;
const FC_PACKED1_T * _noalias const param = fc.param[0];
const flt_t * _noalias const special_lj = fc.special_lj;
int * _noalias const rngi_thread = fc.rngi;
@ -204,8 +203,10 @@ void PairDPDIntel::eval(const int offload, const int vflag,
f_stride, x, 0);
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (EFLAG || vflag)
oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (NEWTON_PAIR == 0 && inum != nlocal)
memset(f_start, 0, f_stride * sizeof(FORCE_T));
// loop over neighbors of my atoms
#if defined(_OPENMP)
@ -233,10 +234,10 @@ void PairDPDIntel::eval(const int offload, const int vflag,
flt_t icut, a0, gamma, sigma;
if (ONETYPE) {
icut = param[3].icut;
a0 = param[3].a0;
gamma = param[3].gamma;
sigma = param[3].sigma;
icut = param[_onetype].icut;
a0 = param[_onetype].a0;
gamma = param[_onetype].gamma;
sigma = param[_onetype].sigma;
}
for (int ii = iifrom; ii < iito; ii += iip) {
const int i = ilist[ii];
@ -248,8 +249,8 @@ void PairDPDIntel::eval(const int offload, const int vflag,
parami = param + ptr_off;
}
const int * _noalias const jlist = firstneigh + cnumneigh[ii];
int jnum = numneigh[ii];
const int * _noalias const jlist = firstneigh[i];
int jnum = numneigh[i];
IP_PRE_neighbor_pad(jnum, offload);
acc_t fxtmp, fytmp, fztmp, fwtmp;
@ -379,13 +380,9 @@ void PairDPDIntel::eval(const int offload, const int vflag,
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
}
if (vflag) {
if (EFLAG || vflag) {
if (NEWTON_PAIR == 0) {
oevdwl *= (acc_t)0.5;
ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5;
@ -393,6 +390,8 @@ void PairDPDIntel::eval(const int offload, const int vflag,
ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -503,32 +502,26 @@ void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
_onetype = 0;
if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
int mytypes = 0;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
double icut = 1.0 / cut;
fc.param[i][j].icut = fc.param[j][i].icut = icut;
} else {
cut = init_one(i,j);
double icut = 1.0 / cut;
fc.param[i][j].icut = fc.param[j][i].icut = icut;
mytypes++;
_onetype = i * tp1 + j;
}
double cut = init_one(i,j);
cutsq[i][j] = cutsq[j][i] = cut*cut;
double icut = 1.0 / cut;
fc.param[i][j].icut = fc.param[j][i].icut = icut;
}
}
if (mytypes > 1 || atom->molecular) _onetype = 0;
for (int i = 0; i < 4; i++) {
fc.special_lj[i] = force->special_lj[i];
fc.special_lj[0] = 1.0;

View File

@ -193,8 +193,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int ** _noalias const firstneigh = (const int **)list->firstneigh;
const FC_PACKED1_T * _noalias const rhor_spline_f = fc.rhor_spline_f;
const FC_PACKED1_T * _noalias const rhor_spline_e = fc.rhor_spline_e;
const FC_PACKED2_T * _noalias const z2r_spline_t = fc.z2r_spline_t;
@ -243,8 +242,10 @@ void PairEAMIntel::eval(const int offload, const int vflag,
f_stride, x, 0);
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (EFLAG || vflag)
oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (NEWTON_PAIR == 0 && inum != nlocal)
memset(f_start, 0, f_stride * sizeof(FORCE_T));
// loop over neighbors of my atoms
#if defined(_OPENMP)
@ -293,8 +294,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
itype = x[i].w;
rhor_ioff = istride * itype;
}
const int * _noalias const jlist = firstneigh + cnumneigh[ii];
int jnum = numneigh[ii];
const int * _noalias const jlist = firstneigh[i];
int jnum = numneigh[i];
IP_PRE_neighbor_pad(jnum, offload);
const flt_t xtmp = x[i].x;
@ -450,8 +451,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
if (tid == 0)
comm->forward_comm_pair(this);
if (NEWTON_PAIR)
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
#if defined(_OPENMP)
#pragma omp barrier
@ -469,8 +469,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
rhor_ioff = istride * itype;
scale_fi = scale_f + itype*ntypes;
}
const int * _noalias const jlist = firstneigh + cnumneigh[ii];
int jnum = numneigh[ii];
const int * _noalias const jlist = firstneigh[i];
int jnum = numneigh[i];
IP_PRE_neighbor_pad(jnum, offload);
acc_t fxtmp, fytmp, fztmp, fwtmp;
@ -597,11 +597,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
}
if (vflag) {
if (EFLAG || vflag) {
if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5;
@ -610,6 +606,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -682,19 +680,16 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1,nr,nrho,memory,_cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
double cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
cut = init_one(i,j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
else
cut = 0.0;
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
}
@ -733,13 +728,13 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4];
fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5];
fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6];
fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0];
fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1];
fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2];
fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3];
fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4];
fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5];
fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6];
fc.z2r_spline_t[joff + k].a=z2r_spline[type2z2r[j][i]][k][0];
fc.z2r_spline_t[joff + k].b=z2r_spline[type2z2r[j][i]][k][1];
fc.z2r_spline_t[joff + k].c=z2r_spline[type2z2r[j][i]][k][2];
fc.z2r_spline_t[joff + k].d=z2r_spline[type2z2r[j][i]][k][3];
fc.z2r_spline_t[joff + k].e=z2r_spline[type2z2r[j][i]][k][4];
fc.z2r_spline_t[joff + k].f=z2r_spline[type2z2r[j][i]][k][5];
fc.z2r_spline_t[joff + k].g=z2r_spline[type2z2r[j][i]][k][6];
}
}
}
@ -754,7 +749,7 @@ void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
const int nr, const int nrho,
Memory *memory,
const int cop) {
if (ntypes != _ntypes || nr > _nr || nrho > _nrho) {
if (ntypes != _ntypes || nr + 1 > _nr || nrho + 1 > _nrho) {
if (_ntypes > 0) {
_memory->destroy(rhor_spline_f);
_memory->destroy(rhor_spline_e);

View File

@ -228,8 +228,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int ** _noalias const firstneigh = (const int **)list->firstneigh;
const flt_t * _noalias const special_lj = fc.special_lj;
const FC_PACKED1_T * _noalias const ijc = fc.ijc[0];
@ -283,7 +282,6 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \
in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(quat:length(nall+1) alloc_if(0) free_if(0)) \
@ -337,9 +335,11 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
#endif
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = (acc_t)0.0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
if (EFLAG || vflag)
oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
if (NEWTON_PAIR == 0) f_start[1].w = 0;
if (NEWTON_PAIR == 0 && inum != nlocal)
memset(f_start, 0, f_stride * sizeof(FORCE_T));
// loop over neighbors of my atoms
#if defined(_OPENMP)
@ -372,8 +372,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
const int * _noalias const jlist = firstneigh + cnumneigh[ii];
int jnum = numneigh[ii];
const int * _noalias const jlist = firstneigh[i];
int jnum = numneigh[i];
IP_PRE_neighbor_pad(jnum, offload);
const flt_t xtmp = x[i].x;
@ -833,13 +833,9 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
f_start[1].w = ierror;
} // omp
if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
}
if (vflag) {
if (EFLAG || vflag) {
if (NEWTON_PAIR == 0) {
oevdwl *= (acc_t)0.5;
ov0 *= (acc_t)-0.5;
ov1 *= (acc_t)-0.5;
ov2 *= (acc_t)-0.5;
@ -847,6 +843,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
ov4 *= (acc_t)-0.5;
ov5 *= (acc_t)-0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -914,19 +912,16 @@ void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
if (mthreads < buffers->get_off_threads())
mthreads = buffers->get_off_threads();
fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
double cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
cut = init_one(i,j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
else
cut = 0.0;
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
}
@ -963,14 +958,12 @@ void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
FC_PACKED1_T *oijc = fc.ijc[0];
FC_PACKED2_T *olj34 = fc.lj34[0];
FC_PACKED3_T *oic = fc.ic;
flt_t * ocutneighsq = cutneighsq[0];
int tp1sq = tp1 * tp1;
if (oijc != NULL && oic != NULL) {
#pragma offload_transfer target(mic:_cop) \
in(special_lj: length(4) alloc_if(0) free_if(0)) \
in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \
in(oic: length(tp1) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq))
in(oic: length(tp1) alloc_if(0) free_if(0))
}
#endif
}

View File

@ -138,8 +138,7 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int ** _noalias const firstneigh = (const int **)list->firstneigh;
const flt_t * _noalias const special_coul = fc.special_coul;
const flt_t * _noalias const special_lj = fc.special_lj;
@ -186,7 +185,6 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(q:length(q_size) alloc_if(0) free_if(0)) \
@ -212,8 +210,10 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
f_stride, x, q);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = oecoul = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (EFLAG || vflag)
oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (NEWTON_PAIR == 0 && inum != nlocal)
memset(f_start, 0, f_stride * sizeof(FORCE_T));
// loop over neighbors of my atoms
#if defined(_OPENMP)
@ -248,8 +248,8 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
const flt_t * _noalias const cutsqi = cutsq + ptr_off;
const LJ_T * _noalias const lji = lj + ptr_off;
const int * _noalias const jlist = firstneigh + cnumneigh[ii];
int jnum = numneigh[ii];
const int * _noalias const jlist = firstneigh[i];
int jnum = numneigh[i];
IP_PRE_neighbor_pad(jnum, offload);
acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -403,16 +403,10 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (EFLAG || vflag) {
if (NEWTON_PAIR == 0) {
oevdwl *= (acc_t)0.5;
oecoul *= (acc_t)0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
}
if (vflag) {
if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5;
@ -420,6 +414,8 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -487,22 +483,19 @@ void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1, memory, _cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
if (cut_lj > cut_coul)
error->all(FLERR,
"Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
double cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
cut = init_one(i, j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
else
cut = 0.0;
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
}
@ -540,12 +533,10 @@ void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
flt_t * special_coul = fc.special_coul;
flt_t * cutsq = fc.cutsq[0];
LJ_T * lj = fc.lj[0];
flt_t * ocutneighsq = cutneighsq[0];
int tp1sq = tp1 * tp1;
#pragma offload_transfer target(mic:_cop) \
in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0))
#endif
}

View File

@ -28,7 +28,7 @@
#include "suffix.h"
using namespace LAMMPS_NS;
#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t
#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec2_t
#define TABLE_T typename ForceConst<flt_t>::table_t
/* ---------------------------------------------------------------------- */
@ -142,8 +142,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int ** _noalias const firstneigh = (const int **)list->firstneigh;
const flt_t * _noalias const special_coul = fc.special_coul;
const flt_t * _noalias const special_lj = fc.special_lj;
@ -207,7 +206,6 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(q:length(q_size) alloc_if(0) free_if(0)) \
@ -232,8 +230,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
f_stride, x, q);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = oecoul = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (EFLAG || vflag)
oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (NEWTON_PAIR == 0 && inum != nlocal)
memset(f_start, 0, f_stride * sizeof(FORCE_T));
// loop over neighbors of my atoms
#if defined(_OPENMP)
@ -268,8 +268,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
const flt_t * _noalias const cutsqi = cutsq + ptr_off;
const LJ_T * _noalias const lji = lj + ptr_off;
const int * _noalias const jlist = firstneigh + cnumneigh[ii];
int jnum = numneigh[ii];
const int * _noalias const jlist = firstneigh[i];
int jnum = numneigh[i];
IP_PRE_neighbor_pad(jnum, offload);
acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -376,8 +376,14 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
if (rsq < cut_ljsq) {
#endif
flt_t r6inv = r2inv * r2inv * r2inv;
forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
flt_t eps4 = lji[jtype].x;
flt_t sigp6 = lji[jtype].y;
flt_t lj4 = eps4 * sigp6;
flt_t lj3 = lj4 * sigp6;
flt_t lj2 = (flt_t)6.0 * lj4;
flt_t lj1 = (flt_t)12.0 * lj3;
forcelj = r6inv * (lj1 * r6inv - lj2);
if (EFLAG) evdwl = r6inv*(lj3 * r6inv - lj4);
#ifdef INTEL_VMASK
if (rsq > cut_lj_innersq) {
@ -397,8 +403,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
}
#endif
} else {
const flt_t philj = r6inv * (lji[jtype].z*r6inv -
lji[jtype].w);
const flt_t philj = r6inv * (lj3 * r6inv - lj4);
#ifndef INTEL_VMASK
if (rsq > cut_lj_innersq)
#endif
@ -463,16 +468,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (EFLAG || vflag) {
if (NEWTON_PAIR == 0) {
oevdwl *= (acc_t)0.5;
oecoul *= (acc_t)0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
}
if (vflag) {
if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5;
@ -480,6 +479,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -552,22 +553,19 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
fc.set_ntypes(tp1, ntable, memory, _cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
if (cut_lj > cut_coul)
error->all(FLERR,
"Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
double cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
cut = init_one(i, j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
else
cut = 0.0;
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
}
@ -591,10 +589,13 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
for (int i = 1; i < tp1; i++) {
for (int j = 1; j < tp1; j++) {
fc.lj[i][j].x = lj1[i][j];
fc.lj[i][j].y = lj2[i][j];
fc.lj[i][j].z = lj3[i][j];
fc.lj[i][j].w = lj4[i][j];
if (i <= j) {
fc.lj[i][j].x = epsilon[i][j] * 4.0;
fc.lj[i][j].y = pow(sigma[i][j],6.0);
} else {
fc.lj[i][j].x = epsilon[j][i] * 4.0;
fc.lj[i][j].y = pow(sigma[j][i],6.0);
}
fc.cutsq[i][j] = cutsq[i][j];
}
}
@ -623,14 +624,12 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
flt_t * detable = fc.detable;
flt_t * ctable = fc.ctable;
flt_t * dctable = fc.dctable;
flt_t * ocutneighsq = cutneighsq[0];
int tp1sq = tp1 * tp1;
#pragma offload_transfer target(mic:_cop) \
in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
in(table: length(ntable) alloc_if(0) free_if(0)) \
in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0))
#endif
}
@ -647,7 +646,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
flt_t * ospecial_lj = special_lj;
flt_t * ospecial_coul = special_coul;
flt_t * ocutsq = cutsq[0];
typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
typename IntelBuffers<flt_t,flt_t>::vec2_t * olj = lj[0];
table_t * otable = table;
flt_t * oetable = etable;
flt_t * odetable = detable;
@ -687,7 +686,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
flt_t * ospecial_lj = special_lj;
flt_t * ospecial_coul = special_coul;
flt_t * ocutsq = cutsq[0];
typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
typename IntelBuffers<flt_t,flt_t>::vec2_t * olj = lj[0];
table_t * otable = table;
flt_t * oetable = etable;
flt_t * odetable = detable;

View File

@ -69,7 +69,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
flt_t cut_lj_innersq;
table_t *table;
flt_t *etable, *detable, *ctable, *dctable;
typename IntelBuffers<flt_t,flt_t>::vec4_t **lj;
typename IntelBuffers<flt_t,flt_t>::vec2_t **lj;
ForceConst() : _ntypes(0), _ntable(0) {}
~ForceConst() { set_ntypes(0,0,NULL,_cop); }

View File

@ -141,8 +141,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int ** _noalias const firstneigh = (const int **)list->firstneigh;
const flt_t * _noalias const special_coul = fc.special_coul;
const flt_t * _noalias const special_lj = fc.special_lj;
@ -201,7 +200,6 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(q:length(q_size) alloc_if(0) free_if(0)) \
@ -225,8 +223,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
f_stride, x, q);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = oecoul = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (EFLAG || vflag)
oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (NEWTON_PAIR == 0 && inum != nlocal)
memset(f_start, 0, f_stride * sizeof(FORCE_T));
// loop over neighbors of my atoms
#if defined(_OPENMP)
@ -260,8 +260,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
const int * _noalias const jlist = firstneigh + cnumneigh[ii];
int jnum = numneigh[ii];
const int * _noalias const jlist = firstneigh[i];
int jnum = numneigh[i];
IP_PRE_neighbor_pad(jnum, offload);
acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -433,16 +433,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (EFLAG || vflag) {
if (NEWTON_PAIR == 0) {
oevdwl *= (acc_t)0.5;
oecoul *= (acc_t)0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
}
if (vflag) {
if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5;
@ -450,6 +444,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -522,19 +518,16 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
fc.set_ntypes(tp1, ntable, memory, _cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
double cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
cut = init_one(i, j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
else
cut = 0.0;
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
}
@ -587,14 +580,12 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
flt_t * detable = fc.detable;
flt_t * ctable = fc.ctable;
flt_t * dctable = fc.dctable;
flt_t * ocutneighsq = cutneighsq[0];
int tp1sq = tp1 * tp1;
#pragma offload_transfer target(mic:_cop) \
in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \
in(table: length(ntable) alloc_if(0) free_if(0)) \
in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0))
#endif
}

View File

@ -150,8 +150,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int ** _noalias const firstneigh = (const int **)list->firstneigh;
const flt_t * _noalias const special_lj = fc.special_lj;
const FC_PACKED1_T * _noalias const ljc12o = fc.ljc12o[0];
const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
@ -180,8 +179,10 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
f_stride, x, 0);
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (EFLAG || vflag)
oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (NEWTON_PAIR == 0 && inum != nlocal)
memset(f_start, 0, f_stride * sizeof(FORCE_T));
// loop over neighbors of my atoms
#if defined(_OPENMP)
@ -201,12 +202,12 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
flt_t cutsq, lj1, lj2, lj3, lj4, offset;
if (ONETYPE) {
cutsq = ljc12o[3].cutsq;
lj1 = ljc12o[3].lj1;
lj2 = ljc12o[3].lj2;
lj3 = lj34[3].lj3;
lj4 = lj34[3].lj4;
offset = ljc12o[3].offset;
cutsq = ljc12o[_onetype].cutsq;
lj1 = ljc12o[_onetype].lj1;
lj2 = ljc12o[_onetype].lj2;
lj3 = lj34[_onetype].lj3;
lj4 = lj34[_onetype].lj4;
offset = ljc12o[_onetype].offset;
}
for (int ii = iifrom; ii < iito; ii += iip) {
const int i = ilist[ii];
@ -219,8 +220,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
ljc12oi = ljc12o + ptr_off;
lj34i = lj34 + ptr_off;
}
const int * _noalias const jlist = firstneigh + cnumneigh[ii];
int jnum = numneigh[ii];
const int * _noalias const jlist = firstneigh[i];
int jnum = numneigh[i];
IP_PRE_neighbor_pad(jnum, offload);
acc_t fxtmp, fytmp, fztmp, fwtmp;
@ -338,13 +339,9 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
}
if (vflag) {
if (EFLAG || vflag) {
if (NEWTON_PAIR == 0) {
oevdwl *= (acc_t)0.5;
ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5;
@ -352,6 +349,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5;
}
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -414,25 +413,26 @@ void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
_onetype = 0;
if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1,memory,_cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
int mytypes = 0;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
double cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
mytypes++;
_onetype = i * tp1 + j;
} else {
cut = 0.0;
}
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
}
if (mytypes > 1 || atom->molecular) _onetype = 0;
for (int i = 0; i < 4; i++) {
fc.special_lj[i] = force->special_lj[i];

View File

@ -130,37 +130,37 @@ void PairSWIntel::compute(int eflag, int vflag,
if (_onetype) {
if (_spq) {
if (eflag) {
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (eflag) {
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
} else {
if (_spq) {
if (eflag) {
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (eflag) {
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
}
@ -173,7 +173,7 @@ template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
void PairSWIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc, const int astart,
const int aend, const int /*pad_width*/)
const int aend)
{
const int inum = aend - astart;
if (inum == 0) return;
@ -185,10 +185,13 @@ void PairSWIntel::eval(const int offload, const int vflag,
ATOM_T * _noalias const x = buffers->get_x(offload);
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneighhalf = buffers->get_atombin();
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int * _noalias const firstneigh = list->firstneigh[ilist[0]];
int *nhalf, *cnum;
buffers->get_list_data3(list, nhalf, cnum);
const int * _noalias const numneighhalf = nhalf;
const int * _noalias const cnumneigh = cnum;
const FC_PACKED0_T * _noalias const p2 = fc.p2[0];
const FC_PACKED1_T * _noalias const p2f = fc.p2f[0];
@ -206,6 +209,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
const int ntypes = atom->ntypes + 1;
const int eatom = this->eflag_atom;
const int onetype = _onetype;
const int onetype3 = _onetype3;
// Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag;
@ -235,8 +240,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
in(overflow:length(0) alloc_if(0) free_if(0)) \
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload,onetype3) \
in(astart,nlocal,f_stride,minlocal,separate_flag,onetype) \
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@ -251,8 +256,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
f_stride, x, 0);
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (EFLAG || vflag)
oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
#if defined(_OPENMP)
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
@ -278,24 +283,24 @@ void PairSWIntel::eval(const int offload, const int vflag,
flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3, c4, c5, c6;
flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
if (ONETYPE) {
cutsq = p2[3].cutsq;
cut = p2f[3].cut;
sigma = p2f[3].sigma;
c1 = p2f2[3].c1;
c2 = p2f2[3].c2;
c3 = p2f2[3].c3;
c4 = p2f2[3].c4;
sigma_gamma = p2[3].sigma_gamma;
costheta = p3[7].costheta;
lambda_epsilon = p3[7].lambda_epsilon;
lambda_epsilon2 = p3[7].lambda_epsilon2;
cutsq = p2[onetype].cutsq;
cut = p2f[onetype].cut;
sigma = p2f[onetype].sigma;
c1 = p2f2[onetype].c1;
c2 = p2f2[onetype].c2;
c3 = p2f2[onetype].c3;
c4 = p2f2[onetype].c4;
sigma_gamma = p2[onetype].sigma_gamma;
costheta = p3[onetype3].costheta;
lambda_epsilon = p3[onetype3].lambda_epsilon;
lambda_epsilon2 = p3[onetype3].lambda_epsilon2;
if (SPQ == 0) {
powerp = p2f[3].powerp;
powerq = p2f[3].powerq;
powerp = p2f[onetype].powerp;
powerq = p2f[onetype].powerq;
}
if (EFLAG) {
c5 = p2e[3].c5;
c6 = p2e[3].c6;
c5 = p2e[onetype].c5;
c6 = p2e[onetype].c6;
}
}
@ -312,7 +317,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
}
const int * _noalias const jlist = firstneigh + cnumneigh[ii];
const int jnum = numneigh[ii];
const int jnum = numneigh[i];
const int jnumhalf = numneighhalf[ii];
acc_t fxtmp, fytmp, fztmp, fwtmp;
@ -541,11 +546,9 @@ void PairSWIntel::eval(const int offload, const int vflag,
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (EFLAG || vflag) {
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
}
if (vflag) {
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -585,7 +588,7 @@ template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
void PairSWIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc, const int astart,
const int aend, const int pad_width)
const int aend)
{
typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t;
@ -601,10 +604,13 @@ void PairSWIntel::eval(const int offload, const int vflag,
ATOM_T * _noalias const x = buffers->get_x(offload);
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneighhalf = buffers->get_atombin();
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int * _noalias const firstneigh = list->firstneigh[ilist[0]];
int *nhalf, *cnum;
buffers->get_list_data3(list, nhalf, cnum);
const int * _noalias const numneighhalf = nhalf;
const int * _noalias const cnumneigh = cnum;
const FC_PACKED0_T * _noalias const p2 = fc.p2[0];
const FC_PACKED1_T * _noalias const p2f = fc.p2f[0];
@ -654,7 +660,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
in(ccachei,ccachej,ccachef:length(0) alloc_if(0) free_if(0)) \
in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
in(astart,nlocal,f_stride,minlocal,separate_flag) \
in(ccache_stride3) \
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
@ -670,8 +676,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
f_stride, x, 0);
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
if (EFLAG || vflag)
oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
#if defined(_OPENMP)
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
@ -701,34 +707,38 @@ void PairSWIntel::eval(const int offload, const int vflag,
SIMD_flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3,c4, c5, c6;
SIMD_flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
if (ONETYPE) {
cutsq = SIMD_set(p2[3].cutsq);
cut = SIMD_set(p2f[3].cut);
sigma = SIMD_set(p2f[3].sigma);
c1 = SIMD_set(p2f2[3].c1);
c2 = SIMD_set(p2f2[3].c2);
c3 = SIMD_set(p2f2[3].c3);
c4 = SIMD_set(p2f2[3].c4);
sigma_gamma = SIMD_set(p2[3].sigma_gamma);
costheta = SIMD_set(p3[7].costheta);
lambda_epsilon = SIMD_set(p3[7].lambda_epsilon);
lambda_epsilon2 = SIMD_set(p3[7].lambda_epsilon2);
cutsq = SIMD_set(p2[_onetype].cutsq);
cut = SIMD_set(p2f[_onetype].cut);
sigma = SIMD_set(p2f[_onetype].sigma);
c1 = SIMD_set(p2f2[_onetype].c1);
c2 = SIMD_set(p2f2[_onetype].c2);
c3 = SIMD_set(p2f2[_onetype].c3);
c4 = SIMD_set(p2f2[_onetype].c4);
sigma_gamma = SIMD_set(p2[_onetype].sigma_gamma);
costheta = SIMD_set(p3[_onetype3].costheta);
lambda_epsilon = SIMD_set(p3[_onetype3].lambda_epsilon);
lambda_epsilon2 = SIMD_set(p3[_onetype3].lambda_epsilon2);
if (SPQ == 0) {
powerp = SIMD_set(p2f[3].powerp);
powerq = SIMD_set(p2f[3].powerq);
powerp = SIMD_set(p2f[_onetype].powerp);
powerq = SIMD_set(p2f[_onetype].powerq);
}
if (EFLAG) {
c5 = SIMD_set(p2e[3].c5);
c6 = SIMD_set(p2e[3].c6);
c5 = SIMD_set(p2e[_onetype].c5);
c6 = SIMD_set(p2e[_onetype].c6);
}
}
const SIMD_int goffset = SIMD_set(0,16,32,48,64,80,96,112,128,
144,160,176,192,208,224,240);
acc_t * const dforce = &(f[0].x);
for (int i = iifrom; i < iito; i += iip) {
SIMD_int ilistv = SIMD_load(ilist + i);
SIMD_int goffset = ilistv * 16;
SIMD_mask imask = ilistv < iito;
SIMD_mask imask;
if (swidth == 16)
imask = 0xFFFF;
else
imask = 0xFF;
const int rem = iito - i;
if (rem < swidth) imask = imask >> (swidth - rem);
SIMD_flt_t xtmp, ytmp, ztmp;
SIMD_int itype, itype_offset;
@ -741,11 +751,12 @@ void PairSWIntel::eval(const int offload, const int vflag,
#ifdef LMP_INTEL_3BODY_FAST
const int* ng = firstneigh + cnumneigh[i] - swidth;
const SIMD_int jnum = SIMD_loadz(imask, numneigh + i);
#else
SIMD_int ng = SIMD_load(cnumneigh + i);
const SIMD_int jnum = SIMD_gatherz(imask, numneigh, ilistv);
ng = ng - 1;
#endif
const SIMD_int jnum = SIMD_loadz(imask, numneigh + i);
const SIMD_int jnumhalf = SIMD_loadz(imask, numneighhalf + i);
const int jnum_max = SIMD_max(jnum);
@ -1051,11 +1062,9 @@ void PairSWIntel::eval(const int offload, const int vflag,
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (EFLAG || vflag) {
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
}
if (vflag) {
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -1162,26 +1171,22 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1,memory,_cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
double cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
cut = init_one(i,j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
else
cut = 0.0;
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
}
_onetype = 0;
if (atom->ntypes == 1) _onetype = 1;
_spq = 1;
int mytypes = 0;
for (int ii = 0; ii < tp1; ii++) {
int i = map[ii];
for (int jj = 0; jj < tp1; jj++) {
@ -1231,6 +1236,9 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
fc.p3[ii][jj][kk].lambda_epsilon = 0;
fc.p3[ii][jj][kk].lambda_epsilon2 = 0;
} else {
mytypes++;
_onetype = ii * tp1 + jj;
_onetype3 = ii * tp1 * tp1 + jj * tp1 + kk;
int ijkparam = elem2param[i][j][k];
fc.p3[ii][jj][kk].costheta = params[ijkparam].costheta;
fc.p3[ii][jj][kk].lambda_epsilon = params[ijkparam].lambda_epsilon;
@ -1239,12 +1247,7 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
}
}
}
_host_pad = 1;
_offload_pad = 1;
if (INTEL_NBOR_PAD > 1)
_host_pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
if (mytypes > 1) _onetype = 0;
#ifdef _LMP_INTEL_OFFLOAD
if (_cop < 0) return;
@ -1253,16 +1256,11 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
FC_PACKED1p2_T *op2f2 = fc.p2f2[0];
FC_PACKED2_T *op2e = fc.p2e[0];
FC_PACKED3_T *op3 = fc.p3[0][0];
flt_t * ocutneighsq = cutneighsq[0];
int tp1sq = tp1 * tp1;
int tp1cu = tp1sq * tp1;
if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL &&
op3 != NULL && ocutneighsq != NULL) {
#pragma offload_transfer target(mic:_cop) \
in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0)) \
in(op3: length(tp1cu) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq))
}
#pragma offload_transfer target(mic:_cop) \
in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0)) \
in(op3: length(tp1cu) alloc_if(0) free_if(0))
#endif
}

View File

@ -49,13 +49,13 @@ class PairSWIntel : public PairSW {
template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t>
void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
const int astart, const int aend, const int pad_width);
const int astart, const int aend);
template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers);
int _ccache_stride, _host_pad, _offload_pad, _spq, _onetype;
int _ccache_stride, _spq, _onetype, _onetype3;
#ifdef LMP_USE_AVXCD
int _ccache_stride3;
#endif
@ -75,7 +75,7 @@ class PairSWIntel : public PairSW {
flt_t c1, c2, c3, c4;
} fc_packed1p2;
typedef struct {
flt_t c5, c6;
flt_t c5, c6, d1, d2;
} fc_packed2;
typedef struct {
flt_t costheta, lambda_epsilon, lambda_epsilon2, pad;

View File

@ -220,7 +220,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
static void kernel_step(
int eatom, int vflag,
const int * _noalias const numneigh,
const int * _noalias const cnumneigh,
iarr vcnumneigh,
const int * _noalias const firstneigh,
int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
@ -241,7 +241,8 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
const c_inner_t * _noalias const c_inner,
const c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive
avec *vsevdwl, int compress_idx, int ii, int i, iarr js,
bvec vmask_repulsive
);
};
@ -274,9 +275,12 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const numneighhalf = buffers->get_atombin();
const int * _noalias const firstneigh = buffers->firstneigh(list);
const int * _noalias const firstneigh = list->firstneigh[ilist[0]];
int *nhalf, *cnum;
buffers->get_list_data3(list, nhalf, cnum);
const int * _noalias const numneighhalf = nhalf;
const int * _noalias const cnumneigh = cnum;
typedef typename ForceConst<flt_t>::c_inner_t c_inner_t;
typedef typename ForceConst<flt_t>::c_outer_t c_outer_t;
@ -332,13 +336,13 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
f_stride, x, 0);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = oecoul = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG || vflag)
oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
// loop over neighbors of my atoms
#if defined(_OPENMP)
#pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int iifrom, iito, tid;
@ -379,11 +383,9 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (EFLAG || vflag) {
ev_global[0] = oevdwl;
ev_global[1] = 0.0;
}
if (vflag) {
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
@ -461,19 +463,16 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1, memory, _cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
double cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
cut = init_one(i, j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
else
cut = 0.0;
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
}
@ -547,7 +546,6 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
typename ForceConst<flt_t>::c_inner_loop_t * c_inner_loop = fc.c_inner_loop[0][0];
typename ForceConst<flt_t>::c_cutoff_t * c_cutoff_inner = fc.c_cutoff_inner[0][0];
typename ForceConst<flt_t>::c_inner_t * c_inner = fc.c_inner[0][0];
flt_t * ocutneighsq = cutneighsq[0];
size_t VL = 512 / 8 / sizeof(flt_t);
int ntypes = tp1;
int ntypes_pad = ntypes + VL - ntypes % VL;
@ -557,8 +555,7 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
#pragma offload_transfer target(mic:_cop) \
in(c_first_loop, c_second_loop, c_cutoff_outer, c_outer : length(tp1sq) alloc_if(0) free_if(0)) \
in(c_inner : length(tp1cb) alloc_if(0) free_if(0)) \
in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0))
#endif
}
@ -647,7 +644,7 @@ template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
template<int EFLAG>
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
int eatom, int vflag,
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
const int * _noalias const numneigh, iarr vcnumneigh,
const int * _noalias const firstneigh, int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
@ -681,6 +678,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
// TDO: We could extract this from the driver routine
ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is));
ivec vjs = v::int_mullo(v_i4floats, v::int_load_vl(js));
ivec vcnumneigh_i = v::int_load_vl(vcnumneigh);
bvec vmask = v::mask_enable_lower(compress_idx);
fvec vx_i = v::zero(), vy_i = v::zero(), vz_i = v::zero();
ivec vw_i = v_i0;
@ -692,7 +690,6 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
fvec vrijsq = vdx_ij * vdx_ij + vdy_ij * vdy_ij + vdz_ij * vdz_ij;
fvec vrij = sqrt(vrijsq);
ivec vis_orig = v::int_load_vl(is);
ivec vcnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, cnumneigh);
ivec vnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, numneigh);
ivec vc_idx_ij = v::int_mullo(v_i4floats, vw_j + v::int_mullo(v_i_ntypes, vw_i));
@ -930,6 +927,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
avec *vsevdwl,
int compress_idx,
int ii,
int i,
iarr js,
bvec vmask_repulsive
@ -970,7 +968,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
fvec vrijsq = vdx_ij * vdx_ij + vdy_ij * vdy_ij + vdz_ij * vdz_ij;
fvec vrij = sqrt(vrijsq);
int cnumneigh_i = cnumneigh[i];
int cnumneigh_i = cnumneigh[ii];
int numneigh_i = numneigh[i];
ivec vc_idx_j = v::int_mullo(v_i4floats, vw_j);
ivec vc_idx_j_ntypes = v::int_mullo(v_i_ntypes, vc_idx_j);
@ -1147,7 +1145,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
) {
int compress_idx = 0;
int ii, jj;
iarr is, js;
iarr is, js, vc;
avec vsevdwl = v::acc_zero();
ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4));
ivec vj, v_NEIGHMASK(NEIGHMASK);
@ -1166,7 +1164,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
flt_t y_i = x[i].y;
flt_t z_i = x[i].z;
int jlist_off_i = cnumneigh[ii];
int jnum = numneigh[ii];
int jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
int j = firstneigh[jlist_off_i + jj] & NEIGHMASK;
int w_j = x[j].w;
@ -1178,6 +1176,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
if (rsq < cutsq) {
is[compress_idx] = ii;
js[compress_idx] = j;
vc[compress_idx] = jlist_off_i;
if (jj < numneighhalf[ii])
repulsive_flag[compress_idx] = 1;
compress_idx += 1;
@ -1187,7 +1186,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
kernel_step<EFLAG>(
eatom, vflag,
numneigh, cnumneigh, firstneigh, ntypes,
numneigh, vc, firstneigh, ntypes,
x, c_inner, c_outer, f,
&vsevdwl, compress_idx,
is, js, vmask_repulsive
@ -1203,7 +1202,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
numneigh, cnumneigh, firstneigh, ntypes,
x, c_inner, c_outer, f,
&vsevdwl, compress_idx,
i, js, vmask_repulsive
ii, i, js, vmask_repulsive
);
compress_idx = 0;
v::int_clear_arr(repulsive_flag);
@ -1215,7 +1214,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
IntelKernelTersoff::kernel_step<EFLAG>(
eatom, vflag,
numneigh, cnumneigh, firstneigh, ntypes,
numneigh, vc, firstneigh, ntypes,
x, c_inner, c_outer, f,
&vsevdwl, compress_idx,
is, js, vmask_repulsive

View File

@ -581,6 +581,12 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
else
nthr = comm->nthreads;
if (fix->need_zero(0)) {
int zl = nlocal;
if (force->newton_pair) zl += atom->nghost;
memset(f, 0, zl * sizeof(FORCE_T));
}
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(nlocal, nthr) if(!_use_lrt)
@ -726,6 +732,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
if (fix->need_zero(0)) {
int zl = nlocal;
if (force->newton_pair) zl += atom->nghost;
memset(f, 0, zl * sizeof(FORCE_T));
}
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(nlocal, nthr) if(!_use_lrt)

View File

@ -79,7 +79,9 @@ void *Memory::srealloc(void *ptr, bigint nbytes, const char *name)
#if defined(LMP_USE_TBB_ALLOCATOR)
ptr = scalable_aligned_realloc(ptr, nbytes, LAMMPS_MEMALIGN);
#elif defined(LMP_INTEL_NO_TBB) && defined(LAMMPS_MEMALIGN)
#elif defined(LMP_INTEL_NO_TBB) && defined(LAMMPS_MEMALIGN) && \
defined(__INTEL_COMPILER)
ptr = realloc(ptr, nbytes);
uintptr_t offset = ((uintptr_t)(const void *)(ptr)) % LAMMPS_MEMALIGN;
if (offset) {

View File

@ -207,6 +207,10 @@ class Pair : protected Pointers {
typedef union {int i; float f;} union_int_float_t;
// Accessor for the user-intel package to determine virial calc for hybrid
inline int fdotr_is_set() const { return vflag_fdotr; }
protected:
int vflag_fdotr;
int maxeatom,maxvatom;