git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12851 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -6,6 +6,8 @@
|
|||||||
W. Michael Brown (Intel)
|
W. Michael Brown (Intel)
|
||||||
michael.w.brown at intel.com
|
michael.w.brown at intel.com
|
||||||
|
|
||||||
|
Anupama Kurpad (Intel)
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
|
|
||||||
This package is based on the USER-OMP package and provides LAMMPS styles that:
|
This package is based on the USER-OMP package and provides LAMMPS styles that:
|
||||||
@ -41,3 +43,10 @@ Intel compilers.
|
|||||||
For portability reasons, vectorization directives are currently only enabled
|
For portability reasons, vectorization directives are currently only enabled
|
||||||
for Intel compilers. Using other compilers may result in significantly
|
for Intel compilers. Using other compilers may result in significantly
|
||||||
lower performance.
|
lower performance.
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
By default, when running with offload to Intel(R) coprocessors, affinity
|
||||||
|
for host MPI tasks and OpenMP threads is set automatically within the code.
|
||||||
|
This currently requires the use of system calls. To disable at build time,
|
||||||
|
compile with -DINTEL_OFFLOAD_NOAFFINITY.
|
||||||
|
|||||||
@ -13,6 +13,7 @@
|
|||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
Contributing author: W. Michael Brown (Intel)
|
Contributing author: W. Michael Brown (Intel)
|
||||||
|
Anupama Kurpad (Intel) - Host Affinitization
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#include "comm.h"
|
#include "comm.h"
|
||||||
@ -59,7 +60,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
|||||||
_off_overflow_flag[LMP_OVERFLOW] = 0;
|
_off_overflow_flag[LMP_OVERFLOW] = 0;
|
||||||
|
|
||||||
_offload_affinity_balanced = 0;
|
_offload_affinity_balanced = 0;
|
||||||
_offload_threads = 1;
|
_offload_threads = 0;
|
||||||
_offload_tpc = 4;
|
_offload_tpc = 4;
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
@ -71,25 +72,12 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
|||||||
_off_ev_array_s = 0;
|
_off_ev_array_s = 0;
|
||||||
_off_ev_array_d = 0;
|
_off_ev_array_d = 0;
|
||||||
_balance_fixed = 0.0;
|
_balance_fixed = 0.0;
|
||||||
|
|
||||||
_cop = 0;
|
_cop = 0;
|
||||||
|
|
||||||
int max_offload_threads, offload_cores;
|
|
||||||
#pragma offload target(mic:_cop) mandatory \
|
|
||||||
out(max_offload_threads,offload_cores)
|
|
||||||
{
|
|
||||||
offload_cores = omp_get_num_procs();
|
|
||||||
omp_set_num_threads(offload_cores);
|
|
||||||
max_offload_threads = omp_get_max_threads();
|
|
||||||
}
|
|
||||||
_max_offload_threads = max_offload_threads;
|
|
||||||
_offload_cores = offload_cores;
|
|
||||||
_offload_threads = offload_cores;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// optional keywords
|
// optional keywords
|
||||||
|
|
||||||
int nomp = 0;
|
int nomp = 0, no_affinity = 0;
|
||||||
_allow_separate_buffers = 1;
|
_allow_separate_buffers = 1;
|
||||||
_offload_ghost = -1;
|
_offload_ghost = -1;
|
||||||
|
|
||||||
@ -127,6 +115,9 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
|||||||
if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
|
if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
|
||||||
_offload_threads = atoi(arg[iarg+1]);
|
_offload_threads = atoi(arg[iarg+1]);
|
||||||
iarg += 2;
|
iarg += 2;
|
||||||
|
} else if (strcmp(arg[iarg],"no_affinity") == 0) {
|
||||||
|
no_affinity = 1;
|
||||||
|
iarg++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// undocumented options
|
// undocumented options
|
||||||
@ -143,10 +134,34 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
|||||||
|
|
||||||
// error check
|
// error check
|
||||||
|
|
||||||
if (_offload_balance > 1.0 || _offload_threads <= 0 ||
|
if (_offload_balance > 1.0 || _offload_threads < 0 ||
|
||||||
_offload_tpc <= 0 || _offload_tpc > 4)
|
_offload_tpc <= 0 || _offload_tpc > 4 || nomp < 0)
|
||||||
error->all(FLERR,"Illegal package intel command");
|
error->all(FLERR,"Illegal package intel command");
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
_ncops = ncops;
|
||||||
|
if (_offload_balance != 0.0) {
|
||||||
|
_real_space_comm = MPI_COMM_WORLD;
|
||||||
|
if (no_affinity == 0)
|
||||||
|
if (set_host_affinity(nomp) != 0)
|
||||||
|
error->all(FLERR,"Could not set host affinity for offload tasks");
|
||||||
|
}
|
||||||
|
|
||||||
|
int max_offload_threads = 0, offload_cores = 0;
|
||||||
|
if (_offload_balance != 0.0) {
|
||||||
|
#pragma offload target(mic:_cop) mandatory \
|
||||||
|
out(max_offload_threads,offload_cores)
|
||||||
|
{
|
||||||
|
offload_cores = omp_get_num_procs();
|
||||||
|
omp_set_num_threads(offload_cores);
|
||||||
|
max_offload_threads = omp_get_max_threads();
|
||||||
|
}
|
||||||
|
_max_offload_threads = max_offload_threads;
|
||||||
|
_offload_cores = offload_cores;
|
||||||
|
if (_offload_threads == 0) _offload_threads = offload_cores;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// set OpenMP threads
|
// set OpenMP threads
|
||||||
// nomp is user setting, default = 0
|
// nomp is user setting, default = 0
|
||||||
|
|
||||||
@ -154,13 +169,17 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
|||||||
if (nomp != 0) {
|
if (nomp != 0) {
|
||||||
omp_set_num_threads(nomp);
|
omp_set_num_threads(nomp);
|
||||||
comm->nthreads = nomp;
|
comm->nthreads = nomp;
|
||||||
|
} else {
|
||||||
|
int nthreads;
|
||||||
|
#pragma omp parallel default(none) shared(nthreads)
|
||||||
|
nthreads = omp_get_num_threads();
|
||||||
|
comm->nthreads = nthreads;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// set offload params
|
// set offload params
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
_ncops = ncops;
|
|
||||||
if (_offload_balance < 0.0) {
|
if (_offload_balance < 0.0) {
|
||||||
_balance_neighbor = 0.9;
|
_balance_neighbor = 0.9;
|
||||||
_balance_pair = 0.9;
|
_balance_pair = 0.9;
|
||||||
@ -173,6 +192,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
|||||||
zero_timers();
|
zero_timers();
|
||||||
_setup_time_cleared = false;
|
_setup_time_cleared = false;
|
||||||
_timers_allocated = false;
|
_timers_allocated = false;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
_offload_balance = 0.0;
|
_offload_balance = 0.0;
|
||||||
#endif
|
#endif
|
||||||
@ -197,7 +217,8 @@ FixIntel::~FixIntel()
|
|||||||
double *time1 = off_watch_pair();
|
double *time1 = off_watch_pair();
|
||||||
double *time2 = off_watch_neighbor();
|
double *time2 = off_watch_neighbor();
|
||||||
int *overflow = get_off_overflow_flag();
|
int *overflow = get_off_overflow_flag();
|
||||||
if (time1 != NULL && time2 != NULL && overflow != NULL) {
|
if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL &&
|
||||||
|
overflow != NULL) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
|
nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
@ -239,7 +260,8 @@ void FixIntel::init()
|
|||||||
double *time1 = off_watch_pair();
|
double *time1 = off_watch_pair();
|
||||||
double *time2 = off_watch_neighbor();
|
double *time2 = off_watch_neighbor();
|
||||||
int *overflow = get_off_overflow_flag();
|
int *overflow = get_off_overflow_flag();
|
||||||
if (time1 != NULL && time2 != NULL && overflow != NULL) {
|
if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL &&
|
||||||
|
overflow != NULL) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
|
nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
|
||||||
in(overflow:length(5) alloc_if(1) free_if(0))
|
in(overflow:length(5) alloc_if(1) free_if(0))
|
||||||
@ -247,25 +269,6 @@ void FixIntel::init()
|
|||||||
_timers_allocated = true;
|
_timers_allocated = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
char kmode[80];
|
|
||||||
if (_precision_mode == PREC_MODE_SINGLE)
|
|
||||||
strcpy(kmode, "single");
|
|
||||||
else if (_precision_mode == PREC_MODE_MIXED)
|
|
||||||
strcpy(kmode, "mixed");
|
|
||||||
else
|
|
||||||
strcpy(kmode, "double");
|
|
||||||
|
|
||||||
// print summary of settings
|
|
||||||
if (comm->me == 0) {
|
|
||||||
if (screen) {
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (_offload_balance != 0.0) {
|
|
||||||
fprintf(screen,"using offload with %d threads per core, ",_offload_tpc);
|
|
||||||
fprintf(screen,"%d threads per task\n",_offload_threads);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (update->whichflag == 2 && _offload_balance != 0.0) {
|
if (update->whichflag == 2 && _offload_balance != 0.0) {
|
||||||
if (_offload_balance == 1.0 && _offload_noghost == 0)
|
if (_offload_balance == 1.0 && _offload_noghost == 0)
|
||||||
_sync_at_pair = 1;
|
_sync_at_pair = 1;
|
||||||
@ -279,12 +282,6 @@ void FixIntel::init()
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (neighbor->style != BIN)
|
|
||||||
error->all(FLERR,
|
|
||||||
"Currently, neighbor style BIN must be used with Intel package.");
|
|
||||||
if (neighbor->exclude_setting() != 0)
|
|
||||||
error->all(FLERR,
|
|
||||||
"Currently, cannot use neigh_modify exclude with Intel package.");
|
|
||||||
int nstyles = 0;
|
int nstyles = 0;
|
||||||
if (force->pair_match("hybrid", 1) != NULL) {
|
if (force->pair_match("hybrid", 1) != NULL) {
|
||||||
PairHybrid *hybrid = (PairHybrid *) force->pair;
|
PairHybrid *hybrid = (PairHybrid *) force->pair;
|
||||||
@ -315,6 +312,71 @@ void FixIntel::init()
|
|||||||
_double_buffers->zero_ev();
|
_double_buffers->zero_ev();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
void FixIntel::setup(int vflag)
|
||||||
|
{
|
||||||
|
if (neighbor->style != BIN)
|
||||||
|
error->all(FLERR,
|
||||||
|
"Currently, neighbor style BIN must be used with Intel package.");
|
||||||
|
if (neighbor->exclude_setting() != 0)
|
||||||
|
error->all(FLERR,
|
||||||
|
"Currently, cannot use neigh_modify exclude with Intel package.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
void FixIntel::pair_init_check()
|
||||||
|
{
|
||||||
|
if (_offload_balance != 0.0 && comm->me == 0) {
|
||||||
|
#ifndef __INTEL_COMPILER_BUILD_DATE
|
||||||
|
error->warning(FLERR, "Unknown Intel Compiler Version\n");
|
||||||
|
#else
|
||||||
|
if (__INTEL_COMPILER_BUILD_DATE != 20131008 &&
|
||||||
|
__INTEL_COMPILER_BUILD_DATE < 20141023)
|
||||||
|
error->warning(FLERR, "Unsupported Intel Compiler.");
|
||||||
|
#endif
|
||||||
|
#if !defined(__INTEL_COMPILER)
|
||||||
|
error->warning(FLERR, "Unsupported Intel Compiler.");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear buffers used for pair style
|
||||||
|
char kmode[80];
|
||||||
|
if (_precision_mode == PREC_MODE_SINGLE) {
|
||||||
|
strcpy(kmode, "single");
|
||||||
|
get_single_buffers()->free_all_nbor_buffers();
|
||||||
|
} else if (_precision_mode == PREC_MODE_MIXED) {
|
||||||
|
strcpy(kmode, "mixed");
|
||||||
|
get_mixed_buffers()->free_all_nbor_buffers();
|
||||||
|
} else {
|
||||||
|
strcpy(kmode, "double");
|
||||||
|
get_double_buffers()->free_all_nbor_buffers();
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
set_offload_affinity();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (comm->me == 0) {
|
||||||
|
if (screen) {
|
||||||
|
fprintf(screen,
|
||||||
|
"----------------------------------------------------------\n");
|
||||||
|
if (_offload_balance != 0.0) {
|
||||||
|
fprintf(screen,"Using Intel Coprocessor with %d threads per core, ",
|
||||||
|
_offload_tpc);
|
||||||
|
fprintf(screen,"%d threads per task\n",_offload_threads);
|
||||||
|
} else {
|
||||||
|
fprintf(screen,"Using Intel Package without Coprocessor.\n");
|
||||||
|
}
|
||||||
|
fprintf(screen,"Precision: %s\n",kmode);
|
||||||
|
fprintf(screen,
|
||||||
|
"----------------------------------------------------------\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
void FixIntel::check_neighbor_intel()
|
void FixIntel::check_neighbor_intel()
|
||||||
@ -440,6 +502,14 @@ void FixIntel::output_timing_data() {
|
|||||||
fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
|
fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
|
||||||
fprintf(_tscreen, "\n");
|
fprintf(_tscreen, "\n");
|
||||||
#endif
|
#endif
|
||||||
|
double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] +
|
||||||
|
timers[TIME_OFFLOAD_WAIT];
|
||||||
|
double ct = timers[TIME_OFFLOAD_NEIGHBOR] +
|
||||||
|
timers[TIME_OFFLOAD_PAIR];
|
||||||
|
double tt = MAX(ht,ct);
|
||||||
|
if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0)
|
||||||
|
error->warning(FLERR,
|
||||||
|
"Leaving a core free can improve performance for offload");
|
||||||
}
|
}
|
||||||
fprintf(_tscreen, "------------------------------------------------\n");
|
fprintf(_tscreen, "------------------------------------------------\n");
|
||||||
}
|
}
|
||||||
@ -549,4 +619,157 @@ void FixIntel::set_offload_affinity()
|
|||||||
_double_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
|
_double_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
int FixIntel::set_host_affinity(const int nomp)
|
||||||
|
{
|
||||||
|
#ifndef INTEL_OFFLOAD_NOAFFINITY
|
||||||
|
_separate_coi = 1;
|
||||||
|
int rank = comm->me;
|
||||||
|
int node_rank;
|
||||||
|
int ppn = get_ppn(node_rank);
|
||||||
|
int cop = node_rank / (ppn / _ncops);
|
||||||
|
|
||||||
|
// Get a sorted list of logical cores
|
||||||
|
int proc_list[INTEL_MAX_HOST_CORE_COUNT];
|
||||||
|
int ncores;
|
||||||
|
FILE *p;
|
||||||
|
char cmd[512];
|
||||||
|
char readbuf[INTEL_MAX_HOST_CORE_COUNT*5];
|
||||||
|
sprintf(cmd, "lscpu -p=cpu,core,socket | grep -v '#' |"
|
||||||
|
"sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'");
|
||||||
|
p = popen(cmd, "r");
|
||||||
|
if (p == NULL) return -1;
|
||||||
|
ncores = 0;
|
||||||
|
while(fgets(readbuf, 512, p)) {
|
||||||
|
proc_list[ncores] = atoi(readbuf);
|
||||||
|
ncores++;
|
||||||
|
}
|
||||||
|
pclose(p);
|
||||||
|
|
||||||
|
// Sanity checks for core list
|
||||||
|
if (ncores < 2) return -1;
|
||||||
|
int nzero = 0;
|
||||||
|
for (int i = 0; i < ncores; i++) {
|
||||||
|
if (proc_list[i] == 0) nzero++;
|
||||||
|
if (proc_list[i] < 0 || proc_list[i] >= ncores) return -1;
|
||||||
|
}
|
||||||
|
if (nzero > 1) return -1;
|
||||||
|
|
||||||
|
// Determine the OpenMP/MPI configuration
|
||||||
|
char *estring;
|
||||||
|
int nthreads = nomp;
|
||||||
|
if (nthreads == 0) {
|
||||||
|
estring = getenv("OMP_NUM_THREADS");
|
||||||
|
if (estring != NULL) {
|
||||||
|
nthreads = atoi(estring);
|
||||||
|
if (nthreads < 2) nthreads = 1;
|
||||||
|
} else
|
||||||
|
nthreads = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine how many logical cores for COI and MPI tasks
|
||||||
|
int coi_cores = 0, mpi_cores;
|
||||||
|
int subscription = nthreads * ppn;
|
||||||
|
if (subscription > ncores) {
|
||||||
|
if (rank == 0)
|
||||||
|
error->warning(FLERR,
|
||||||
|
"More MPI tasks/OpenMP threads than available cores");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (subscription == ncores)
|
||||||
|
_separate_coi = 0;
|
||||||
|
|
||||||
|
if (subscription > ncores / 2) {
|
||||||
|
coi_cores = ncores - subscription;
|
||||||
|
if (coi_cores > INTEL_MAX_COI_CORES) coi_cores = INTEL_MAX_COI_CORES;
|
||||||
|
}
|
||||||
|
mpi_cores = (ncores - coi_cores) / ppn;
|
||||||
|
|
||||||
|
// Get ids of all LWPs that COI spawned and affinitize
|
||||||
|
int lwp = 0, plwp = 0, nlwp = 0, mlwp = 0, fail = 0;
|
||||||
|
cpu_set_t cpuset;
|
||||||
|
pid_t pid = getpid();
|
||||||
|
if (coi_cores) {
|
||||||
|
sprintf(cmd, "ps -Lp %d -o lwp | awk ' (NR > 2) {print}'", pid);
|
||||||
|
p = popen(cmd, "r");
|
||||||
|
if (p == NULL) return -1;
|
||||||
|
|
||||||
|
while(fgets(readbuf, 512, p)) {
|
||||||
|
lwp = atoi(readbuf);
|
||||||
|
int first = coi_cores + node_rank * mpi_cores;
|
||||||
|
CPU_ZERO(&cpuset);
|
||||||
|
for (int i = first; i < first + mpi_cores; i++)
|
||||||
|
CPU_SET(proc_list[i], &cpuset);
|
||||||
|
if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
|
||||||
|
fail = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
plwp++;
|
||||||
|
}
|
||||||
|
pclose(p);
|
||||||
|
|
||||||
|
// Do async offload to create COI threads
|
||||||
|
int sig1, sig2;
|
||||||
|
float *buf1;
|
||||||
|
int pragma_size = 1024;
|
||||||
|
buf1 = (float*) malloc(sizeof(float)*pragma_size);
|
||||||
|
|
||||||
|
#pragma offload target (mic:0) \
|
||||||
|
in(buf1:length(pragma_size) alloc_if(1) free_if(0)) \
|
||||||
|
signal(&sig1)
|
||||||
|
{}
|
||||||
|
#pragma offload_wait target(mic:0) wait(&sig1)
|
||||||
|
|
||||||
|
#pragma offload target (mic:0) \
|
||||||
|
out(buf1:length(pragma_size) alloc_if(0) free_if(1)) \
|
||||||
|
signal(&sig2)
|
||||||
|
{}
|
||||||
|
#pragma offload_wait target(mic:0) wait(&sig2)
|
||||||
|
free(buf1);
|
||||||
|
|
||||||
|
p = popen(cmd, "r");
|
||||||
|
if (p == NULL) return -1;
|
||||||
|
|
||||||
|
while(fgets(readbuf, 512, p)) {
|
||||||
|
lwp = atoi(readbuf);
|
||||||
|
nlwp++;
|
||||||
|
if (nlwp <= plwp) continue;
|
||||||
|
|
||||||
|
CPU_ZERO(&cpuset);
|
||||||
|
for(int i=0; i<coi_cores; i++)
|
||||||
|
CPU_SET(proc_list[i], &cpuset);
|
||||||
|
|
||||||
|
if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
|
||||||
|
fail = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pclose(p);
|
||||||
|
nlwp -= plwp;
|
||||||
|
|
||||||
|
// Get stats on the number of LWPs per process
|
||||||
|
MPI_Reduce(&nlwp, &mlwp, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (screen && rank == 0) {
|
||||||
|
if (coi_cores)
|
||||||
|
fprintf(screen,"Intel Package: Affinitizing %d Offload Threads to %d Cores\n",
|
||||||
|
mlwp, coi_cores);
|
||||||
|
fprintf(screen,"Intel Package: Affinitizing MPI Tasks to %d Cores Each\n",mpi_cores);
|
||||||
|
}
|
||||||
|
if (fail) return -1;
|
||||||
|
|
||||||
|
// Affinitize MPI Ranks
|
||||||
|
CPU_ZERO(&cpuset);
|
||||||
|
int first = coi_cores + node_rank * mpi_cores;
|
||||||
|
for (int i = first; i < first+mpi_cores; i++)
|
||||||
|
CPU_SET(proc_list[i], &cpuset);
|
||||||
|
if (sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset))
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -38,6 +38,8 @@ class FixIntel : public Fix {
|
|||||||
virtual ~FixIntel();
|
virtual ~FixIntel();
|
||||||
virtual int setmask();
|
virtual int setmask();
|
||||||
virtual void init();
|
virtual void init();
|
||||||
|
virtual void setup(int);
|
||||||
|
void pair_init_check();
|
||||||
|
|
||||||
// Get all forces, calculation results from coprocesser
|
// Get all forces, calculation results from coprocesser
|
||||||
void sync_coprocessor();
|
void sync_coprocessor();
|
||||||
@ -135,7 +137,7 @@ class FixIntel : public Fix {
|
|||||||
int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost;
|
int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost;
|
||||||
int _host_min_local, _host_min_ghost, _host_nall;
|
int _host_min_local, _host_min_ghost, _host_nall;
|
||||||
int _host_used_local, _host_used_ghost;
|
int _host_used_local, _host_used_ghost;
|
||||||
int _separate_buffers, _offload_noghost, _sync_at_pair;
|
int _separate_buffers, _offload_noghost, _sync_at_pair, _separate_coi;
|
||||||
bool _setup_time_cleared, _timers_allocated;
|
bool _setup_time_cleared, _timers_allocated;
|
||||||
void output_timing_data();
|
void output_timing_data();
|
||||||
FILE *_tscreen;
|
FILE *_tscreen;
|
||||||
@ -149,6 +151,7 @@ class FixIntel : public Fix {
|
|||||||
int _full_host_list, _cop, _ncops;
|
int _full_host_list, _cop, _ncops;
|
||||||
|
|
||||||
int get_ppn(int &);
|
int get_ppn(int &);
|
||||||
|
int set_host_affinity(const int);
|
||||||
#endif
|
#endif
|
||||||
void check_neighbor_intel();
|
void check_neighbor_intel();
|
||||||
|
|
||||||
@ -540,6 +543,14 @@ E: The 'package intel' command is required for /intel styles
|
|||||||
|
|
||||||
Self-explanatory.
|
Self-explanatory.
|
||||||
|
|
||||||
|
W: Could not set host affinity for offload tasks
|
||||||
|
|
||||||
|
When using offload to a coprocessor, the application will try to set affinity
|
||||||
|
for host MPI tasks and OpenMP threads and will generate a warning if unable
|
||||||
|
to do so successfully. In the unsuccessful case, you might wish to set
|
||||||
|
affinity outside of the application and performance might suffer if
|
||||||
|
hyperthreading is disable on the CPU.
|
||||||
|
|
||||||
E: Neighbor list overflow, boost neigh_modify one
|
E: Neighbor list overflow, boost neigh_modify one
|
||||||
|
|
||||||
Increase the value for neigh_modify one to allow for larger allocations for
|
Increase the value for neigh_modify one to allow for larger allocations for
|
||||||
@ -578,6 +589,17 @@ E: Currently, cannot use neigh_modify exclude with Intel package.
|
|||||||
|
|
||||||
This is a current restriction of the Intel package.
|
This is a current restriction of the Intel package.
|
||||||
|
|
||||||
|
W: Unknown Intel Compiler Version
|
||||||
|
|
||||||
|
The compiler version used to build LAMMPS has not been tested with
|
||||||
|
offload to a coprocessor.
|
||||||
|
|
||||||
|
W: Unsupported Intel Compiler
|
||||||
|
|
||||||
|
The compiler version used to build LAMMPS is not supported when using
|
||||||
|
offload to a coprocessor. There could be performance or correctness
|
||||||
|
issues. Please use 14.0.1.106 or 15.1.133 or later.
|
||||||
|
|
||||||
E: Currently, cannot use more than one intel style with hybrid.
|
E: Currently, cannot use more than one intel style with hybrid.
|
||||||
|
|
||||||
Currently, hybrid pair styles can only use the intel suffix for one of the
|
Currently, hybrid pair styles can only use the intel suffix for one of the
|
||||||
@ -589,9 +611,21 @@ The hybrid pair style configuration is not yet supported by the Intel
|
|||||||
package. Support is limited to hybrid/overlay or a hybrid style that does
|
package. Support is limited to hybrid/overlay or a hybrid style that does
|
||||||
not require a skip list.
|
not require a skip list.
|
||||||
|
|
||||||
|
W: Leaving a core/node free can improve performance for offload
|
||||||
|
|
||||||
|
When each CPU is fully subscribed with MPI tasks and OpenMP threads,
|
||||||
|
context switching with threads used for offload can sometimes decrease
|
||||||
|
performance. If you see this warning, try using fewer MPI tasks/OpenMP threads
|
||||||
|
per node to leave a physical CPU core free on each node.
|
||||||
|
|
||||||
E: MPI tasks per node must be multiple of offload_cards
|
E: MPI tasks per node must be multiple of offload_cards
|
||||||
|
|
||||||
For offload to multiple coprocessors on a single node, the Intel package
|
For offload to multiple coprocessors on a single node, the Intel package
|
||||||
requires that each coprocessor is used by the same number of MPI tasks.
|
requires that each coprocessor is used by the same number of MPI tasks.
|
||||||
|
|
||||||
|
W: More MPI tasks/OpenMP threads than available cores
|
||||||
|
|
||||||
|
Using more MPI tasks/OpenMP threads than available cores will typically
|
||||||
|
decrease performance.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -385,7 +385,7 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
|
|||||||
if (_ntypes > 0) {
|
if (_ntypes > 0) {
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
flt_t * cutneighsqo = _cutneighsq[0];
|
flt_t * cutneighsqo = _cutneighsq[0];
|
||||||
if (cutneighsqo != 0) {
|
if (_off_threads > 0 && cutneighsqo != 0) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(cutneighsqo:alloc_if(0) free_if(1))
|
nocopy(cutneighsqo:alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
@ -396,7 +396,7 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
|
|||||||
lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq");
|
lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq");
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
flt_t * cutneighsqo = _cutneighsq[0];
|
flt_t * cutneighsqo = _cutneighsq[0];
|
||||||
if (cutneighsqo != NULL) {
|
if (_off_threads > 0 && cutneighsqo != NULL) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0))
|
nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0))
|
||||||
}
|
}
|
||||||
|
|||||||
@ -61,6 +61,8 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||||||
#define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
|
#define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
|
||||||
#define INTEL_LB_MEAN_WEIGHT 0.1
|
#define INTEL_LB_MEAN_WEIGHT 0.1
|
||||||
#define INTEL_BIGP 1e15
|
#define INTEL_BIGP 1e15
|
||||||
|
#define INTEL_MAX_HOST_CORE_COUNT 512
|
||||||
|
#define INTEL_MAX_COI_CORES 2
|
||||||
|
|
||||||
#define IP_PRE_get_stride(stride, n, datasize, torque) \
|
#define IP_PRE_get_stride(stride, n, datasize, torque) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@ -893,20 +893,17 @@ void PairGayBerneIntel::init_style()
|
|||||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
fix->set_offload_affinity();
|
|
||||||
if (force->newton_pair) fix->set_offload_noghost(1);
|
if (force->newton_pair) fix->set_offload_noghost(1);
|
||||||
_cop = fix->coprocessor_number();
|
_cop = fix->coprocessor_number();
|
||||||
#endif
|
#endif
|
||||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
||||||
fix->get_mixed_buffers()->free_all_nbor_buffers();
|
fix->pair_init_check();
|
||||||
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||||
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
||||||
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||||
fix->get_double_buffers()->free_all_nbor_buffers();
|
|
||||||
pack_force_const(force_const_double, fix->get_double_buffers());
|
pack_force_const(force_const_double, fix->get_double_buffers());
|
||||||
} else {
|
else
|
||||||
fix->get_single_buffers()->free_all_nbor_buffers();
|
|
||||||
pack_force_const(force_const_single, fix->get_single_buffers());
|
pack_force_const(force_const_single, fix->get_single_buffers());
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|||||||
@ -489,19 +489,16 @@ void PairLJCharmmCoulLongIntel::init_style()
|
|||||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
fix->set_offload_affinity();
|
|
||||||
_cop = fix->coprocessor_number();
|
_cop = fix->coprocessor_number();
|
||||||
#endif
|
#endif
|
||||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
||||||
fix->get_mixed_buffers()->free_all_nbor_buffers();
|
fix->pair_init_check();
|
||||||
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||||
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
||||||
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||||
fix->get_double_buffers()->free_all_nbor_buffers();
|
|
||||||
pack_force_const(force_const_double, fix->get_double_buffers());
|
pack_force_const(force_const_double, fix->get_double_buffers());
|
||||||
} else {
|
else
|
||||||
fix->get_single_buffers()->free_all_nbor_buffers();
|
|
||||||
pack_force_const(force_const_single, fix->get_single_buffers());
|
pack_force_const(force_const_single, fix->get_single_buffers());
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
|
|||||||
@ -456,19 +456,16 @@ void PairLJCutCoulLongIntel::init_style()
|
|||||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
fix->set_offload_affinity();
|
|
||||||
_cop = fix->coprocessor_number();
|
_cop = fix->coprocessor_number();
|
||||||
#endif
|
#endif
|
||||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
||||||
fix->get_mixed_buffers()->free_all_nbor_buffers();
|
fix->pair_init_check();
|
||||||
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||||
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
||||||
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||||
fix->get_double_buffers()->free_all_nbor_buffers();
|
|
||||||
pack_force_const(force_const_double, fix->get_double_buffers());
|
pack_force_const(force_const_double, fix->get_double_buffers());
|
||||||
} else {
|
else
|
||||||
fix->get_single_buffers()->free_all_nbor_buffers();
|
|
||||||
pack_force_const(force_const_single, fix->get_single_buffers());
|
pack_force_const(force_const_single, fix->get_single_buffers());
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
|
|||||||
@ -336,16 +336,14 @@ void PairLJCutIntel::init_style()
|
|||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"Offload for lj/cut/intel is not yet available. Set balance to 0.");
|
"Offload for lj/cut/intel is not yet available. Set balance to 0.");
|
||||||
#endif
|
#endif
|
||||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
||||||
fix->get_mixed_buffers()->free_all_nbor_buffers();
|
fix->pair_init_check();
|
||||||
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||||
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
||||||
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||||
fix->get_double_buffers()->free_all_nbor_buffers();
|
|
||||||
pack_force_const(force_const_double, fix->get_double_buffers());
|
pack_force_const(force_const_double, fix->get_double_buffers());
|
||||||
} else {
|
else
|
||||||
fix->get_single_buffers()->free_all_nbor_buffers();
|
|
||||||
pack_force_const(force_const_single, fix->get_single_buffers());
|
pack_force_const(force_const_single, fix->get_single_buffers());
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|||||||
Reference in New Issue
Block a user