git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12851 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2014-12-22 22:12:30 +00:00
parent 6429e05b78
commit baccbaeda8
9 changed files with 336 additions and 79 deletions

View File

@ -6,6 +6,8 @@
W. Michael Brown (Intel)
michael.w.brown at intel.com
Anupama Kurpad (Intel)
-----------------------------------------------------------------------------
This package is based on the USER-OMP package and provides LAMMPS styles that:
@ -41,3 +43,10 @@ Intel compilers.
For portability reasons, vectorization directives are currently only enabled
for Intel compilers. Using other compilers may result in significantly
lower performance.
-----------------------------------------------------------------------------
By default, when running with offload to Intel(R) coprocessors, affinity
for host MPI tasks and OpenMP threads is set automatically within the code.
This currently requires the use of system calls. To disable at build time,
compile with -DINTEL_OFFLOAD_NOAFFINITY.

View File

@ -13,6 +13,7 @@
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
Anupama Kurpad (Intel) - Host Affinitization
------------------------------------------------------------------------- */
#include "comm.h"
@ -59,7 +60,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
_off_overflow_flag[LMP_OVERFLOW] = 0;
_offload_affinity_balanced = 0;
_offload_threads = 1;
_offload_threads = 0;
_offload_tpc = 4;
#ifdef _LMP_INTEL_OFFLOAD
@ -71,25 +72,12 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
_off_ev_array_s = 0;
_off_ev_array_d = 0;
_balance_fixed = 0.0;
_cop = 0;
int max_offload_threads, offload_cores;
#pragma offload target(mic:_cop) mandatory \
out(max_offload_threads,offload_cores)
{
offload_cores = omp_get_num_procs();
omp_set_num_threads(offload_cores);
max_offload_threads = omp_get_max_threads();
}
_max_offload_threads = max_offload_threads;
_offload_cores = offload_cores;
_offload_threads = offload_cores;
#endif
// optional keywords
int nomp = 0;
int nomp = 0, no_affinity = 0;
_allow_separate_buffers = 1;
_offload_ghost = -1;
@ -127,6 +115,9 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
_offload_threads = atoi(arg[iarg+1]);
iarg += 2;
} else if (strcmp(arg[iarg],"no_affinity") == 0) {
no_affinity = 1;
iarg++;
}
// undocumented options
@ -143,10 +134,34 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
// error check
if (_offload_balance > 1.0 || _offload_threads <= 0 ||
_offload_tpc <= 0 || _offload_tpc > 4)
if (_offload_balance > 1.0 || _offload_threads < 0 ||
_offload_tpc <= 0 || _offload_tpc > 4 || nomp < 0)
error->all(FLERR,"Illegal package intel command");
#ifdef _LMP_INTEL_OFFLOAD
_ncops = ncops;
if (_offload_balance != 0.0) {
_real_space_comm = MPI_COMM_WORLD;
if (no_affinity == 0)
if (set_host_affinity(nomp) != 0)
error->all(FLERR,"Could not set host affinity for offload tasks");
}
int max_offload_threads = 0, offload_cores = 0;
if (_offload_balance != 0.0) {
#pragma offload target(mic:_cop) mandatory \
out(max_offload_threads,offload_cores)
{
offload_cores = omp_get_num_procs();
omp_set_num_threads(offload_cores);
max_offload_threads = omp_get_max_threads();
}
_max_offload_threads = max_offload_threads;
_offload_cores = offload_cores;
if (_offload_threads == 0) _offload_threads = offload_cores;
}
#endif
// set OpenMP threads
// nomp is user setting, default = 0
@ -154,13 +169,17 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
if (nomp != 0) {
omp_set_num_threads(nomp);
comm->nthreads = nomp;
} else {
int nthreads;
#pragma omp parallel default(none) shared(nthreads)
nthreads = omp_get_num_threads();
comm->nthreads = nthreads;
}
#endif
// set offload params
#ifdef _LMP_INTEL_OFFLOAD
_ncops = ncops;
if (_offload_balance < 0.0) {
_balance_neighbor = 0.9;
_balance_pair = 0.9;
@ -173,6 +192,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
zero_timers();
_setup_time_cleared = false;
_timers_allocated = false;
#else
_offload_balance = 0.0;
#endif
@ -197,7 +217,8 @@ FixIntel::~FixIntel()
double *time1 = off_watch_pair();
double *time2 = off_watch_neighbor();
int *overflow = get_off_overflow_flag();
if (time1 != NULL && time2 != NULL && overflow != NULL) {
if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL &&
overflow != NULL) {
#pragma offload_transfer target(mic:_cop) \
nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
}
@ -239,7 +260,8 @@ void FixIntel::init()
double *time1 = off_watch_pair();
double *time2 = off_watch_neighbor();
int *overflow = get_off_overflow_flag();
if (time1 != NULL && time2 != NULL && overflow != NULL) {
if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL &&
overflow != NULL) {
#pragma offload_transfer target(mic:_cop) \
nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
in(overflow:length(5) alloc_if(1) free_if(0))
@ -247,25 +269,6 @@ void FixIntel::init()
_timers_allocated = true;
}
char kmode[80];
if (_precision_mode == PREC_MODE_SINGLE)
strcpy(kmode, "single");
else if (_precision_mode == PREC_MODE_MIXED)
strcpy(kmode, "mixed");
else
strcpy(kmode, "double");
// print summary of settings
if (comm->me == 0) {
if (screen) {
#ifdef _LMP_INTEL_OFFLOAD
if (_offload_balance != 0.0) {
fprintf(screen,"using offload with %d threads per core, ",_offload_tpc);
fprintf(screen,"%d threads per task\n",_offload_threads);
}
#endif
}
}
if (update->whichflag == 2 && _offload_balance != 0.0) {
if (_offload_balance == 1.0 && _offload_noghost == 0)
_sync_at_pair = 1;
@ -279,12 +282,6 @@ void FixIntel::init()
}
#endif
if (neighbor->style != BIN)
error->all(FLERR,
"Currently, neighbor style BIN must be used with Intel package.");
if (neighbor->exclude_setting() != 0)
error->all(FLERR,
"Currently, cannot use neigh_modify exclude with Intel package.");
int nstyles = 0;
if (force->pair_match("hybrid", 1) != NULL) {
PairHybrid *hybrid = (PairHybrid *) force->pair;
@ -315,6 +312,71 @@ void FixIntel::init()
_double_buffers->zero_ev();
}
/* ---------------------------------------------------------------------- */
void FixIntel::setup(int vflag)
{
if (neighbor->style != BIN)
error->all(FLERR,
"Currently, neighbor style BIN must be used with Intel package.");
if (neighbor->exclude_setting() != 0)
error->all(FLERR,
"Currently, cannot use neigh_modify exclude with Intel package.");
}
/* ---------------------------------------------------------------------- */
void FixIntel::pair_init_check()
{
if (_offload_balance != 0.0 && comm->me == 0) {
#ifndef __INTEL_COMPILER_BUILD_DATE
error->warning(FLERR, "Unknown Intel Compiler Version\n");
#else
if (__INTEL_COMPILER_BUILD_DATE != 20131008 &&
__INTEL_COMPILER_BUILD_DATE < 20141023)
error->warning(FLERR, "Unsupported Intel Compiler.");
#endif
#if !defined(__INTEL_COMPILER)
error->warning(FLERR, "Unsupported Intel Compiler.");
#endif
}
// Clear buffers used for pair style
char kmode[80];
if (_precision_mode == PREC_MODE_SINGLE) {
strcpy(kmode, "single");
get_single_buffers()->free_all_nbor_buffers();
} else if (_precision_mode == PREC_MODE_MIXED) {
strcpy(kmode, "mixed");
get_mixed_buffers()->free_all_nbor_buffers();
} else {
strcpy(kmode, "double");
get_double_buffers()->free_all_nbor_buffers();
}
#ifdef _LMP_INTEL_OFFLOAD
set_offload_affinity();
#endif
if (comm->me == 0) {
if (screen) {
fprintf(screen,
"----------------------------------------------------------\n");
if (_offload_balance != 0.0) {
fprintf(screen,"Using Intel Coprocessor with %d threads per core, ",
_offload_tpc);
fprintf(screen,"%d threads per task\n",_offload_threads);
} else {
fprintf(screen,"Using Intel Package without Coprocessor.\n");
}
fprintf(screen,"Precision: %s\n",kmode);
fprintf(screen,
"----------------------------------------------------------\n");
}
}
}
/* ---------------------------------------------------------------------- */
void FixIntel::check_neighbor_intel()
@ -440,6 +502,14 @@ void FixIntel::output_timing_data() {
fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
fprintf(_tscreen, "\n");
#endif
double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] +
timers[TIME_OFFLOAD_WAIT];
double ct = timers[TIME_OFFLOAD_NEIGHBOR] +
timers[TIME_OFFLOAD_PAIR];
double tt = MAX(ht,ct);
if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0)
error->warning(FLERR,
"Leaving a core free can improve performance for offload");
}
fprintf(_tscreen, "------------------------------------------------\n");
}
@ -549,4 +619,157 @@ void FixIntel::set_offload_affinity()
_double_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
}
/* ---------------------------------------------------------------------- */
int FixIntel::set_host_affinity(const int nomp)
{
#ifndef INTEL_OFFLOAD_NOAFFINITY
_separate_coi = 1;
int rank = comm->me;
int node_rank;
int ppn = get_ppn(node_rank);
int cop = node_rank / (ppn / _ncops);
// Get a sorted list of logical cores
int proc_list[INTEL_MAX_HOST_CORE_COUNT];
int ncores;
FILE *p;
char cmd[512];
char readbuf[INTEL_MAX_HOST_CORE_COUNT*5];
sprintf(cmd, "lscpu -p=cpu,core,socket | grep -v '#' |"
"sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'");
p = popen(cmd, "r");
if (p == NULL) return -1;
ncores = 0;
while(fgets(readbuf, 512, p)) {
proc_list[ncores] = atoi(readbuf);
ncores++;
}
pclose(p);
// Sanity checks for core list
if (ncores < 2) return -1;
int nzero = 0;
for (int i = 0; i < ncores; i++) {
if (proc_list[i] == 0) nzero++;
if (proc_list[i] < 0 || proc_list[i] >= ncores) return -1;
}
if (nzero > 1) return -1;
// Determine the OpenMP/MPI configuration
char *estring;
int nthreads = nomp;
if (nthreads == 0) {
estring = getenv("OMP_NUM_THREADS");
if (estring != NULL) {
nthreads = atoi(estring);
if (nthreads < 2) nthreads = 1;
} else
nthreads = 1;
}
// Determine how many logical cores for COI and MPI tasks
int coi_cores = 0, mpi_cores;
int subscription = nthreads * ppn;
if (subscription > ncores) {
if (rank == 0)
error->warning(FLERR,
"More MPI tasks/OpenMP threads than available cores");
return 0;
}
if (subscription == ncores)
_separate_coi = 0;
if (subscription > ncores / 2) {
coi_cores = ncores - subscription;
if (coi_cores > INTEL_MAX_COI_CORES) coi_cores = INTEL_MAX_COI_CORES;
}
mpi_cores = (ncores - coi_cores) / ppn;
// Get ids of all LWPs that COI spawned and affinitize
int lwp = 0, plwp = 0, nlwp = 0, mlwp = 0, fail = 0;
cpu_set_t cpuset;
pid_t pid = getpid();
if (coi_cores) {
sprintf(cmd, "ps -Lp %d -o lwp | awk ' (NR > 2) {print}'", pid);
p = popen(cmd, "r");
if (p == NULL) return -1;
while(fgets(readbuf, 512, p)) {
lwp = atoi(readbuf);
int first = coi_cores + node_rank * mpi_cores;
CPU_ZERO(&cpuset);
for (int i = first; i < first + mpi_cores; i++)
CPU_SET(proc_list[i], &cpuset);
if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
fail = 1;
break;
}
plwp++;
}
pclose(p);
// Do async offload to create COI threads
int sig1, sig2;
float *buf1;
int pragma_size = 1024;
buf1 = (float*) malloc(sizeof(float)*pragma_size);
#pragma offload target (mic:0) \
in(buf1:length(pragma_size) alloc_if(1) free_if(0)) \
signal(&sig1)
{}
#pragma offload_wait target(mic:0) wait(&sig1)
#pragma offload target (mic:0) \
out(buf1:length(pragma_size) alloc_if(0) free_if(1)) \
signal(&sig2)
{}
#pragma offload_wait target(mic:0) wait(&sig2)
free(buf1);
p = popen(cmd, "r");
if (p == NULL) return -1;
while(fgets(readbuf, 512, p)) {
lwp = atoi(readbuf);
nlwp++;
if (nlwp <= plwp) continue;
CPU_ZERO(&cpuset);
for(int i=0; i<coi_cores; i++)
CPU_SET(proc_list[i], &cpuset);
if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
fail = 1;
break;
}
}
pclose(p);
nlwp -= plwp;
// Get stats on the number of LWPs per process
MPI_Reduce(&nlwp, &mlwp, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
}
if (screen && rank == 0) {
if (coi_cores)
fprintf(screen,"Intel Package: Affinitizing %d Offload Threads to %d Cores\n",
mlwp, coi_cores);
fprintf(screen,"Intel Package: Affinitizing MPI Tasks to %d Cores Each\n",mpi_cores);
}
if (fail) return -1;
// Affinitize MPI Ranks
CPU_ZERO(&cpuset);
int first = coi_cores + node_rank * mpi_cores;
for (int i = first; i < first+mpi_cores; i++)
CPU_SET(proc_list[i], &cpuset);
if (sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset))
return -1;
#endif
return 0;
}
#endif

View File

@ -38,6 +38,8 @@ class FixIntel : public Fix {
virtual ~FixIntel();
virtual int setmask();
virtual void init();
virtual void setup(int);
void pair_init_check();
// Get all forces, calculation results from coprocesser
void sync_coprocessor();
@ -135,7 +137,7 @@ class FixIntel : public Fix {
int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost;
int _host_min_local, _host_min_ghost, _host_nall;
int _host_used_local, _host_used_ghost;
int _separate_buffers, _offload_noghost, _sync_at_pair;
int _separate_buffers, _offload_noghost, _sync_at_pair, _separate_coi;
bool _setup_time_cleared, _timers_allocated;
void output_timing_data();
FILE *_tscreen;
@ -149,6 +151,7 @@ class FixIntel : public Fix {
int _full_host_list, _cop, _ncops;
int get_ppn(int &);
int set_host_affinity(const int);
#endif
void check_neighbor_intel();
@ -540,6 +543,14 @@ E: The 'package intel' command is required for /intel styles
Self-explanatory.
W: Could not set host affinity for offload tasks
When using offload to a coprocessor, the application will try to set affinity
for host MPI tasks and OpenMP threads and will generate a warning if unable
to do so successfully. In the unsuccessful case, you might wish to set
affinity outside of the application and performance might suffer if
hyperthreading is disable on the CPU.
E: Neighbor list overflow, boost neigh_modify one
Increase the value for neigh_modify one to allow for larger allocations for
@ -578,6 +589,17 @@ E: Currently, cannot use neigh_modify exclude with Intel package.
This is a current restriction of the Intel package.
W: Unknown Intel Compiler Version
The compiler version used to build LAMMPS has not been tested with
offload to a coprocessor.
W: Unsupported Intel Compiler
The compiler version used to build LAMMPS is not supported when using
offload to a coprocessor. There could be performance or correctness
issues. Please use 14.0.1.106 or 15.1.133 or later.
E: Currently, cannot use more than one intel style with hybrid.
Currently, hybrid pair styles can only use the intel suffix for one of the
@ -589,9 +611,21 @@ The hybrid pair style configuration is not yet supported by the Intel
package. Support is limited to hybrid/overlay or a hybrid style that does
not require a skip list.
W: Leaving a core/node free can improve performance for offload
When each CPU is fully subscribed with MPI tasks and OpenMP threads,
context switching with threads used for offload can sometimes decrease
performance. If you see this warning, try using fewer MPI tasks/OpenMP threads
per node to leave a physical CPU core free on each node.
E: MPI tasks per node must be multiple of offload_cards
For offload to multiple coprocessors on a single node, the Intel package
requires that each coprocessor is used by the same number of MPI tasks.
W: More MPI tasks/OpenMP threads than available cores
Using more MPI tasks/OpenMP threads than available cores will typically
decrease performance.
*/

View File

@ -385,7 +385,7 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
if (_ntypes > 0) {
#ifdef _LMP_INTEL_OFFLOAD
flt_t * cutneighsqo = _cutneighsq[0];
if (cutneighsqo != 0) {
if (_off_threads > 0 && cutneighsqo != 0) {
#pragma offload_transfer target(mic:_cop) \
nocopy(cutneighsqo:alloc_if(0) free_if(1))
}
@ -396,7 +396,7 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq");
#ifdef _LMP_INTEL_OFFLOAD
flt_t * cutneighsqo = _cutneighsq[0];
if (cutneighsqo != NULL) {
if (_off_threads > 0 && cutneighsqo != NULL) {
#pragma offload_transfer target(mic:_cop) \
nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0))
}

View File

@ -61,6 +61,8 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
#define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
#define INTEL_LB_MEAN_WEIGHT 0.1
#define INTEL_BIGP 1e15
#define INTEL_MAX_HOST_CORE_COUNT 512
#define INTEL_MAX_COI_CORES 2
#define IP_PRE_get_stride(stride, n, datasize, torque) \
{ \

View File

@ -893,20 +893,17 @@ void PairGayBerneIntel::init_style()
fix = static_cast<FixIntel *>(modify->fix[ifix]);
#ifdef _LMP_INTEL_OFFLOAD
fix->set_offload_affinity();
if (force->newton_pair) fix->set_offload_noghost(1);
_cop = fix->coprocessor_number();
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
fix->get_mixed_buffers()->free_all_nbor_buffers();
fix->pair_init_check();
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
fix->get_double_buffers()->free_all_nbor_buffers();
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
} else {
fix->get_single_buffers()->free_all_nbor_buffers();
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
}
/* ---------------------------------------------------------------------- */

View File

@ -489,19 +489,16 @@ void PairLJCharmmCoulLongIntel::init_style()
fix = static_cast<FixIntel *>(modify->fix[ifix]);
#ifdef _LMP_INTEL_OFFLOAD
fix->set_offload_affinity();
_cop = fix->coprocessor_number();
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
fix->get_mixed_buffers()->free_all_nbor_buffers();
fix->pair_init_check();
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
fix->get_double_buffers()->free_all_nbor_buffers();
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
} else {
fix->get_single_buffers()->free_all_nbor_buffers();
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
}
template <class flt_t, class acc_t>

View File

@ -456,19 +456,16 @@ void PairLJCutCoulLongIntel::init_style()
fix = static_cast<FixIntel *>(modify->fix[ifix]);
#ifdef _LMP_INTEL_OFFLOAD
fix->set_offload_affinity();
_cop = fix->coprocessor_number();
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
fix->get_mixed_buffers()->free_all_nbor_buffers();
fix->pair_init_check();
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
fix->get_double_buffers()->free_all_nbor_buffers();
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
} else {
fix->get_single_buffers()->free_all_nbor_buffers();
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
}
template <class flt_t, class acc_t>

View File

@ -336,16 +336,14 @@ void PairLJCutIntel::init_style()
error->all(FLERR,
"Offload for lj/cut/intel is not yet available. Set balance to 0.");
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
fix->get_mixed_buffers()->free_all_nbor_buffers();
fix->pair_init_check();
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
fix->get_double_buffers()->free_all_nbor_buffers();
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
} else {
fix->get_single_buffers()->free_all_nbor_buffers();
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
}
/* ---------------------------------------------------------------------- */