From baccbaeda8fc538810fbb06c10d06f997e15ddd7 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 22 Dec 2014 22:12:30 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12851 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/USER-INTEL/README | 9 + src/USER-INTEL/fix_intel.cpp | 313 +++++++++++++++--- src/USER-INTEL/fix_intel.h | 36 +- src/USER-INTEL/intel_buffers.cpp | 4 +- src/USER-INTEL/intel_preprocess.h | 2 + src/USER-INTEL/pair_gayberne_intel.cpp | 13 +- .../pair_lj_charmm_coul_long_intel.cpp | 13 +- .../pair_lj_cut_coul_long_intel.cpp | 13 +- src/USER-INTEL/pair_lj_cut_intel.cpp | 12 +- 9 files changed, 336 insertions(+), 79 deletions(-) diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README index 930cacdd38..b9d391fc30 100644 --- a/src/USER-INTEL/README +++ b/src/USER-INTEL/README @@ -6,6 +6,8 @@ W. Michael Brown (Intel) michael.w.brown at intel.com + Anupama Kurpad (Intel) + ----------------------------------------------------------------------------- This package is based on the USER-OMP package and provides LAMMPS styles that: @@ -41,3 +43,10 @@ Intel compilers. For portability reasons, vectorization directives are currently only enabled for Intel compilers. Using other compilers may result in significantly lower performance. + +----------------------------------------------------------------------------- + +By default, when running with offload to Intel(R) coprocessors, affinity +for host MPI tasks and OpenMP threads is set automatically within the code. +This currently requires the use of system calls. To disable at build time, +compile with -DINTEL_OFFLOAD_NOAFFINITY. diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp index 39d7fe4c63..9b628be7b0 100644 --- a/src/USER-INTEL/fix_intel.cpp +++ b/src/USER-INTEL/fix_intel.cpp @@ -13,6 +13,7 @@ /* ---------------------------------------------------------------------- Contributing author: W. Michael Brown (Intel) + Anupama Kurpad (Intel) - Host Affinitization ------------------------------------------------------------------------- */ #include "comm.h" @@ -59,7 +60,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) _off_overflow_flag[LMP_OVERFLOW] = 0; _offload_affinity_balanced = 0; - _offload_threads = 1; + _offload_threads = 0; _offload_tpc = 4; #ifdef _LMP_INTEL_OFFLOAD @@ -71,25 +72,12 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) _off_ev_array_s = 0; _off_ev_array_d = 0; _balance_fixed = 0.0; - _cop = 0; - - int max_offload_threads, offload_cores; - #pragma offload target(mic:_cop) mandatory \ - out(max_offload_threads,offload_cores) - { - offload_cores = omp_get_num_procs(); - omp_set_num_threads(offload_cores); - max_offload_threads = omp_get_max_threads(); - } - _max_offload_threads = max_offload_threads; - _offload_cores = offload_cores; - _offload_threads = offload_cores; #endif // optional keywords - int nomp = 0; + int nomp = 0, no_affinity = 0; _allow_separate_buffers = 1; _offload_ghost = -1; @@ -127,6 +115,9 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); _offload_threads = atoi(arg[iarg+1]); iarg += 2; + } else if (strcmp(arg[iarg],"no_affinity") == 0) { + no_affinity = 1; + iarg++; } // undocumented options @@ -143,10 +134,34 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) // error check - if (_offload_balance > 1.0 || _offload_threads <= 0 || - _offload_tpc <= 0 || _offload_tpc > 4) + if (_offload_balance > 1.0 || _offload_threads < 0 || + _offload_tpc <= 0 || _offload_tpc > 4 || nomp < 0) error->all(FLERR,"Illegal package intel command"); + #ifdef _LMP_INTEL_OFFLOAD + _ncops = ncops; + if (_offload_balance != 0.0) { + _real_space_comm = MPI_COMM_WORLD; + if (no_affinity == 0) + if (set_host_affinity(nomp) != 0) + error->all(FLERR,"Could not set host affinity for offload tasks"); + } + + int max_offload_threads = 0, offload_cores = 0; + if (_offload_balance != 0.0) { + #pragma offload target(mic:_cop) mandatory \ + out(max_offload_threads,offload_cores) + { + offload_cores = omp_get_num_procs(); + omp_set_num_threads(offload_cores); + max_offload_threads = omp_get_max_threads(); + } + _max_offload_threads = max_offload_threads; + _offload_cores = offload_cores; + if (_offload_threads == 0) _offload_threads = offload_cores; + } + #endif + // set OpenMP threads // nomp is user setting, default = 0 @@ -154,13 +169,17 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) if (nomp != 0) { omp_set_num_threads(nomp); comm->nthreads = nomp; + } else { + int nthreads; + #pragma omp parallel default(none) shared(nthreads) + nthreads = omp_get_num_threads(); + comm->nthreads = nthreads; } #endif // set offload params #ifdef _LMP_INTEL_OFFLOAD - _ncops = ncops; if (_offload_balance < 0.0) { _balance_neighbor = 0.9; _balance_pair = 0.9; @@ -173,6 +192,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) zero_timers(); _setup_time_cleared = false; _timers_allocated = false; + #else _offload_balance = 0.0; #endif @@ -197,7 +217,8 @@ FixIntel::~FixIntel() double *time1 = off_watch_pair(); double *time2 = off_watch_neighbor(); int *overflow = get_off_overflow_flag(); - if (time1 != NULL && time2 != NULL && overflow != NULL) { + if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL && + overflow != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(time1,time2,overflow:alloc_if(0) free_if(1)) } @@ -239,7 +260,8 @@ void FixIntel::init() double *time1 = off_watch_pair(); double *time2 = off_watch_neighbor(); int *overflow = get_off_overflow_flag(); - if (time1 != NULL && time2 != NULL && overflow != NULL) { + if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL && + overflow != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \ in(overflow:length(5) alloc_if(1) free_if(0)) @@ -247,25 +269,6 @@ void FixIntel::init() _timers_allocated = true; } - char kmode[80]; - if (_precision_mode == PREC_MODE_SINGLE) - strcpy(kmode, "single"); - else if (_precision_mode == PREC_MODE_MIXED) - strcpy(kmode, "mixed"); - else - strcpy(kmode, "double"); - - // print summary of settings - if (comm->me == 0) { - if (screen) { - #ifdef _LMP_INTEL_OFFLOAD - if (_offload_balance != 0.0) { - fprintf(screen,"using offload with %d threads per core, ",_offload_tpc); - fprintf(screen,"%d threads per task\n",_offload_threads); - } - #endif - } - } if (update->whichflag == 2 && _offload_balance != 0.0) { if (_offload_balance == 1.0 && _offload_noghost == 0) _sync_at_pair = 1; @@ -279,12 +282,6 @@ void FixIntel::init() } #endif - if (neighbor->style != BIN) - error->all(FLERR, - "Currently, neighbor style BIN must be used with Intel package."); - if (neighbor->exclude_setting() != 0) - error->all(FLERR, - "Currently, cannot use neigh_modify exclude with Intel package."); int nstyles = 0; if (force->pair_match("hybrid", 1) != NULL) { PairHybrid *hybrid = (PairHybrid *) force->pair; @@ -315,6 +312,71 @@ void FixIntel::init() _double_buffers->zero_ev(); } + +/* ---------------------------------------------------------------------- */ + +void FixIntel::setup(int vflag) +{ + if (neighbor->style != BIN) + error->all(FLERR, + "Currently, neighbor style BIN must be used with Intel package."); + if (neighbor->exclude_setting() != 0) + error->all(FLERR, + "Currently, cannot use neigh_modify exclude with Intel package."); +} + +/* ---------------------------------------------------------------------- */ + +void FixIntel::pair_init_check() +{ + if (_offload_balance != 0.0 && comm->me == 0) { + #ifndef __INTEL_COMPILER_BUILD_DATE + error->warning(FLERR, "Unknown Intel Compiler Version\n"); + #else + if (__INTEL_COMPILER_BUILD_DATE != 20131008 && + __INTEL_COMPILER_BUILD_DATE < 20141023) + error->warning(FLERR, "Unsupported Intel Compiler."); + #endif + #if !defined(__INTEL_COMPILER) + error->warning(FLERR, "Unsupported Intel Compiler."); + #endif + } + + // Clear buffers used for pair style + char kmode[80]; + if (_precision_mode == PREC_MODE_SINGLE) { + strcpy(kmode, "single"); + get_single_buffers()->free_all_nbor_buffers(); + } else if (_precision_mode == PREC_MODE_MIXED) { + strcpy(kmode, "mixed"); + get_mixed_buffers()->free_all_nbor_buffers(); + } else { + strcpy(kmode, "double"); + get_double_buffers()->free_all_nbor_buffers(); + } + + #ifdef _LMP_INTEL_OFFLOAD + set_offload_affinity(); + #endif + + if (comm->me == 0) { + if (screen) { + fprintf(screen, + "----------------------------------------------------------\n"); + if (_offload_balance != 0.0) { + fprintf(screen,"Using Intel Coprocessor with %d threads per core, ", + _offload_tpc); + fprintf(screen,"%d threads per task\n",_offload_threads); + } else { + fprintf(screen,"Using Intel Package without Coprocessor.\n"); + } + fprintf(screen,"Precision: %s\n",kmode); + fprintf(screen, + "----------------------------------------------------------\n"); + } + } +} + /* ---------------------------------------------------------------------- */ void FixIntel::check_neighbor_intel() @@ -440,6 +502,14 @@ void FixIntel::output_timing_data() { fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]); fprintf(_tscreen, "\n"); #endif + double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] + + timers[TIME_OFFLOAD_WAIT]; + double ct = timers[TIME_OFFLOAD_NEIGHBOR] + + timers[TIME_OFFLOAD_PAIR]; + double tt = MAX(ht,ct); + if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0) + error->warning(FLERR, + "Leaving a core free can improve performance for offload"); } fprintf(_tscreen, "------------------------------------------------\n"); } @@ -549,4 +619,157 @@ void FixIntel::set_offload_affinity() _double_buffers->set_off_params(offload_threads, _cop, _separate_buffers); } +/* ---------------------------------------------------------------------- */ + +int FixIntel::set_host_affinity(const int nomp) +{ + #ifndef INTEL_OFFLOAD_NOAFFINITY + _separate_coi = 1; + int rank = comm->me; + int node_rank; + int ppn = get_ppn(node_rank); + int cop = node_rank / (ppn / _ncops); + + // Get a sorted list of logical cores + int proc_list[INTEL_MAX_HOST_CORE_COUNT]; + int ncores; + FILE *p; + char cmd[512]; + char readbuf[INTEL_MAX_HOST_CORE_COUNT*5]; + sprintf(cmd, "lscpu -p=cpu,core,socket | grep -v '#' |" + "sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'"); + p = popen(cmd, "r"); + if (p == NULL) return -1; + ncores = 0; + while(fgets(readbuf, 512, p)) { + proc_list[ncores] = atoi(readbuf); + ncores++; + } + pclose(p); + + // Sanity checks for core list + if (ncores < 2) return -1; + int nzero = 0; + for (int i = 0; i < ncores; i++) { + if (proc_list[i] == 0) nzero++; + if (proc_list[i] < 0 || proc_list[i] >= ncores) return -1; + } + if (nzero > 1) return -1; + + // Determine the OpenMP/MPI configuration + char *estring; + int nthreads = nomp; + if (nthreads == 0) { + estring = getenv("OMP_NUM_THREADS"); + if (estring != NULL) { + nthreads = atoi(estring); + if (nthreads < 2) nthreads = 1; + } else + nthreads = 1; + } + + // Determine how many logical cores for COI and MPI tasks + int coi_cores = 0, mpi_cores; + int subscription = nthreads * ppn; + if (subscription > ncores) { + if (rank == 0) + error->warning(FLERR, + "More MPI tasks/OpenMP threads than available cores"); + return 0; + } + if (subscription == ncores) + _separate_coi = 0; + + if (subscription > ncores / 2) { + coi_cores = ncores - subscription; + if (coi_cores > INTEL_MAX_COI_CORES) coi_cores = INTEL_MAX_COI_CORES; + } + mpi_cores = (ncores - coi_cores) / ppn; + + // Get ids of all LWPs that COI spawned and affinitize + int lwp = 0, plwp = 0, nlwp = 0, mlwp = 0, fail = 0; + cpu_set_t cpuset; + pid_t pid = getpid(); + if (coi_cores) { + sprintf(cmd, "ps -Lp %d -o lwp | awk ' (NR > 2) {print}'", pid); + p = popen(cmd, "r"); + if (p == NULL) return -1; + + while(fgets(readbuf, 512, p)) { + lwp = atoi(readbuf); + int first = coi_cores + node_rank * mpi_cores; + CPU_ZERO(&cpuset); + for (int i = first; i < first + mpi_cores; i++) + CPU_SET(proc_list[i], &cpuset); + if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) { + fail = 1; + break; + } + plwp++; + } + pclose(p); + + // Do async offload to create COI threads + int sig1, sig2; + float *buf1; + int pragma_size = 1024; + buf1 = (float*) malloc(sizeof(float)*pragma_size); + + #pragma offload target (mic:0) \ + in(buf1:length(pragma_size) alloc_if(1) free_if(0)) \ + signal(&sig1) + {} + #pragma offload_wait target(mic:0) wait(&sig1) + + #pragma offload target (mic:0) \ + out(buf1:length(pragma_size) alloc_if(0) free_if(1)) \ + signal(&sig2) + {} + #pragma offload_wait target(mic:0) wait(&sig2) + free(buf1); + + p = popen(cmd, "r"); + if (p == NULL) return -1; + + while(fgets(readbuf, 512, p)) { + lwp = atoi(readbuf); + nlwp++; + if (nlwp <= plwp) continue; + + CPU_ZERO(&cpuset); + for(int i=0; i::set_ntypes(const int ntypes) if (_ntypes > 0) { #ifdef _LMP_INTEL_OFFLOAD flt_t * cutneighsqo = _cutneighsq[0]; - if (cutneighsqo != 0) { + if (_off_threads > 0 && cutneighsqo != 0) { #pragma offload_transfer target(mic:_cop) \ nocopy(cutneighsqo:alloc_if(0) free_if(1)) } @@ -396,7 +396,7 @@ void IntelBuffers::set_ntypes(const int ntypes) lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq"); #ifdef _LMP_INTEL_OFFLOAD flt_t * cutneighsqo = _cutneighsq[0]; - if (cutneighsqo != NULL) { + if (_off_threads > 0 && cutneighsqo != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0)) } diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h index da68ecc934..9dfb3a20e0 100644 --- a/src/USER-INTEL/intel_preprocess.h +++ b/src/USER-INTEL/intel_preprocess.h @@ -61,6 +61,8 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, #define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH #define INTEL_LB_MEAN_WEIGHT 0.1 #define INTEL_BIGP 1e15 +#define INTEL_MAX_HOST_CORE_COUNT 512 +#define INTEL_MAX_COI_CORES 2 #define IP_PRE_get_stride(stride, n, datasize, torque) \ { \ diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp index aae42a7145..da58be8b75 100644 --- a/src/USER-INTEL/pair_gayberne_intel.cpp +++ b/src/USER-INTEL/pair_gayberne_intel.cpp @@ -893,20 +893,17 @@ void PairGayBerneIntel::init_style() fix = static_cast(modify->fix[ifix]); #ifdef _LMP_INTEL_OFFLOAD - fix->set_offload_affinity(); if (force->newton_pair) fix->set_offload_noghost(1); _cop = fix->coprocessor_number(); #endif - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - fix->get_mixed_buffers()->free_all_nbor_buffers(); + + fix->pair_init_check(); + if (fix->precision() == FixIntel::PREC_MODE_MIXED) pack_force_const(force_const_single, fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - fix->get_double_buffers()->free_all_nbor_buffers(); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) pack_force_const(force_const_double, fix->get_double_buffers()); - } else { - fix->get_single_buffers()->free_all_nbor_buffers(); + else pack_force_const(force_const_single, fix->get_single_buffers()); - } } /* ---------------------------------------------------------------------- */ diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp index 8d23e8f589..4e45c078fa 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp @@ -489,19 +489,16 @@ void PairLJCharmmCoulLongIntel::init_style() fix = static_cast(modify->fix[ifix]); #ifdef _LMP_INTEL_OFFLOAD - fix->set_offload_affinity(); _cop = fix->coprocessor_number(); #endif - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - fix->get_mixed_buffers()->free_all_nbor_buffers(); + + fix->pair_init_check(); + if (fix->precision() == FixIntel::PREC_MODE_MIXED) pack_force_const(force_const_single, fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - fix->get_double_buffers()->free_all_nbor_buffers(); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) pack_force_const(force_const_double, fix->get_double_buffers()); - } else { - fix->get_single_buffers()->free_all_nbor_buffers(); + else pack_force_const(force_const_single, fix->get_single_buffers()); - } } template diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp index 582ad7eb85..b60f96d4e8 100644 --- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp @@ -456,19 +456,16 @@ void PairLJCutCoulLongIntel::init_style() fix = static_cast(modify->fix[ifix]); #ifdef _LMP_INTEL_OFFLOAD - fix->set_offload_affinity(); _cop = fix->coprocessor_number(); #endif - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - fix->get_mixed_buffers()->free_all_nbor_buffers(); + + fix->pair_init_check(); + if (fix->precision() == FixIntel::PREC_MODE_MIXED) pack_force_const(force_const_single, fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - fix->get_double_buffers()->free_all_nbor_buffers(); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) pack_force_const(force_const_double, fix->get_double_buffers()); - } else { - fix->get_single_buffers()->free_all_nbor_buffers(); + else pack_force_const(force_const_single, fix->get_single_buffers()); - } } template diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp index 897abdde53..b777e60f91 100644 --- a/src/USER-INTEL/pair_lj_cut_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_intel.cpp @@ -336,16 +336,14 @@ void PairLJCutIntel::init_style() error->all(FLERR, "Offload for lj/cut/intel is not yet available. Set balance to 0."); #endif - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - fix->get_mixed_buffers()->free_all_nbor_buffers(); + + fix->pair_init_check(); + if (fix->precision() == FixIntel::PREC_MODE_MIXED) pack_force_const(force_const_single, fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - fix->get_double_buffers()->free_all_nbor_buffers(); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) pack_force_const(force_const_double, fix->get_double_buffers()); - } else { - fix->get_single_buffers()->free_all_nbor_buffers(); + else pack_force_const(force_const_single, fix->get_single_buffers()); - } } /* ---------------------------------------------------------------------- */