// clang-format off /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories LAMMPS development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing author: W. Michael Brown (Intel) Anupama Kurpad (Intel) - Host Affinitization ------------------------------------------------------------------------- */ #include "fix_intel.h" #include "comm.h" #include "domain.h" #include "error.h" #include "force.h" #include "neighbor.h" #include "neigh_request.h" #include "pair.h" #include "pair_hybrid.h" #include "update.h" #include #ifdef _LMP_INTEL_OFFLOAD #ifndef INTEL_OFFLOAD_NOAFFINITY #include #endif #endif #include "suffix.h" using namespace LAMMPS_NS; using namespace FixConst; #ifdef __INTEL_OFFLOAD #ifndef _LMP_INTEL_OFFLOAD #warning "Not building INTEL package with Xeon Phi offload support." #endif #endif /* ---------------------------------------------------------------------- */ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) { if (narg < 4) error->all(FLERR,"Illegal package intel command"); int ncops = utils::inumeric(FLERR,arg[3],false,lmp); _nbor_pack_width = 1; _three_body_neighbor = 0; _pair_intel_count = 0; _hybrid_nonpair = 0; _print_pkg_info = 1; _nthreads = comm->nthreads; _torque_flag = 0; _precision_mode = PREC_MODE_MIXED; _offload_balance = -1.0; _overflow_flag[LMP_OVERFLOW] = 0; _off_overflow_flag[LMP_OVERFLOW] = 0; _offload_affinity_balanced = 0; _offload_threads = 0; _offload_tpc = 4; _force_array_s = nullptr; _force_array_m = nullptr; _force_array_d = nullptr; _ev_array_s = nullptr; _ev_array_d = nullptr; #ifdef _LMP_INTEL_OFFLOAD if (ncops < 0) error->all(FLERR,"Illegal package intel command"); _offload_affinity_set = 0; _off_force_array_s = 0; _off_force_array_m = 0; _off_force_array_d = 0; _off_ev_array_s = 0; _off_ev_array_d = 0; _balance_fixed = 0.0; _cop = 0; #endif // optional keywords int nomp = 0, no_affinity = 0; _allow_separate_buffers = 1; _offload_ghost = -1; _lrt = 0; _p3m_table = 1; int iarg = 4; while (iarg < narg) { if (strcmp(arg[iarg],"omp") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); nomp = utils::inumeric(FLERR,arg[iarg+1],false,lmp); iarg += 2; } else if (strcmp(arg[iarg],"mode") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); if (strcmp(arg[iarg+1],"single") == 0) _precision_mode = PREC_MODE_SINGLE; else if (strcmp(arg[iarg+1],"mixed") == 0) _precision_mode = PREC_MODE_MIXED; else if (strcmp(arg[iarg+1],"double") == 0) _precision_mode = PREC_MODE_DOUBLE; else error->all(FLERR,"Illegal package intel command"); iarg += 2; } else if (strcmp(arg[iarg],"balance") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); _offload_balance = utils::numeric(FLERR,arg[iarg+1],false,lmp); iarg += 2; } else if (strcmp(arg[iarg], "ghost") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); _offload_ghost = utils::logical(FLERR,arg[iarg+1],false,lmp); iarg += 2; } else if (strcmp(arg[iarg], "tpc") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); _offload_tpc = atoi(arg[iarg+1]); iarg += 2; } else if (strcmp(arg[iarg],"tptask") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); _offload_threads = atoi(arg[iarg+1]); iarg += 2; } else if (strcmp(arg[iarg],"no_affinity") == 0) { no_affinity = 1; iarg++; } else if (strcmp(arg[iarg], "lrt") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); _lrt = utils::logical(FLERR,arg[iarg+1],false,lmp); iarg += 2; } else if (strcmp(arg[iarg], "pppm_table") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); _p3m_table = utils::logical(FLERR,arg[iarg+1],false,lmp); iarg += 2; // undocumented options } else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) { _offload_affinity_balanced = 1; iarg++; } else if (strcmp(arg[iarg],"buffers") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); _allow_separate_buffers = atoi(arg[iarg+1]); iarg += 2; } else error->all(FLERR,"Illegal package intel command"); } // if ncops is zero, just run on the cpu if (ncops < 1) { ncops = -1; _offload_balance = 0.0; } // if using LRT mode, create the integrate style if (_lrt) { char *cmd[1]; cmd[0] = (char *) "verlet/lrt/intel"; update->create_integrate(1,cmd,0); } // error check if (_offload_balance > 1.0 || _offload_threads < 0 || _offload_tpc <= 0 || _offload_tpc > 4 || nomp < 0) error->all(FLERR,"Illegal package intel command"); #ifdef _LMP_INTEL_OFFLOAD _ncops = ncops; if (_offload_balance != 0.0) { _real_space_comm = MPI_COMM_WORLD; if (no_affinity == 0) if (set_host_affinity(nomp) != 0) error->all(FLERR,"Could not set host affinity for offload tasks"); } int max_offload_threads = 0, offload_cores = 0; if (_offload_balance != 0.0) { #pragma offload target(mic:_cop) mandatory \ out(max_offload_threads,offload_cores) { offload_cores = omp_get_num_procs(); omp_set_num_threads(offload_cores); max_offload_threads = omp_get_max_threads(); #ifdef __AVX512F__ if ( (offload_cores / 4) % 2 == 1) { offload_cores += 4; max_offload_threads += 4; } #endif } _max_offload_threads = max_offload_threads; _offload_cores = offload_cores; if (_offload_threads == 0) _offload_threads = offload_cores; if (_offload_cores > 244 && _offload_tpc > 2) _offload_tpc = 2; } #endif // set OpenMP threads // nomp is user setting, default = 0 #if defined(_OPENMP) #if defined(__INTEL_COMPILER) kmp_set_blocktime(0); #endif if (nomp != 0) { omp_set_num_threads(nomp); _nthreads = comm->nthreads = nomp; } #endif // set offload params #ifdef _LMP_INTEL_OFFLOAD if (_offload_balance < 0.0) { _balance_neighbor = 0.9; _balance_pair = 0.9; } else { _balance_neighbor = _offload_balance; _balance_pair = _offload_balance; } _tscreen = screen; zero_timers(); _setup_time_cleared = false; _timers_allocated = false; #else _offload_balance = 0.0; #endif // set precision if (_precision_mode == PREC_MODE_SINGLE) _single_buffers = new IntelBuffers(lmp); else if (_precision_mode == PREC_MODE_MIXED) _mixed_buffers = new IntelBuffers(lmp); else _double_buffers = new IntelBuffers(lmp); } /* ---------------------------------------------------------------------- */ FixIntel::~FixIntel() { #ifdef _LMP_INTEL_OFFLOAD output_timing_data(); if (_timers_allocated) { double *time1 = off_watch_pair(); double *time2 = off_watch_neighbor(); int *overflow = get_off_overflow_flag(); if (_offload_balance != 0.0) { #pragma offload_transfer target(mic:_cop) \ nocopy(time1,time2,overflow:alloc_if(0) free_if(1)) } } #endif if (_precision_mode == PREC_MODE_SINGLE) delete _single_buffers; else if (_precision_mode == PREC_MODE_MIXED) delete _mixed_buffers; else delete _double_buffers; } /* ---------------------------------------------------------------------- */ int FixIntel::setmask() { int mask = 0; mask |= PRE_REVERSE; mask |= MIN_PRE_REVERSE; mask |= POST_FORCE; mask |= MIN_POST_FORCE; mask |= POST_RUN; return mask; } /* ---------------------------------------------------------------------- */ void FixIntel::init() { #ifdef _LMP_INTEL_OFFLOAD output_timing_data(); _sync_mode = 0; if (offload_balance() != 0.0) { if (offload_noghost() || force->newton_pair == 0) _sync_mode = 2; else _sync_mode = 1; if (update->whichflag == 2) _sync_mode = 1; } #endif _torque_flag = 0; if (force->pair_match("gayberne/intel$", 0)) _torque_flag = 1; const int nstyles = _pair_intel_count; if (force->pair_match("^hybrid", 0)) { _pair_hybrid_flag = 1; // Check if need to handle torque auto hybrid = dynamic_cast(force->pair); if (hybrid) { for (int i = 0; i < hybrid->nstyles; i++) if (utils::strmatch(hybrid->keywords[i],"/intel$") && utils::strmatch(hybrid->keywords[i],"gayberne")) _torque_flag = 1; } if (force->newton_pair != 0 && force->pair->no_virial_fdotr_compute) error->all(FLERR,"INTEL package requires fdotr virial with newton on."); } else _pair_hybrid_flag = 0; if (_torque_flag && nstyles > 1) error->all(FLERR,"gayberne/intel style cannot yet be used with other " "intel pair styles."); if (nstyles > 1 && _pair_hybrid_flag) _pair_hybrid_flag = 2; else if (force->newton_pair == 0) _pair_hybrid_flag = 0; _pair_hybrid_zero = 0; _zero_master = 0; if (_pair_hybrid_flag && _hybrid_nonpair) _pair_hybrid_zero = 1; _hybrid_nonpair = 0; _pair_intel_count = 0; #ifdef _LMP_INTEL_OFFLOAD if (offload_balance() != 0.0) { _pair_hybrid_zero = 0; if (force->newton_pair == 0) _pair_hybrid_flag = 0; if (nstyles > 1) error->all(FLERR,"Currently, cannot offload more than one intel style with hybrid."); } #endif check_neighbor_intel(); int off_mode = 0; if (_offload_balance != 0.0) off_mode = 1; if (_precision_mode == PREC_MODE_SINGLE) { _single_buffers->set_torque_flag(_torque_flag); _single_buffers->zero_ev(); _single_buffers->grow_ncache(off_mode, comm->nthreads); _single_buffers->free_list_ptrs(); } else if (_precision_mode == PREC_MODE_MIXED) { _mixed_buffers->set_torque_flag(_torque_flag); _mixed_buffers->zero_ev(); _mixed_buffers->grow_ncache(off_mode, comm->nthreads); _mixed_buffers->free_list_ptrs(); } else { _double_buffers->set_torque_flag(_torque_flag); _double_buffers->zero_ev(); _double_buffers->grow_ncache(off_mode, comm->nthreads); _double_buffers->free_list_ptrs(); } _need_reduce = 0; } /* ---------------------------------------------------------------------- */ void FixIntel::setup(int vflag) { if (neighbor->style != Neighbor::BIN) error->all(FLERR,"Currently, neighbor style BIN must be used with INTEL package."); if (vflag > 3) error->all(FLERR,"Cannot currently get per-atom virials with INTEL package."); #ifdef _LMP_INTEL_OFFLOAD if (neighbor->exclude_setting() != 0) error->all(FLERR,"Currently, cannot use neigh_modify exclude with INTEL package offload."); post_force(vflag); #endif } /* ---------------------------------------------------------------------- */ void FixIntel::setup_pre_reverse(int eflag, int vflag) { pre_reverse(eflag,vflag); } /* ---------------------------------------------------------------------- */ bool FixIntel::pair_hybrid_check() { auto ph = dynamic_cast(force->pair); bool has_intel = false; int nstyles = ph->nstyles; for (int i = 0; i < nstyles; ++i) if (ph->styles[i]->suffix_flag & Suffix::INTEL) has_intel = true; return has_intel; } /* ---------------------------------------------------------------------- */ void FixIntel::pair_init_check(const bool cdmessage) { #ifdef INTEL_VMASK if (atom->sortfreq) atom->sortfreq = 1; #endif _nbor_pack_width = 1; #ifdef _LMP_INTEL_OFFLOAD if (_offload_balance != 0.0) atom->sortfreq = 1; _offload_noghost = 0; if (force->newton_pair && _offload_ghost == 0) _offload_noghost = 1; set_offload_affinity(); if (!_timers_allocated) { double *time1 = off_watch_pair(); double *time2 = off_watch_neighbor(); int *overflow = get_off_overflow_flag(); if (_offload_balance != 0.0) { #pragma offload_transfer target(mic:_cop) \ nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \ in(overflow:length(5) alloc_if(1) free_if(0)) } _timers_allocated = true; } #endif _nthreads = comm->nthreads; if (_offload_balance != 0.0 && comm->me == 0) { #ifndef __INTEL_COMPILER_BUILD_DATE error->warning(FLERR,"Unknown Intel Compiler Version\n"); #else if (__INTEL_COMPILER_BUILD_DATE != 20131008 && __INTEL_COMPILER_BUILD_DATE < 20141023) error->warning(FLERR,"Unsupported Intel Compiler."); #endif #if !defined(__INTEL_COMPILER) error->warning(FLERR,"Unsupported Intel Compiler."); #endif } #ifndef LMP_INTEL_NBOR_COMPAT if (force->pair->manybody_flag && (atom->molecular != Atom::ATOMIC)) { int flag = 0; if (atom->nbonds > 0 && force->special_lj[1] == 0.0 && force->special_coul[1] == 0.0) flag = 1; if (atom->nangles > 0 && force->special_lj[2] == 0.0 && force->special_coul[2] == 0.0) flag = 1; if (atom->ndihedrals > 0 && force->special_lj[3] == 0.0 && force->special_coul[3] == 0.0) flag = 1; if (flag) error->all(FLERR,"Add -DLMP_INTEL_NBOR_COMPAT to build for special_bond" " exclusions with Intel"); } #endif int need_tag = 0; if (atom->molecular != Atom::ATOMIC || three_body_neighbor()) need_tag = 1; if (domain->triclinic && force->newton_pair) need_tag = 1; // Clear buffers used for pair style char kmode[80]; if (_precision_mode == PREC_MODE_SINGLE) { strcpy(kmode, "single"); get_single_buffers()->need_tag(need_tag); } else if (_precision_mode == PREC_MODE_MIXED) { strcpy(kmode, "mixed"); get_mixed_buffers()->need_tag(need_tag); } else { strcpy(kmode, "double"); get_double_buffers()->need_tag(need_tag); } _pair_intel_count++; #ifdef _LMP_INTEL_OFFLOAD set_offload_affinity(); #endif if (_print_pkg_info && comm->me == 0) { utils::logmesg(lmp, "----------------------------------------------------------\n"); if (_offload_balance != 0.0) { utils::logmesg(lmp,"Using Intel Coprocessor with {} threads per core, " "{} threads per task\n",_offload_tpc, _offload_threads); } else { utils::logmesg(lmp,"Using INTEL Package without Coprocessor.\n"); } utils::logmesg(lmp,"Compiler: {}\n",platform::compiler_info()); #ifdef LMP_SIMD_COMPILER utils::logmesg(lmp,"SIMD compiler directives: Enabled\n"); #else utils::logmesg(lmp,"SIMD compiler directives: Disabled\n"); #endif utils::logmesg(lmp,"Precision: {}\n",kmode); if (cdmessage) { #ifdef LMP_USE_AVXCD utils::logmesg(lmp,"AVX512 CD Optimizations: Enabled\n"); #else utils::logmesg(lmp,"AVX512 CD Optimizations: Disabled\n"); #endif } utils::logmesg(lmp, "----------------------------------------------------------\n"); } _print_pkg_info = 0; } /* ---------------------------------------------------------------------- */ void FixIntel::bond_init_check() { if ((_offload_balance != 0.0) && (atom->molecular != Atom::ATOMIC) && (force->newton_pair != force->newton_bond)) error->all(FLERR,"INTEL package requires same setting for newton bond and non-bond."); int intel_pair = 0; if (force->pair_match("/intel$", 0) != nullptr) intel_pair = 1; else if (force->pair_match("^hybrid", 0) != nullptr) { _hybrid_nonpair = 1; if (pair_hybrid_check()) intel_pair = 1; } if (intel_pair == 0) error->all(FLERR,"Intel styles for bond/angle/dihedral/improper require intel pair style."); } /* ---------------------------------------------------------------------- */ void FixIntel::kspace_init_check() { int intel_pair = 0; if (force->pair_match("/intel$", 0) != nullptr) intel_pair = 1; else if (force->pair_match("^hybrid", 0) != nullptr) { _hybrid_nonpair = 1; if (pair_hybrid_check()) intel_pair = 1; } if (intel_pair == 0) error->all(FLERR,"Intel styles for kspace require intel pair style."); } /* ---------------------------------------------------------------------- */ void FixIntel::check_neighbor_intel() { #ifdef _LMP_INTEL_OFFLOAD _full_host_list = 0; const int nrequest = neighbor->nrequest; for (int i = 0; i < nrequest; ++i) { if (_offload_balance != 0.0 && neighbor->requests[i]->intel == 0) { _full_host_list = 1; _offload_noghost = 0; } if (neighbor->requests[i]->skip && _offload_balance != 0.0) error->all(FLERR,"Cannot yet use hybrid styles with Intel offload."); } #endif } /* ---------------------------------------------------------------------- */ void FixIntel::_sync_main_arrays(const int prereverse) { if (!prereverse) _zero_master = 1; int done_this_step = prereverse; if (_pair_hybrid_zero == 0) done_this_step = 1; if (_force_array_m != nullptr) { if (_need_reduce) { reduce_results(&_force_array_m[0].x); _need_reduce = 0; } add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom,0); if (done_this_step) _force_array_m = nullptr; else _ev_array_d = nullptr; } else if (_force_array_d != nullptr) { if (_need_reduce) { reduce_results(&_force_array_d[0].x); _need_reduce = 0; } add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom,0); if (done_this_step) _force_array_d = nullptr; else _ev_array_d = nullptr; } else if (_force_array_s != nullptr) { if (_need_reduce) { reduce_results(&_force_array_s[0].x); _need_reduce = 0; } add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom,0); if (done_this_step) _force_array_s = nullptr; else _ev_array_s = nullptr; } #ifdef _LMP_INTEL_OFFLOAD if (_sync_mode == 1) sync_coprocessor(); #endif } /* ---------------------------------------------------------------------- */ void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/) { _sync_main_arrays(1); } /* ---------------------------------------------------------------------- */ void FixIntel::post_force(int vflag) { // Redundant call to sync Intel data structs with native for methods that // call force compute but do not call prereverse _sync_main_arrays(1); #ifdef LMP_INTEL_OFFLOAD if (_sync_mode == 2) sync_coprocessor(); #endif } /* ---------------------------------------------------------------------- */ template void FixIntel::reduce_results(acc_t * _noalias const f_scalar) { int o_range, f_stride; if (force->newton_pair) o_range = atom->nlocal + atom->nghost; else o_range = atom->nlocal; IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), _torque_flag); o_range *= 4; const int f_stride4 = f_stride * 4; if (_nthreads <= INTEL_HTHREADS) { acc_t *f_scalar2 = f_scalar + f_stride4; if (_nthreads == 4) { acc_t *f_scalar3 = f_scalar2 + f_stride4; acc_t *f_scalar4 = f_scalar3 + f_stride4; #if defined(USE_OMP_SIMD) #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4:64) #elif defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma simd #endif for (int n = 0; n < o_range; n++) f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; } else if (_nthreads == 2) { #if defined(USE_OMP_SIMD) #pragma omp simd aligned(f_scalar,f_scalar2:64) #elif defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma simd #endif for (int n = 0; n < o_range; n++) f_scalar[n] += f_scalar2[n]; } else { acc_t *f_scalar3 = f_scalar2 + f_stride4; #if defined(USE_OMP_SIMD) #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3:64) #elif defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma simd #endif for (int n = 0; n < o_range; n++) f_scalar[n] += f_scalar2[n] + f_scalar3[n]; } } else { #if defined(_OPENMP) #pragma omp parallel #endif { int iifrom, iito, tid; IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads, sizeof(acc_t)); acc_t *f_scalar2 = f_scalar + f_stride4; for (int t = 1; t < _nthreads; t++) { #if defined(USE_OMP_SIMD) #pragma omp simd aligned(f_scalar,f_scalar2:64) #elif defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma simd #endif for (int n = iifrom; n < iito; n++) f_scalar[n] += f_scalar2[n]; f_scalar2 += f_stride4; } } } } /* ---------------------------------------------------------------------- */ void FixIntel::sync_coprocessor() { #ifdef _LMP_INTEL_OFFLOAD if (_offload_balance != 0.0) { if (_off_force_array_m != 0) { add_off_results(_off_force_array_m, _off_ev_array_d); _off_force_array_m = 0; } else if (_off_force_array_d != 0) { add_off_results(_off_force_array_d, _off_ev_array_d); _off_force_array_d = 0; } else if (_off_force_array_s != 0) { add_off_results(_off_force_array_s, _off_ev_array_s); _off_force_array_s = 0; } } #endif } /* ---------------------------------------------------------------------- */ template void FixIntel::add_results(const ft * _noalias const f_in, const acc_t * _noalias const ev_global, const int eatom, const int vatom, const int offload) { start_watch(TIME_PACK); int f_length; #ifdef _LMP_INTEL_OFFLOAD if (_separate_buffers) { if (offload) { if (force->newton_pair) { add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal); const acc_t * _noalias const enull = 0; int offset = _offload_nlocal; if (_torque_flag) offset *= 2; add_oresults(f_in + offset, enull, eatom, vatom, _offload_min_ghost, _offload_nghost); } else add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair()); } else { if (force->newton_pair) { add_oresults(f_in, ev_global, eatom, vatom, _host_min_local, _host_used_local); const acc_t * _noalias const enull = 0; int offset = _host_used_local; if (_torque_flag) offset *= 2; add_oresults(f_in + offset, enull, eatom, vatom, _host_min_ghost, _host_used_ghost); } else { int start = host_start_pair(); add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start); } } stop_watch(TIME_PACK); return; } int start; if (offload) { start = 0; if (force->newton_pair) { if (_offload_noghost == 0) f_length = atom->nlocal + atom->nghost; else f_length = atom->nlocal; } else f_length = offload_end_pair(); } else { if (force->newton_pair) { start = 0; f_length = atom->nlocal + atom->nghost; } else { start = host_start_pair(); f_length = atom->nlocal - start; } } add_oresults(f_in, ev_global, eatom, vatom, start, f_length); #else if (force->newton_pair) f_length = atom->nlocal + atom->nghost; else f_length = atom->nlocal; add_oresults(f_in, ev_global, eatom, vatom, 0, f_length); #endif stop_watch(TIME_PACK); } /* ---------------------------------------------------------------------- */ template void FixIntel::add_oresults(const ft * _noalias const f_in, const acc_t * _noalias const ev_global, const int eatom, const int /*vatom*/, const int out_offset, const int nall) { lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset; if (_torque_flag) { if (f_in[1].w) { if (f_in[1].w == 1) error->all(FLERR,"Bad matrix inversion in mldivide3"); else error->all(FLERR,"Sphere particles not yet supported for gayberne/intel"); } } int packthreads; if (_nthreads > INTEL_HTHREADS) packthreads = _nthreads; else packthreads = 1; #if defined(_OPENMP) #pragma omp parallel if (packthreads > 1) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif int ifrom, ito; IP_PRE_omp_range_align(ifrom, ito, tid, nall, packthreads, sizeof(acc_t)); if (_torque_flag) { int ii = ifrom * 2; lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] + out_offset; if (eatom) { double * _noalias const lmp_eatom = force->pair->eatom + out_offset; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma ivdep #endif for (int i = ifrom; i < ito; i++) { f[i].x += f_in[ii].x; f[i].y += f_in[ii].y; f[i].z += f_in[ii].z; lmp_eatom[i] += f_in[ii].w; tor[i].x += f_in[ii+1].x; tor[i].y += f_in[ii+1].y; tor[i].z += f_in[ii+1].z; ii += 2; } } else { #if defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma ivdep #endif for (int i = ifrom; i < ito; i++) { f[i].x += f_in[ii].x; f[i].y += f_in[ii].y; f[i].z += f_in[ii].z; tor[i].x += f_in[ii+1].x; tor[i].y += f_in[ii+1].y; tor[i].z += f_in[ii+1].z; ii += 2; } } } else { if (eatom) { double * _noalias const lmp_eatom = force->pair->eatom + out_offset; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma ivdep #endif for (int i = ifrom; i < ito; i++) { f[i].x += f_in[i].x; f[i].y += f_in[i].y; f[i].z += f_in[i].z; lmp_eatom[i] += f_in[i].w; } } else { #if defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma ivdep #endif for (int i = ifrom; i < ito; i++) { f[i].x += f_in[i].x; f[i].y += f_in[i].y; f[i].z += f_in[i].z; } } } } if (ev_global != nullptr) { force->pair->eng_vdwl += ev_global[0]; force->pair->eng_coul += ev_global[1]; force->pair->virial[0] += ev_global[2]; force->pair->virial[1] += ev_global[3]; force->pair->virial[2] += ev_global[4]; force->pair->virial[3] += ev_global[5]; force->pair->virial[4] += ev_global[6]; force->pair->virial[5] += ev_global[7]; } } /* ---------------------------------------------------------------------- */ double FixIntel::memory_usage() { double bytes; if (_precision_mode == PREC_MODE_SINGLE) bytes = _single_buffers->memory_usage(_nthreads); else if (_precision_mode == PREC_MODE_MIXED) bytes = _mixed_buffers->memory_usage(_nthreads); else bytes = _double_buffers->memory_usage(_nthreads); return bytes; } /* ---------------------------------------------------------------------- */ #ifdef _LMP_INTEL_OFFLOAD /* ---------------------------------------------------------------------- */ template void FixIntel::add_off_results(const ft * _noalias const f_in, const acc_t * _noalias const ev_global) { if (_offload_balance < 0.0) _balance_other_time = platform::walltime() - _balance_other_time; start_watch(TIME_OFFLOAD_WAIT); #ifdef _LMP_INTEL_OFFLOAD if (neighbor->ago == 0) { #pragma offload_wait target(mic:_cop) wait(atom->tag,f_in) } else { #pragma offload_wait target(mic:_cop) wait(f_in) } #endif double wait_time = stop_watch(TIME_OFFLOAD_WAIT); int nlocal = atom->nlocal; if (neighbor->ago == 0) { if (_off_overflow_flag[LMP_OVERFLOW]) error->one(FLERR,"Neighbor list overflow, boost neigh_modify one"); _offload_nlocal = _off_overflow_flag[LMP_LOCAL_MAX] + 1; _offload_min_ghost = _off_overflow_flag[LMP_GHOST_MIN]; _offload_nghost = _off_overflow_flag[LMP_GHOST_MAX] + 1 - _offload_min_ghost; if (_offload_nghost < 0) _offload_nghost = 0; _offload_nall = _offload_nlocal + _offload_nghost; _offload_nlocal; } if (_torque_flag) if (f_in[1].w < 0.0) error->all(FLERR,"Bad matrix inversion in mldivide3"); add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1); // Load balance? if (_offload_balance < 0.0) { if (neighbor->ago == 0) _balance_pair = _balance_neighbor; double mic_time; mic_time = *_stopwatch_offload_pair; if (_balance_pair_time + _balance_other_time < mic_time) { double ft = _balance_pair_time + _balance_other_time + wait_time - mic_time; _balance_fixed = (1.0 - INTEL_LB_MEAN_WEIGHT) * _balance_fixed + INTEL_LB_MEAN_WEIGHT * ft; } double ctps = _balance_pair_time / (1.0-_balance_pair); double otps = mic_time / _balance_pair; double new_balance = (ctps + _balance_other_time - _balance_fixed) / (otps + ctps); _balance_neighbor = (1.0 - INTEL_LB_MEAN_WEIGHT) *_balance_neighbor + INTEL_LB_MEAN_WEIGHT * new_balance; } #ifdef TIME_BALANCE start_watch(TIME_IMBALANCE); MPI_Barrier(_real_space_comm); stop_watch(TIME_IMBALANCE); #endif acc_timers(); } /* ---------------------------------------------------------------------- */ void FixIntel::output_timing_data() { if (_im_real_space_task == 0 || _offload_affinity_set == 0) return; double timer_total = 0.0; int size, rank; double timers[NUM_ITIMERS]; MPI_Comm_size(_real_space_comm, &size); MPI_Comm_rank(_real_space_comm, &rank); MPI_Allreduce(&_timers, &timers, NUM_ITIMERS, MPI_DOUBLE, MPI_SUM, _real_space_comm); for (int i=0; i < NUM_ITIMERS; i++) { timers[i] /= size; timer_total += timers[i]; } #ifdef TIME_BALANCE double timers_min[NUM_ITIMERS], timers_max[NUM_ITIMERS]; MPI_Allreduce(&_timers, &timers_max, NUM_ITIMERS, MPI_DOUBLE, MPI_MAX, _real_space_comm); MPI_Allreduce(&_timers, &timers_min, NUM_ITIMERS, MPI_DOUBLE, MPI_MIN, _real_space_comm); #endif if (timer_total > 0.0) { double balance_out[2], balance_in[2]; balance_out[0] = _balance_pair; balance_out[1] = _balance_neighbor; MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM, 0, _real_space_comm); balance_in[0] /= size; balance_in[1] /= size; if (rank == 0 && _tscreen) { fprintf(_tscreen, "\n------------------------------------------------\n"); fprintf(_tscreen, " Offload Timing Data\n"); fprintf(_tscreen, "------------------------------------------------\n"); fprintf(_tscreen, " Data Pack/Cast Seconds %f\n", timers[TIME_PACK]); if (_offload_balance != 0.0) { fprintf(_tscreen, " Host Neighbor Seconds %f\n", timers[TIME_HOST_NEIGHBOR]); fprintf(_tscreen, " Host Pair Seconds %f\n", timers[TIME_HOST_PAIR]); fprintf(_tscreen, " Offload Neighbor Seconds %f\n", timers[TIME_OFFLOAD_NEIGHBOR]); fprintf(_tscreen, " Offload Pair Seconds %f\n", timers[TIME_OFFLOAD_PAIR]); fprintf(_tscreen, " Offload Wait Seconds %f\n", timers[TIME_OFFLOAD_WAIT]); fprintf(_tscreen, " Offload Latency Seconds %f\n", timers[TIME_OFFLOAD_LATENCY]); fprintf(_tscreen, " Offload Neighbor Balance %f\n", balance_in[1]); fprintf(_tscreen, " Offload Pair Balance %f\n", balance_in[0]); fprintf(_tscreen, " Offload Ghost Atoms "); if (_offload_noghost) fprintf(_tscreen,"No\n"); else fprintf(_tscreen,"Yes\n"); #ifdef TIME_BALANCE fprintf(_tscreen, " Offload Imbalance Seconds %f\n", timers[TIME_IMBALANCE]); fprintf(_tscreen, " Offload Min/Max Seconds "); for (int i = 0; i < NUM_ITIMERS; i++) fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]); fprintf(_tscreen, "\n"); #endif double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] + timers[TIME_OFFLOAD_WAIT]; double ct = timers[TIME_OFFLOAD_NEIGHBOR] + timers[TIME_OFFLOAD_PAIR]; double tt = MAX(ht,ct); if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0) error->warning(FLERR,"Leaving a core free can improve performance for offload"); } fprintf(_tscreen, "------------------------------------------------\n"); } zero_timers(); _setup_time_cleared = false; } } /* ---------------------------------------------------------------------- */ int FixIntel::get_ppn(int &node_rank) { int nprocs; int rank; MPI_Comm_size(_real_space_comm, &nprocs); MPI_Comm_rank(_real_space_comm, &rank); int name_length; char node_name[MPI_MAX_PROCESSOR_NAME]; MPI_Get_processor_name(node_name,&name_length); node_name[name_length] = '\0'; char *node_names = new char[MPI_MAX_PROCESSOR_NAME*nprocs]; MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm); int ppn = 0; node_rank = 0; for (int i = 0; i < nprocs; i++) { if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) { ppn++; if (i < rank) node_rank++; } } return ppn; } /* ---------------------------------------------------------------------- */ void FixIntel::set_offload_affinity() { _separate_buffers = 0; if (_allow_separate_buffers) if (_offload_balance != 0.0 && _offload_balance < 1.0) _separate_buffers = 1; _im_real_space_task = 1; if (strncmp(update->integrate_style,"verlet/split",12) == 0) { _real_space_comm = world; if (universe->iworld != 0) { _im_real_space_task = 0; return; } } else _real_space_comm = universe->uworld; if (_offload_balance == 0.0) _cop = -1; if (_offload_balance == 0.0 || _offload_affinity_set == 1) return; _offload_affinity_set = 1; int node_rank; int ppn = get_ppn(node_rank); if (ppn % _ncops != 0) error->all(FLERR,"MPI tasks per node must be multiple of offload_cards"); ppn = ppn / _ncops; _cop = node_rank / ppn; node_rank = node_rank % ppn; int max_threads_per_task = _offload_cores / 4 * _offload_tpc / ppn; if (_offload_threads > max_threads_per_task) _offload_threads = max_threads_per_task; if (_offload_threads > _max_offload_threads) _offload_threads = _max_offload_threads; int offload_threads = _offload_threads; int offload_tpc = _offload_tpc; int offload_affinity_balanced = _offload_affinity_balanced; int offload_cores = _offload_cores; #pragma offload target(mic:_cop) mandatory \ in(node_rank,offload_threads,offload_tpc,offload_affinity_balanced, \ offload_cores) { omp_set_num_threads(offload_threads); #pragma omp parallel { int tnum = omp_get_thread_num(); kmp_affinity_mask_t mask; kmp_create_affinity_mask(&mask); int proc = offload_threads * node_rank + tnum; #ifdef __AVX512F__ proc = (proc / offload_tpc) + (proc % offload_tpc) * ((offload_cores) / 4); proc += 68; #else if (offload_affinity_balanced) proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1; else proc += (proc / 4) * (4 - offload_tpc) + 1; #endif kmp_set_affinity_mask_proc(proc, &mask); if (kmp_set_affinity(&mask) != 0) printf("Could not set affinity on rank %d thread %d to %d\n", node_rank, tnum, proc); } } if (_precision_mode == PREC_MODE_SINGLE) _single_buffers->set_off_params(offload_threads, _cop, _separate_buffers); else if (_precision_mode == PREC_MODE_MIXED) _mixed_buffers->set_off_params(offload_threads, _cop, _separate_buffers); else _double_buffers->set_off_params(offload_threads, _cop, _separate_buffers); } /* ---------------------------------------------------------------------- */ int FixIntel::set_host_affinity(const int nomp) { #ifndef INTEL_OFFLOAD_NOAFFINITY _separate_coi = 1; int rank = comm->me; int node_rank; int ppn = get_ppn(node_rank); int cop = node_rank / (ppn / _ncops); // Get a sorted list of logical cores int proc_list[INTEL_MAX_HOST_CORE_COUNT]; int ncores; FILE *p; char cmd[512]; char readbuf[INTEL_MAX_HOST_CORE_COUNT*5]; sprintf(cmd, "lscpu -p | grep -v '#' |" "sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'"); p = popen(cmd, "r"); if (p == nullptr) return -1; ncores = 0; while (fgets(readbuf, 512, p)) { proc_list[ncores] = atoi(readbuf); ncores++; } pclose(p); // Sanity checks for core list if (ncores < 2) return -1; int nzero = 0; for (int i = 0; i < ncores; i++) { if (proc_list[i] == 0) nzero++; if (proc_list[i] < 0 || proc_list[i] >= ncores) return -1; } if (nzero > 1) return -1; // Determine the OpenMP/MPI configuration char *estring; int nthreads = nomp; if (nthreads == 0) { estring = getenv("OMP_NUM_THREADS"); if (estring != nullptr) { nthreads = atoi(estring); if (nthreads < 2) nthreads = 1; } else nthreads = 1; } // Determine how many logical cores for COI and MPI tasks int coi_cores = 0, mpi_cores; int subscription = nthreads * ppn; if (subscription > ncores) { if (rank == 0) error->warning(FLERR,"More MPI tasks/OpenMP threads than available cores"); return 0; } if (subscription == ncores) _separate_coi = 0; if (subscription > ncores / 2) { coi_cores = ncores - subscription; if (coi_cores > INTEL_MAX_COI_CORES) coi_cores = INTEL_MAX_COI_CORES; } mpi_cores = (ncores - coi_cores) / ppn; // Get ids of all LWPs that COI spawned and affinitize int lwp = 0, plwp = 0, nlwp = 0, mlwp = 0, fail = 0; cpu_set_t cpuset; pid_t pid = getpid(); if (coi_cores) { sprintf(cmd, "ps -Lp %d -o lwp | awk ' (NR > 2) {print}'", pid); p = popen(cmd, "r"); if (p == nullptr) return -1; while (fgets(readbuf, 512, p)) { lwp = atoi(readbuf); int first = coi_cores + node_rank * mpi_cores; CPU_ZERO(&cpuset); for (int i = first; i < first + mpi_cores; i++) CPU_SET(proc_list[i], &cpuset); if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) { fail = 1; break; } plwp++; } pclose(p); // Do async offload to create COI threads int sig1, sig2; float *buf1; int pragma_size = 1024; buf1 = (float*) malloc(sizeof(float)*pragma_size); #pragma offload target (mic:0) mandatory \ in(buf1:length(pragma_size) alloc_if(1) free_if(0)) \ signal(&sig1) { buf1[0] = 0.0; } #pragma offload_wait target(mic:0) wait(&sig1) #pragma offload target (mic:0) mandatory \ out(buf1:length(pragma_size) alloc_if(0) free_if(1)) \ signal(&sig2) { buf1[0] = 1.0; } #pragma offload_wait target(mic:0) wait(&sig2) free(buf1); p = popen(cmd, "r"); if (p == nullptr) return -1; while (fgets(readbuf, 512, p)) { lwp = atoi(readbuf); nlwp++; if (nlwp <= plwp) continue; CPU_ZERO(&cpuset); for (int i=0; i