// clang-format off /* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing author: Rodrigo Canales (RWTH Aachen University) ------------------------------------------------------------------------- */ #include "pair_buck_intel.h" #include "atom.h" #include "comm.h" #include "error.h" #include "force.h" #include "math_const.h" #include "memory.h" #include "modify.h" #include "neigh_list.h" #include "neigh_request.h" #include "neighbor.h" #include "suffix.h" #include #include using namespace LAMMPS_NS; using namespace MathConst; #define C_FORCE_T typename ForceConst::c_force_t #define C_ENERGY_T typename ForceConst::c_energy_t PairBuckIntel::PairBuckIntel(LAMMPS *lmp) : PairBuck(lmp) { suffix_flag |= Suffix::INTEL; } PairBuckIntel::~PairBuckIntel() { } void PairBuckIntel::compute(int eflag, int vflag) { if (fix->precision()==FixIntel::PREC_MODE_MIXED) compute(eflag, vflag, fix->get_mixed_buffers(), force_const_single); else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) compute(eflag, vflag, fix->get_double_buffers(), force_const_double); else compute(eflag, vflag, fix->get_single_buffers(), force_const_single); fix->balance_stamp(); vflag_fdotr = 0; } template void PairBuckIntel::compute(int eflag, int vflag, IntelBuffers *buffers, const ForceConst &fc) { ev_init(eflag,vflag); if (vflag_atom) error->all(FLERR,"INTEL package does not support per-atom stress"); if (vflag && !vflag_fdotr && force->newton_pair) error->all(FLERR,"INTEL package does not support pair_modify nofdotr " "with newton on"); const int inum = list->inum; const int nthreads = comm->nthreads; const int host_start = fix->host_start_pair(); const int offload_end = fix->offload_end_pair(); const int ago = neighbor->ago; if (ago != 0 && fix->separate_buffers() == 0) { fix->start_watch(TIME_PACK); int packthreads; if (nthreads > INTEL_HTHREADS) packthreads = nthreads; else packthreads = 1; #if defined(_OPENMP) #pragma omp parallel if (packthreads > 1) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } int ovflag = 0; if (vflag_fdotr) ovflag = 2; else if (vflag) ovflag = 1; if (eflag) { if (force->newton_pair) { eval<1,1>(1, ovflag, buffers, fc, 0, offload_end); eval<1,1>(0, ovflag, buffers, fc, host_start, inum); } else { eval<1,0>(1, ovflag, buffers, fc, 0, offload_end); eval<1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { eval<0,1>(1, ovflag, buffers, fc, 0, offload_end); eval<0,1>(0, ovflag, buffers, fc, host_start, inum); } else { eval<0,0>(1, ovflag, buffers, fc, 0, offload_end); eval<0,0>(0, ovflag, buffers, fc, host_start, inum); } } } /* ---------------------------------------------------------------------- */ template void PairBuckIntel::eval(const int offload, const int vflag, IntelBuffers *buffers, const ForceConst &fc, const int astart, const int aend) { const int inum = aend - astart; if (inum == 0) return; int nlocal, nall, minlocal; fix->get_buffern(offload, nlocal, nall, minlocal); const int ago = neighbor->ago; IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); ATOM_T * _noalias const x = buffers->get_x(offload); const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; const int ** _noalias const firstneigh = (const int **)list->firstneigh; const flt_t * _noalias const special_lj = fc.special_lj; const C_FORCE_T * _noalias const c_force = fc.c_force[0]; const C_ENERGY_T * _noalias const c_energy = fc.c_energy[0]; const int ntypes = atom->ntypes + 1; const int eatom = this->eflag_atom; // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, buffers, offload, fix, separate_flag, x_size, q_size, ev_size, f_stride); int tc; FORCE_T * _noalias f_start; acc_t * _noalias ev_global; IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); const int nthreads = tc; #ifdef _LMP_INTEL_OFFLOAD int *overflow = fix->get_off_overflow_flag(); double *timer_compute = fix->off_watch_pair(); // Redeclare as local variables for offload if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY); #pragma offload target(mic:_cop) if (offload) \ in(special_lj:length(0) alloc_if(0) free_if(0)) \ in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \ in(firstneigh:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(ilist:length(0) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \ in(f_stride,nlocal,minlocal,separate_flag,offload) \ out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \ signal(f_start) #endif { #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) *timer_compute = MIC_Wtime(); #endif IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; if (EFLAG || vflag) oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; if (NEWTON_PAIR == 0 && inum != nlocal) memset(f_start, 0, f_stride * sizeof(FORCE_T)); // loop over neighbors of my atoms #if defined(_OPENMP) #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) #endif { int iifrom, iip, iito, tid; IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); iifrom += astart; iito += astart; int foff; if (NEWTON_PAIR) foff = tid * f_stride - minlocal; else foff = -minlocal; FORCE_T * _noalias const f = f_start + foff; if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); for (int ii = iifrom; ii < iito; ii += iip) { const int i = ilist[ii]; const int itype = x[i].w; const int ptr_off = itype * ntypes; const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off; const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off; const int * _noalias const jlist = firstneigh[i]; int jnum = numneigh[i]; IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; const flt_t xtmp = x[i].x; const flt_t ytmp = x[i].y; const flt_t ztmp = x[i].z; fxtmp = fytmp = fztmp = (acc_t)0; if (EFLAG) fwtmp = sevdwl = (acc_t)0; if (NEWTON_PAIR == 0) if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; #if defined(LMP_SIMD_COMPILER) #if defined(USE_OMP_SIMD) #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ sv0, sv1, sv2, sv3, sv4, sv5) #else #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ sv0, sv1, sv2, sv3, sv4, sv5) #endif #pragma vector aligned #endif for (int jj = 0; jj < jnum; jj++) { flt_t forcebuck, evdwl; forcebuck = evdwl = (flt_t)0.0; const int sbindex = jlist[jj] >> SBBITS & 3; const int j = jlist[jj] & NEIGHMASK; const flt_t delx = xtmp - x[j].x; const flt_t dely = ytmp - x[j].y; const flt_t delz = ztmp - x[j].z; const int jtype = x[j].w; const flt_t rsq = delx * delx + dely * dely + delz * delz; const flt_t r = sqrt(rsq); const flt_t r2inv = (flt_t)1.0 / rsq; #ifdef INTEL_VMASK if (rsq < c_forcei[jtype].cutsq) { #endif const flt_t r6inv = r2inv * r2inv * r2inv; const flt_t rexp = exp(-r * c_forcei[jtype].rhoinv); forcebuck = r * rexp * c_forcei[jtype].buck1 - r6inv * c_forcei[jtype].buck2; #ifndef INTEL_VMASK if (rsq > c_forcei[jtype].cutsq) forcebuck =(flt_t)0.0; #endif if (EFLAG) { evdwl = rexp * c_energyi[jtype].a - r6inv * c_energyi[jtype].c - c_energyi[jtype].offset; #ifndef INTEL_VMASK if (rsq > c_forcei[jtype].cutsq) evdwl =(flt_t)0.0; #endif } if (sbindex) { const flt_t factor_lj = special_lj[sbindex]; forcebuck *= factor_lj; if (EFLAG) evdwl *= factor_lj; } const flt_t fpair = forcebuck * r2inv; const flt_t fpx = fpair * delx; fxtmp += fpx; if (NEWTON_PAIR) f[j].x -= fpx; const flt_t fpy = fpair * dely; fytmp += fpy; if (NEWTON_PAIR) f[j].y -= fpy; const flt_t fpz = fpair * delz; fztmp += fpz; if (NEWTON_PAIR) f[j].z -= fpz; if (EFLAG) { sevdwl += evdwl; if (eatom) { fwtmp += (flt_t)0.5 * evdwl; if (NEWTON_PAIR) f[j].w += (flt_t)0.5 * evdwl; } } if (NEWTON_PAIR == 0) IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); #ifdef INTEL_VMASK } #endif } // for jj if (NEWTON_PAIR) { f[i].x += fxtmp; f[i].y += fytmp; f[i].z += fztmp; } else { f[i].x = fxtmp; f[i].y = fytmp; f[i].z = fztmp; } IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5); } // end of omp parallel region IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); if (EFLAG || vflag) { if (NEWTON_PAIR == 0) { oevdwl *= (acc_t)0.5; ov0 *= (acc_t)0.5; ov1 *= (acc_t)0.5; ov2 *= (acc_t)0.5; ov3 *= (acc_t)0.5; ov4 *= (acc_t)0.5; ov5 *= (acc_t)0.5; } ev_global[0] = oevdwl; ev_global[1] = (acc_t)0; ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; ev_global[5] = ov3; ev_global[6] = ov4; ev_global[7] = ov5; } #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) *timer_compute = MIC_Wtime() - *timer_compute; #endif } // end of offload region if (offload) fix->stop_watch(TIME_OFFLOAD_LATENCY); else fix->stop_watch(TIME_HOST_PAIR); if (EFLAG || vflag) fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); } void PairBuckIntel::init_style() { PairBuck::init_style(); // augment neighbor list request if (force->newton_pair == 0) neighbor->find_request(this)->enable_full(); fix = static_cast(modify->get_fix_by_id("package_intel")); if (!fix) error->all(FLERR, "The 'package intel' command is required for /intel styles"); fix->pair_init_check(); #ifdef _LMP_INTEL_OFFLOAD _cop = fix->coprocessor_number(); #endif if (fix->precision() == FixIntel::PREC_MODE_MIXED) pack_force_const(force_const_single, fix->get_mixed_buffers()); else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) pack_force_const(force_const_double, fix->get_double_buffers()); else pack_force_const(force_const_single, fix->get_single_buffers()); } template void PairBuckIntel::pack_force_const(ForceConst &fc, IntelBuffers *buffers) { int tp1 = atom->ntypes + 1; fc.set_ntypes(tp1, memory, _cop); // Repeat cutsq calculation because done after call to init_style for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { double cut; if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) cut = init_one(i, j); else cut = 0.0; cutsq[i][j] = cutsq[j][i] = cut*cut; } } for (int i = 0; i < 4; i++) { fc.special_lj[i] = force->special_lj[i]; fc.special_lj[0] = 1.0; } for (int i = 1; i < tp1; i++) { for (int j = 1; j < tp1; j++) { fc.c_force[i][j].buck1 = buck1[i][j]; fc.c_force[i][j].buck2 = buck2[i][j]; fc.c_force[i][j].rhoinv = rhoinv[i][j]; fc.c_force[i][j].cutsq = cutsq[i][j]; fc.c_energy[i][j].a = a[i][j]; fc.c_energy[i][j].c = c[i][j]; fc.c_energy[i][j].offset = offset[i][j]; } } #ifdef _LMP_INTEL_OFFLOAD if (_cop < 0) return; flt_t * special_lj = fc.special_lj; C_FORCE_T * c_force = fc.c_force[0]; C_ENERGY_T * c_energy = fc.c_energy[0]; int tp1sq = tp1 * tp1; #pragma offload_transfer target(mic:_cop) \ in(special_lj: length(4) alloc_if(0) free_if(0)) \ in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) #endif } /* ---------------------------------------------------------------------- */ template void PairBuckIntel::ForceConst::set_ntypes(const int ntypes, Memory *memory, const int cop) { if (memory != nullptr) _memory = memory; if ((ntypes != _ntypes )) { if (_ntypes > 0) { #ifdef _LMP_INTEL_OFFLOAD flt_t * ospecial_lj = special_lj; c_force_t * oc_force = c_force[0]; c_energy_t * oc_energy = c_energy[0]; if (ospecial_lj != nullptr && oc_force != nullptr && oc_energy != nullptr && _cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(ospecial_lj: alloc_if(0) free_if(1)) \ nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) } #endif _memory->destroy(c_force); _memory->destroy(c_energy); } if (ntypes > 0) { _cop = cop; _memory->create(c_force,ntypes,ntypes,"fc.c_force"); _memory->create(c_energy,ntypes,ntypes,"fc.c_energy"); #ifdef _LMP_INTEL_OFFLOAD flt_t * ospecial_lj = special_lj; c_force_t * oc_force = c_force[0]; c_energy_t * oc_energy = c_energy[0]; int tp1sq = ntypes*ntypes; if (ospecial_lj != nullptr && oc_force != nullptr && oc_energy != nullptr && cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ nocopy(oc_force: length(tp1sq) alloc_if(1) free_if(0)) \ nocopy(oc_energy: length(tp1sq) alloc_if(1) free_if(0)) } #endif } } _ntypes=ntypes; }