lammps/src/USER-INTEL/npair_full_bin_ghost_intel.cpp

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing authors: W. Michael Brown (Intel)
------------------------------------------------------------------------- */

#include "npair_full_bin_ghost_intel.h"

#include "atom.h"
#include "comm.h"
#include "error.h"
#include "molecule.h"
#include "neigh_list.h"
#include "neighbor.h"

using namespace LAMMPS_NS;

/* ---------------------------------------------------------------------- */

NPairFullBinGhostIntel::NPairFullBinGhostIntel(LAMMPS *lmp) : NPairIntel(lmp) {}

/* ----------------------------------------------------------------------
   binned neighbor list construction for all neighbors
   include neighbors of ghost atoms, but no "special neighbors" for ghosts
   every neighbor pair appears in list of both atoms i and j
------------------------------------------------------------------------- */

void NPairFullBinGhostIntel::build(NeighList *list)
{
  #ifdef _LMP_INTEL_OFFLOAD
  if (_fix->offload_noghost())
    error->all(FLERR,
      "The 'ghost no' option cannot be used with this USER-INTEL pair style.");
  #endif

  if (nstencil > INTEL_MAX_STENCIL_CHECK)
    error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");

  #ifdef _LMP_INTEL_OFFLOAD
  if (exclude)
    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
  #endif

  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
    fbi(list, _fix->get_mixed_buffers());
  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
    fbi(list, _fix->get_double_buffers());
  else
    fbi(list, _fix->get_single_buffers());

  _fix->stop_watch(TIME_HOST_NEIGHBOR);
}

/* ---------------------------------------------------------------------- */

template<class flt_t, class acc_t>
void NPairFullBinGhostIntel::fbi(NeighList * list,
                                 IntelBuffers<flt_t,acc_t> * buffers)
{
  const int nlocal = atom->nlocal;
  const int nall = atom->nlocal + atom->nghost;
  list->inum = atom->nlocal;
  list->gnum = atom->nghost;

  int host_start = _fix->host_start_neighbor();
  const int off_end = _fix->offload_end_neighbor();

  #ifdef _LMP_INTEL_OFFLOAD
  if (off_end) grow_stencil();
  if (_fix->full_host_list()) host_start = 0;
  int offload_noghost = _fix->offload_noghost();
  #endif

  // only uses offload_end_neighbor to check whether we are doing offloading
  // at all, no need to correct this later
  buffers->grow_list(list, nall, comm->nthreads, 0, off_end,
                     _fix->nbor_pack_width());

  int need_ic = 0;
  if (atom->molecular != Atom::ATOMIC)
    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
                         neighbor->cutneighmax);

  if (need_ic) {
    fbi<flt_t,acc_t,1>(1, list, buffers, 0, off_end);
    fbi<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
  } else {
    fbi<flt_t,acc_t,0>(1, list, buffers, 0, off_end);
    fbi<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
  }
}

/* ---------------------------------------------------------------------- */

template<class flt_t, class acc_t, int need_ic>
void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
                                 IntelBuffers<flt_t,acc_t> * buffers,
                                 const int pstart, const int pend) {
  if (pend-pstart == 0) return;

  const int nall = atom->nlocal + atom->nghost;
  int nall_t = nall;
  const int aend = nall;

  const ATOM_T * _noalias const x = buffers->get_x();
  int * _noalias const intel_list = buffers->intel_list(list);
  int ** _noalias const firstneigh = list->firstneigh;
  const int e_nall = nall_t;

  const int molecular = atom->molecular;
  int *ns = nullptr;
  tagint *s = nullptr;
  int tag_size = 0, special_size;
  if (buffers->need_tag()) tag_size = e_nall;
  if (molecular != Atom::ATOMIC) {
    s = atom->special[0];
    ns = atom->nspecial[0];
    special_size = aend;
  } else {
    s = &buffers->_special_holder;
    ns = &buffers->_nspecial_holder;
    special_size = 0;
  }
  const tagint * _noalias const special = s;
  const int * _noalias const nspecial = ns;
  const int maxspecial = atom->maxspecial;
  const tagint * _noalias const tag = atom->tag;

  int * _noalias const ilist = list->ilist;
  int * _noalias numneigh = list->numneigh;
  const int nstencil = this->nstencil;
  const int * _noalias const stencil = this->stencil;
  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
  const flt_t * _noalias const cutneighghostsq =
    buffers->get_cutneighghostsq()[0];
  const int ntypes = atom->ntypes + 1;
  const int nlocal = atom->nlocal;

  #ifndef _LMP_INTEL_OFFLOAD
  int * _noalias const mask = atom->mask;
  tagint * _noalias const molecule = atom->molecule;
  #endif

  int moltemplate;
  if (molecular == Atom::TEMPLATE) moltemplate = 1;
  else moltemplate = 0;
  if (moltemplate)
    error->all(FLERR,
               "Can't use moltemplate with npair style full/bin/ghost/intel.");

  int tnum;
  int * _noalias overflow;
  #ifdef _LMP_INTEL_OFFLOAD
  double *timer_compute;
  if (offload) {
    timer_compute = _fix->off_watch_neighbor();
    tnum = buffers->get_off_threads();
    overflow = _fix->get_off_overflow_flag();
    _fix->stop_watch(TIME_HOST_NEIGHBOR);
    _fix->start_watch(TIME_OFFLOAD_LATENCY);
  } else
  #endif
  {
    tnum = comm->nthreads;
    overflow = _fix->get_overflow_flag();
  }
  const int nthreads = tnum;
  const int maxnbors = buffers->get_max_nbors();
  int * _noalias const atombin = buffers->get_atombin();
  const int * _noalias const binpacked = buffers->get_binpacked();

  const int xperiodic = domain->xperiodic;
  const int yperiodic = domain->yperiodic;
  const int zperiodic = domain->zperiodic;
  const flt_t xprd_half = domain->xprd_half;
  const flt_t yprd_half = domain->yprd_half;
  const flt_t zprd_half = domain->zprd_half;

  flt_t * _noalias const ncachex = buffers->get_ncachex();
  flt_t * _noalias const ncachey = buffers->get_ncachey();
  flt_t * _noalias const ncachez = buffers->get_ncachez();
  int * _noalias const ncachej = buffers->get_ncachej();
  int * _noalias const ncachejtype = buffers->get_ncachejtype();
  tagint * _noalias const ncachetag = buffers->get_ncachetag();
  const int ncache_stride = buffers->ncache_stride();

  const int mbinx = this->mbinx;
  const int mbiny = this->mbiny;
  const int mbinz = this->mbinz;
  const int * _noalias const stencilxyz = &this->stencilxyz[0][0];

  int sb = 1;
  if (special_flag[1] == 0) {
    sb = 2;
    if (special_flag[2] == 0) {
      sb = 3;
      if (special_flag[3] == 0)
        sb = 4;
    }
  }
  const int special_bound = sb;

  #ifdef _LMP_INTEL_OFFLOAD
  const int * _noalias const binhead = this->binhead;
  const int * _noalias const bins = this->bins;
  const int cop = _fix->coprocessor_number();
  const int separate_buffers = _fix->separate_buffers();
  #pragma offload target(mic:cop) if(offload) \
    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
    in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
    in(intel_list:length(0) alloc_if(0) free_if(0)) \
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(ilist:length(0) alloc_if(0) free_if(0)) \
    in(atombin:length(aend) alloc_if(0) free_if(0)) \
    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
    in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
    in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
    in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
    in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny,special_bound)\
    in(mbinz,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
    in(stencilxyz:length(3*nstencil)) \
    out(overflow:length(5) alloc_if(0) free_if(0)) \
    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
    signal(tag)
  #endif
  {
    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime();
    #endif

    #ifdef _LMP_INTEL_OFFLOAD
    overflow[LMP_LOCAL_MIN] = 0;
    overflow[LMP_LOCAL_MAX] = aend - 1;
    overflow[LMP_GHOST_MIN] = e_nall;
    overflow[LMP_GHOST_MAX] = -1;
    #endif

    int nstencilp = 0;
    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
    for (int k = 0; k < nstencil; k++) {
      binstart[nstencilp] = stencil[k];
      int end = stencil[k] + 1;
      for (int kk = k + 1; kk < nstencil; kk++) {
        if (stencil[kk-1]+1 == stencil[kk]) {
          end++;
          k++;
        } else break;
      }
      binend[nstencilp] = end;
      nstencilp++;
    }

    const int mbinyx = mbiny * mbinx;

    #if defined(_OPENMP)
    #pragma omp parallel
    #endif
    {
      const int num = aend;
      int tid, ifrom, ito;

      const double balance_factor = 2.0;
      const double ibalance_factor = 1.0 / balance_factor;
      const int gnum = num - nlocal;
      const int wlocal = static_cast<int>(ceil(balance_factor * nlocal));
      const int snum = wlocal + gnum;
      IP_PRE_omp_range_id(ifrom, ito, tid, snum, nthreads);
      if (ifrom < wlocal) ifrom = static_cast<int>(ibalance_factor * ifrom);
      else ifrom -= wlocal - nlocal;
      if (ito < wlocal) ito = static_cast<int>(ibalance_factor * ito);
      else ito -= wlocal - nlocal;

      int e_ito = ito;
      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;

      int pack_offset = maxnbors;
      int ct = (ifrom + tid * 2) * maxnbors;
      int * _noalias neighptr = intel_list + ct;
      const int obound = pack_offset + maxnbors * 2;

      const int toffs = tid * ncache_stride;
      flt_t * _noalias const tx = ncachex + toffs;
      flt_t * _noalias const ty = ncachey + toffs;
      flt_t * _noalias const tz = ncachez + toffs;
      int * _noalias const tj = ncachej + toffs;
      int * _noalias const tjtype = ncachejtype + toffs;
      tagint * _noalias const ttag = ncachetag + toffs;

      // loop over all atoms in other bins in stencil, store every pair
      int ncount, oldbin = -9999999;
      for (int i = ifrom; i < ito; i++) {
        const flt_t xtmp = x[i].x;
        const flt_t ytmp = x[i].y;
        const flt_t ztmp = x[i].z;
        const int itype = x[i].w;
        const tagint itag = tag[i];
        const int ioffset = ntypes * itype;

        const int ibin = atombin[i];
        if (ibin != oldbin) {
          oldbin = ibin;
          ncount = 0;
          if (i < nlocal) {
            for (int k = 0; k < nstencilp; k++) {
              const int bstart = binhead[ibin + binstart[k]];
              const int bend = binhead[ibin + binend[k]];
              #if defined(LMP_SIMD_COMPILER)
              #pragma simd
              #endif
              for (int jj = bstart; jj < bend; jj++)
                tj[ncount++] = binpacked[jj];
            }
          } else {
            const int zbin = ibin / mbinyx;
            const int zrem = ibin % mbinyx;
            const int ybin = zrem / mbinx;
            const int xbin = zrem % mbinx;
            for (int k = 0; k < nstencil; k++) {
              const int xbin2 = xbin + stencilxyz[3 * k + 0];
              const int ybin2 = ybin + stencilxyz[3 * k + 1];
              const int zbin2 = zbin + stencilxyz[3 * k + 2];
              if (xbin2 < 0 || xbin2 >= mbinx ||
                  ybin2 < 0 || ybin2 >= mbiny ||
                  zbin2 < 0 || zbin2 >= mbinz) continue;

              const int bstart = binhead[ibin + stencil[k]];
              const int bend = binhead[ibin + stencil[k] + 1];
              #if defined(LMP_SIMD_COMPILER)
              #pragma simd
              #endif
              for (int jj = bstart; jj < bend; jj++)
                tj[ncount++] = binpacked[jj];
            }
          } // if i < nlocal
          #if defined(LMP_SIMD_COMPILER)
          #pragma vector aligned
          #pragma simd
          #endif
          for (int u = 0; u < ncount; u++) {
            const int j = tj[u];
            tx[u] = x[j].x;
            ty[u] = x[j].y;
            tz[u] = x[j].z;
            tjtype[u] = x[j].w;
            ttag[u] = tag[j];
          }
        } // if ibin != oldbin

        // ---------------------- Loop over other bins

        int n = maxnbors;
        int n2 = n * 2;
        int * _noalias neighptr2 = neighptr;
        const flt_t * _noalias cutsq;
        if (i < nlocal) cutsq = cutneighsq;
        else cutsq = cutneighghostsq;

        const int icp = i;

        #if defined(LMP_SIMD_COMPILER)
        #pragma vector aligned
        #pragma ivdep
        #endif
        for (int u = 0; u < ncount; u++) {
          int addme = 1;
          int j = tj[u];

          if (i == j) addme = 0;

          // Cutoff Check
          const flt_t delx = xtmp - tx[u];
          const flt_t dely = ytmp - ty[u];
          const flt_t delz = ztmp - tz[u];
          const int jtype = tjtype[u];
          const tagint jtag = ttag[u];
          const flt_t rsq = delx * delx + dely * dely + delz * delz;
          if (rsq > cutsq[ioffset + jtype]) addme = 0;

          if (need_ic && icp < nlocal) {
            int no_special;
            ominimum_image_check(no_special, delx, dely, delz);
            if (no_special)
              j = -j - 1;
          }

          int flist = 0;
          if (itag > jtag) {
            if (((itag+jtag) & 1) == 0) flist = 1;
          } else if (itag < jtag) {
            if (((itag+jtag) & 1) == 1) flist = 1;
          } else {
            if (tz[u] < ztmp) flist = 1;
            else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
            else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
              flist = 1;
          }
          if (addme) {
            if (flist)
              neighptr2[n2++] = j;
            else
              neighptr[n++] = j;
          }
        } // for u

        if ((molecular != Atom::ATOMIC) && (i < nlocal)) {
          int alln = n;
          n = 0;
          #if defined(LMP_SIMD_COMPILER)
          #pragma vector aligned
          #ifdef LMP_INTEL_NBOR_COMPAT
          #pragma ivdep
          #else
          #pragma simd
          #endif
          #endif
          for (int u = 0; u < alln; u++) {
            int which;
            int addme = 1;
            int j = neighptr[u];
            if (need_ic && j < 0) {
              which = 0;
              j = -j - 1;
            } else
              ofind_special(which, special, nspecial, i, tag[j]);
            if (which) {
              j = j ^ (which << SBBITS);
              if (which < special_bound) addme = 0;
            }
            #ifdef LMP_INTEL_NBOR_COMPAT
            if (addme) neighptr2[n++] = j;
            #else
            neighptr2[n++] = j;
            #endif
          }
          alln = n2;
          n2 = maxnbors * 2;
          #if defined(LMP_SIMD_COMPILER)
          #pragma vector aligned
          #ifdef LMP_INTEL_NBOR_COMPAT
          #pragma ivdep
          #else
          #pragma simd
          #endif
          #endif
          for (int u = n2; u < alln; u++) {
            int which;
            int addme = 1;
            int j = neighptr[u];
            if (need_ic && j < 0) {
              which = 0;
              j = -j - 1;
            } else
              ofind_special(which, special, nspecial, i, tag[j]);
            if (which) {
              j = j ^ (which << SBBITS);
              if (which < special_bound) addme = 0;
            }
            #ifdef LMP_INTEL_NBOR_COMPAT
            if (addme) neighptr2[n2++] = j;
            #else
            neighptr2[n2++] = j;
            #endif
          }
        }

        #ifndef _LMP_INTEL_OFFLOAD
        if (exclude) {
          int alln = n;
          n = maxnbors;
          #if defined(LMP_SIMD_COMPILER)
          #pragma vector aligned
          #pragma ivdep
          #endif
          for (int u = n; u < alln; u++) {
            int addme = 1;
            const int js = neighptr[u];
            const int j = js & NEIGHMASK;
            const int jtype = x[j].w;
            if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0;
            if (addme) neighptr2[n++] = js;
          }
          alln = n2;
          n2 = maxnbors * 2;
          #if defined(LMP_SIMD_COMPILER)
          #pragma vector aligned
          #pragma ivdep
          #endif
          for (int u = n2; u < alln; u++) {
            int addme = 1;
            const int js = neighptr[u];
            const int j = js & NEIGHMASK;
            const int jtype = x[j].w;
            if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0;
            if (addme) neighptr2[n2++] = js;
          }
        }
        #endif

        int ns = n - maxnbors;
        int alln = n;
        atombin[i] = ns;
        n = 0;
        for (int u = maxnbors; u < alln; u++)
          neighptr[n++] = neighptr2[u];
        ns += n2 - maxnbors * 2;
        for (int u = maxnbors * 2; u < n2; u++)
          neighptr[n++] = neighptr2[u];
        if (ns > maxnbors) *overflow = 1;

        ilist[i] = i;
        firstneigh[i] = intel_list + ct;
        numneigh[i] = ns;

        ct += ns;
        IP_PRE_edge_align(ct, sizeof(int));
        neighptr = intel_list + ct;
        if (ct + obound > list_size) {
          if (i < ito - 1) {
            *overflow = 1;
            ct = (ifrom + tid * 2) * maxnbors;
          }
        }
      }

      if (*overflow == 1)
        for (int i = ifrom; i < ito; i++)
          numneigh[i] = 0;

      #ifdef _LMP_INTEL_OFFLOAD
      if (separate_buffers) {
        overflow[LMP_LOCAL_MIN] = 0;
        overflow[LMP_LOCAL_MAX] = nlocal - 1;
        overflow[LMP_GHOST_MIN] = nlocal;
        overflow[LMP_GHOST_MAX] = e_nall - 1;
      }
      #endif
    } // end omp
    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime() - *timer_compute;
    #endif
  } // end offload

  #ifdef _LMP_INTEL_OFFLOAD
  if (offload) {
    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
    _fix->start_watch(TIME_HOST_NEIGHBOR);
    firstneigh[0] = intel_list;
    for (int n = 0; n < aend; n++) {
      ilist[n] = n;
      numneigh[n] = 0;
    }
  } else {
    if (separate_buffers) {
      _fix->start_watch(TIME_PACK);
      _fix->set_neighbor_host_sizes();
      buffers->pack_sep_from_single(_fix->host_min_local(),
                                    _fix->host_used_local(),
                                    _fix->host_min_ghost(),
                                    _fix->host_used_ghost());
      _fix->stop_watch(TIME_PACK);
    }
  }
  #endif
}