diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp index d88d4cb377..1964ca53b1 100644 --- a/src/USER-INTEL/intel_buffers.cpp +++ b/src/USER-INTEL/intel_buffers.cpp @@ -226,6 +226,7 @@ void IntelBuffers::free_local() { if (_off_map_maxlocal > 0) { int * cnumneigh = _cnumneigh; + int * atombin = _atombin; #ifdef _LMP_INTEL_OFFLOAD if (_off_map_ilist != NULL) { const int * ilist = _off_map_ilist; @@ -233,11 +234,12 @@ void IntelBuffers::free_local() _off_map_ilist = NULL; if (numneigh != 0 && ilist != 0) { #pragma offload_transfer target(mic:_cop) \ - nocopy(ilist,numneigh,cnumneigh:alloc_if(0) free_if(1)) + nocopy(ilist,numneigh,cnumneigh,atombin:alloc_if(0) free_if(1)) } } #endif lmp->memory->destroy(cnumneigh); + lmp->memory->destroy(atombin); _off_map_maxlocal = 0; } } @@ -251,6 +253,7 @@ void IntelBuffers::_grow_local(NeighList *list, free_local(); int size = list->get_maxlocal(); lmp->memory->create(_cnumneigh, size, "_cnumneigh"); + lmp->memory->create(_atombin, size, "_atombin"); _off_map_maxlocal = size; #ifdef _LMP_INTEL_OFFLOAD @@ -258,11 +261,13 @@ void IntelBuffers::_grow_local(NeighList *list, int * numneigh = list->numneigh; int * ilist = list->ilist; int * cnumneigh = _cnumneigh; - if (cnumneigh != 0) { + int * atombin = _atombin; + if (cnumneigh != 0 && atombin != 0) { #pragma offload_transfer target(mic:_cop) \ nocopy(ilist:length(size) alloc_if(1) free_if(0)) \ nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \ - nocopy(cnumneigh:length(size) alloc_if(1) free_if(0)) + nocopy(cnumneigh:length(size) alloc_if(1) free_if(0)) \ + nocopy(atombin:length(size) alloc_if(1) free_if(0)) } _off_map_ilist = ilist; _off_map_numneigh = numneigh; @@ -420,7 +425,7 @@ double IntelBuffers::memory_usage(const int nthreads) if (_off_f) tmem += fstride*_off_threads * sizeof(vec3_acc_t); #endif - tmem += _off_map_maxlocal * sizeof(int); + tmem += _off_map_maxlocal * sizeof(int) * 2; tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int); tmem += _ntypes * _ntypes * sizeof(int); diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h index 4b60c87f04..19f10f9559 100644 --- a/src/USER-INTEL/intel_buffers.h +++ b/src/USER-INTEL/intel_buffers.h @@ -126,6 +126,7 @@ class IntelBuffers { inline int * firstneigh(const NeighList *list) { return _list_alloc; } inline int * cnumneigh(const NeighList *list) { return _cnumneigh; } + inline int * get_atombin() { return _atombin; } inline atom_t * get_x(const int offload = 1) { #ifdef _LMP_INTEL_OFFLOAD if (_separate_buffers && offload == 0) return _host_x; @@ -249,6 +250,7 @@ class IntelBuffers { int _list_alloc_atoms; int * _list_alloc; int * _cnumneigh; + int * _atombin; flt_t **_cutneighsq; int _ntypes; diff --git a/src/USER-INTEL/neigh_half_bin_intel.cpp b/src/USER-INTEL/neigh_half_bin_intel.cpp index 162a73c83f..f4ca6b3a1e 100644 --- a/src/USER-INTEL/neigh_half_bin_intel.cpp +++ b/src/USER-INTEL/neigh_half_bin_intel.cpp @@ -32,47 +32,6 @@ using namespace LAMMPS_NS; #pragma offload_attribute(push,target(mic)) #endif -template -inline int mcoord2bin(const flt_t x0, const flt_t x1, const flt_t x2, - const flt_t bboxlo0, const flt_t bboxlo1, - const flt_t bboxlo2, const flt_t bboxhi0, - const flt_t bboxhi1, const flt_t bboxhi2, - const flt_t bininvx, const flt_t bininvy, - const flt_t bininvz, const int nbinx, const int nbiny, - const int nbinz, const int mbinx, const int mbiny, - const int mbinz, const int mbinxlo, const int mbinylo, - const int mbinzlo) -{ - int ix, iy, iz; - - if (x0 >= bboxhi0) - ix = static_cast ((x0 - bboxhi0) * bininvx) + nbinx; - else if (x0 >= bboxlo0) { - ix = static_cast ((x0 - bboxlo0) * bininvx); - ix = MIN(ix, nbinx-1); - } else - ix = static_cast ((x0 - bboxlo0) * bininvx) - 1; - - if (x1 >= bboxhi1) - iy = static_cast ((x1 - bboxhi1) * bininvy) + nbiny; - else if (x1 >= bboxlo1) { - iy = static_cast ((x1 - bboxlo1) * bininvy); - iy = MIN(iy, nbiny-1); - } else - iy = static_cast ((x1 - bboxlo1) * bininvy) - 1; - - if (x2 >= bboxhi2) - iz = static_cast ((x2 - bboxhi2) * bininvz) + nbinz; - else if (x2 >= bboxlo2) { - iz = static_cast ((x2 - bboxlo2) * bininvz); - iz = MIN(iz, nbinz - 1); - } else - iz = static_cast ((x2 - bboxlo2) * bininvz) - 1; - - return (iz - mbinzlo) * mbiny * mbinx + (iy - mbinylo) * mbinx + - (ix - mbinxlo); -} - #define ofind_special(which, special, nspecial, i, tag, special_flag) \ { \ which = 0; \ @@ -104,20 +63,14 @@ inline int mcoord2bin(const flt_t x0, const flt_t x1, const flt_t x2, #endif template -void Neighbor::bin_atoms(void * xin) { +void Neighbor::bin_atoms(void * xin, int * _noalias const atombin) { const ATOM_T * _noalias const x = (const ATOM_T * _noalias const)xin; int nlocal = atom->nlocal; const int nall = nlocal + atom->nghost; - const flt_t bininvx = this->bininvx; - const flt_t bininvy = this->bininvy; - const flt_t bininvz = this->bininvz; - const flt_t bboxlo0 = this->bboxlo[0]; - const flt_t bboxlo1 = this->bboxlo[1]; - const flt_t bboxlo2 = this->bboxlo[2]; - const flt_t bboxhi0 = this->bboxhi[0]; - const flt_t bboxhi1 = this->bboxhi[1]; - const flt_t bboxhi2 = this->bboxhi[2]; + const double sboxlo0 = bboxlo[0] + mbinxlo/bininvx; + const double sboxlo1 = bboxlo[1] + mbinylo/bininvy; + const double sboxlo2 = bboxlo[2] + mbinzlo/bininvz; int i, ibin; @@ -129,25 +82,26 @@ void Neighbor::bin_atoms(void * xin) { int bitmask = group->bitmask[includegroup]; for (i = nall-1; i >= nlocal; i--) { if (mask[i] & bitmask) { - ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2, - bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, nbinx, nbiny, - nbinz, mbinx, mbiny, mbinz, mbinxlo, mbinylo, mbinzlo); + ibin = coord2bin(atom->x[i]); bins[i] = binhead[ibin]; binhead[ibin] = i; } } for (i = atom->nfirst-1; i >= 0; i--) { - ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2, - bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, nbinx, nbiny, - nbinz, mbinx, mbiny, mbinz, mbinxlo, mbinylo, mbinzlo); + ibin = coord2bin(atom->x[i]); + atombin[i] = ibin; bins[i] = binhead[ibin]; binhead[ibin] = i; } } else { - for (i = nall-1; i >= 0; i--) { - ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2, - bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, nbinx, nbiny, - nbinz, mbinx, mbiny, mbinz, mbinxlo, mbinylo, mbinzlo); + for (i = nall-1; i >= nlocal; i--) { + ibin = coord2bin(atom->x[i]); + bins[i] = binhead[ibin]; + binhead[ibin] = i; + } + for (i = nlocal-1; i >= 0; i--) { + ibin = coord2bin(atom->x[i]); + atombin[i]=ibin; bins[i] = binhead[ibin]; binhead[ibin] = i; } @@ -228,7 +182,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, fix->stop_watch(TIME_PACK); fix->start_watch(TIME_HOST_NEIGHBOR); - bin_atoms(buffers->get_x()); + bin_atoms(buffers->get_x(), buffers->get_atombin()); if (INTEL_MIC_NBOR_PAD > 1) pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); } else { @@ -295,22 +249,13 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, } const int nthreads = tnum; const int maxnbors = buffers->get_max_nbors(); + int * _noalias const atombin = buffers->get_atombin(); - const flt_t bboxlo0 = this->bboxlo[0]; - const flt_t bboxlo1 = this->bboxlo[1]; - const flt_t bboxlo2 = this->bboxlo[2]; - const flt_t bboxhi0 = this->bboxhi[0]; - const flt_t bboxhi1 = this->bboxhi[1]; - const flt_t bboxhi2 = this->bboxhi[2]; - const flt_t bininvx = this->bininvx; - const flt_t bininvy = this->bininvy; - const flt_t bininvz = this->bininvz; - // Make sure dummy coordinates to eliminate loop remainder not within cutoff { - const flt_t dx = (INTEL_BIGP - bboxhi0); - const flt_t dy = (INTEL_BIGP - bboxhi1); - const flt_t dz = (INTEL_BIGP - bboxhi2); + const flt_t dx = (INTEL_BIGP - bboxhi[0]); + const flt_t dy = (INTEL_BIGP - bboxhi[1]); + const flt_t dz = (INTEL_BIGP - bboxhi[2]); if (dx * dx + dy * dy + dz * dz < static_cast(cutneighmaxsq)) error->one(FLERR, "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."); @@ -319,15 +264,6 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, #ifdef _LMP_INTEL_OFFLOAD const int * _noalias const binhead = this->binhead; const int * _noalias const special_flag = this->special_flag; - const int nbinx = this->nbinx; - const int nbiny = this->nbiny; - const int nbinz = this->nbinz; - const int mbinxlo = this->mbinxlo; - const int mbinylo = this->mbinylo; - const int mbinzlo = this->mbinzlo; - const int mbinx = this->mbinx; - const int mbiny = this->mbiny; - const int mbinz = this->mbinz; const int * _noalias const bins = this->bins; const int cop = fix->coprocessor_number(); const int separate_buffers = fix->separate_buffers(); @@ -343,12 +279,11 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ out(numneigh:length(0) alloc_if(0) free_if(0)) \ in(ilist:length(0) alloc_if(0) free_if(0)) \ + in(atombin:length(aend) alloc_if(0) free_if(0)) \ in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ in(special_flag:length(0) alloc_if(0) free_if(0)) \ - in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz) \ - in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,offload) \ - in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \ - in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \ + in(maxnbors,nthreads,maxspecial,nstencil,pad_width,offload) \ + in(separate_buffers, astart, aend, nlocal, molecular, ntypes) \ out(overflow:length(5) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \ signal(numneigh) @@ -402,10 +337,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, // stores own/own pairs only once // stores own/ghost pairs on both procs - ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2, - bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, - nbinx, nbiny, nbinz, mbinx, mbiny, mbinz, - mbinxlo, mbinylo, mbinzlo); + ibin = atombin[i]; for (k = 0; k < nstencil; k++) { for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) { @@ -648,7 +580,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, fix->stop_watch(TIME_PACK); fix->start_watch(TIME_HOST_NEIGHBOR); - bin_atoms(buffers->get_x()); + bin_atoms(buffers->get_x(), buffers->get_atombin()); if (INTEL_MIC_NBOR_PAD > 1) pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); } else { @@ -718,21 +650,13 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, } const int nthreads = tnum; const int maxnbors = buffers->get_max_nbors(); + int * _noalias const atombin = buffers->get_atombin(); - const flt_t bboxlo0 = this->bboxlo[0]; - const flt_t bboxlo1 = this->bboxlo[1]; - const flt_t bboxlo2 = this->bboxlo[2]; - const flt_t bboxhi0 = this->bboxhi[0]; - const flt_t bboxhi1 = this->bboxhi[1]; - const flt_t bboxhi2 = this->bboxhi[2]; - const flt_t bininvx = this->bininvx; - const flt_t bininvy = this->bininvy; - const flt_t bininvz = this->bininvz; // Make sure dummy coordinates to eliminate loop remainder not within cutoff { - const flt_t dx = (INTEL_BIGP - bboxhi0); - const flt_t dy = (INTEL_BIGP - bboxhi1); - const flt_t dz = (INTEL_BIGP - bboxhi2); + const flt_t dx = (INTEL_BIGP - bboxhi[0]); + const flt_t dy = (INTEL_BIGP - bboxhi[1]); + const flt_t dz = (INTEL_BIGP - bboxhi[2]); if (dx * dx + dy * dy + dz * dz < static_cast(cutneighmaxsq)) error->one(FLERR, "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."); @@ -741,15 +665,6 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, #ifdef _LMP_INTEL_OFFLOAD const int * _noalias const binhead = this->binhead; const int * _noalias const special_flag = this->special_flag; - const int nbinx = this->nbinx; - const int nbiny = this->nbiny; - const int nbinz = this->nbinz; - const int mbinxlo = this->mbinxlo; - const int mbinylo = this->mbinylo; - const int mbinzlo = this->mbinzlo; - const int mbinx = this->mbinx; - const int mbiny = this->mbiny; - const int mbinz = this->mbinz; const int * _noalias const bins = this->bins; const int cop = fix->coprocessor_number(); const int separate_buffers = fix->separate_buffers(); @@ -765,12 +680,11 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ out(numneigh:length(0) alloc_if(0) free_if(0)) \ in(ilist:length(0) alloc_if(0) free_if(0)) \ + in(atombin:length(aend) alloc_if(0) free_if(0)) \ in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ in(special_flag:length(0) alloc_if(0) free_if(0)) \ - in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz,e_nall,offload)\ - in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,offload_end) \ - in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \ - in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \ + in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pad_width) \ + in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \ out(overflow:length(5) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \ signal(numneigh) @@ -861,10 +775,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, } // loop over all atoms in other bins in stencil, store every pair - ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2, - bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, - nbinx, nbiny, nbinz, mbinx, mbiny, mbinz, - mbinxlo, mbinylo, mbinzlo); + ibin = atombin[i]; for (k = 0; k < nstencil; k++) { for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) { @@ -1114,7 +1025,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, fix->stop_watch(TIME_PACK); fix->start_watch(TIME_HOST_NEIGHBOR); - bin_atoms(buffers->get_x()); + bin_atoms(buffers->get_x(), buffers->get_atombin()); if (INTEL_MIC_NBOR_PAD > 1) pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); } else { @@ -1184,21 +1095,13 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, } const int nthreads = tnum; const int maxnbors = buffers->get_max_nbors(); + int * _noalias const atombin = buffers->get_atombin(); - const flt_t bboxlo0 = this->bboxlo[0]; - const flt_t bboxlo1 = this->bboxlo[1]; - const flt_t bboxlo2 = this->bboxlo[2]; - const flt_t bboxhi0 = this->bboxhi[0]; - const flt_t bboxhi1 = this->bboxhi[1]; - const flt_t bboxhi2 = this->bboxhi[2]; - const flt_t bininvx = this->bininvx; - const flt_t bininvy = this->bininvy; - const flt_t bininvz = this->bininvz; // Make sure dummy coordinates to eliminate loop remainder not within cutoff { - const flt_t dx = (INTEL_BIGP - bboxhi0); - const flt_t dy = (INTEL_BIGP - bboxhi1); - const flt_t dz = (INTEL_BIGP - bboxhi2); + const flt_t dx = (INTEL_BIGP - bboxhi[0]); + const flt_t dy = (INTEL_BIGP - bboxhi[1]); + const flt_t dz = (INTEL_BIGP - bboxhi[2]); if (dx * dx + dy * dy + dz * dz < static_cast(cutneighmaxsq)) error->one(FLERR, "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."); @@ -1207,15 +1110,6 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, #ifdef _LMP_INTEL_OFFLOAD const int * _noalias const binhead = this->binhead; const int * _noalias const special_flag = this->special_flag; - const int nbinx = this->nbinx; - const int nbiny = this->nbiny; - const int nbinz = this->nbinz; - const int mbinxlo = this->mbinxlo; - const int mbinylo = this->mbinylo; - const int mbinzlo = this->mbinzlo; - const int mbinx = this->mbinx; - const int mbiny = this->mbiny; - const int mbinz = this->mbinz; const int * _noalias const bins = this->bins; const int cop = fix->coprocessor_number(); const int separate_buffers = fix->separate_buffers(); @@ -1231,12 +1125,11 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ out(numneigh:length(0) alloc_if(0) free_if(0)) \ in(ilist:length(0) alloc_if(0) free_if(0)) \ + in(atombin:length(aend) alloc_if(0) free_if(0)) \ in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ in(special_flag:length(0) alloc_if(0) free_if(0)) \ - in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz,offload_end) \ - in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,e_nall,offload) \ - in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \ - in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \ + in(maxnbors,nthreads,maxspecial,nstencil,offload_end,pad_width,e_nall) \ + in(offload,separate_buffers, astart, aend, nlocal, molecular, ntypes) \ out(overflow:length(5) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \ signal(numneigh) @@ -1291,10 +1184,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, // (equal zyx and j <= i) // latter excludes self-self interaction but allows superposed atoms - ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2, - bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, - nbinx, nbiny, nbinz, mbinx, mbiny, mbinz, - mbinxlo, mbinylo, mbinzlo); + ibin = atombin[i]; for (k = 0; k < nstencil; k++) { for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) { diff --git a/src/accelerator_intel.h b/src/accelerator_intel.h index 97823dfe30..2f0179eb4b 100644 --- a/src/accelerator_intel.h +++ b/src/accelerator_intel.h @@ -32,7 +32,7 @@ void *fix_intel; template - void bin_atoms(void *); + void bin_atoms(void *, int *); template void hbni(const int, NeighList *, void *, const int, const int, void *,