/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing author: W. Michael Brown (Intel) ------------------------------------------------------------------------- */ #include #include "intel_buffers.h" #include "force.h" #include "memory.h" using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ template IntelBuffers::IntelBuffers(class LAMMPS *lmp_in) : lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _off_threads(0), _buf_size(0), _buf_local_size(0), _n_list_ptrs(1), _max_list_ptrs(4) { _neigh_list_ptrs = new IntelNeighListPtrs[_max_list_ptrs]; _neigh_list_ptrs[0].cnumneigh = 0; _list_alloc_atoms = 0; _ntypes = 0; _off_map_listlocal = 0; _ccachex = 0; _ncache_alloc = 0; _ncachetag = 0; _cutneighsq = 0; _cutneighghostsq = 0; #ifdef _LMP_INTEL_OFFLOAD _separate_buffers = 0; _off_f = 0; _off_map_ilist = 0; _off_map_nmax = 0; _off_list_alloc = false; _off_threads = 0; _off_ccache = 0; _off_ncache = 0; _host_nmax = 0; #endif } /* ---------------------------------------------------------------------- */ template IntelBuffers::~IntelBuffers() { free_buffers(); free_all_nbor_buffers(); free_ccache(); set_ntypes(0); delete []_neigh_list_ptrs; } /* ---------------------------------------------------------------------- */ template void IntelBuffers::free_buffers() { if (_buf_size > 0) { atom_t * x = get_x(); flt_t * q = get_q(); quat_t * quat = get_quat(); #ifdef _LMP_INTEL_OFFLOAD vec3_acc_t * f_start = get_off_f(); if (f_start != 0) { acc_t * ev_global = get_ev_global(); if (ev_global != 0) { #pragma offload_transfer target(mic:_cop) \ nocopy(x:alloc_if(0) free_if(1)) \ nocopy(f_start:alloc_if(0) free_if(1)) \ nocopy(ev_global:alloc_if(0) free_if(1)) } if (q != 0) { #pragma offload_transfer target (mic:_cop) \ nocopy(q:alloc_if(0) free_if(1)) } if (quat != 0) { #pragma offload_transfer target (mic:_cop) \ nocopy(quat:alloc_if(0) free_if(1)) } lmp->memory->destroy(f_start); } if (_separate_buffers) { lmp->memory->destroy(_host_x); if (q != 0) lmp->memory->destroy(_host_q); if (quat != 0) lmp->memory->destroy(_host_quat); } #endif lmp->memory->destroy(x); if (q != 0) lmp->memory->destroy(q); if (quat != 0) lmp->memory->destroy(quat); lmp->memory->destroy(_f); _buf_size = _buf_local_size = 0; } } /* ---------------------------------------------------------------------- */ template void IntelBuffers::_grow(const int nall, const int nlocal, const int nthreads, const int offload_end) { free_buffers(); _buf_size = static_cast(nall) * 1.1 + 1; if (lmp->force->newton_pair) _buf_local_size = _buf_size; else _buf_local_size = static_cast(nlocal) * 1.1 + 1; const int f_stride = get_stride(_buf_local_size); lmp->memory->create(_x, _buf_size,"intel_x"); if (lmp->atom->q != NULL) lmp->memory->create(_q, _buf_size, "intel_q"); if (lmp->atom->ellipsoid != NULL) lmp->memory->create(_quat, _buf_size, "intel_quat"); #ifdef _LMP_INTEL_OFFLOAD if (lmp->force->newton_pair) #else if (lmp->force->newton_pair || lmp->atom->molecular) #endif lmp->memory->create(_f, f_stride * nthreads, "intel_f"); else lmp->memory->create(_f, f_stride, "intel_f"); #ifdef _LMP_INTEL_OFFLOAD if (_separate_buffers) { lmp->memory->create(_host_x, _buf_size,"intel_host_x"); if (lmp->atom->q != NULL) lmp->memory->create(_host_q, _buf_size, "intel_host_q"); if (lmp->atom->ellipsoid != NULL) lmp->memory->create(_host_quat, _buf_size, "intel_host_quat"); } if (offload_end > 0) { int fm; if (lmp->force->newton_pair) fm = _off_threads; else fm = 1; lmp->memory->create(_off_f, f_stride * fm, "intel_off_f"); const atom_t * const x = get_x(); const flt_t * const q = get_q(); const vec3_acc_t * f_start = get_off_f(); acc_t * ev_global = get_ev_global(); if (lmp->atom->q != NULL) { if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \ nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\ nocopy(ev_global:length(8) alloc_if(1) free_if(0)) } } else { if (x != NULL && f_start != NULL && ev_global != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \ nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\ nocopy(ev_global:length(8) alloc_if(1) free_if(0)) } } if (lmp->atom->ellipsoid != NULL) { const quat_t * const quat = get_quat(); if (quat != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(quat:length(_buf_size) alloc_if(1) free_if(0)) } } } #endif } /* ---------------------------------------------------------------------- */ template void IntelBuffers::free_nmax() { #ifdef _LMP_INTEL_OFFLOAD if (_off_map_nmax > 0) { const tagint * tag = _off_map_tag; const tagint * special = _off_map_special; const int * nspecial = _off_map_nspecial; #pragma offload_transfer target(mic:_cop) \ nocopy(tag:alloc_if(0) free_if(1)) \ nocopy(special,nspecial:alloc_if(0) free_if(1)) _off_map_nmax = 0; _host_nmax = 0; } #endif } /* ---------------------------------------------------------------------- */ template void IntelBuffers::_grow_nmax(const int offload_end) { if (lmp->atom->molecular) _need_tag = 1; else _need_tag = 0; #ifdef _LMP_INTEL_OFFLOAD free_nmax(); int size = lmp->atom->nmax; _host_nmax = size; if (!offload_end) return; tagint *special; int *nspecial; int tag_length, special_length, nspecial_length; if (lmp->atom->molecular) { special = lmp->atom->special[0]; nspecial = lmp->atom->nspecial[0]; special_length = size * lmp->atom->maxspecial; nspecial_length = size * 3; } else { special = &_special_holder; nspecial = &_nspecial_holder; special_length = 1; nspecial_length = 1; } if (_need_tag) tag_length = size; else tag_length = 1; tagint *tag = lmp->atom->tag; #pragma offload_transfer target(mic:_cop) \ nocopy(tag:length(tag_length) alloc_if(1) free_if(0)) \ nocopy(special:length(special_length) alloc_if(1) free_if(0)) \ nocopy(nspecial:length(nspecial_length) alloc_if(1) free_if(0)) _off_map_tag = tag; _off_map_special = special; _off_map_nspecial = nspecial; _off_map_nmax = size; #endif } /* ---------------------------------------------------------------------- */ template void IntelBuffers::free_list_local() { if (_off_map_listlocal > 0) { if (_neigh_list_ptrs[0].cnumneigh) { int * cnumneigh = _neigh_list_ptrs[0].cnumneigh; _neigh_list_ptrs[0].cnumneigh = 0; #ifdef _LMP_INTEL_OFFLOAD if (_off_map_ilist != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(cnumneigh:alloc_if(0) free_if(1)) } #endif lmp->memory->destroy(cnumneigh); } #ifdef _LMP_INTEL_OFFLOAD if (_off_map_ilist != NULL) { const int * ilist = _off_map_ilist; const int * numneigh = _off_map_numneigh; const int ** firstneigh = (const int **)_off_map_firstneigh; _off_map_ilist = NULL; #pragma offload_transfer target(mic:_cop) \ nocopy(ilist,firstneigh,numneigh:alloc_if(0) free_if(1)) } #endif _off_map_listlocal = 0; } } /* ---------------------------------------------------------------------- */ template void IntelBuffers::free_list_ptrs() { for (int list_num = 1; list_num < _n_list_ptrs; list_num++) { if (_neigh_list_ptrs[list_num].size) { lmp->memory->destroy(_neigh_list_ptrs[list_num].cnumneigh); lmp->memory->destroy(_neigh_list_ptrs[list_num].numneighhalf); } _neigh_list_ptrs[list_num].size = 0; _neigh_list_ptrs[list_num].list_ptr = 0; } _n_list_ptrs = 1; } /* ---------------------------------------------------------------------- */ template void IntelBuffers::grow_data3(NeighList *list, int *&numneighhalf, int *&cnumneigh) { const int size = list->get_maxlocal(); int list_num; for (list_num = 0; list_num < _n_list_ptrs; list_num++) if (_neigh_list_ptrs[list_num].list_ptr == (void*)list) break; if (list_num == _n_list_ptrs) { if (_n_list_ptrs == _max_list_ptrs) { _max_list_ptrs *= 2; IntelNeighListPtrs *new_list = new IntelNeighListPtrs[_max_list_ptrs]; for (int i = 0; i < _n_list_ptrs; i++) new_list[i] = _neigh_list_ptrs[i]; delete []_neigh_list_ptrs; _neigh_list_ptrs = new_list; } _neigh_list_ptrs[list_num].list_ptr = (void *)list; _neigh_list_ptrs[list_num].size = 0; _n_list_ptrs++; } if (size > _neigh_list_ptrs[list_num].size) { if (_neigh_list_ptrs[list_num].size) { lmp->memory->destroy(_neigh_list_ptrs[list_num].cnumneigh); lmp->memory->destroy(_neigh_list_ptrs[list_num].numneighhalf); } lmp->memory->create(_neigh_list_ptrs[list_num].cnumneigh, size, "_cnumneigh"); lmp->memory->create(_neigh_list_ptrs[list_num].numneighhalf, size, "_cnumneigh"); _neigh_list_ptrs[list_num].size = size; } numneighhalf = _neigh_list_ptrs[list_num].numneighhalf; cnumneigh = _neigh_list_ptrs[list_num].cnumneigh; } /* ---------------------------------------------------------------------- */ template void IntelBuffers::_grow_list_local(NeighList *list, const int three_body, const int offload_end) { free_list_local(); int size = list->get_maxlocal(); _off_map_listlocal = size; if (three_body) lmp->memory->create(_neigh_list_ptrs[0].cnumneigh, size, "_cnumneigh"); #ifdef _LMP_INTEL_OFFLOAD if (offload_end > 0) { int tb_size = size; if (three_body == 0) { lmp->memory->create(_neigh_list_ptrs[0].cnumneigh, 16, "_cnumneigh"); tb_size = 16; } int ** firstneigh = list->firstneigh; int * numneigh = list->numneigh; int * ilist = list->ilist; int * cnumneigh = _neigh_list_ptrs[0].cnumneigh; #pragma offload_transfer target(mic:_cop) \ nocopy(ilist:length(size) alloc_if(1) free_if(0)) \ nocopy(firstneigh:length(size) alloc_if(1) free_if(0)) \ nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \ nocopy(cnumneigh:length(tb_size) alloc_if(1) free_if(0)) _off_map_ilist = ilist; _off_map_firstneigh = firstneigh; _off_map_numneigh = numneigh; } #endif } /* ---------------------------------------------------------------------- */ template void IntelBuffers::free_nbor_list() { if (_list_alloc_atoms > 0) { #ifdef _LMP_INTEL_OFFLOAD if (_off_list_alloc) { int * list_alloc = _list_alloc; #pragma offload_transfer target(mic:_cop) \ nocopy(list_alloc:alloc_if(0) free_if(1)) _off_list_alloc = false; } #endif lmp->memory->destroy(_list_alloc); _list_alloc_atoms = 0; } } /* ---------------------------------------------------------------------- */ template void IntelBuffers::_grow_nbor_list(NeighList * /*list*/, const int nlocal, const int nthreads, const int offload_end, const int pack_width) { free_nbor_list(); _list_alloc_atoms = 1.10 * nlocal; int nt = MAX(nthreads, _off_threads); int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) * get_max_nbors(); lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc"); #ifdef _LMP_INTEL_OFFLOAD if (offload_end > 0) { int * list_alloc =_list_alloc; if (list_alloc != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(list_alloc:length(list_alloc_size) alloc_if(1) free_if(0)) _off_list_alloc = true; } } #endif } /* ---------------------------------------------------------------------- */ template void IntelBuffers::free_ccache() { if (_ccachex) { flt_t *ccachex = _ccachex; flt_t *ccachey = _ccachey; flt_t *ccachez = _ccachez; flt_t *ccachew = _ccachew; int *ccachei = _ccachei; int *ccachej = _ccachej; #ifdef LMP_USE_AVXCD acc_t *ccachef = _ccachef; #endif #ifdef _LMP_INTEL_OFFLOAD if (_off_ccache) { #pragma offload_transfer target(mic:_cop) \ nocopy(ccachex,ccachey,ccachez,ccachew:alloc_if(0) free_if(1)) \ nocopy(ccachei,ccachej:alloc_if(0) free_if(1)) #ifdef LMP_USE_AVXCD #pragma offload_transfer target(mic:_cop) \ nocopy(ccachef:alloc_if(0) free_if(1)) #endif } _off_ccache = 0; #endif lmp->memory->destroy(ccachex); lmp->memory->destroy(ccachey); lmp->memory->destroy(ccachez); lmp->memory->destroy(ccachew); lmp->memory->destroy(ccachei); lmp->memory->destroy(ccachej); #ifdef LMP_USE_AVXCD lmp->memory->destroy(ccachef); #endif _ccachex = 0; } } /* ---------------------------------------------------------------------- */ template void IntelBuffers::grow_ccache(const int off_flag, const int nthreads, const int width) { #ifdef _LMP_INTEL_OFFLOAD if (_ccachex && off_flag && _off_ccache == 0) free_ccache(); #endif if (_ccachex) return; const int nsize = get_max_nbors() * width; int esize = MIN(sizeof(int), sizeof(flt_t)); IP_PRE_get_stride(_ccache_stride, nsize, esize, 0); int nt = MAX(nthreads, _off_threads); const int vsize = _ccache_stride * nt; lmp->memory->create(_ccachex, vsize , "_ccachex"); lmp->memory->create(_ccachey, vsize, "_ccachey"); lmp->memory->create(_ccachez, vsize, "_ccachez"); lmp->memory->create(_ccachew, vsize, "_ccachew"); lmp->memory->create(_ccachei, vsize, "_ccachei"); lmp->memory->create(_ccachej, vsize, "_ccachej"); #ifdef LMP_USE_AVXCD IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0); lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef"); #endif memset(_ccachei, 0, vsize * sizeof(int)); memset(_ccachej, 0, vsize * sizeof(int)); #ifdef _LMP_INTEL_OFFLOAD if (off_flag) { flt_t *ccachex = _ccachex; flt_t *ccachey = _ccachey; flt_t *ccachez = _ccachez; flt_t *ccachew = _ccachew; int *ccachei = _ccachei; int *ccachej = _ccachej; if (ccachex != NULL && ccachey !=NULL && ccachez != NULL && ccachew != NULL && ccachei != NULL && ccachej !=NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \ in(ccachei:length(vsize) alloc_if(1) free_if(0)) \ in(ccachej:length(vsize) alloc_if(1) free_if(0)) } #ifdef LMP_USE_AVXCD if (ccachef != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(ccachef:length(_ccache_stride3 * nt) alloc_if(1) free_if(0)) } #endif _off_ccache = 1; } #endif } /* ---------------------------------------------------------------------- */ template void IntelBuffers::free_ncache() { if (_ncache_alloc) { flt_t *ncachex = _ncachex; flt_t *ncachey = _ncachey; flt_t *ncachez = _ncachez; int *ncachej = _ncachej; int *ncachejtype = _ncachejtype; tagint *ncachetag = _ncachetag; #ifdef _LMP_INTEL_OFFLOAD if (_off_ncache) { #pragma offload_transfer target(mic:_cop) \ nocopy(ncachex,ncachey,ncachez,ncachej:alloc_if(0) free_if(1)) \ nocopy(ncachejtype:alloc_if(0) free_if(1)) if (ncachetag) { #pragma offload_transfer target(mic:_cop) \ nocopy(ncachetag:alloc_if(0) free_if(1)) } } _off_ncache = 0; #endif lmp->memory->destroy(ncachex); lmp->memory->destroy(ncachey); lmp->memory->destroy(ncachez); lmp->memory->destroy(ncachej); lmp->memory->destroy(ncachejtype); if (ncachetag) lmp->memory->destroy(ncachetag); _ncache_alloc = 0; _ncachetag = 0; } } /* ---------------------------------------------------------------------- */ template void IntelBuffers::grow_ncache(const int off_flag, const int nthreads) { const int nsize = get_max_nbors() * 3; int esize = MIN(sizeof(int), sizeof(flt_t)); IP_PRE_get_stride(_ncache_stride, nsize, esize, 0); int nt = MAX(nthreads, _off_threads); const int vsize = _ncache_stride * nt; if (_ncache_alloc) { if (vsize > _ncache_alloc || (need_tag() && _ncachetag == 0)) free_ncache(); #ifdef _LMP_INTEL_OFFLOAD else if (off_flag && _off_ncache == 0) free_ncache(); #endif else return; } lmp->memory->create(_ncachex, vsize, "_ncachex"); lmp->memory->create(_ncachey, vsize, "_ncachey"); lmp->memory->create(_ncachez, vsize, "_ncachez"); lmp->memory->create(_ncachej, vsize, "_ncachej"); lmp->memory->create(_ncachejtype, vsize, "_ncachejtype"); if (need_tag()) lmp->memory->create(_ncachetag, vsize, "_ncachetag"); _ncache_alloc = vsize; #ifdef _LMP_INTEL_OFFLOAD if (off_flag) { flt_t *ncachex = _ncachex; flt_t *ncachey = _ncachey; flt_t *ncachez = _ncachez; int *ncachej = _ncachej; int *ncachejtype = _ncachejtype; if (ncachex != NULL && ncachey !=NULL && ncachez != NULL && ncachej != NULL && ncachejtype != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ncachejtype:length(vsize) alloc_if(1) free_if(0)) } int tsize = vsize; if (!need_tag()) { tsize = 16; lmp->memory->create(_ncachetag, tsize, "_ncachetag"); } tagint *ncachetag = _ncachetag; #pragma offload_transfer target(mic:_cop) \ nocopy(ncachetag:length(tsize) alloc_if(1) free_if(0)) _off_ncache = 1; } #endif } /* ---------------------------------------------------------------------- */ #ifndef _LMP_INTEL_OFFLOAD template void IntelBuffers::fdotr_reduce_l5(const int lf, const int lt, const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5) { IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0, ov1, ov2, ov3, ov4, ov5); } #endif /* ---------------------------------------------------------------------- */ #ifndef _LMP_INTEL_OFFLOAD template void IntelBuffers::fdotr_reduce(const int nall, const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5) { int iifrom, iito, tid; IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2, ov0, ov1, ov2, ov3, ov4, ov5); } #endif /* ---------------------------------------------------------------------- */ template void IntelBuffers::set_ntypes(const int ntypes, const int use_ghost_cut) { if (ntypes != _ntypes) { if (_ntypes > 0) { #ifdef _LMP_INTEL_OFFLOAD flt_t * cutneighsqo = _cutneighsq[0]; if (_off_threads > 0) { #pragma offload_transfer target(mic:_cop) \ nocopy(cutneighsqo:alloc_if(0) free_if(1)) } flt_t * cutneighghostsqo; if (_cutneighghostsq && _off_threads > 0) { cutneighghostsqo = _cutneighghostsq[0]; #pragma offload_transfer target(mic:_cop) \ nocopy(cutneighghostsqo:alloc_if(0) free_if(1)) } #endif lmp->memory->destroy(_cutneighsq); if (_cutneighghostsq != 0) lmp->memory->destroy(_cutneighghostsq); } if (ntypes > 0) { lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq"); if (use_ghost_cut) lmp->memory->create(_cutneighghostsq, ntypes, ntypes, "_cutneighghostsq"); #ifdef _LMP_INTEL_OFFLOAD flt_t * cutneighsqo = _cutneighsq[0]; const int ntypes2 = ntypes * ntypes; if (_off_threads > 0 && cutneighsqo != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(cutneighsqo:length(ntypes2) alloc_if(1) free_if(0)) } if (use_ghost_cut) { flt_t * cutneighghostsqo = _cutneighghostsq[0]; if (_off_threads > 0 && cutneighghostsqo != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(cutneighghostsqo:length(ntypes2) alloc_if(1) free_if(0)) } } #endif } _ntypes = ntypes; } } /* ---------------------------------------------------------------------- */ template double IntelBuffers::memory_usage(const int nthreads) { double tmem = sizeof(atom_t); if (lmp->atom->q) tmem += sizeof(flt_t); if (lmp->atom->torque) tmem += sizeof(quat_t); #ifdef _LMP_INTEL_OFFLOAD if (_separate_buffers) tmem *= 2; #endif tmem *= _buf_size; const int fstride = get_stride(_buf_local_size); tmem += fstride * nthreads * sizeof(vec3_acc_t); #ifdef _LMP_INTEL_OFFLOAD if (_off_f) tmem += fstride*_off_threads * sizeof(vec3_acc_t); #endif tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int); tmem += _ntypes * _ntypes * sizeof(int); tmem += _buf_local_size + (_n_list_ptrs - 1) * _buf_local_size * 2; return tmem; } /* ---------------------------------------------------------------------- */ template class LAMMPS_NS::IntelBuffers; template class LAMMPS_NS::IntelBuffers; template class LAMMPS_NS::IntelBuffers;