/*************************************************************************** neighbor.cpp ------------------- W. Michael Brown (ORNL) Peng Wang (Nvidia) Class for handling neighbor lists __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov, penwang@nvidia.com ***************************************************************************/ #include "lal_precision.h" #include "lal_neighbor.h" #include "lal_device.h" #include "math.h" using namespace LAMMPS_AL; int Neighbor::bytes_per_atom(const int max_nbors) const { if (_gpu_nbor==1) return (max_nbors+2)*sizeof(int); else if (_gpu_nbor==2) return (max_nbors+3)*sizeof(int); else if (_use_packing) return ((max_nbors+2)*2)*sizeof(int); else return (max_nbors+3)*sizeof(int); } bool Neighbor::init(NeighborShared *shared, const int inum, const int host_inum, const int max_nbors, const int maxspecial, UCL_Device &devi, const int gpu_nbor, const int gpu_host, const bool pre_cut, const int block_cell_2d, const int block_cell_id, const int block_nbor_build, const int threads_per_atom, const bool time_device) { clear(); _threads_per_atom=threads_per_atom; _block_cell_2d=block_cell_2d; _block_cell_id=block_cell_id; _block_nbor_build=block_nbor_build; _shared=shared; dev=&devi; _gpu_nbor=gpu_nbor; _time_device=time_device; if (gpu_host==0) _gpu_host=false; else if (gpu_host==1) _gpu_host=true; else // Not yet implemented assert(0==1); if (pre_cut || gpu_nbor==0) _alloc_packed=true; else _alloc_packed=false; bool success=true; // Initialize timers for the selected GPU _nbor_time_avail=false; time_nbor.init(*dev); time_kernel.init(*dev); time_hybrid1.init(*dev); time_hybrid2.init(*dev); time_transpose.init(*dev); time_nbor.zero(); time_kernel.zero(); time_hybrid1.zero(); time_hybrid2.zero(); time_transpose.zero(); _max_atoms=static_cast(static_cast(inum)*1.10); if (_max_atoms==0) _max_atoms=1000; _max_host=static_cast(static_cast(host_inum)*1.10); _max_nbors=max_nbors; _maxspecial=maxspecial; if (gpu_nbor==0) _maxspecial=0; if (gpu_nbor==0) success=success && (host_packed.alloc(2*IJ_SIZE,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); alloc(success); if (!success) return false; if (_use_packing==false) _shared->compile_kernels(devi,gpu_nbor); return success; } void Neighbor::alloc(bool &success) { dev_nbor.clear(); host_acc.clear(); int nt=_max_atoms+_max_host; if (_use_packing==false || _gpu_nbor>0) success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS); else success=success && (dev_nbor.alloc(3*_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); success=success && (host_acc.alloc(nt*2,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); _c_bytes=dev_nbor.row_bytes(); if (_alloc_packed) { dev_packed.clear(); success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _c_bytes+=dev_packed.row_bytes(); } if (_max_host>0) { host_nbor.clear(); dev_host_nbor.clear(); dev_host_numj.clear(); host_ilist.clear(); host_jlist.clear(); success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev, UCL_RW_OPTIMIZED)==UCL_SUCCESS); success=success && (dev_host_nbor.alloc(_max_nbors*_max_host, *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); success=success && (dev_host_numj.alloc(_max_host,*dev, UCL_WRITE_ONLY)==UCL_SUCCESS); success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); if (!success) return; for (int i=0; i0) { dev_nspecial.clear(); dev_special.clear(); dev_special_t.clear(); int at=_max_atoms+_max_host; success=success && (dev_nspecial.alloc(3*at,*dev, UCL_READ_ONLY)==UCL_SUCCESS); success=success && (dev_special.alloc(_maxspecial*at,*dev, UCL_READ_ONLY)==UCL_SUCCESS); success=success && (dev_special_t.alloc(_maxspecial*at,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+ dev_special_t.row_bytes(); } _allocated=true; } void Neighbor::clear() { _gpu_bytes=0.0; _cell_bytes=0.0; _c_bytes=0.0; _bin_time=0.0; if (_ncells>0) { _ncells=0; dev_cell_counts.clear(); if (_gpu_nbor==2) { host_cell_counts.clear(); delete [] cell_iter; } } if (_allocated) { _allocated=false; _nbor_time_avail=false; host_packed.clear(); host_acc.clear(); dev_nbor.clear(); dev_host_nbor.clear(); dev_packed.clear(); host_nbor.clear(); dev_host_numj.clear(); host_ilist.clear(); host_jlist.clear(); dev_nspecial.clear(); dev_special.clear(); dev_special_t.clear(); time_kernel.clear(); time_nbor.clear(); time_hybrid1.clear(); time_hybrid2.clear(); time_transpose.clear(); } } double Neighbor::host_memory_usage() const { if (_gpu_nbor>0) { if (_gpu_host) return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+ host_jlist.row_bytes(); else return 0; } else return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+ sizeof(Neighbor); } void Neighbor::get_host(const int inum, int *ilist, int *numj, int **firstneigh, const int block_size) { _nbor_time_avail=true; time_nbor.start(); UCL_H_Vec ilist_view; ilist_view.view(ilist,inum,*dev); ucl_copy(dev_nbor,ilist_view,false); UCL_D_Vec nbor_offset; UCL_H_Vec host_offset; int copy_count=0; int ij_count=0; int acc_count=0; int dev_count=0; int *h_ptr=host_packed.begin(); _nbor_pitch=inum; for (int ii=0; ii acc_view; acc_view.view_offset(inum,dev_nbor,inum*2); ucl_copy(acc_view,host_acc,true); time_nbor.stop(); if (_use_packing==false) { time_kernel.start(); int GX=static_cast(ceil(static_cast(inum)*_threads_per_atom/ block_size)); _shared->k_nbor.set_size(GX,block_size); _shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum, &_threads_per_atom); time_kernel.stop(); } } template void Neighbor::resize_max_neighbors(const int maxn, bool &success) { if (maxn>_max_nbors) { int mn=static_cast(static_cast(maxn)*1.10); dev_nbor.clear(); success=success && (dev_nbor.alloc((mn+1)*_max_atoms,*dev)==UCL_SUCCESS); _gpu_bytes=dev_nbor.row_bytes(); if (_max_host>0) { host_nbor.clear(); dev_host_nbor.clear(); success=success && (host_nbor.alloc(mn*_max_host,*dev, UCL_RW_OPTIMIZED)==UCL_SUCCESS); success=success && (dev_host_nbor.alloc(mn*_max_host, *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); int *ptr=host_nbor.begin(); for (int i=0; i<_max_host; i++) { host_jlist[i]=ptr; ptr+=mn; } _gpu_bytes+=dev_host_nbor.row_bytes(); } else { dev_host_nbor.view(dev_nbor); dev_host_numj.view(dev_nbor); } if (_alloc_packed) { dev_packed.clear(); success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes+=dev_packed.row_bytes(); } _max_nbors=mn; } } template void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int nall, Atom &atom, double *sublo, double *subhi, int *tag, int **nspecial, int **special, bool &success, int &mn) { _nbor_time_avail=true; const int nt=inum+host_inum; // Calculate number of cells and allocate storage for binning as necessary int ncellx, ncelly, ncellz, ncell_3d; ncellx = static_cast(ceil(((subhi[0] - sublo[0]) + 2.0*_cell_size)/_cell_size)); ncelly = static_cast(ceil(((subhi[1] - sublo[1]) + 2.0*_cell_size)/_cell_size)); ncellz = static_cast(ceil(((subhi[2] - sublo[2]) + 2.0*_cell_size)/_cell_size)); ncell_3d = ncellx * ncelly * ncellz; if (ncell_3d+1>_ncells) { dev_cell_counts.clear(); dev_cell_counts.alloc(ncell_3d+1,dev_nbor); if (_gpu_nbor==2) { if (_ncells>0) { host_cell_counts.clear(); delete [] cell_iter; } cell_iter = new int[ncell_3d+1]; host_cell_counts.alloc(ncell_3d+1,dev_nbor); } _ncells=ncell_3d+1; _cell_bytes=dev_cell_counts.row_bytes(); } const numtyp cell_size_cast=static_cast(_cell_size); if (_maxspecial>0) { time_nbor.start(); UCL_H_Vec view_nspecial, view_special, view_tag; view_nspecial.view(nspecial[0],nt*3,*dev); view_special.view(special[0],nt*_maxspecial,*dev); view_tag.view(tag,nall,*dev); ucl_copy(dev_nspecial,view_nspecial,nt*3,false); ucl_copy(dev_special_t,view_special,nt*_maxspecial,false); ucl_copy(atom.dev_tag,view_tag,nall,false); time_nbor.stop(); if (_time_device) time_nbor.add_to_total(); time_transpose.start(); const int b2x=_block_cell_2d; const int b2y=_block_cell_2d; const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); const int g2y=static_cast(ceil(static_cast(nt)/b2y)); _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(), &_maxspecial,&nt); time_transpose.stop(); } // If binning on CPU, do this now if (_gpu_nbor==2) { double stime = MPI_Wtime(); int *cell_id=atom.host_cell_id.begin(); int *particle_id=atom.host_particle_id.begin(); // Build cell list on CPU host_cell_counts.zero(); double m_cell_size=-_cell_size; double dx=subhi[0]-sublo[0]+_cell_size; double dy=subhi[1]-sublo[1]+_cell_size; double dz=subhi[2]-sublo[2]+_cell_size; for (int i=0; idx) px=dx; if (py>dy) py=dy; if (pz>dz) pz=dz; int id=static_cast(px/_cell_size + 1.0) + static_cast(py/_cell_size + 1.0) * ncellx + static_cast(pz/_cell_size + 1.0) * ncellx * ncelly; cell_id[i]=id; host_cell_counts[id+1]++; } mn=0; for (int i=0; i<_ncells; i++) mn=std::max(mn,host_cell_counts[i]); mn*=8; resize_max_neighbors(mn,success); if (!success) return; _total_atoms=nt; cell_iter[0]=0; for (int i=1; i<_ncells; i++) { host_cell_counts[i]+=host_cell_counts[i-1]; cell_iter[i]=host_cell_counts[i]; } time_hybrid1.start(); ucl_copy(dev_cell_counts,host_cell_counts,true); time_hybrid1.stop(); for (int i=0; ineigh_tex.bind_float(atom.dev_x,4); // If binning on GPU, do this now if (_gpu_nbor==1) { const int neigh_block=_block_cell_id; const int GX=(int)ceil((float)nall/neigh_block); const numtyp sublo0=static_cast(sublo[0]); const numtyp sublo1=static_cast(sublo[1]); const numtyp sublo2=static_cast(sublo[2]); const numtyp subhi0=static_cast(subhi[0]); const numtyp subhi1=static_cast(subhi[1]); const numtyp subhi2=static_cast(subhi[2]); _shared->k_cell_id.set_size(GX,neigh_block); _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), &atom.dev_particle_id.begin(), &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall); atom.sort_neighbor(nall); /* calculate cell count */ _shared->k_cell_counts.set_size(GX,neigh_block); _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &dev_cell_counts.begin(), &nall, &ncell_3d); } /* build the neighbor list */ const int cell_block=_block_nbor_build; _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1); _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(), &dev_cell_counts.begin(), &dev_nbor.begin(), &dev_host_nbor.begin(), &dev_host_numj.begin(), &_max_nbors,&cell_size_cast, &ncellx, &ncelly, &ncellz, &inum, &nt, &nall, &_threads_per_atom); /* Get the maximum number of nbors and realloc if necessary */ UCL_D_Vec numj; numj.view_offset(inum,dev_nbor,inum); ucl_copy(host_acc,numj,inum,true); if (nt>inum) { UCL_H_Vec host_offset; host_offset.view_offset(inum,host_acc,nt-inum); ucl_copy(host_offset,dev_host_numj,nt-inum,true); } if (_gpu_nbor!=2) { host_acc.sync(); mn=host_acc[0]; for (int i=1; i_max_nbors) { resize_max_neighbors(mn,success); if (!success) return; time_kernel.stop(); if (_time_device) time_kernel.add_to_total(); build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, nspecial, special, success, mn); return; } } if (_maxspecial>0) { const int GX2=static_cast(ceil(static_cast (nt*_threads_per_atom)/cell_block)); _shared->k_special.set_size(GX2,cell_block); _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), &dev_host_numj.begin(), &atom.dev_tag.begin(), &dev_nspecial.begin(), &dev_special.begin(), &inum, &nt, &_max_nbors, &_threads_per_atom); } time_kernel.stop(); time_nbor.start(); if (inum (double **x, const int inum, const int host_inum, const int nall, Atom &atom, double *sublo, double *subhi, int *, int **, int **, bool &success, int &mn);