git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2016-07-01 23:27:26 +00:00
parent 8366b35459
commit 9656958169
245 changed files with 4890 additions and 4832 deletions

View File

@ -10,7 +10,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
begin :
email : brownw@ornl.gov, penwang@nvidia.com
***************************************************************************/
@ -32,13 +32,13 @@ int Neighbor::bytes_per_atom(const int max_nbors) const {
}
bool Neighbor::init(NeighborShared *shared, const int inum,
const int host_inum, const int max_nbors,
const int maxspecial, UCL_Device &devi,
const int gpu_nbor, const int gpu_host,
const int host_inum, const int max_nbors,
const int maxspecial, UCL_Device &devi,
const int gpu_nbor, const int gpu_host,
const bool pre_cut, const int block_cell_2d,
const int block_cell_id, const int block_nbor_build,
const int threads_per_atom, const int warp_size,
const bool time_device,
const bool time_device,
const std::string compile_flags) {
clear();
@ -56,10 +56,10 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
_gpu_host=false;
else if (gpu_host==1)
_gpu_host=true;
else
else
// Not yet implemented
assert(0==1);
if (pre_cut || gpu_nbor==0)
_alloc_packed=true;
else
@ -71,7 +71,7 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
_packed_permissions=UCL_READ_ONLY;
bool success=true;
// Initialize timers for the selected GPU
_nbor_time_avail=false;
time_nbor.init(*dev);
@ -88,7 +88,7 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
_max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
if (_max_atoms==0)
_max_atoms=1000;
_max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
_max_nbors=(max_nbors/threads_per_atom+1)*threads_per_atom;
@ -102,21 +102,21 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
alloc(success);
if (!success)
return false;
if (_use_packing==false)
_shared->compile_kernels(devi,gpu_nbor,compile_flags);
return success;
}
void Neighbor::alloc(bool &success) {
void Neighbor::alloc(bool &success) {
dev_nbor.clear();
host_acc.clear();
int nt=_max_atoms+_max_host;
if (_use_packing==false || _gpu_nbor>0)
success=success &&
if (_use_packing==false || _gpu_nbor>0)
success=success &&
(dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS);
else
else
success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (host_acc.alloc(nt*2,*dev,
@ -127,14 +127,17 @@ void Neighbor::alloc(bool &success) {
dev_packed.clear();
success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
_packed_permissions)==UCL_SUCCESS);
_c_bytes+=dev_packed.row_bytes();
}
dev_acc.clear();
success=success && (dev_acc.alloc(_max_atoms,*dev,
UCL_READ_WRITE)==UCL_SUCCESS);
_c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes();
}
if (_max_host>0) {
nbor_host.clear();
dev_numj_host.clear();
host_ilist.clear();
host_jlist.clear();
success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE,
UCL_READ_WRITE)==UCL_SUCCESS) && success;
success=success && (dev_numj_host.alloc(_max_host,*dev,
@ -152,7 +155,7 @@ void Neighbor::alloc(bool &success) {
for (int i=0; i<_max_host; i++) {
host_jlist[i]=ptr;
ptr+=_max_nbors;
}
}
_c_bytes+=nbor_host.device.row_bytes()+dev_numj_host.row_bytes();
} else {
// Some OpenCL implementations return errors for NULL pointers as args
@ -176,7 +179,7 @@ void Neighbor::alloc(bool &success) {
_allocated=true;
}
void Neighbor::clear() {
_gpu_bytes=0.0;
_cell_bytes=0.0;
@ -194,6 +197,7 @@ void Neighbor::clear() {
host_packed.clear();
host_acc.clear();
dev_acc.clear();
dev_nbor.clear();
nbor_host.clear();
dev_packed.clear();
@ -219,13 +223,13 @@ double Neighbor::host_memory_usage() const {
host_ilist.row_bytes()+host_jlist.row_bytes();
else
return 0;
} else
} else
return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+
sizeof(Neighbor);
}
void Neighbor::get_host(const int inum, int *ilist, int *numj,
int **firstneigh, const int block_size) {
int **firstneigh, const int block_size) {
_nbor_time_avail=true;
time_nbor.start();
@ -242,7 +246,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
int dev_count=0;
int *h_ptr=host_packed.begin();
_nbor_pitch=inum;
for (int ii=0; ii<inum; ii++) {
int i=ilist[ii];
int nj=numj[i];
@ -250,13 +254,13 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
host_acc[ii+inum]=acc_count;
acc_count+=nj;
int *jlist=firstneigh[i];
for (int jj=0; jj<nj; jj++) {
*h_ptr=jlist[jj];
h_ptr++;
ij_count++;
if (ij_count==IJ_SIZE) {
dev_nbor.sync();
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
@ -278,8 +282,17 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
UCL_D_Vec<int> acc_view;
acc_view.view_offset(inum,dev_nbor,inum*2);
ucl_copy(acc_view,host_acc,true);
UCL_H_Vec<int> host_view;
host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
for (int ii=0; ii<inum; ii++) {
int i=ilist[ii];
host_view[i] = ii;
}
ucl_copy(dev_acc,host_view,true);
time_nbor.stop();
if (_use_packing==false) {
time_kernel.start();
int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
@ -294,7 +307,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
// inum=nlocal is forced to be true to allow direct indexing of neighbors of
// neighbors
void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
int **firstneigh, const int block_size) {
int **firstneigh, const int block_size) {
_nbor_time_avail=true;
time_nbor.start();
@ -311,7 +324,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
int dev_count=0;
int *h_ptr=host_packed.begin();
_nbor_pitch=inum;
if (nlist!=inum)
host_acc.zero(inum);
@ -322,7 +335,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
host_acc[i+inum]=acc_count;
acc_count+=nj;
}
for (int i=0; i<inum; i++) {
int nj=host_acc[i];
int *jlist=firstneigh[i];
@ -330,7 +343,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
*h_ptr=jlist[jj];
h_ptr++;
ij_count++;
if (ij_count==IJ_SIZE) {
dev_nbor.sync();
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
@ -353,7 +366,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
acc_view.view_offset(inum,dev_nbor,inum*2);
ucl_copy(acc_view,host_acc,true);
time_nbor.stop();
if (_use_packing==false) {
time_kernel.start();
int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
@ -366,7 +379,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
template <class numtyp, class acctyp>
void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
if (maxn>_max_nbors) {
if (maxn>_max_nbors) {
int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
mn=(mn/_threads_per_atom+1)*_threads_per_atom;
success=success && (dev_nbor.resize((mn+1)*_max_atoms)==UCL_SUCCESS);
@ -377,7 +390,7 @@ void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
for (int i=0; i<_max_host; i++) {
host_jlist[i]=ptr;
ptr+=mn;
}
}
_gpu_bytes+=nbor_host.row_bytes();
} else {
nbor_host.device.view(dev_nbor);
@ -393,8 +406,8 @@ void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
template <class numtyp, class acctyp>
void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
const int nall, Atom<numtyp,acctyp> &atom,
double *sublo, double *subhi, tagint *tag,
const int nall, Atom<numtyp,acctyp> &atom,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, bool &success,
int &mn) {
_nbor_time_avail=true;
@ -409,7 +422,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
ncell_3d = ncellx * ncelly * ncellz;
if (ncell_3d+1>_ncells) {
cell_counts.clear();
if (_gpu_nbor==2) {
if (_ncells>0)
delete [] cell_iter;
@ -419,7 +432,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
cell_counts.device.clear();
cell_counts.device.alloc(ncell_3d+1,dev_nbor);
}
_ncells=ncell_3d+1;
_cell_bytes=cell_counts.device.row_bytes();
}
@ -445,17 +458,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
_shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt);
_shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt);
time_transpose.stop();
}
// If binning on CPU, do this now
if (_gpu_nbor==2) {
double stime = MPI_Wtime();
int *cell_id=atom.host_cell_id.begin();
int *particle_id=atom.host_particle_id.begin();
// Build cell list on CPU
// Build cell list on CPU
cell_counts.host.zero();
double i_cell_size=1.0/_cell_size;
@ -475,12 +488,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
int iz = static_cast<int>(pz*i_cell_size+1);
iz = std::max(iz,_cells_in_cutoff);
iz = std::min(iz,ncellz-offset_hi);
int id = ix+iy*ncellx+iz*ncellx*ncelly;
cell_id[i] = id;
cell_counts[id+1]++;
}
for (int i=nt; i<nall; i++) {
double px, py, pz;
px=x[i][0]-sublo[0];
@ -496,12 +509,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
int iz = static_cast<int>(pz*i_cell_size+1);
iz = std::max(iz,0);
iz = std::min(iz,ncellz-1);
int id = ix+iy*ncellx+iz*ncellx*ncelly;
cell_id[i] = id;
cell_counts[id+1]++;
}
mn=0;
for (int i=0; i<_ncells; i++)
mn=std::max(mn,cell_counts[i]);
@ -531,7 +544,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
ucl_copy(atom.dev_particle_id,atom.host_particle_id,true);
time_hybrid2.stop();
_bin_time+=MPI_Wtime()-stime;
}
}
time_kernel.start();
@ -547,7 +560,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
const numtyp sublo1=static_cast<numtyp>(sublo[1]);
const numtyp sublo2=static_cast<numtyp>(sublo[2]);
_shared->k_cell_id.set_size(GX,neigh_block);
_shared->k_cell_id.run(&atom.x, &atom.dev_cell_id,
_shared->k_cell_id.run(&atom.x, &atom.dev_cell_id,
&atom.dev_particle_id, &sublo0, &sublo1,
&sublo2, &i_cell_size, &ncellx, &ncelly, &ncellz,
&nt, &nall, &_cells_in_cutoff);
@ -556,10 +569,10 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
/* calculate cell count */
_shared->k_cell_counts.set_size(GX,neigh_block);
_shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall,
_shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall,
&ncell_3d);
}
}
/* build the neighbor list */
const int cell_block=_block_nbor_build;
_shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
@ -579,7 +592,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
host_offset.view_offset(inum,host_acc,nt-inum);
ucl_copy(host_offset,dev_numj_host,nt-inum,true);
}
if (_gpu_nbor!=2) {
host_acc.sync();
mn=host_acc[0];
@ -587,7 +600,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
mn=std::max(mn,host_acc[i]);
set_nbor_block_size(mn);
if (mn>_max_nbors) {
if (mn>_max_nbors) {
resize_max_neighbors<numtyp,acctyp>(mn,success);
if (!success)
return;
@ -599,13 +612,13 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
return;
}
}
if (_maxspecial>0) {
const int GX2=static_cast<int>(ceil(static_cast<double>
(nt*_threads_per_atom)/cell_block));
_shared->k_special.set_size(GX2,cell_block);
_shared->k_special.run(&dev_nbor, &nbor_host, &dev_numj_host,
&atom.dev_tag, &dev_nspecial, &dev_special,
&atom.dev_tag, &dev_nspecial, &dev_special,
&inum, &nt, &_max_nbors, &_threads_per_atom);
}
time_kernel.stop();