From 04f1dc492eecb53619dd7f46a42eca301352d618 Mon Sep 17 00:00:00 2001 From: "W. Michael Brown" Date: Sat, 26 Mar 2011 12:58:47 -0400 Subject: [PATCH] Changing host neighbor list from gpu build to be full. Changing ordering of host neighbor list from gpu builds. Updating lj_cut with changes. Breaking everthing else. --- lib/gpu/atomic_gpu_memory.cpp | 49 ++++++++++++ lib/gpu/atomic_gpu_memory.h | 8 ++ lib/gpu/lj_cut_gpu.cpp | 7 +- lib/gpu/pair_gpu_build_kernel.cu | 48 ++++++------ lib/gpu/pair_gpu_nbor.cpp | 52 ++++++++++--- lib/gpu/pair_gpu_nbor.h | 5 +- src/GPU/pair_lj_cut_gpu.cpp | 126 ++++++------------------------- src/GPU/pair_lj_cut_gpu.h | 3 +- 8 files changed, 155 insertions(+), 143 deletions(-) diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp index a964c9ba01..900b8be9bc 100644 --- a/lib/gpu/atomic_gpu_memory.cpp +++ b/lib/gpu/atomic_gpu_memory.cpp @@ -243,6 +243,55 @@ int * AtomicGPUMemoryT::compute(const int ago, const int inum_full, return nbor->host_nbor.begin(); } +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template +int ** AtomicGPUMemoryT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, + int **nspecial, int **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success) { + acc_timers(); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return NULL; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + boxlo, boxhi, tag, nspecial, special, success); + if (!success) + return NULL; + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; +} + template double AtomicGPUMemoryT::host_memory_usage_atomic() const { return device->atom.host_memory_usage()+nbor->host_memory_usage()+ diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h index 11cda990af..88c4a7ed81 100644 --- a/lib/gpu/atomic_gpu_memory.h +++ b/lib/gpu/atomic_gpu_memory.h @@ -129,6 +129,14 @@ class AtomicGPUMemory { const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); + /// Pair loop with device neighboring + int ** compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, double *boxlo, + double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success); + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage diff --git a/lib/gpu/lj_cut_gpu.cpp b/lib/gpu/lj_cut_gpu.cpp index 8772eee81a..4082a94f64 100644 --- a/lib/gpu/lj_cut_gpu.cpp +++ b/lib/gpu/lj_cut_gpu.cpp @@ -96,15 +96,16 @@ void ljl_gpu_clear() { LJLMF.clear(); } -int * ljl_gpu_compute_n(const int ago, const int inum_full, +int ** ljl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *boxlo, double *boxhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success) { + int **ilist, int **jnum, const double cpu_time, + bool &success) { return LJLMF.compute(ago, inum_full, nall, host_x, host_type, boxlo, boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success); + vatom, host_start, ilist, jnum, cpu_time, success); } void ljl_gpu_compute(const int ago, const int inum_full, const int nall, diff --git a/lib/gpu/pair_gpu_build_kernel.cu b/lib/gpu/pair_gpu_build_kernel.cu index a08a26800a..5f5d6717de 100644 --- a/lib/gpu/pair_gpu_build_kernel.cu +++ b/lib/gpu/pair_gpu_build_kernel.cu @@ -139,7 +139,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos, int *cell_particle_id, int *cell_counts, int *nbor_list, - int *host_nbor_list, + int *host_nbor_list, + int *host_numj, int neigh_bin_size, numtyp cell_size, int ncellx, int ncelly, int ncellz, @@ -183,9 +184,9 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos, neigh_list=neigh_counts+stride; nbor_list[pid_i]=pid_i; } else { - stride=nt-inum; - neigh_counts=host_nbor_list+pid_i-inum; - neigh_list=neigh_counts+stride; + stride=1; + neigh_counts=host_numj+pid_i-inum; + neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size; } // loop through neighbors @@ -220,20 +221,18 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos, for (int j = 0; j < end_idx; j++) { int pid_j = cell_list_sh[j]; // gather from shared memory - if (pid_ipid_i) { - diff.x = atom_i.x - pos_sh[j].x; - diff.y = atom_i.y - pos_sh[j].y; - diff.z = atom_i.z - pos_sh[j].z; + diff.x = atom_i.x - pos_sh[j].x; + diff.y = atom_i.y - pos_sh[j].y; + diff.z = atom_i.z - pos_sh[j].z; - r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; - if (r2 < cell_size*cell_size && r2 > 1e-5) { - if (cnt < neigh_bin_size) { - *neigh_list = pid_j; - neigh_list+=stride; - } - cnt++; - } - } + r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; + if (r2 < cell_size*cell_size && r2 > 1e-5) { + if (cnt < neigh_bin_size) { + *neigh_list = pid_j; + neigh_list+=stride; + } + cnt++; + } } } __syncthreads(); @@ -247,9 +246,10 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos, } __kernel void kernel_special(__global int *dev_nbor, - __global int *host_nbor_list, __global int *tag, + __global int *host_nbor_list, + __global int *host_numj, __global int *tag, __global int *nspecial, __global int *special, - int inum, int nt, int nall) { + int inum, int nt, int nall, int max_nbors) { // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X; @@ -261,15 +261,17 @@ __kernel void kernel_special(__global int *dev_nbor, int n2=nspecial[ii*3+1]; int n3=nspecial[ii*3+2]; + int numj; if (ii < inum) { stride=inum; list=dev_nbor+stride+ii; + numj=*list; + list+=stride; } else { - stride=nt-inum; - list=host_nbor_list+ii-inum; + stride=1; + list=host_nbor_list+(ii-inum)*max_nbors; + numj=host_numj[ii]; } - int numj=*list; - list+=stride; list_end=list+numj*stride; for ( ; list0) { host_nbor.clear(); dev_host_nbor.clear(); - success=success && (host_nbor.alloc((_max_nbors+1)*_max_host,*dev, + dev_host_numj.clear(); + host_ilist.clear(); + host_jlist.clear(); + + success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev, UCL_RW_OPTIMIZED)==UCL_SUCCESS); - success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host, + success=success && (dev_host_nbor.alloc(_max_nbors*_max_host, *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); - _c_bytes+=dev_host_nbor.row_bytes(); + success=success && (dev_host_numj.alloc(_max_host,*dev, + UCL_WRITE_ONLY)==UCL_SUCCESS); + success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); + for (int i=0; i0) { dev_nspecial.clear(); @@ -140,6 +157,9 @@ void PairGPUNbor::clear() { dev_host_nbor.clear(); dev_packed.clear(); host_nbor.clear(); + dev_host_numj.clear(); + host_ilist.clear(); + host_jlist.clear(); dev_nspecial.clear(); dev_special.clear(); dev_special_t.clear(); @@ -152,7 +172,8 @@ void PairGPUNbor::clear() { double PairGPUNbor::host_memory_usage() const { if (_gpu_nbor) { if (_gpu_host) - return host_nbor.row_bytes()*host_nbor.rows(); + return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+ + host_jlist.row_bytes(); else return 0; } else @@ -297,7 +318,8 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1); _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(), &cell_counts.begin(), &dev_nbor.begin(), - &dev_host_nbor.begin(),&_max_nbors,&cell_size_cast, + &dev_host_nbor.begin(), &dev_host_numj.begin(), + &_max_nbors,&cell_size_cast, &ncellx, &ncelly, &ncellz, &inum, &nt, &nall); /* Get the maximum number of nbors and realloc if necessary */ @@ -307,7 +329,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, if (nt>inum) { UCL_H_Vec host_offset; host_offset.view_offset(inum,host_acc,nt-inum); - ucl_copy(host_offset,dev_host_nbor,nt-inum,false); + ucl_copy(host_offset,dev_host_numj,nt-inum,false); } mn=host_acc[0]; for (int i=1; i0) { host_nbor.clear(); dev_host_nbor.clear(); - success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor, + success=success && (host_nbor.alloc(mn*_max_host,dev_nbor, UCL_RW_OPTIMIZED)==UCL_SUCCESS); - success=success && (dev_host_nbor.alloc((mn+1)*_max_host, + success=success && (dev_host_nbor.alloc(mn*_max_host, dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS); + int *ptr=host_nbor.begin(); + for (int i=0; i<_max_host; i++) { + host_jlist[i]=ptr; + ptr+=mn; + } _gpu_bytes+=dev_host_nbor.row_bytes(); } if (_alloc_packed) { @@ -348,14 +375,15 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, const int GX2=static_cast(ceil(static_cast(nt)/cell_block)); _shared->k_special.set_size(GX2,cell_block); _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), - &atom.dev_tag.begin(), &dev_nspecial.begin(), - &dev_special.begin(), &inum, &nt, &nall); + &dev_host_numj.begin(), &atom.dev_tag.begin(), + &dev_nspecial.begin(), &dev_special.begin(), + &inum, &nt, &nall, &_max_nbors); } time_kernel.stop(); time_nbor.start(); if (_gpu_host) - ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false); + ucl_copy(host_nbor,dev_host_nbor,false); time_nbor.stop(); } diff --git a/lib/gpu/pair_gpu_nbor.h b/lib/gpu/pair_gpu_nbor.h index ff2dcc5ad6..090e81f76e 100644 --- a/lib/gpu/pair_gpu_nbor.h +++ b/lib/gpu/pair_gpu_nbor.h @@ -171,8 +171,11 @@ class PairGPUNbor { UCL_H_Vec host_nbor; /// Device storage for neighbor list matrix that will be copied to host /** - 1st row is numj - * - Remaining rows are nbors **/ + * - Remaining rows are by atom, columns are nbors **/ UCL_D_Vec dev_host_nbor; + UCL_D_Vec dev_host_numj; + UCL_H_Vec host_ilist; + UCL_H_Vec host_jlist; /// Device storage for special neighbor counts UCL_D_Vec dev_nspecial; /// Device storage for special neighbors diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp index 289d6605e8..edafd0a132 100644 --- a/src/GPU/pair_lj_cut_gpu.cpp +++ b/src/GPU/pair_lj_cut_gpu.cpp @@ -46,12 +46,13 @@ bool ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void ljl_gpu_clear(); -int * ljl_gpu_compute_n(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); +int ** ljl_gpu_compute_n(const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success); void ljl_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, @@ -89,30 +90,30 @@ void PairLJCutGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = ljl_gpu_compute_n(neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success); + firstneigh = ljl_gpu_compute_n(neighbor->ago, inum, nall, + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, + &ilist, &numneigh, cpu_time, success); } else { inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; ljl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, - list->ilist, list->numneigh, list->firstneigh, eflag, - vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success); + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); } if (!success) error->one("Out of memory on GPGPU"); if (host_startx; double **f = atom->f; @@ -185,11 +187,6 @@ void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) { int nall = nlocal + atom->nghost; double *special_lj = force->special_lj; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -237,78 +234,3 @@ void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) { } } } - -/* ---------------------------------------------------------------------- */ - -void PairLJCutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) { - int i,j,itype,jtype; - int nlocal = atom->nlocal; - int nall = nlocal + atom->nghost; - int stride = nlocal-start; - double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; - double rsq,r2inv,r6inv,forcelj,factor_lj; - double *special_lj = force->special_lj; - - double **x = atom->x; - double **f = atom->f; - int *type = atom->type; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - int jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor