From 3803804d3806aaf643a1bc06c19c6e8aa887e167 Mon Sep 17 00:00:00 2001 From: "W. Michael Brown" Date: Sat, 3 Dec 2011 21:52:19 -0500 Subject: [PATCH] Removing the need for 2 allocations for fp on the host. --- lib/gpu/lal_eam.cpp | 29 +++--------- lib/gpu/lal_eam.h | 25 ++--------- lib/gpu/lal_eam_ext.cpp | 25 ++++++----- src/GPU/pair_eam_gpu.cpp | 97 +++++++++++++++++++++++++--------------- src/GPU/pair_eam_gpu.h | 6 ++- src/MANYBODY/pair_eam.h | 4 +- 6 files changed, 93 insertions(+), 93 deletions(-) diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp index 7b76e6893e..d95ddc1a98 100644 --- a/lib/gpu/lal_eam.cpp +++ b/lib/gpu/lal_eam.cpp @@ -236,7 +236,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, double *fp) { + bool &success, void **fp_ptr) { this->acc_timers(); if (this->device->time_device()) { @@ -277,7 +277,8 @@ void EAMT::compute(const int f_ago, const int inum_full, dev_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_WRITE_ONLY); fp_tex.bind_float(dev_fp,1); - } + } + *fp_ptr=host_fp.begin(); // ----------------------------------------------------------------- @@ -296,14 +297,6 @@ void EAMT::compute(const int f_ago, const int inum_full, time_fp1.start(); ucl_copy(host_fp,dev_fp,false); time_fp1.stop(); - - double t = MPI_Wtime(); - numtyp *ap=host_fp.begin(); - for (int i=0; iatom->add_cast_time(MPI_Wtime() - t); } // --------------------------------------------------------------------------- @@ -318,7 +311,7 @@ int** EAMT::compute(const int ago, const int inum_full, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, - double *fp, int &inum) { + int &inum, void **fp_ptr) { this->acc_timers(); if (this->device->time_device()) { @@ -361,6 +354,7 @@ int** EAMT::compute(const int ago, const int inum_full, fp_tex.bind_float(dev_fp,1); } + *fp_ptr=host_fp.begin(); // ----------------------------------------------------------------- @@ -384,14 +378,6 @@ int** EAMT::compute(const int ago, const int inum_full, ucl_copy(host_fp,dev_fp,false); time_fp1.stop(); - double t = MPI_Wtime(); - numtyp *ap=host_fp.begin(); - for (int i=0; iatom->add_cast_time(MPI_Wtime() - t); - return this->nbor->host_jlist.begin()-host_start; } @@ -400,10 +386,9 @@ int** EAMT::compute(const int ago, const int inum_full, // --------------------------------------------------------------------------- template void EAMT::compute2(int *ilist, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, double *host_fp) { - time_fp2.start(); - this->cast_fp_data(host_fp); + const bool eatom, const bool vatom) { this->hd_balancer.start_timer(); + time_fp2.start(); this->add_fp_data(); time_fp2.stop(); diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h index 100d850cd7..c07297da7c 100644 --- a/lib/gpu/lal_eam.h +++ b/lib/gpu/lal_eam.h @@ -47,24 +47,6 @@ class EAM : public BaseAtomic { const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen); - // Cast fp to write buffer -// template - inline void cast_fp_data(double *host_ptr) { - int nall = this->atom->nall(); - if (this->ucl_device->device_type()==UCL_CPU) { - if (sizeof(numtyp)==sizeof(double)) { - host_fp.view((numtyp*)host_ptr,nall,*(this->ucl_device)); - dev_fp.view(host_fp); - } else - for (int i=0; iatom->nall(),true); @@ -85,7 +67,8 @@ class EAM : public BaseAtomic { double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *fp); + const double cpu_time, bool &success, + void **fp_ptr); /// Pair loop with device neighboring int** compute(const int ago, const int inum_full, const int nall, @@ -94,11 +77,11 @@ class EAM : public BaseAtomic { int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, - double *fp, int &inum); + int &inum, void **fp_ptr); /// Pair loop with host neighboring void compute2(int *ilist, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, double *host_fp); + const bool eatom, const bool vatom); // ------------------------- DEVICE KERNELS ------------------------- UCL_Kernel k_energy; diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp index 5add042b83..0b4b155964 100644 --- a/lib/gpu/lal_eam_ext.cpp +++ b/lib/gpu/lal_eam_ext.cpp @@ -35,7 +35,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, int nrho, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, - int &gpu_mode, FILE *screen) { + int &gpu_mode, FILE *screen, int &fp_size) { EAMMF.clear(); gpu_mode=EAMMF.device->gpu_mode(); double gpu_split=EAMMF.device->particle_split(); @@ -49,6 +49,8 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, if (gpu_split != 1.0) return -8; + fp_size=sizeof(PRECISION); + EAMMF.device->init_message(screen,"eam",first_gpu,last_gpu); bool message=false; @@ -114,23 +116,24 @@ int ** eam_gpu_compute_energy_n(const int ago, const int inum_full, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_fp, double *boxlo, - double *prd, int &inum) { + bool &success, double *boxlo, + double *prd, int &inum, void **fp_ptr) { return EAMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, - host_fp, inum); + inum, fp_ptr); } void eam_gpu_compute_energy(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_fp, - const int nlocal, double *boxlo, double *prd) { + const double cpu_time, bool &success, + const int nlocal, double *boxlo, double *prd, + void **fp_ptr) { EAMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, - host_fp); + fp_ptr); } void eam_gpu_compute_n(const int ago, const int inum_full, @@ -139,18 +142,18 @@ void eam_gpu_compute_n(const int ago, const int inum_full, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_fp, double *boxlo, + bool &success, double *boxlo, double *prd, int inum) { - EAMMF.compute2(NULL, eflag, vflag, eatom, vatom, host_fp); + EAMMF.compute2(NULL, eflag, vflag, eatom, vatom); } void eam_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_fp, + const double cpu_time, bool &success, const int nlocal, double *boxlo, double *prd) { - EAMMF.compute2(ilist, eflag, vflag, eatom, vatom, host_fp); + EAMMF.compute2(ilist, eflag, vflag, eatom, vatom); } double eam_gpu_bytes() { diff --git a/src/GPU/pair_eam_gpu.cpp b/src/GPU/pair_eam_gpu.cpp index 3fefffbb2b..c68fe9e43c 100644 --- a/src/GPU/pair_eam_gpu.cpp +++ b/src/GPU/pair_eam_gpu.cpp @@ -33,9 +33,6 @@ using namespace LAMMPS_NS; -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -#define MAX(a,b) ((a) > (b) ? (a) : (b)) - #define MAXLINE 1024 // External functions from cuda library for atom decomposition @@ -49,7 +46,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, - int &gpu_mode, FILE *screen); + int &gpu_mode, FILE *screen, int &fp_size); void eam_gpu_clear(); int** eam_gpu_compute_energy_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, @@ -57,27 +54,28 @@ int** eam_gpu_compute_energy_n(const int ago, const int inum_full, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_fp, double *boxlo, - double *prd, int &inum); + bool &success, double *boxlo, + double *prd, int &inum, void **fp_ptr); void eam_gpu_compute_energy(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_fp, - const int nlocal, double *boxlo, double *prd); + const double cpu_time, bool &success, + const int nlocal, double *boxlo, double *prd, + void **fp_ptr); void eam_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_fp, double *boxlo, + bool &success, double *boxlo, double *prd, int inum); void eam_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_fp, + const double cpu_time, bool &success, const int nlocal, double *boxlo, double *prd); double eam_gpu_bytes(); @@ -117,28 +115,9 @@ void PairEAMGPU::compute(int eflag, int vflag) if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = eflag_global = eflag_atom = 0; - // grow energy and fp arrays if necessary - // need to be atom->nmax in length - - if (atom->nmax > nmax) { - memory->destroy(rho); - memory->destroy(fp); - nmax = atom->nmax; - memory->create(rho,nmax,"pair:rho"); - memory->create(fp,nmax,"pair:fp"); - } - int nlocal = atom->nlocal; int newton_pair = force->newton_pair; - // zero out density - - if (newton_pair) { - m = nlocal + atom->nghost; - for (i = 0; i < m; i++) rho[i] = 0.0; - } else for (i = 0; i < nlocal; i++) rho[i] = 0.0; - - // compute density on each atom on GPU int nall = atom->nlocal + atom->nghost; @@ -154,8 +133,8 @@ void PairEAMGPU::compute(int eflag, int vflag) atom->tag, atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, - success, fp, domain->boxlo, - domain->prd, inum_dev); + success, domain->boxlo, + domain->prd, inum_dev, &fp_pinned); } else { // gpu_mode == GPU_FORCE inum = list->inum; ilist = list->ilist; @@ -163,8 +142,9 @@ void PairEAMGPU::compute(int eflag, int vflag) firstneigh = list->firstneigh; eam_gpu_compute_energy(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, fp, - atom->nlocal, domain->boxlo, domain->prd); + vflag_atom, host_start, cpu_time, success, + atom->nlocal, domain->boxlo, domain->prd, + &fp_pinned); } if (!success) @@ -189,12 +169,12 @@ void PairEAMGPU::compute(int eflag, int vflag) atom->tag, atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, - success, fp, domain->boxlo, + success, domain->boxlo, domain->prd, inum_dev); } else { // gpu_mode == GPU_FORCE eam_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, fp, + vflag_atom, host_start, cpu_time, success, atom->nlocal, domain->boxlo, domain->prd); } @@ -404,12 +384,13 @@ void PairEAMGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; + int fp_size; int success = eam_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r, type2frho, rhor_spline, z2r_spline, frho_spline, rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, atom->nlocal, atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen); + cell_size, gpu_mode, screen, fp_size); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) { @@ -417,8 +398,52 @@ void PairEAMGPU::init_style() neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } + + if (fp_size == sizeof(double)) + fp_single = false; + else + fp_single = true; } +/* ---------------------------------------------------------------------- */ +int PairEAMGPU::pack_comm(int n, int *list, double *buf, int pbc_flag, + int *pbc) +{ + int i,j,m; + m = 0; + if (fp_single) { + float *fp_ptr = (float *)fp_pinned; + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = static_cast(fp_ptr[j]); + } + } else { + double *fp_ptr = (double *)fp_pinned; + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = fp_ptr[j]; + } + } + + return 1; +} + +/* ---------------------------------------------------------------------- */ + +void PairEAMGPU::unpack_comm(int n, int first, double *buf) +{ + int i,m,last; + + m = 0; + last = first + n; + if (fp_single) { + float *fp_ptr = (float *)fp_pinned; + for (i = first; i < last; i++) fp_ptr[i] = buf[m++]; + } else { + double *fp_ptr = (double *)fp_pinned; + for (i = first; i < last; i++) fp_ptr[i] = buf[m++]; + } +} diff --git a/src/GPU/pair_eam_gpu.h b/src/GPU/pair_eam_gpu.h index b7eaffcfa1..7cdecb0105 100644 --- a/src/GPU/pair_eam_gpu.h +++ b/src/GPU/pair_eam_gpu.h @@ -36,13 +36,17 @@ class PairEAMGPU : public PairEAM { void init_style(); double memory_usage(); + int pack_comm(int, int *, double *, int, int *); + void unpack_comm(int, int, double *); + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; private: int gpu_mode; double cpu_time; int *gpulist; - + void *fp_pinned; + bool fp_single; }; } diff --git a/src/MANYBODY/pair_eam.h b/src/MANYBODY/pair_eam.h index 1b7b1f1f00..677d83f4e7 100644 --- a/src/MANYBODY/pair_eam.h +++ b/src/MANYBODY/pair_eam.h @@ -53,8 +53,8 @@ class PairEAM : public Pair { double init_one(int, int); double single(int, int, int, int, double, double, double, double &); - int pack_comm(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); + virtual int pack_comm(int, int *, double *, int, int *); + virtual void unpack_comm(int, int, double *); int pack_reverse_comm(int, int, double *); void unpack_reverse_comm(int, int *, double *); double memory_usage();