/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains cetain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing author: Mike Brown (SNL) ------------------------------------------------------------------------- */ #include "math.h" #include "stdio.h" #include "stdlib.h" #include "pair_gayberne_gpu.h" #include "math_extra.h" #include "atom.h" #include "atom_vec.h" #include "comm.h" #include "force.h" #include "neighbor.h" #include "neigh_list.h" #include "integrate.h" #include "memory.h" #include "error.h" #include "neigh_request.h" #include "universe.h" #include #ifdef GB_GPU_OMP #include "omp.h" #endif #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) // External functions from cuda library for atom decomposition int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma, const double upsilon, const double mu, double **shape, double **well, double **cutsq, double **sigma, double **epsilon, double *host_lshape, int **form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int max_nbors, const int thread, const int gpu_id); void gb_gpu_clear(const int thread); int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, int *ilist, const int *numj, const int *type, const int thread, bool &success); void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread); void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type, const bool rebuild, const int thread); void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, const int thread); double gb_gpu_forces(double **f, double **tor, const int *ilist, const bool eflag, const bool vflag, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial, const int thread); std::string gb_gpu_name(const int i, const int max_nbors); void gb_gpu_time(const int thread); int gb_gpu_num_devices(); double gb_gpu_bytes(); using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0), omp_chunk(0), nthreads(1), multi_gpu_mode(ONE_NODE), multi_gpu_param(0) { ij_new[0]=NULL; } /* ---------------------------------------------------------------------- free all arrays ------------------------------------------------------------------------- */ PairGayBerneGPU::~PairGayBerneGPU() { if (comm->me == 0 && screen) { printf("\n\n-------------------------------------"); printf("--------------------------------\n"); printf(" GPU Time Stamps: "); printf("\n-------------------------------------"); printf("--------------------------------\n"); gb_gpu_time(my_thread); printf("Procs: %d\n",comm->nprocs); printf("-------------------------------------"); printf("--------------------------------\n\n"); } #pragma omp parallel { #ifdef GB_GPU_OMP int my_thread=omp_get_thread_num(); #endif gb_gpu_clear(my_thread); if (ij_new[my_thread]!=NULL) { ij_new[my_thread]=NULL; delete [] ij_new[my_thread]; } } } /* ---------------------------------------------------------------------- */ void PairGayBerneGPU::compute(int eflag, int vflag) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; if (vflag_atom) error->all("Per-atom virial not available with GPU Gay-Berne."); int nlocal = atom->nlocal; int nall = nlocal + atom->nghost; int inum = list->inum; bool rebuild=false; if (neighbor->ncalls > last_neighbor) { last_neighbor=neighbor->ncalls; rebuild=true; } #pragma omp parallel { bool success=true; #ifdef GB_GPU_OMP int my_thread=omp_get_thread_num(); if (rebuild) { omp_chunk=static_cast(ceil(static_cast(inum)/nthreads)); if (my_thread==nthreads-1) thread_inum[my_thread]=inum-(nthreads-1)*omp_chunk; else thread_inum[my_thread]=omp_chunk; olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, thread_inum[my_thread], list->ilist+omp_chunk*my_thread, list->numneigh, atom->type, my_thread, success); } #else if (rebuild) olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, inum, list->ilist, list->numneigh, atom->type, my_thread, success); #endif if (!success) error->one("Total # of atoms exceeds maximum allowed per GPGPU.\n"); // copy atom data to GPU gb_gpu_atom(atom->x,atom->quat,atom->type,rebuild,my_thread); int i,j,ii,jj,jnum; double factor_lj; int *jlist; if (rebuild==true) { int num_ij = 0; // loop over neighbors of my atoms int *ijp=ij_new[my_thread]; #ifdef GB_GPU_OMP int mgo=my_thread*omp_chunk; int mgot=mgo+thread_inum[my_thread]; #else int mgo=0, mgot=inum; #endif for (ii = mgo; iifirstneigh[i]; jnum = list->numneigh[i]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; *ijp=j; ijp++; num_ij++; if (num_ij==ij_size) { memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int)); gb_gpu_nbors(num_ij,eflag,my_thread); ijp=ij_new[my_thread]; num_ij=0; } } } if (num_ij>0) { memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int)); gb_gpu_nbors(num_ij,eflag,my_thread); } } gb_gpu_gayberne(eflag,vflag,rebuild,my_thread); double lvirial[6]; for (int i=0; i<6; i++) lvirial[i]=0.0; double my_eng=gb_gpu_forces(atom->f,atom->torque,olist[my_thread],eflag,vflag, eflag_atom, vflag_atom, eatom, vatom, lvirial, my_thread); #pragma omp critical { eng_vdwl+=my_eng; virial[0]+=lvirial[0]; virial[1]+=lvirial[1]; virial[2]+=lvirial[2]; virial[3]+=lvirial[3]; virial[4]+=lvirial[4]; virial[5]+=lvirial[5]; } } //End parallel if (vflag_fdotr) virial_compute(); } /* ---------------------------------------------------------------------- global settings ------------------------------------------------------------------------- */ void PairGayBerneGPU::settings(int narg, char **arg) { // strip off GPU keyword/value and send remaining args to parent if (narg < 2) error->all("Illegal pair_style command"); // set multi_gpu_mode to one/node for multiple gpus on 1 node // -- param is starting gpu id // set multi_gpu_mode to one/gpu to select the same gpu id on every node // -- param is id of gpu // set multi_gpu_mode to multi/gpu to get ma // -- param is number of gpus per node if (strcmp(arg[0],"one/node") == 0) multi_gpu_mode = ONE_NODE; else if (strcmp(arg[0],"one/gpu") == 0) multi_gpu_mode = ONE_GPU; else if (strcmp(arg[0],"multi/gpu") == 0) multi_gpu_mode = MULTI_GPU; else error->all("Illegal pair_style command"); multi_gpu_param = atoi(arg[1]); if (multi_gpu_mode == MULTI_GPU && multi_gpu_param < 1) error->all("Illegal pair_style command"); PairGayBerne::settings(narg-2,&arg[2]); } /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ void PairGayBerneGPU::init_style() { if (force->pair_match("gpu",0) == NULL) error->all("Cannot use pair hybrid with multiple GPU pair styles"); if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type) error->all("Pair gayberne requires atom attributes quat, torque, shape"); if (atom->radius_flag) error->all("Pair gayberne cannot be used with atom attribute diameter"); // set the GPU ID int my_gpu=comm->me+multi_gpu_param; int ngpus=universe->nprocs; if (multi_gpu_mode==ONE_GPU) { my_gpu=multi_gpu_param; ngpus=1; } else if (multi_gpu_mode==MULTI_GPU) { ngpus=multi_gpu_param; my_gpu=comm->me%ngpus; if (ngpus>universe->nprocs) ngpus=universe->nprocs; } int irequest = neighbor->request(this); // per-type shape precalculations for (int i = 1; i <= atom->ntypes; i++) { if (setwell[i]) { double *one = atom->shape[i]; shape[i][0] = one[0]*one[0]; shape[i][1] = one[1]*one[1]; shape[i][2] = one[2]*one[2]; lshape[i] = (one[0]*one[1]+one[2]*one[2])*sqrt(one[0]*one[1]); } } // Repeat cutsq calculation because done after call to init_style double cut; for (int i = 1; i <= atom->ntypes; i++) for (int j = i; j <= atom->ntypes; j++) { cut = init_one(i,j); cutsq[i][j] = cutsq[j][i] = cut*cut; } // If compiled with OpenMP and only 1 proc, try to use multiple GPUs w/threads #ifdef GB_GPU_OMP if (multi_gpu_mode!=ONE_GPU) nthreads=ngpus=gb_gpu_num_devices(); else nthreads=ngpus=1; if (nthreads>MAX_GPU_THREADS) nthreads=MAX_GPU_THREADS; omp_set_num_threads(nthreads); #endif #pragma omp parallel firstprivate(my_gpu) { #ifdef GB_GPU_OMP int my_thread = omp_get_thread_num(); if (multi_gpu_mode!=ONE_GPU) my_gpu=my_thread; if (multi_gpu_mode==ONE_NODE) my_gpu+=multi_gpu_param; #endif ij[my_thread]=gb_gpu_init(ij_size, atom->ntypes+1, gamma, upsilon, mu, shape, well, cutsq, sigma, epsilon, lshape, form, lj1, lj2, lj3, lj4, offset, force->special_lj, neighbor->oneatom, my_thread, my_gpu); if (ij[my_thread]==0) error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU."); if (ij_new[my_thread]!=NULL) delete [] ij_new[my_thread]; ij_new[my_thread]=new int[ij_size]; } last_neighbor = -1; neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; if (force->newton_pair) error->all("Cannot use newton with GPU GayBerne pair style."); if (comm->me == 0 && screen) { printf("\n-------------------------------------"); printf("-------------------------------------\n"); printf("- Using GPGPU acceleration for Gay-Berne:\n"); printf("-------------------------------------"); printf("-------------------------------------\n"); for (int i=0; ioneatom); printf("GPU %d: %s\n",gpui,gpu_string.c_str()); } printf("-------------------------------------"); printf("-------------------------------------\n\n"); } } /* ---------------------------------------------------------------------- */ double PairGayBerneGPU::memory_usage() { double bytes=Pair::memory_usage()+nthreads*ij_size*sizeof(int); return bytes+gb_gpu_bytes(); }