/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #include #include #include #include "gb_gpu_memory.h" using namespace std; static GB_GPU_Memory GBMF; #define GBMT GB_GPU_Memory template void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, const int inum, const int form_low, const int form_high) { int stride=gbm.nbor->nbor_pitch(); int anall=gbm.atom->nall(); if (gbm.shared_types) { GBMF.k_gb_nbor_fast.set_size(GX,BX); GBMF.k_gb_nbor_fast.run(&gbm.atom->dev_x.begin(), &gbm.cut_form.begin(), &gbm.nbor->dev_nbor.begin(), &stride, &start, &inum, &gbm.nbor->dev_packed.begin(), &form_low, &form_high, &anall); } else { GBMF.k_gb_nbor.set_size(GX,BX); GBMF.k_gb_nbor.run(&gbm.atom->dev_x.begin(), &gbm.cut_form.begin(), &gbm._lj_types, &gbm.nbor->dev_nbor.begin(), &stride, &start, &inum, &gbm.nbor->dev_packed.begin(), &form_low, &form_high, &anall); } } // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int gb_gpu_init(const int ntypes, const double gamma, const double upsilon, const double mu, double **shape, double **well, double **cutsq, double **sigma, double **epsilon, double *host_lshape, int **form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const double cell_size, int &gpu_mode, FILE *screen) { GBMF.clear(); gpu_mode=GBMF.device->gpu_mode(); double gpu_split=GBMF.device->particle_split(); int first_gpu=GBMF.device->first_device(); int last_gpu=GBMF.device->last_device(); int world_me=GBMF.device->world_me(); int gpu_rank=GBMF.device->gpu_rank(); int procs_per_gpu=GBMF.device->procs_per_gpu(); GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu); bool message=false; if (GBMF.device->replica_me()==0 && screen) message=true; if (message) { fprintf(screen,"Initializing GPU and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, sigma, epsilon, host_lshape, form, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, max_nbors, cell_size, gpu_split, screen); GBMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; igpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) GBMF.estimate_gpu_overhead(); return init_ok; } // --------------------------------------------------------------------------- // Clear memory on host and device // --------------------------------------------------------------------------- void gb_gpu_clear() { GBMF.clear(); } // --------------------------------------------------------------------------- // Build neighbor list on device // --------------------------------------------------------------------------- template inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum, const int host_inum, const int nall, double **host_x, double **host_quat, int *host_type, double *sublo, double *subhi, bool &success) { gbm.nbor_time_avail=true; success=true; gbm.resize_atom(inum,nall,success); gbm.resize_local(inum,host_inum,gbm.nbor->max_nbors(),0,success); if (!success) return; gbm.atom->cast_copy_x(host_x,host_type); int mn; gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom, sublo, subhi, NULL, NULL, NULL, success, mn); gbm.nbor->copy_unpacked(inum,mn); gbm.last_ellipse=inum; gbm.max_last_ellipse=inum; } // --------------------------------------------------------------------------- // Copy neighbor list from host and (if spheres) reorder so ellipses first // --------------------------------------------------------------------------- template void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int inum, const int osize, int *ilist, int *numj, int *type, int **firstneigh, bool &success) { success=true; gbm.nbor_time_avail=true; int mn=gbm.nbor->max_nbor_loop(inum,numj,ilist); gbm.resize_atom(inum,nall,success); gbm.resize_local(inum,0,mn,osize,success); if (!success) return; if (gbm.multiple_forms) { int p=0; for (int i=0; iget_host(inum,gbm.host_olist.begin(),numj,firstneigh, gbm.block_size()); gbm.nbor->copy_unpacked(inum,mn); return; } gbm.last_ellipse=inum; gbm.max_last_ellipse=inum; gbm.nbor->get_host(inum,ilist,numj,firstneigh,gbm.block_size()); gbm.nbor->copy_unpacked(inum,mn); } // --------------------------------------------------------------------------- // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { // Compute the block size and grid size to keep all cores busy const int BX=gbm.block_size(); int eflag, vflag; if (_eflag) eflag=1; else eflag=0; if (_vflag) vflag=1; else vflag=0; int GX=static_cast(ceil(static_cast(gbm.ans->inum())/ (BX/gbm._threads_per_atom))); int stride=gbm.nbor->nbor_pitch(); int ainum=gbm.ans->inum(); int anall=gbm.atom->nall(); if (gbm.multiple_forms) { gbm.time_kernel.start(); if (gbm.last_ellipse>0) { // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- GX=static_cast(ceil(static_cast(gbm.last_ellipse)/ (BX/gbm._threads_per_atom))); gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE, ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); gbm.time_gayberne.start(); GBMF.k_gayberne.set_size(GX,BX); GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(), &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(), &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall, &gbm._threads_per_atom); gbm.time_gayberne.stop(); if (gbm.last_ellipse==gbm.ans->inum()) { gbm.time_kernel2.start(); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); gbm.time_gayberne2.stop(); gbm.time_pair.start(); gbm.time_pair.stop(); return; } // ------------ SPHERE_ELLIPSE --------------- gbm.time_kernel2.start(); GX=static_cast(ceil(static_cast(gbm.ans->inum()- gbm.last_ellipse)/ (BX/gbm._threads_per_atom))); gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(), SPHERE_ELLIPSE,SPHERE_ELLIPSE); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); GBMF.k_sphere_gb.set_size(GX,BX); GBMF.k_sphere_gb.run(&gbm.atom->dev_x.begin(),&gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(), &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(), &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall, &gbm._threads_per_atom); gbm.time_gayberne2.stop(); } else { gbm.ans->dev_ans.zero(); gbm.ans->dev_engv.zero(); gbm.time_kernel.stop(); gbm.time_gayberne.start(); gbm.time_gayberne.stop(); gbm.time_kernel2.start(); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); gbm.time_gayberne2.stop(); } // ------------ LJ --------------- gbm.time_pair.start(); if (gbm.last_ellipseinum()) { if (gbm.shared_types) { GBMF.k_lj_fast.set_size(GX,BX); GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(), &gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(), &stride, &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(), &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall, &gbm._threads_per_atom); } else { GBMF.k_lj.set_size(GX,BX); GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(), &gbm.lj3.begin(), &gbm._lj_types, &gbm.gamma_upsilon_mu.begin(), &stride, &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(), &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall, &gbm._threads_per_atom); } } gbm.time_pair.stop(); } else { gbm.time_kernel.start(); gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE, ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); gbm.time_gayberne.start(); GBMF.k_gayberne.set_size(GX,BX); GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(), &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(), &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(), &ainum, &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, &vflag, &ainum, &anall, &gbm._threads_per_atom); gbm.time_gayberne.stop(); } } // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary and then compute forces, torques, energies // --------------------------------------------------------------------------- template inline int** _gb_gpu_compute_n(gbmtyp &gbm, const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat) { gbm.acc_timers(); if (inum_full==0) { host_start=0; gbm.zero_timers(); return NULL; } gbm.hd_balancer.balance(cpu_time); int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full); gbm.ans->inum(inum); gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse); host_start=inum; // Build neighbor list on GPU if necessary if (ago==0) { _gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x, host_quat, host_type, sublo, subhi, success); if (!success) return NULL; gbm.atom->cast_quat_data(host_quat[0]); gbm.hd_balancer.start_timer(); } else { gbm.atom->cast_x_data(host_x,host_type); gbm.atom->cast_quat_data(host_quat[0]); gbm.hd_balancer.start_timer(); gbm.atom->add_x_data(host_x,host_type); } gbm.atom->add_quat_data(); *ilist=gbm.nbor->host_ilist.begin(); *jnum=gbm.nbor->host_acc.begin(); _gb_gpu_gayberne(gbm,eflag,vflag); gbm.ans->copy_answers(eflag,vflag,eatom,vatom); gbm.device->add_ans_object(gbm.ans); gbm.hd_balancer.stop_timer(); return gbm.nbor->host_jlist.begin()-host_start; } int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat) { return _gb_gpu_compute_n(GBMF, ago, inum_full, nall, host_x, host_type, sublo, subhi, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_quat); } // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then calculate forces, torques,.. // --------------------------------------------------------------------------- template inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full, const int nall,double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double **host_quat) { gbm.acc_timers(); if (inum_full==0) { host_start=0; gbm.zero_timers(); return NULL; } int ago=gbm.hd_balancer.ago_first(f_ago); int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time); gbm.ans->inum(inum); gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse); host_start=inum; if (ago==0) { _gb_gpu_reset_nbors(gbm, nall, inum, inum_full, ilist, numj, host_type, firstneigh, success); if (!success) return NULL; } int *list; if (gbm.multiple_forms) list=gbm.host_olist.begin(); else list=ilist; gbm.atom->cast_x_data(host_x,host_type); gbm.atom->cast_quat_data(host_quat[0]); gbm.hd_balancer.start_timer(); gbm.atom->add_x_data(host_x,host_type); gbm.atom->add_quat_data(); _gb_gpu_gayberne(gbm,eflag,vflag); gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list); gbm.device->add_ans_object(gbm.ans); gbm.hd_balancer.stop_timer(); return list; } int * gb_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double **host_quat) { return _gb_gpu_compute(GBMF, ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, host_quat); } // --------------------------------------------------------------------------- // Return memory usage // --------------------------------------------------------------------------- double gb_gpu_bytes() { return GBMF.host_memory_usage(); }