/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov Peng Wang (Nvidia), penwang@nvidia.com Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ #include #include #include "nvc_macros.h" #include "nvc_timer.h" #include "nvc_device.h" #include "gb_gpu_memory.cu" #include "gb_gpu_kernel.h" using namespace std; static GB_GPU_Memory GBMF[MAX_GPU_THREADS]; #define GBMT GB_GPU_Memory // --------------------------------------------------------------------------- // Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access // -- Only pack neighbors matching the specified inclusive range of forms // -- Only pack neighbors within cutoff // --------------------------------------------------------------------------- template __global__ void kernel_pack_nbor(const vec4 *x_, int *dev_nbor, const int nbor_pitch, const int start, const int inum, const int *dev_ij, const int form_low, const int form_high, const int nall) { // ii indexes the two interacting particles in gi int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start; if (ii=nall) j%=nall; vec4 jx=x_[j]; int jtype=jx.w; if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) { // Compute r12; numtyp rsq=jx.x-ix.x; rsq*=rsq; numtyp t=jx.y-ix.y; rsq+=t*t; t=jx.z-ix.z; rsq+=t*t; if (rsq< _cutsq_(itype,jtype)) { *nbor=j; nbor+=nbor_pitch; newj++; } } } *nbor_newj=newj; } } // --------------------------------------------------------------------------- // Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access // -- Only pack neighbors matching the specified inclusive range of forms // -- Only pack neighbors within cutoff // -- Fast version of routine that uses shared memory for LJ constants // --------------------------------------------------------------------------- template __global__ void kernel_pack_nbor_fast(const vec4 *x_, int *dev_nbor, const int nbor_pitch, const int start, const int inum, const int *dev_ij, const int form_low, const int form_high, const int nall) { int ii=threadIdx.x; __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; if (ii(itype,jtype); form[ii]=_form_(itype,jtype); } ii+=INT_MUL(blockIdx.x,blockDim.x)+start; __syncthreads(); if (ii=nall) j%=nall; vec4 jx=x_[j]; int jtype=jx.w; int mtype=itype+jtype; if (form[mtype]>=form_low && form[mtype]<=form_high) { // Compute r12; numtyp rsq=jx.x-ix.x; rsq*=rsq; numtyp t=jx.y-ix.y; rsq+=t*t; t=jx.z-ix.z; rsq+=t*t; if (rsq void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, const int inum, const int form_low, const int form_high) { if (gbm.shared_types) { kernel_pack_nbor_fast<<>> ((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), start, inum, gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); } else kernel_pack_nbor<<>> ((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), start, inum, gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); } // --------------------------------------------------------------------------- // Convert something to a string // --------------------------------------------------------------------------- #include template inline string gb_gpu_toa(const t& in) { ostringstream o; o.precision(2); o << in; return o.str(); } // --------------------------------------------------------------------------- // Return string with GPU info // --------------------------------------------------------------------------- EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) { string sname=GBMF[0].gpu.name(id)+", "+ gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+ gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+ gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ"; strcpy(name,sname.c_str()); } // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma, const double upsilon, const double mu, double **shape, double **well, double **cutsq, double **sigma, double **epsilon, double *host_lshape, int **form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int nlocal, const int nall, const int max_nbors, const int thread, const int gpu_id) { assert(thread inline void _gb_gpu_atom(PairGPUAtom &atom, double **host_x, double **host_quat, const int *host_type, const bool rebuild, cudaStream_t &stream) { atom.time_atom.start(); atom.reset_write_buffer(); // Rows 1-3 of dev_x are position; rows 4-7 are quaternion atom.add_x_data(host_x,host_type); atom.add_q_data(host_quat[0]); atom.copy_x_data(stream); atom.copy_q_data(stream); atom.time_atom.stop(); } EXTERN void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type, const bool rebuild, const int thread) { _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild, GBMF[thread].pair_stream); } // --------------------------------------------------------------------------- // Signal that we need to transfer a new neighbor list // --------------------------------------------------------------------------- template int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, const int inum, int *ilist, const int *numj, const int *type, bool &success) { success=true; gbm.nbor.time_nbor.start(); int mn=0; for (int i=0; igbm.max_atoms) gbm.resize_atom(nall,success); if (nlocal>gbm.max_local || mn>gbm._max_nbors) gbm.resize_local(nlocal,mn,success); if (!success) return false; gbm.atom.nall(nall); gbm.atom.inum(inum); if (gbm.multiple_forms) { int ij_size=gbm.nbor.host_ij.numel(); if (inum*20) { gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset, hi,gbm.pair_stream); gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+ inum*2+offset, hi,gbm.pair_stream); } gbm.nbor.ij_total=0; } } else { gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream); gbm.last_ellipse=inum; } gbm.nbor.time_nbor.stop(); if (gbm.multiple_forms) return gbm.host_olist.begin(); return ilist; } EXTERN int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, int *ilist, const int *numj, const int *type, const int thread, bool &success) { return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type, success); } // --------------------------------------------------------------------------- // Copy a set of ij_size ij interactions to device and compute energies, // forces, and torques for those interactions // --------------------------------------------------------------------------- template void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij, const bool eflag) { gbm.nbor.time_nbor.add_to_total(); // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed memcpy(gbm.nbor.host_ij.begin(),ij,num_ij*sizeof(int)); gbm.nbor.time_nbor.start(); gbm.nbor.add(num_ij,gbm.pair_stream); gbm.nbor.time_nbor.stop(); } EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag, const int thread) { _gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag); } template void _gb_gpu_enqueue(GBMT &gbm, const bool eflag, const bool vflag) { gbm.atom.time_answer.start(); gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream); gbm.atom.time_answer.stop(); } // --------------------------------------------------------------------------- // Calculate energies, forces, and torques for all ij interactions // --------------------------------------------------------------------------- template void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, const bool rebuild) { // Compute the block size and grid size to keep all cores busy const int BX=BLOCK_1D; int ans_pitch=6; if (eflag) ans_pitch++; if (vflag) ans_pitch+=6; int GX=static_cast(ceil(static_cast(gbm.atom.inum())/BX)); if (gbm.multiple_forms) { gbm.time_kernel.start(); if (gbm.last_ellipse>0) { // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- GX=static_cast(ceil(static_cast(gbm.last_ellipse)/ static_cast(BX))); pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); gbm.time_gayberne.start(); kernel_gayberne<<>> ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.nall()); gbm.time_gayberne.stop(); if (gbm.last_ellipse==gbm.atom.inum()) { gbm.time_kernel2.start(); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); gbm.time_gayberne2.stop(); gbm.time_pair.start(); gbm.time_pair.stop(); return; } // ------------ SPHERE_ELLIPSE --------------- gbm.time_kernel2.start(); GX=static_cast(ceil(static_cast(gbm.atom.inum()- gbm.last_ellipse)/BX)); pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE, ELLIPSE_SPHERE); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); kernel_sphere_gb<<>> ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); gbm.time_gayberne2.stop(); } else { gbm.atom.ans.zero(); gbm.time_kernel.stop(); gbm.time_gayberne.start(); gbm.time_gayberne.stop(); gbm.time_kernel2.start(); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); gbm.time_gayberne2.stop(); } // ------------ LJ --------------- gbm.time_pair.start(); if (gbm.last_ellipse<<>> ((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(), gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); else kernel_lj<<>> ((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(), gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); } gbm.time_pair.stop(); } else { gbm.time_kernel.start(); pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); gbm.time_gayberne.start(); kernel_gayberne<<>> ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag, vflag, gbm.atom.inum(), gbm.atom.nall()); gbm.time_gayberne.stop(); } } EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, const int thread) { _gb_gpu_gayberne(GBMF[thread],eflag,vflag,rebuild); _gb_gpu_enqueue(GBMF[thread],eflag,vflag); } // --------------------------------------------------------------------------- // Get energies, forces, and torques to host // --------------------------------------------------------------------------- template double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist, const bool eflag, const bool vflag, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial) { double evdw; gbm.atom.time_atom.add_to_total(); gbm.nbor.time_nbor.add_to_total(); gbm.time_kernel.add_to_total(); gbm.time_gayberne.add_to_total(); if (gbm.multiple_forms) { gbm.time_kernel2.add_to_total(); gbm.time_gayberne2.add_to_total(); gbm.time_pair.add_to_total(); } CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); if (gbm.last_ellipse>gbm.atom.inum()) { if (eflag || vflag) evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial, f,tor,gbm.atom.inum()); else gbm.atom.copy_asphere(ilist,f,tor,gbm.atom.inum()); } else { if (eflag || vflag) evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial, f,tor,gbm.last_ellipse); else gbm.atom.copy_asphere(ilist,f,tor,gbm.last_ellipse); } gbm.atom.time_answer.add_to_total(); return evdw; } EXTERN double gb_gpu_forces(double **f, double **tor, const int *ilist, const bool eflag, const bool vflag, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial, const int thread) { return _gb_gpu_forces (GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom, vflag_atom,eatom,vatom,virial); } EXTERN void gb_gpu_time(const int i) { cout.precision(4); cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds() << " s.\n" << "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds() << " s.\n" << "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+ GBMF[i].time_kernel2.total_seconds() << " s.\n" << "Force calc: " << GBMF[i].time_gayberne.total_seconds()+ GBMF[i].time_gayberne2.total_seconds()<< " s.\n"; if (GBMF[i].multiple_forms) cout << "LJ calc: " << GBMF[i].time_pair.total_seconds() << " s.\n"; cout << "Answer copy: " << GBMF[i].atom.time_answer.total_seconds() << " s.\n"; } EXTERN int gb_gpu_num_devices() { return GBMF[0].gpu.num_devices(); } EXTERN double gb_gpu_bytes() { return GBMF[0].host_memory_usage(); }