/*************************************************************************** gb_gpu.cu ------------------- W. Michael Brown Gay-Berne anisotropic potential GPU calcultation *** Force decomposition by Atom Version *** __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Tue Jun 23 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #include #include #include "nvc_macros.h" #include "nvc_timer.h" #include "nvc_device.h" #include "gb_gpu_memory.cu" #include "gb_gpu_kernel.h" using namespace std; static GB_GPU_Memory GBMF[MAX_GPU_THREADS]; #define GBMT GB_GPU_Memory // --------------------------------------------------------------------------- // Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access // -- Only pack neighbors matching the specified inclusive range of forms // -- Only pack neighbors within cutoff // --------------------------------------------------------------------------- template __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, const int start, const int inum, const int *dev_ij, const int form_low, const int form_high, const int nall) { // ii indexes the two interacting particles in gi int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start; if (ii(i,0); numtyp iy=_x_(i,1); numtyp iz=_x_(i,2); int itype=_x_(i,7); int newj=0; for ( ; list=nall) j%=nall; int jtype=_x_(j,7); if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) { // Compute r12; numtyp rsq=_x_(j,0)-ix; rsq*=rsq; numtyp t=_x_(j,1)-iy; rsq+=t*t; t=_x_(j,2)-iz; rsq+=t*t; if (rsq< _cutsq_(itype,jtype)) { *nbor=j; nbor+=nbor_pitch; newj++; } } } *nbor_newj=newj; } } // --------------------------------------------------------------------------- // Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access // -- Only pack neighbors matching the specified inclusive range of forms // -- Only pack neighbors within cutoff // -- Fast version of routine that uses shared memory for LJ constants // --------------------------------------------------------------------------- template __global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch, const int start, const int inum, const int *dev_ij, const int form_low, const int form_high, const int nall) { int ii=threadIdx.x; __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; if (ii(itype,jtype); form[ii]=_form_(itype,jtype); } ii+=INT_MUL(blockIdx.x,blockDim.x)+start; __syncthreads(); if (ii(i,0); numtyp iy=_x_(i,1); numtyp iz=_x_(i,2); int itype=INT_MUL(MAX_SHARED_TYPES,_x_(i,7)); int newj=0; for ( ; list=nall) j%=nall; int mtype=itype+_x_(j,7); if (form[mtype]>=form_low && form[mtype]<=form_high) { // Compute r12; numtyp rsq=_x_(j,0)-ix; rsq*=rsq; numtyp t=_x_(j,1)-iy; rsq+=t*t; t=_x_(j,2)-iz; rsq+=t*t; if (rsq void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, const int inum, const int form_low, const int form_high) { if (gbm.shared_types) kernel_pack_nbor_fast<<>> (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum, gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); else kernel_pack_nbor<<>> (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum, gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); } // --------------------------------------------------------------------------- // Convert something to a string // --------------------------------------------------------------------------- #include template inline string gb_gpu_toa(const t& in) { ostringstream o; o.precision(2); o << in; return o.str(); } // --------------------------------------------------------------------------- // Return string with GPU info // --------------------------------------------------------------------------- string gb_gpu_name(const int id, const int max_nbors) { string name=GBMF[0].gpu.name(id)+", "+ gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+ gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+ gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+ gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id), max_nbors))+" Atoms"; return name; } // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma, const double upsilon, const double mu, double **shape, double **well, double **cutsq, double **sigma, double **epsilon, double *host_lshape, int **form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int max_nbors, const int thread, const int gpu_id) { assert(thread inline void _gb_gpu_atom(PairGPUAtom &atom, double **host_x, double **host_quat, const int *host_type, const bool rebuild, cudaStream_t &stream) { atom.time_atom.start(); atom.reset_write_buffer(); // Rows 1-3 of dev_x are position; rows 4-7 are quaternion atom.add_atom_data(host_x[0],3); atom.add_atom_data(host_x[0]+1,3); atom.add_atom_data(host_x[0]+2,3); atom.add_atom_data(host_quat[0],4); atom.add_atom_data(host_quat[0]+1,4); atom.add_atom_data(host_quat[0]+2,4); atom.add_atom_data(host_quat[0]+3,4); int csize=7; // If a rebuild occured, copy type data if (rebuild) { atom.add_atom_data(host_type); csize++; } atom.copy_atom_data(csize,stream); atom.time_atom.stop(); } void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type, const bool rebuild, const int thread) { _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild, GBMF[thread].pair_stream); } // --------------------------------------------------------------------------- // Signal that we need to transfer a new neighbor list // --------------------------------------------------------------------------- template int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, const int inum, int *ilist, const int *numj, const int *type, bool &success) { if (nall>gbm.max_atoms) { success=false; return 0; } success=true; gbm.nbor.time_nbor.start(); gbm.atom.nall(nall); gbm.atom.inum(inum); if (gbm.multiple_forms) { int p=0, acc=0; for (int i=0; i void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) { gbm.nbor.time_nbor.add_to_total(); // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed gbm.nbor.time_nbor.start(); gbm.nbor.add(num_ij,gbm.pair_stream); gbm.nbor.time_nbor.stop(); } void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) { _gb_gpu_nbors(GBMF[thread],num_ij,eflag); } // --------------------------------------------------------------------------- // Calculate energies, forces, and torques for all ij interactions // --------------------------------------------------------------------------- template void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, const bool rebuild) { // Compute the block size and grid size to keep all cores busy const int BX=BLOCK_1D; int GX=static_cast(ceil(static_cast(gbm.atom.inum())/BX)); if (gbm.multiple_forms) { gbm.time_kernel.start(); if (gbm.last_ellipse>0) { // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- GX=static_cast(ceil(static_cast(gbm.last_ellipse)/ static_cast(BX))); pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); gbm.time_gayberne.start(); kernel_gayberne<<>> (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.nall()); gbm.time_gayberne.stop(); if (gbm.last_ellipse==gbm.atom.inum()) { gbm.time_kernel2.start(); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); gbm.time_gayberne2.stop(); gbm.time_pair.start(); gbm.time_pair.stop(); return; } // ------------ SPHERE_ELLIPSE --------------- gbm.time_kernel2.start(); GX=static_cast(ceil(static_cast(gbm.atom.inum()- gbm.last_ellipse)/BX)); pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE, ELLIPSE_SPHERE); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); kernel_sphere_gb<<>> (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); gbm.time_gayberne2.stop(); } else { gbm.atom.ans.zero(); gbm.time_kernel.stop(); gbm.time_gayberne.start(); gbm.time_gayberne.stop(); gbm.time_kernel2.start(); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); gbm.time_gayberne2.stop(); } // ------------ LJ --------------- gbm.time_pair.start(); if (gbm.last_ellipse<<>> (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); else kernel_lj<<>> (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); } gbm.time_pair.stop(); } else { gbm.time_kernel.start(); pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); gbm.time_gayberne.start(); kernel_gayberne<<>> (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), eflag, vflag, gbm.atom.inum(), gbm.atom.nall()); gbm.time_gayberne.stop(); } } void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, const int thread) { _gb_gpu_gayberne(GBMF[thread],eflag,vflag,rebuild); } // --------------------------------------------------------------------------- // Get energies, forces, and torques to host // --------------------------------------------------------------------------- template double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist, const bool eflag, const bool vflag, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial) { double evdw; gbm.atom.time_answer.start(); gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream); gbm.atom.time_atom.add_to_total(); gbm.nbor.time_nbor.add_to_total(); gbm.time_kernel.add_to_total(); gbm.time_gayberne.add_to_total(); if (gbm.multiple_forms) { gbm.time_kernel2.add_to_total(); gbm.time_gayberne2.add_to_total(); gbm.time_pair.add_to_total(); } // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial); gbm.atom.add_forces(ilist,f); gbm.atom.add_torques(ilist,tor,gbm.last_ellipse); gbm.atom.time_answer.stop(); gbm.atom.time_answer.add_to_total(); return evdw; } double gb_gpu_forces(double **f, double **tor, const int *ilist, const bool eflag, const bool vflag, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial, const int thread) { return _gb_gpu_forces (GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom, vflag_atom,eatom,vatom,virial); } void gb_gpu_time(const int i) { cout.precision(4); cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds() << " s.\n" << "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds() << " s.\n" << "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+ GBMF[i].time_kernel2.total_seconds() << " s.\n" << "Force calc: " << GBMF[i].time_gayberne.total_seconds()+ GBMF[i].time_gayberne2.total_seconds()<< " s.\n"; if (GBMF[i].multiple_forms) cout << "LJ calc: " << GBMF[i].time_pair.total_seconds() << " s.\n"; cout << "Answer copy: " << GBMF[i].atom.time_answer.total_seconds() << " s.\n"; } int gb_gpu_num_devices() { return GBMF[0].gpu.num_devices(); } double gb_gpu_bytes() { return GBMF[0].host_memory_usage(); }