/*************************************************************************** lj_gpu.cu ------------------- W. Michael Brown Lennard-Jones potential GPU calcultation __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Tue Aug 4 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #include #include #include "nvc_macros.h" #include "nvc_timer.h" #include "nvc_device.h" #include "pair_gpu_texture.h" #include "lj_gpu_memory.cu" #include "lj_gpu_kernel.h" using namespace std; static LJ_GPU_Memory LJMF; #define LJMT LJ_GPU_Memory // --------------------------------------------------------------------------- // Convert something to a string // --------------------------------------------------------------------------- #include template inline string lj_gpu_toa(const t& in) { ostringstream o; o.precision(2); o << in; return o.str(); } // --------------------------------------------------------------------------- // Return string with GPU info // --------------------------------------------------------------------------- string lj_gpu_name(const int id, const int max_nbors) { string name=LJMF.gpu.name(id)+", "+ lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+ lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+ lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+ lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id), max_nbors))+" Atoms"; return name; } // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, double **epsilon, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int max_nbors, const int gpu_id) { if (LJMF.gpu.num_devices()==0) return 0; ij_size=IJ_SIZE; return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id); } // --------------------------------------------------------------------------- // Clear memory on host and device // --------------------------------------------------------------------------- void lj_gpu_clear() { LJMF.clear(); } // --------------------------------------------------------------------------- // copy atom positions and optionally types to device // --------------------------------------------------------------------------- template inline void _lj_gpu_atom(PairGPUAtom &atom, double **host_x, const int *host_type, const bool rebuild, cudaStream_t &stream) { atom.time_atom.start(); atom.reset_write_buffer(); // First row of dev_x is x position, second is y, third is z atom.add_atom_data(host_x[0],3); atom.add_atom_data(host_x[0]+1,3); atom.add_atom_data(host_x[0]+2,3); int csize=3; // If a rebuild occured, copy type data if (rebuild) { atom.add_atom_data(host_type); csize++; } atom.copy_atom_data(csize,stream); atom.time_atom.stop(); } void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) { _lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream); } // --------------------------------------------------------------------------- // Signal that we need to transfer a new neighbor list // --------------------------------------------------------------------------- template bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum, int *ilist, const int *numj) { if (nall>ljm.max_atoms) return false; ljm.nbor.time_nbor.start(); ljm.atom.nall(nall); ljm.atom.inum(inum); ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream); ljm.nbor.time_nbor.stop(); return true; } bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, const int *numj) { return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj); } // --------------------------------------------------------------------------- // Copy a set of ij_size ij interactions to device and compute energies, // forces, and torques for those interactions // --------------------------------------------------------------------------- template void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) { ljm.nbor.time_nbor.add_to_total(); // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed ljm.nbor.time_nbor.start(); ljm.nbor.add(num_ij,ljm.pair_stream); ljm.nbor.time_nbor.stop(); } void lj_gpu_nbors(const int num_ij) { _lj_gpu_nbors(LJMF,num_ij); } // --------------------------------------------------------------------------- // Calculate energies and forces for all ij interactions // --------------------------------------------------------------------------- template void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){ // Compute the block size and grid size to keep all cores busy const int BX=BLOCK_1D; int GX=static_cast(ceil(static_cast(ljm.atom.inum())/BX)); ljm.time_pair.start(); if (ljm.shared_types) kernel_lj_fast<<>> (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, vflag, ljm.atom.inum(), ljm.atom.nall()); else kernel_lj<<>> (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, vflag, ljm.atom.inum(), ljm.atom.nall()); ljm.time_pair.stop(); } void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) { _lj_gpu(LJMF,eflag,vflag,rebuild); } // --------------------------------------------------------------------------- // Get energies and forces to host // --------------------------------------------------------------------------- template double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist, const bool eflag, const bool vflag, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial) { double evdw; ljm.atom.time_answer.start(); ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream); ljm.atom.time_atom.add_to_total(); ljm.nbor.time_nbor.add_to_total(); ljm.time_pair.add_to_total(); // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial); ljm.atom.add_forces(ilist,f); ljm.atom.time_answer.stop(); ljm.atom.time_answer.add_to_total(); return evdw; } double lj_gpu_forces(double **f, const int *ilist, const bool eflag, const bool vflag, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial) { return _lj_gpu_forces (LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial); } void lj_gpu_time() { cout.precision(4); cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n"; cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n"; cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n"; cout << "Answer copy: " << LJMF.atom.time_answer.total_seconds() << " s.\n"; } int lj_gpu_num_devices() { return LJMF.gpu.num_devices(); } double lj_gpu_bytes() { return LJMF.host_memory_usage(); }