/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #include "pair_gpu_atom.h" #define PairGPUAtomT PairGPUAtom #ifdef WINDLL #include typedef bool (*__win_sort_alloc)(const int max_atoms); typedef void (*__win_sort)(const int max_atoms, unsigned *cell_begin, int *particle_begin); __win_sort_alloc _win_sort_alloc; __win_sort _win_sort; #endif template PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false), _vflag(false),_inum(0),_ilist(NULL) { #ifndef USE_OPENCL sort_config.op = CUDPP_ADD; sort_config.datatype = CUDPP_UINT; sort_config.algorithm = CUDPP_SORT_RADIX; sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS; #ifdef WINDLL HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll")); if (hinstLib == NULL) { printf("\nUnable to load gpu.dll\n"); exit(1); } _win_sort_alloc=(__win_sort_alloc)GetProcAddress(hinstLib,"_win_sort_alloc"); _win_sort=(__win_sort)GetProcAddress(hinstLib,"_win_sort"); #endif #endif } template int PairGPUAtomT::bytes_per_atom() const { int id_space=0; if (_gpu_nbor) id_space=2; int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space; if (_rot) bytes+=4*sizeof(numtyp)+4*sizeof(acctyp); if (_charge) bytes+=sizeof(numtyp); return bytes; } template bool PairGPUAtomT::alloc(const int max_atoms) { bool success=true; int ans_elements=4; if (_rot) ans_elements+=4; // Ignore host/device transfers? bool cpuview=false; if (dev->device_type()==UCL_CPU) cpuview=true; // Allocate storage for CUDPP sort #ifndef USE_OPENCL #ifdef WINDLL _win_sort_alloc(max_atoms); #else if (_gpu_nbor) { CUDPPResult result = cudppPlan(&sort_plan, sort_config, max_atoms, 1, 0); if (CUDPP_SUCCESS != result) return false; } #endif #endif // -------------------------- Host allocations // Get a host write only buffer #ifdef GPU_CAST success=success && (host_x_cast.alloc(max_atoms*3,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); success=success && (host_type_cast.alloc(max_atoms,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); #else success=success && (host_x.alloc(max_atoms*4,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); #endif success=success && (host_ans.alloc(ans_elements*max_atoms,*dev)==UCL_SUCCESS); success=success && (host_engv.alloc(_ev_fields*max_atoms,*dev)==UCL_SUCCESS); // Buffer for casting only if different precisions if (_charge) success=success && (host_q.alloc(max_atoms,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); // Buffer for casting only if different precisions if (_rot) success=success && (host_quat.alloc(max_atoms*4,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); // --------------------------- Device allocations _gpu_bytes=0; if (cpuview) { #ifdef GPU_CAST assert(0==1); #else dev_x.view(host_x); #endif dev_engv.view(host_engv); dev_ans.view(host_ans); if (_rot) dev_quat.view(host_quat); if (_charge) dev_q.view(host_q); } else { #ifdef GPU_CAST success=success && (UCL_SUCCESS==dev_x.alloc(max_atoms*4,*dev)); success=success && (UCL_SUCCESS== dev_x_cast.alloc(max_atoms*3,*dev,UCL_READ_ONLY)); success=success && (UCL_SUCCESS== dev_type_cast.alloc(max_atoms,*dev,UCL_READ_ONLY)); _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); #else success=success && (UCL_SUCCESS== dev_x.alloc(max_atoms*4,*dev,UCL_READ_ONLY)); #endif success=success && (dev_engv.alloc(_ev_fields*max_atoms,*dev, UCL_WRITE_ONLY)==UCL_SUCCESS); success=success && (dev_ans.alloc(ans_elements*max_atoms, *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); if (_charge) { success=success && (dev_q.alloc(max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes+=dev_q.row_bytes(); } if (_rot) { success=success && (dev_quat.alloc(max_atoms*4,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes+=dev_quat.row_bytes(); } } if (_gpu_nbor) { success=success && (dev_cell_id.alloc(max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_particle_id.alloc(max_atoms,*dev)==UCL_SUCCESS); _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes(); if (_bonds) { success=success && (dev_tag.alloc(max_atoms,*dev)==UCL_SUCCESS); _gpu_bytes+=dev_tag.row_bytes(); } } _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes(); return success; } template bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, const bool rot, UCL_Device &devi, const bool gpu_nbor, const bool bonds) { clear(); bool success=true; _gpu_nbor=gpu_nbor; _bonds=bonds; _charge=charge; _rot=rot; _other=_charge || _rot; dev=&devi; _e_fields=1; if (_charge) _e_fields++; _ev_fields=6+_e_fields; // Initialize atom and nbor data int max_local=static_cast(static_cast(inum)*1.10); if (max_local==0) max_local=1000; if (nall<=inum) _max_atoms=max_local*2; else _max_atoms=static_cast(static_cast(nall)*1.10); // Initialize timers for the selected device time_pos.init(*dev); time_other.init(*dev); time_answer.init(*dev); time_pos.zero(); time_other.zero(); time_answer.zero(); _time_cast=0.0; #ifdef GPU_CAST compile_kernels(*dev); #endif _allocated=true; return success && alloc(_max_atoms); } template void PairGPUAtomT::clear_resize() { if (!_allocated) return; _allocated=false; dev_x.clear(); if (_charge) { dev_q.clear(); host_q.clear(); } if (_rot) { dev_quat.clear(); host_quat.clear(); } dev_ans.clear(); dev_engv.clear(); #ifndef GPU_CAST host_x.clear(); #else host_x_cast.clear(); host_type_cast.clear(); #endif host_ans.clear(); host_engv.clear(); dev_cell_id.clear(); dev_particle_id.clear(); dev_tag.clear(); #ifdef GPU_CAST dev_x_cast.clear(); dev_type_cast.clear(); #endif #ifndef USE_OPENCL #ifndef WINDLL if (_gpu_nbor) cudppDestroyPlan(sort_plan); #endif #endif } template void PairGPUAtomT::clear() { _gpu_bytes=0; if (!_allocated) return; time_pos.clear(); time_other.clear(); time_answer.clear(); clear_resize(); _inum=0; _eflag=false; _vflag=false; #ifdef GPU_CAST if (_compiled) { k_cast_x.clear(); delete atom_program; _compiled=false; } #endif } template double PairGPUAtomT::host_memory_usage() const { int atom_bytes=4; if (_charge) atom_bytes+=1; if (_rot) atom_bytes+=4; int ans_bytes=atom_bytes+_ev_fields; return _max_atoms*atom_bytes*sizeof(numtyp)+ ans_bytes*(_max_atoms)*sizeof(acctyp)+ sizeof(PairGPUAtom); } template void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, const bool ef_atom, const bool vf_atom) { time_answer.start(); _eflag=eflag; _vflag=vflag; _ef_atom=ef_atom; _vf_atom=vf_atom; int csize=_ev_fields; if (!eflag) csize-=_e_fields; if (!vflag) csize-=6; if (csize>0) ucl_copy(host_engv,dev_engv,_inum*csize,true); if (_rot) ucl_copy(host_ans,dev_ans,_inum*4*2,true); else ucl_copy(host_ans,dev_ans,_inum*4,true); time_answer.stop(); } template void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, const bool ef_atom, const bool vf_atom, int *ilist) { _ilist=ilist; copy_answers(eflag,vflag,ef_atom,vf_atom); } template double PairGPUAtomT::energy_virial(double *eatom, double **vatom, double *virial) { if (_eflag==false && _vflag==false) return 0.0; double evdwl=0.0; if (_gpu_nbor) { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; if (_eflag) { if (_ef_atom) { evdwl+=*ap; eatom[i]+=*ap*0.5; ap+=_inum; } else { evdwl+=*ap; ap+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { vatom[i][j]+=*ap*0.5; virial[j]+=*ap; ap+=_inum; } } else { for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=_inum; } } } } for (int j=0; j<6; j++) virial[j]*=0.5; } else { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; int ii=_ilist[i]; if (_eflag) { if (_ef_atom) { evdwl+=*ap; eatom[ii]+=*ap*0.5; ap+=_inum; } else { evdwl+=*ap; ap+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { vatom[ii][j]+=*ap*0.5; virial[j]+=*ap; ap+=_inum; } } else { for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=_inum; } } } } for (int j=0; j<6; j++) virial[j]*=0.5; } evdwl*=0.5; return evdwl; } template double PairGPUAtomT::energy_virial(double *eatom, double **vatom, double *virial, double &ecoul) { if (_eflag==false && _vflag==false) { ecoul=0.0; return 0.0; } if (_charge==false) return energy_virial(eatom,vatom,virial); double evdwl=0.0; double _ecoul=0.0; if (_gpu_nbor) { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; if (_eflag) { if (_ef_atom) { evdwl+=*ap; eatom[i]+=*ap*0.5; ap+=_inum; _ecoul+=*ap; eatom[i]+=*ap*0.5; ap+=_inum; } else { evdwl+=*ap; ap+=_inum; _ecoul+=*ap; ap+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { vatom[i][j]+=*ap*0.5; virial[j]+=*ap; ap+=_inum; } } else { for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=_inum; } } } } for (int j=0; j<6; j++) virial[j]*=0.5; } else { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; int ii=_ilist[i]; if (_eflag) { if (_ef_atom) { evdwl+=*ap; eatom[ii]+=*ap*0.5; ap+=_inum; _ecoul+=*ap; eatom[ii]+=*ap*0.5; ap+=_inum; } else { evdwl+=*ap; ap+=_inum; _ecoul+=*ap; ap+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { vatom[ii][j]+=*ap*0.5; virial[j]+=*ap; ap+=_inum; } } else { for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=_inum; } } } } for (int j=0; j<6; j++) virial[j]*=0.5; } evdwl*=0.5; ecoul+=_ecoul*0.5; return evdwl; } template void PairGPUAtomT::get_answers(double **f, double **tor) { acctyp *ap=host_ans.begin(); if (_gpu_nbor) { for (int i=0; i<_inum; i++) { f[i][0]+=*ap; ap++; f[i][1]+=*ap; ap++; f[i][2]+=*ap; ap+=2; } if (_rot) { for (int i=0; i<_inum; i++) { tor[i][0]+=*ap; ap++; tor[i][1]+=*ap; ap++; tor[i][2]+=*ap; ap+=2; } } } else { for (int i=0; i<_inum; i++) { int ii=_ilist[i]; f[ii][0]+=*ap; ap++; f[ii][1]+=*ap; ap++; f[ii][2]+=*ap; ap+=2; } if (_rot) { for (int i=0; i<_inum; i++) { int ii=_ilist[i]; tor[ii][0]+=*ap; ap++; tor[ii][1]+=*ap; ap++; tor[ii][2]+=*ap; ap+=2; } } } } // Sort arrays for neighbor list calculation template void PairGPUAtomT::sort_neighbor(const int num_atoms) { #ifndef USE_OPENCL #ifdef WINDLL _win_sort(num_atoms,(unsigned *)dev_cell_id.begin(), (int *)dev_particle_id.begin()); #else CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), (int *)dev_particle_id.begin(), 8*sizeof(unsigned), num_atoms); if (CUDPP_SUCCESS != result) { printf("Error in cudppSort\n"); NVD_GERYON_EXIT; } #endif #endif } #ifdef GPU_CAST #ifdef USE_OPENCL #include "pair_gpu_atom_cl.h" #else #include "pair_gpu_atom_ptx.h" #endif template void PairGPUAtomT::compile_kernels(UCL_Device &dev) { atom_program=new UCL_Program(dev); atom_program->load_string(pair_gpu_atom_kernel,""); k_cast_x.set_function(*atom_program,"kernel_cast_x"); _compiled=true; } #endif template class PairGPUAtom;