/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #include "pair_gpu_device.h" #include "pair_gpu_precision.h" #include #include #define PairGPUDeviceT PairGPUDevice template PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false), _gpu_mode(GPU_FORCE), _first_device(0), _last_device(0) { } template PairGPUDeviceT::~PairGPUDevice() { clear_device(); } template bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu, const int gpu_mode, const double p_split) { if (_device_init) return true; _device_init=true; _first_device=first_gpu; _last_device=last_gpu; _gpu_mode=gpu_mode; _particle_split=p_split; // Get the rank within the world MPI_Comm_rank(MPI_COMM_WORLD,&_world_me); MPI_Comm_size(MPI_COMM_WORLD,&_world_size); // Get the names of all nodes int name_length; char node_name[MPI_MAX_PROCESSOR_NAME]; char node_names[MPI_MAX_PROCESSOR_NAME*_world_size]; MPI_Get_processor_name(node_name,&name_length); MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names, MPI_MAX_PROCESSOR_NAME,MPI_CHAR,MPI_COMM_WORLD); std::string node_string=std::string(node_name); // Get the number of procs per node std::map name_map; std::map::iterator np; for (int i=0; i<_world_size; i++) { std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]); np=name_map.find(i_string); if (np==name_map.end()) name_map[i_string]=1; else np->second++; } int procs_per_node=name_map.begin()->second; // Assign a unique id to each node int split_num=0, split_id=0; for (np=name_map.begin(); np!=name_map.end(); ++np) { if (np->first==node_string) split_id=split_num; split_num++; } // Set up a per node communicator and find rank within MPI_Comm node_comm; MPI_Comm_split(MPI_COMM_WORLD, split_id, 0, &node_comm); int node_rank; MPI_Comm_rank(node_comm,&node_rank); // set the device ID _procs_per_gpu=static_cast(ceil(static_cast(procs_per_node)/ (last_gpu-first_gpu+1))); int my_gpu=node_rank/_procs_per_gpu; // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&gpu_comm); MPI_Comm_rank(gpu_comm,&_gpu_rank); gpu=new UCL_Device(); if (my_gpu>=gpu->num_devices()) return false; gpu->set(my_gpu); return true; } template bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, const int host_nlocal, const int nall, const int maxspecial, const bool gpu_nbor, const int gpu_host, const int max_nbors, const double cell_size, const bool pre_cut) { if (!_device_init) return false; if (_init_count==0) { // Initialize atom and nbor data if (!atom.init(nlocal,nall,charge,rot,*gpu,gpu_nbor, gpu_nbor && maxspecial>0)) return false; if (!nbor.init(nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor, gpu_host,pre_cut)) return false; nbor.cell_size(cell_size); } else { if (cell_size>nbor.cell_size()) nbor.cell_size(cell_size); } _init_count++; return true; } template void PairGPUDeviceT::init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu) { #ifdef USE_OPENCL std::string fs=""; #else std::string fs=toa(gpu->free_gigabytes())+"/"; #endif if (_world_me == 0 && screen) { fprintf(screen,"\n-------------------------------------"); fprintf(screen,"-------------------------------------\n"); fprintf(screen,"- Using GPGPU acceleration for %s:\n",name); fprintf(screen,"- with %d procs per device.\n",_procs_per_gpu); fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n"); for (int i=first_gpu; i<=last_gpu; i++) { std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+ " GHZ ("; if (sizeof(PRECISION)==4) { if (sizeof(ACC_PRECISION)==4) sname+="Single Precision)"; else sname+="Mixed Precision)"; } else sname+="Double Precision)"; fprintf(screen,"GPU %d: %s\n",i,sname.c_str()); } fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n\n"); } } template void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split, const double max_bytes, FILE *screen) { double single[5], times[5]; single[0]=atom.transfer_time(); single[1]=nbor.time_nbor.total_seconds(); single[2]=nbor.time_kernel.total_seconds(); single[3]=time_pair.total_seconds(); single[4]=atom.cast_time(); MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); double my_max_bytes=max_bytes; double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); double max_mb=mpi_max_bytes/(1024.0*1024.0); if (world_me()==0) if (screen && times[3]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); fprintf(screen," GPU Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); if (procs_per_gpu()==1) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_world_size); fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_world_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_world_size); if (nbor.gpu_nbor()) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_world_size); else fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_world_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/_world_size); } fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); } } template void PairGPUDeviceT::clear() { if (_init_count>0) { _init_count--; if (_init_count==0) { atom.clear(); nbor.clear(); } } } template void PairGPUDeviceT::clear_device() { while (_init_count>0) clear(); if (_device_init) { delete gpu; _device_init=false; } } template double PairGPUDeviceT::host_memory_usage() const { return atom.host_memory_usage()+ nbor.host_memory_usage()+4*sizeof(numtyp)+ sizeof(PairGPUDevice); } template class PairGPUDevice; PairGPUDevice pair_gpu_device; bool lmp_init_device(const int first_gpu, const int last_gpu, const int gpu_mode, const double particle_split) { return pair_gpu_device.init_device(first_gpu,last_gpu,gpu_mode, particle_split); } void lmp_clear_device() { pair_gpu_device.clear_device(); } double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul) { if (pair_gpu_device.init_count()) { pair_gpu_device.stop_host_timer(); pair_gpu_device.gpu->sync(); double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul); pair_gpu_device.atom.get_answers(f,tor); return evdw; } return 0.0; }