diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 26fbecb4ba..73fbf15b1e 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -58,7 +58,8 @@ OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \ $(OBJ_DIR)/cmmc_msm_gpu_memory.o $(OBJ_DIR)/cmmc_msm_gpu.o \ $(CUDPP) -PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \ +PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \ + $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \ $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \ $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \ $(OBJ_DIR)/pppm_gpu_kernel.ptx $(OBJ_DIR)/pppm_gpu_ptx.h \ @@ -125,8 +126,14 @@ $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H) $(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) - $(CUDR) -o $@ -c pair_gpu_device.cpp +$(OBJ_DIR)/pair_gpu_dev_kernel.ptx: pair_gpu_dev_kernel.cu + $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_dev_kernel.cu + +$(OBJ_DIR)/pair_gpu_dev_ptx.h: $(OBJ_DIR)/pair_gpu_dev_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_dev_kernel.ptx $(OBJ_DIR)/pair_gpu_dev_ptx.h + +$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_ptx.h + $(CUDR) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR) $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp $(CUDR) -o $@ -c atomic_gpu_memory.cpp diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 0307d8c023..3e758bb756 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -46,8 +46,8 @@ OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \ $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o -KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \ - $(OBJ_DIR)/pppm_gpu_cl.h \ +KERS = $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_atom_cl.h \ + $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pppm_gpu_cl.h \ $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \ $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \ $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h \ @@ -74,8 +74,11 @@ $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h $(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) - $(OCL) -o $@ -c pair_gpu_device.cpp +$(OBJ_DIR)/pair_gpu_dev_cl.h: pair_gpu_dev_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh pair_gpu_dev_kernel.cu $(OBJ_DIR)/pair_gpu_dev_cl.h + +$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_cl.h + $(OCL) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR) $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp $(OCL) -o $@ -c atomic_gpu_memory.cpp diff --git a/lib/gpu/lj_cut_gpu.cpp b/lib/gpu/lj_cut_gpu.cpp index 4082a94f64..5e20eacfe4 100644 --- a/lib/gpu/lj_cut_gpu.cpp +++ b/lib/gpu/lj_cut_gpu.cpp @@ -89,6 +89,8 @@ bool ljl_gpu_init(const int ntypes, double **cutsq, } if (message) fprintf(screen,"\n"); + LJLMF.device->estimate_gpu_overhead(); + return true; } diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h index 579ad9aea8..526c146f37 100644 --- a/lib/gpu/pair_gpu_atom.h +++ b/lib/gpu/pair_gpu_atom.h @@ -81,6 +81,12 @@ class PairGPUAtom { bool add_fields(const bool charge, const bool rot, const bool gpu_nbor, const bool bonds); + /// Returns true if GPU is using charges + bool charge() { return _charge; } + + /// Returns true if GPU is using quaternions + bool quat() { return _rot; } + /// Only free matrices of length inum or nall for resizing void clear_resize(); diff --git a/lib/gpu/pair_gpu_balance.h b/lib/gpu/pair_gpu_balance.h index abdebbb654..f5bc1ab261 100644 --- a/lib/gpu/pair_gpu_balance.h +++ b/lib/gpu/pair_gpu_balance.h @@ -53,7 +53,7 @@ class PairGPUBalance { if (gpu_split>0) host_nlocal=static_cast(ceil((1.0-gpu_split)*nlocal)); else - host_nlocal=static_cast(ceil(0.1*nlocal)); + host_nlocal=static_cast(ceil(0.05*nlocal)); } return host_nlocal; } @@ -130,7 +130,7 @@ void PairGPUBalanceT::init(PairGPUDevice *gpu, if (split<0.0) { _load_balance=true; - _desired_split=0.9; + _desired_split=0.90; } else { _load_balance=false; _desired_split=split; @@ -162,31 +162,27 @@ int PairGPUBalanceT::get_gpu_count(const int ago, const int inum_full) { template void PairGPUBalanceT::balance(const double cpu_time) { if (_measure_this_step) { + _measure_this_step=false; + double gpu_time=_device_time.seconds(); + + double max_gpu_time; + MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX, + _device->gpu_comm()); + if (_inum_full==_inum) { _desired_split=1.0; return; } - _measure_this_step=false; - double gpu_time=_device_time.seconds(); + double cpu_time_per_atom=cpu_time/(_inum_full-_inum); + double cpu_other_time=_device->host_time()-cpu_time; + int host_inum=static_cast((max_gpu_time-cpu_other_time)/ + cpu_time_per_atom); - double cpu_gpu_time[3], max_times[3]; - cpu_gpu_time[0]=cpu_time/(_inum_full-_inum); - cpu_gpu_time[1]=gpu_time/_inum; - cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full; - - MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX, - _device->gpu_comm()); - double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]); - split*=_HD_BALANCE_GAP; - - if (split>1.0) - split=1.0; - if (_avg_count<10) - _desired_split=(_desired_split*_avg_count+split)/(_avg_count+1); - else - _desired_split=_desired_split*(1.0-_HD_BALANCE_WEIGHT)+ - _HD_BALANCE_WEIGHT*split; + double split=static_cast(_inum_full-host_inum)/_inum_full; + _desired_split=split*_HD_BALANCE_GAP; + if (_desired_split<0.0) + _desired_split=0.0; if (!_gpu_nbor) { if (_desired_split<_max_split) @@ -194,6 +190,7 @@ void PairGPUBalanceT::balance(const double cpu_time) { else _actual_split=_max_split; } +//std::cout << gpu_time << " " << max_gpu_time << " " << cpu_other_time << " " << cpu_time_per_atom << " " << cpu_time << " " << _desired_split << " " << host_inum << std::endl; } _avg_split+=_desired_split; _avg_count++; diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp index 87775291f5..bbfd9376c9 100644 --- a/lib/gpu/pair_gpu_device.cpp +++ b/lib/gpu/pair_gpu_device.cpp @@ -23,12 +23,19 @@ #include #endif +#ifdef USE_OPENCL +#include "pair_gpu_dev_cl.h" +#else +#include "pair_gpu_dev_ptx.h" +#endif + +#define BLOCK_1D 64 #define PairGPUDeviceT PairGPUDevice template PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false), _gpu_mode(GPU_FORCE), _first_device(0), - _last_device(0) { + _last_device(0), _compiled(false) { } template @@ -113,6 +120,11 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, return false; gpu->set(my_gpu); + + _block_size=BLOCK_1D; + if (static_cast(_block_size)>gpu->group_size()) + _block_size=gpu->group_size(); + return true; } @@ -139,6 +151,7 @@ bool PairGPUDeviceT::init(PairGPUAns &ans, const bool charge, // Initialize atom and nbor data if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0)) return false; + compile_kernels(); } else atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial); @@ -149,6 +162,7 @@ bool PairGPUDeviceT::init(PairGPUAns &ans, const bool charge, *gpu,gpu_nbor,gpu_host,pre_cut)) return false; nbor->cell_size(cell_size); + _gpu_overhead=0.0; _init_count++; return true; @@ -164,6 +178,7 @@ bool PairGPUDeviceT::init(PairGPUAns &ans, const int nlocal, // Initialize atom and nbor data if (!atom.init(nall,true,false,*gpu,false,false)) return false; + compile_kernels(); } else atom.add_fields(true,false,false,false); @@ -214,6 +229,71 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name, } } +template +void PairGPUDeviceT::estimate_gpu_overhead() { + UCL_H_Vec h_sample_x(1,*gpu), h_sample_q(1,*gpu); + UCL_H_Vec h_sample_quat(1,*gpu), h_sample_ans(1,*gpu); + UCL_D_Vec d_sample_x(1,*gpu), d_sample_q(1,*gpu); + UCL_D_Vec d_sample_quat(1,*gpu), d_sample_ans(1,*gpu); + UCL_Timer x_timer(*gpu), q_timer(*gpu); + UCL_Timer quat_timer(*gpu), ans_timer(*gpu), kernel_timer(*gpu); + UCL_Timer over_timer(*gpu); + + h_sample_x[0]=1; + h_sample_q[0]=1; + h_sample_quat[0]=1; + + _gpu_overhead=0.0; + + for (int i=0; i<10; i++) { + gpu->sync(); + gpu_barrier(); + over_timer.start(); + gpu->sync(); + gpu_barrier(); +/* + x_timer.start(); + ucl_copy(d_sample_x,h_sample_x,true); + x_timer.stop(); + + if (atom.charge()) { + q_timer.start(); + ucl_copy(d_sample_q,h_sample_q,true); + q_timer.stop(); + } + + if (atom.quat()) { + quat_timer.start(); + ucl_copy(d_sample_quat,h_sample_quat,true); + quat_timer.stop(); + } +*/ + kernel_timer.start(); + zero(d_sample_ans,1); + kernel_timer.stop(); + + ans_timer.start(); + ucl_copy(h_sample_ans,d_sample_ans,true); + ans_timer.stop(); + + over_timer.stop(); +/* x_timer.add_to_total(); + if (atom.charge()) + q_timer.add_to_total(); + if (atom.quat()) + quat_timer.add_to_total(); +*/ kernel_timer.add_to_total(); + ans_timer.add_to_total(); + double time, mpi_time; + time = over_timer.seconds(); + MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm()); + _gpu_overhead+=mpi_time; + } + if (world_me()==0) + std::cout << "Estimated overhead per timestep: " << _gpu_overhead/10.0 + << " seconds.\n"; +} + template void PairGPUDeviceT::output_times(UCL_Timer &time_pair, PairGPUAns &ans, @@ -326,6 +406,11 @@ void PairGPUDeviceT::clear() { if (_init_count==0) { atom.clear(); _nbor_shared.clear(); + if (_compiled) { + k_zero.clear(); + delete dev_program; + _compiled=false; + } } } } @@ -340,6 +425,18 @@ void PairGPUDeviceT::clear_device() { } } +template +void PairGPUDeviceT::compile_kernels() { + if (_compiled) + return; + + std::string flags="-cl-mad-enable"; + dev_program=new UCL_Program(*gpu); + dev_program->load_string(pair_gpu_dev_kernel,flags.c_str()); + k_zero.set_function(*dev_program,"kernel_zero"); + _compiled=true; +} + template double PairGPUDeviceT::host_memory_usage() const { return atom.host_memory_usage()+4*sizeof(numtyp)+ diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index 9dbe2f50f6..7195d539d9 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -67,6 +67,9 @@ class PairGPUDevice { void init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu); + /// Esimate the overhead from GPU calls from multiple procs + void estimate_gpu_overhead(); + /// Output a message with timing information void output_times(UCL_Timer &time_pair, PairGPUAns &ans, PairGPUNbor &nbor, const double avg_split, @@ -157,6 +160,15 @@ class PairGPUDevice { /// Return the initialization count for the device inline int init_count() const { return _init_count; } + // -------------------- SHARED DEVICE ROUTINES -------------------- + // Perform asynchronous zero of integer array + void zero(UCL_D_Vec &mem, const int numel) { + int num_blocks=static_cast(ceil(static_cast(numel)/ + _block_size)); + k_zero.set_size(num_blocks,_block_size); + k_zero.run(&mem.begin(),&numel); + } + // -------------------------- DEVICE DATA ------------------------- /// Geryon Device @@ -184,6 +196,13 @@ class PairGPUDevice { int _gpu_mode, _first_device, _last_device, _nthreads; double _particle_split; double _cpu_full; + double _gpu_overhead; + + int _block_size; + UCL_Program *dev_program; + UCL_Kernel k_zero; + bool _compiled; + void compile_kernels(); template inline std::string toa(const t& in) {