/*************************************************************************** device.cpp ------------------- W. Michael Brown (ORNL) Class for management of the device where the computations are performed __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include "lal_device.h" #include "lal_precision.h" #include #include #include #if (LAL_USE_OMP == 1) #include #endif #if defined(USE_OPENCL) #include "device_cl.h" #ifdef LAL_OCL_EXTRA_ARGS #define LAL_DM_STRINGIFY(x) #x #define LAL_PRE_STRINGIFY(x) LAL_DM_STRINGIFY(x) #endif #elif defined(USE_CUDART) const char *device=0; #else #include "device_cubin.h" #endif namespace LAMMPS_AL { #define DeviceT Device template DeviceT::Device() : _init_count(0), _device_init(false), _gpu_mode(GPU_FORCE), _first_device(0), _last_device(0), _platform_id(-1), _compiled(false) { } template DeviceT::~Device() { clear_device(); } template int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, const int first_gpu_id, const int gpu_mode, const double p_split, const int nthreads, const int t_per_atom, const double user_cell_size, char *ocl_args, const int ocl_platform, char *device_type_flags, const int block_pair) { _nthreads=nthreads; #if (LAL_USE_OMP == 1) omp_set_num_threads(nthreads); #endif _threads_per_atom=t_per_atom; _threads_per_charge=t_per_atom; _threads_per_three=t_per_atom; if (_device_init) return 0; _device_init=true; _comm_world=replica; //world; _comm_replica=replica; int ndevices=ngpu; _first_device=first_gpu_id; _gpu_mode=gpu_mode; _particle_split=p_split; _user_cell_size=user_cell_size; _block_pair=block_pair; // support selecting OpenCL platform id with "package platform" keyword if (ocl_platform >= 0) _platform_id = ocl_platform; gpu=new UCL_Device(); // ---------------------- OpenCL Compiler Args ------------------------- std::string extra_args=""; if (ocl_args) extra_args+=":"+std::string(ocl_args); #ifdef LAL_OCL_EXTRA_ARGS extra_args+=":" LAL_PRE_STRINGIFY(LAL_OCL_EXTRA_ARGS); #endif for (int i=0; i name_map; std::map::iterator np; for (int i=0; i<_world_size; i++) { std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]); np=name_map.find(i_string); if (np==name_map.end()) name_map[i_string]=1; else np->second++; } int procs_per_node=name_map.begin()->second; // Assign a unique id to each node int split_num=0, split_id=0; for (np=name_map.begin(); np!=name_map.end(); ++np) { if (np->first==node_string) split_id=split_num; split_num++; } delete[] node_names; // Set up a per node communicator and find rank within MPI_Comm node_comm; MPI_Comm_split(_comm_world, split_id, 0, &node_comm); int node_rank; MPI_Comm_rank(node_comm,&node_rank); // ------------------- Device selection parameters---------------------- if (ndevices > procs_per_node) ndevices = procs_per_node; // --------------------- OCL Platform Selection ----------------------- // Setup OpenCL platform and parameters based on platform // and device type specifications std::string ocl_vstring=""; if (device_type_flags != nullptr) ocl_vstring=device_type_flags; // Setup the OpenCL platform // If multiple platforms and no user platform specified, // try to match platform from config matching any user specified // device type. Give preference to platforms with GPUs. // Priority under these conditions to platform with device with // highest compute unit count. int pres; enum UCL_DEVICE_TYPE type=UCL_GPU; #ifndef USE_OPENCL pres=gpu->set_platform(0); #else if (_platform_id>=0) pres=gpu->set_platform(_platform_id); else { std::string vendor=""; if (device_type_flags!=nullptr) { if (ocl_vstring=="intelgpu") vendor="intel"; else if (ocl_vstring=="intelcpu") { vendor="intel"; type=UCL_CPU; } else if (ocl_vstring=="nvidiagpu") vendor="nvidia"; else if (ocl_vstring=="amdgpu") vendor="amd"; else if (ocl_vstring=="applegpu") vendor="apple"; } pres=gpu->auto_set_platform(type,vendor,ndevices,_first_device); } #endif if (pres != UCL_SUCCESS) return -12; // ------------------------ Device Selection --------------------------- if (_first_device > -1 && _first_device >= gpu->num_devices()) return -2; if (ndevices > gpu->num_devices()) return -2; if (_first_device + ndevices > gpu->num_devices()) return -2; if (gpu->num_devices()==0) return -2; // Fully specified deviceIDs if (_first_device > -1 && ndevices > 0) _last_device = _first_device + ndevices - 1; // Find deviceID with most CUs (priority given to the accelerator type) if (_first_device < 0) { int best_device = 0; int best_cus = gpu->cus(0); bool type_match = (gpu->device_type(0) == type); for (int i = 1; i < gpu->num_devices(); i++) { if (type_match==true && gpu->device_type(i)!=type) continue; if (type_match == false && gpu->device_type(i) == type) { type_match = true; best_cus = gpu->cus(i); best_device = i; } if (gpu->cus(i) > best_cus) { best_cus = gpu->cus(i); best_device = i; } } _first_device = _last_device = best_device; type = gpu->device_type(_first_device); if (ndevices > 0) { // Expand range to meet specified number of devices while (_last_device - _first_device < ndevices - 1) { if (_last_device + 1 == gpu->num_devices()) _first_device--; else if (_first_device == 0) _last_device++; else { if (gpu->device_type(_last_device+1)==type && gpu->device_type(_first_device-1)!=type) _last_device++; else if (gpu->device_type(_last_device+1)!=type && gpu->device_type(_first_device-1)==type) _first_device--; else if (gpu->cus(_last_device+1) > gpu->cus(_first_device-1)) _last_device++; else _first_device--; } } } } // If ngpus not specified, expand range to include matching devices if (ndevices == 0) { for (int i = _first_device; i < gpu->num_devices(); i++) { if (gpu->device_type(i)==gpu->device_type(_first_device) && gpu->cus(i)==gpu->cus(_first_device)) _last_device = i; else break; } ndevices = _last_device - _first_device + 1; if (ndevices > procs_per_node) { ndevices = procs_per_node; _last_device=_first_device + ndevices - 1; } } // ------------------------ MPI Device ID Setup ----------------------- // set the device ID _procs_per_gpu=static_cast(ceil(static_cast(procs_per_node)/ ndevices)); int my_gpu=node_rank/_procs_per_gpu+_first_device; // Time on the device only if 1 proc per gpu _time_device=true; #if 0 // XXX: the following setting triggers a memory leak with OpenCL and MPI // setting _time_device=true for all processes doesn't seem to be a // problem with either (no segfault, no (large) memory leak. // thus keeping this disabled for now. may need to review later. // 2018-07-23 if (_procs_per_gpu>1) _time_device=false; #endif // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); MPI_Comm_rank(_comm_gpu,&_gpu_rank); #if !defined(CUDA_PROXY) && !defined(CUDA_MPS_SUPPORT) if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false) return -7; #endif // --------------- Device Configuration and Setup ------------------------- if (gpu->set(my_gpu)!=UCL_SUCCESS) return -6; #if !defined(USE_OPENCL) && !defined(USE_HIP) if (gpu->arch()<7.0) { gpu->push_command_queue(); gpu->set_command_queue(1); } #endif _long_range_precompute=0; // If OpenCL parameters not specified by user, try to auto detect // best option from the platform config #ifdef USE_OPENCL if (device_type_flags==nullptr) { std::string pname = gpu->platform_name(); for (int i=0; i='a') pname[i]=toupper(pname[i]); if (pname.find("NVIDIA")!=std::string::npos) ocl_vstring="nvidiagpu"; else if (pname.find("INTEL")!=std::string::npos) { if (gpu->device_type()==UCL_GPU) ocl_vstring="intelgpu"; else if (gpu->device_type()==UCL_CPU) ocl_vstring="intelcpu"; } else if (pname.find("AMD")!=std::string::npos) { if (gpu->device_type()==UCL_GPU) ocl_vstring="amdgpu"; } else if (pname.find("APPLE")!=std::string::npos) { if (gpu->device_type()==UCL_GPU) ocl_vstring="applegpu"; } } #endif if (set_ocl_params(ocl_vstring, extra_args)!=0) return -11; int flag=0; for (int i=0; i<_procs_per_gpu; i++) { if (_gpu_rank==i) flag=compile_kernels(); gpu_barrier(); } // Setup auto bin size calculation for calls from atom::sort // - This is repeated in neighbor init with additional info if (_user_cell_size<0.0) { #ifndef LAL_USE_OLD_NEIGHBOR _neighbor_shared.setup_auto_cell_size(true,0,_simd_size); #else _neighbor_shared.setup_auto_cell_size(false,0,_simd_size); #endif } else _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,_simd_size); return flag; } template int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) { #ifdef USE_OPENCL #include "lal_pre_ocl_config.h" if (s_config=="" || s_config=="none") s_config="generic"; int config_index=-1; for (int i=0; ihas_subgroup_support()) _ocl_compile_string+=" -DUSE_OPENCL_SUBGROUPS"; #ifdef LAL_USE_OLD_NEIGHBOR _ocl_compile_string+=" -DLAL_USE_OLD_NEIGHBOR"; #endif _ocl_compile_string += " -DCONFIG_ID="+params[0]+ " -DSIMD_SIZE="+params[1]+ " -DMEM_THREADS="+params[2]; if (gpu->has_shuffle_support()==false) _ocl_compile_string+=" -DSHUFFLE_AVAIL=0"; else _ocl_compile_string+=" -DSHUFFLE_AVAIL="+params[3]; _ocl_compile_string += " -DFAST_MATH="+params[4]+ " -DTHREADS_PER_ATOM="+params[5]+ " -DTHREADS_PER_CHARGE="+params[6]+ " -DTHREADS_PER_THREE="+params[7]+ " -DBLOCK_PAIR="+params[8]+ " -DBLOCK_BIO_PAIR="+params[9]+ " -DBLOCK_ELLIPSE="+params[10]+ " -DPPPM_BLOCK_1D="+params[11]+ " -DBLOCK_NBOR_BUILD="+params[12]+ " -DBLOCK_CELL_2D="+params[13]+ " -DBLOCK_CELL_ID="+params[14]+ " -DMAX_SHARED_TYPES="+params[15]+ " -DMAX_BIO_SHARED_TYPES="+params[16]+ " -DPPPM_MAX_SPLINE="+params[17]; _ocl_compile_string += extra_args; #endif return 0; } template int DeviceT::init(Answer &ans, const bool charge, const bool rot, const int nlocal, const int nall, const int maxspecial, const bool vel) { if (!_device_init) return -1; if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) return -5; // Counts of data transfers for timing overhead estimates _data_in_estimate=0; _data_out_estimate=1; // Initial number of local particles int ef_nlocal=nlocal; if (_particle_split<1.0 && _particle_split>0.0) ef_nlocal=static_cast(_particle_split*nlocal); int gpu_nbor=0; if (_gpu_mode==Device::GPU_NEIGH) gpu_nbor=1; else if (_gpu_mode==Device::GPU_HYB_NEIGH) gpu_nbor=2; #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT) if (gpu_nbor==1) gpu_nbor=2; #endif #ifndef LAL_USE_OLD_NEIGHBOR if (gpu_nbor==1) gpu_nbor=2; #endif if (_init_count==0) { // Initialize atom and nbor data if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel)) return -3; _data_in_estimate++; if (charge) _data_in_estimate++; if (rot) _data_in_estimate++; if (vel) _data_in_estimate++; } else { if (atom.charge()==false && charge) _data_in_estimate++; if (atom.quaternion()==false && rot) _data_in_estimate++; if (atom.velocity()==false && vel) _data_in_estimate++; if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel)) return -3; } if (!ans.init(ef_nlocal,charge,rot,*gpu)) return -3; _init_count++; return 0; } template int DeviceT::init(Answer &ans, const int nlocal, const int nall) { if (!_device_init) return -1; if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) return -5; if (_init_count==0) { // Initialize atom and nbor data if (!atom.init(nall,true,false,*gpu,false,false)) return -3; } else if (!atom.add_fields(true,false,false,false)) return -3; if (!ans.init(nlocal,true,false,*gpu)) return -3; _init_count++; return 0; } template int DeviceT::init_nbor(Neighbor *nbor, const int nlocal, const int host_nlocal, const int nall, const int maxspecial, const int gpu_host, const int max_nbors, const double cutoff, const bool pre_cut, const int threads_per_atom, const bool ilist_map) { int ef_nlocal=nlocal; if (_particle_split<1.0 && _particle_split>0.0) ef_nlocal=static_cast(_particle_split*nlocal); int gpu_nbor=0; if (_gpu_mode==Device::GPU_NEIGH) gpu_nbor=1; else if (_gpu_mode==Device::GPU_HYB_NEIGH) gpu_nbor=2; #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT) if (gpu_nbor==1) gpu_nbor=2; #endif #ifndef LAL_USE_OLD_NEIGHBOR if (gpu_nbor==1) gpu_nbor=2; #endif if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, *gpu,gpu_nbor,gpu_host,pre_cut,_block_cell_2d, _block_cell_id, _block_nbor_build, threads_per_atom, _simd_size, _time_device, compile_string(), ilist_map)) return -3; if (_user_cell_size<0.0) { #ifndef LAL_USE_OLD_NEIGHBOR _neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size()); #else _neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size()); #endif } else _neighbor_shared.setup_auto_cell_size(false,_user_cell_size, nbor->simd_size()); nbor->set_cutoff(cutoff); return 0; } template void DeviceT::set_single_precompute (PPPM *pppm) { _long_range_precompute=1; pppm_single=pppm; } template void DeviceT::set_double_precompute (PPPM *pppm) { _long_range_precompute=2; pppm_double=pppm; } template void DeviceT::init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu) { #if defined(USE_OPENCL) std::string fs=""; #elif defined(USE_CUDART) std::string fs=""; #else std::string fs=toa(gpu->free_gigabytes())+"/"; #endif if (_replica_me == 0 && screen) { fprintf(screen,"\n-------------------------------------"); fprintf(screen,"-------------------------------------\n"); fprintf(screen,"- Using acceleration for %s:\n",name); fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu); #if (LAL_USE_OMP == 1) fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads); #endif #ifdef USE_OPENCL fprintf(screen,"- with OpenCL Parameters for: %s (%d)\n", _ocl_config_name.c_str(),_config_id); #endif if (shuffle_avail()) fprintf(screen,"- Horizontal vector operations: ENABLED\n"); else fprintf(screen,"- Horizontal vector operations: DISABLED\n"); if (gpu->shared_memory(first_gpu)) fprintf(screen,"- Shared memory system: Yes\n"); else fprintf(screen,"- Shared memory system: No\n"); fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n"); int last=last_gpu+1; if (last>gpu->num_devices()) last=gpu->num_devices(); for (int i=first_gpu; iname(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+ toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ ("; else sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+ toa(gpu->clock_rate(i))+" GHZ ("; if (sizeof(PRECISION)==4) { if (sizeof(ACC_PRECISION)==4) sname+="Single Precision)"; else sname+="Mixed Precision)"; } else sname+="Double Precision)"; fprintf(screen,"Device %d: %s\n",i,sname.c_str()); } fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n\n"); } } template void DeviceT::estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead, double &gpu_driver_overhead) { UCL_H_Vec *host_data_in=nullptr, *host_data_out=nullptr; UCL_D_Vec *dev_data_in=nullptr, *dev_data_out=nullptr, *kernel_data=nullptr; UCL_Timer *timers_in=nullptr, *timers_out=nullptr, *timers_kernel=nullptr; UCL_Timer over_timer(*gpu); if (_data_in_estimate>0) { host_data_in=new UCL_H_Vec[_data_in_estimate]; dev_data_in=new UCL_D_Vec[_data_in_estimate]; timers_in=new UCL_Timer[_data_in_estimate]; } if (_data_out_estimate>0) { host_data_out=new UCL_H_Vec[_data_out_estimate]; dev_data_out=new UCL_D_Vec[_data_out_estimate]; timers_out=new UCL_Timer[_data_out_estimate]; } if (kernel_calls>0) { kernel_data=new UCL_D_Vec[kernel_calls]; timers_kernel=new UCL_Timer[kernel_calls]; } for (int i=0; i<_data_in_estimate; i++) { host_data_in[i].alloc(1,*gpu); dev_data_in[i].alloc(1,*gpu); timers_in[i].init(*gpu); } for (int i=0; i<_data_out_estimate; i++) { host_data_out[i].alloc(1,*gpu); dev_data_out[i].alloc(1,*gpu); timers_out[i].init(*gpu); } for (int i=0; isync(); gpu_barrier(); over_timer.start(); gpu->sync(); gpu_barrier(); double driver_time=MPI_Wtime(); for (int i=0; i<_data_in_estimate; i++) { timers_in[i].start(); ucl_copy(dev_data_in[i],host_data_in[i],true); timers_in[i].stop(); } const int numel=1; for (int i=0; i0) { gpu_overhead+=mpi_time; gpu_driver_overhead+=mpi_driver_time; } } gpu_overhead/=10.0; gpu_driver_overhead/=10.0; if (_data_in_estimate>0) { delete [] host_data_in; delete [] dev_data_in; delete [] timers_in; } if (_data_out_estimate>0) { delete [] host_data_out; delete [] dev_data_out; delete [] timers_out; } if (kernel_calls>0) { delete [] kernel_data; delete [] timers_kernel; } } template void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, Neighbor &nbor, const double avg_split, const double max_bytes, const double gpu_overhead, const double driver_overhead, const int threads_per_atom, FILE *screen) { double single[9], times[9]; int post_final=0; single[0]=atom.transfer_time()+ans.transfer_time(); single[1]=nbor.time_nbor.total_seconds()+nbor.time_hybrid1.total_seconds()+ nbor.time_hybrid2.total_seconds(); single[2]=nbor.time_kernel.total_seconds(); single[3]=time_pair.total_seconds(); single[4]=atom.cast_time()+ans.cast_time(); single[5]=gpu_overhead; single[6]=driver_overhead; single[7]=ans.cpu_idle_time(); single[8]=nbor.bin_time(); MPI_Finalized(&post_final); if (post_final) return; MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,_comm_replica); double my_max_bytes=max_bytes+atom.max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); #ifdef USE_OPENCL // Workaround for timing issue on Intel OpenCL if (times[3] > 80e6) times[3]=0.0; #endif if (replica_me()==0) if (screen && times[6]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); fprintf(screen," Device Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); if (time_device() && times[3]>0) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); if (nbor.gpu_nbor()>0) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size); else fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); } if (times[5]>0) fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Threads / atom: %d.\n",threads_per_atom); fprintf(screen,"Vector width: %d.\n", simd_size()); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); if (nbor.gpu_nbor()==2) fprintf(screen,"CPU Neighbor: %.4f s.\n",times[8]/_replica_size); fprintf(screen,"CPU Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); } } template void DeviceT::output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out, UCL_Timer &time_map, UCL_Timer &time_rho, UCL_Timer &time_interp, Answer &ans, const double max_bytes, const double cpu_time, const double idle_time, FILE *screen) { double single[9], times[9]; single[0]=time_out.total_seconds(); single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time(); single[2]=time_map.total_seconds(); single[3]=time_rho.total_seconds(); single[4]=time_interp.total_seconds(); single[5]=ans.transfer_time(); single[6]=cpu_time; single[7]=idle_time; single[8]=ans.cast_time(); MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,_comm_replica); double my_max_bytes=max_bytes+atom.max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); #ifdef USE_OPENCL // Workaround for timing issue on Intel OpenCL if (times[3] > 80e6) times[3]=0.0; #endif if (replica_me()==0) if (screen && times[6]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); fprintf(screen," Device Time Info (average) for kspace: "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); if (time_device() && times[3]>0) { fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size); fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Kernel (rho): %.4f s.\n",times[3]/_replica_size); fprintf(screen,"Force interp: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Total rho: %.4f s.\n", (times[0]+times[2]+times[3])/_replica_size); fprintf(screen,"Total interp: %.4f s.\n", (times[1]+times[4])/_replica_size); fprintf(screen,"Force copy: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Total: %.4f s.\n", (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/ _replica_size); } fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size); fprintf(screen,"CPU Data Cast: %.4f s.\n",times[8]/_replica_size); fprintf(screen,"CPU Idle Time: %.4f s.\n",times[7]/_replica_size); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); } } template void DeviceT::clear() { if (_init_count>0) { _long_range_precompute=0; _init_count--; if (_init_count==0) { atom.clear(); _neighbor_shared.clear(); } } } template void DeviceT::clear_device() { while (_init_count>0) clear(); if (_compiled) { k_zero.clear(); k_info.clear(); delete dev_program; _compiled=false; } if (_device_init) { delete gpu; _device_init=false; } } template int DeviceT::compile_kernels() { int flag=0; if (_compiled) return flag; dev_program=new UCL_Program(*gpu); int success=dev_program->load_string(device,compile_string().c_str(), nullptr,stderr); if (success!=UCL_SUCCESS) return -6; k_zero.set_function(*dev_program,"kernel_zero"); k_info.set_function(*dev_program,"kernel_info"); _compiled=true; UCL_Vector gpu_lib_data(19,*gpu,UCL_NOT_PINNED); k_info.set_size(1,1); k_info.run(&gpu_lib_data); gpu_lib_data.update_host(false); _ptx_arch=static_cast(gpu_lib_data[0])/100.0; #if !(defined(USE_OPENCL) || defined(USE_HIP)) if (_ptx_arch>gpu->arch() || floor(_ptx_arch)arch())) return -4; #endif _config_id=gpu_lib_data[1]; if (sizeof(numtyp)==sizeof(float)) _simd_size=std::max(gpu_lib_data[2],gpu->preferred_fp32_width()); else _simd_size=std::max(gpu_lib_data[2],gpu->preferred_fp64_width()); _num_mem_threads=gpu_lib_data[3]; _shuffle_avail=gpu_lib_data[4]; _fast_math=gpu_lib_data[5]; if (_threads_per_atom<1) _threads_per_atom=gpu_lib_data[6]; if (_threads_per_charge<1) _threads_per_charge=gpu_lib_data[7]; if (_threads_per_three<1) _threads_per_three=gpu_lib_data[8]; if (_block_pair == -1) { _block_pair=gpu_lib_data[9]; _block_bio_pair=gpu_lib_data[10]; _block_ellipse=gpu_lib_data[11]; } else { _block_bio_pair=_block_pair; _block_ellipse=_block_pair; } _pppm_block=gpu_lib_data[12]; _block_nbor_build=gpu_lib_data[13]; _block_cell_2d=gpu_lib_data[14]; _block_cell_id=gpu_lib_data[15]; _max_shared_types=gpu_lib_data[16]; _max_bio_shared_types=gpu_lib_data[17]; _pppm_max_spline=gpu_lib_data[18]; if (static_cast(_block_pair)>gpu->group_size_dim(0) || static_cast(_block_bio_pair)>gpu->group_size_dim(0) || static_cast(_block_ellipse)>gpu->group_size_dim(0) || static_cast(_pppm_block)>gpu->group_size_dim(0) || static_cast(_block_nbor_build)>gpu->group_size_dim(0) || static_cast(_block_cell_2d)>gpu->group_size_dim(0) || static_cast(_block_cell_2d)>gpu->group_size_dim(1) || static_cast(_block_cell_id)>gpu->group_size_dim(0) || static_cast(_max_shared_types*_max_shared_types* sizeof(numtyp)*17 > gpu->slm_size()) || static_cast(_max_bio_shared_types*2*sizeof(numtyp) > gpu->slm_size())) return -13; if (_block_pair % _simd_size != 0 || _block_bio_pair % _simd_size != 0 || _block_ellipse % _simd_size != 0 || _pppm_block % _simd_size != 0 || _block_nbor_build % _simd_size != 0 || _block_pair < _max_shared_types * _max_shared_types || _block_bio_pair * 2 < _max_bio_shared_types || _pppm_block < _pppm_max_spline * _pppm_max_spline) return -11; if (_threads_per_atom>_simd_size) _threads_per_atom=_simd_size; if (_simd_size%_threads_per_atom!=0) _threads_per_atom=1; if (_threads_per_atom & (_threads_per_atom - 1)) _threads_per_atom=1; if (_threads_per_charge>_simd_size) _threads_per_charge=_simd_size; if (_simd_size%_threads_per_charge!=0) _threads_per_charge=1; if (_threads_per_charge & (_threads_per_charge - 1)) _threads_per_charge=1; if (_threads_per_three>_simd_size) _threads_per_three=_simd_size; if (_simd_size%_threads_per_three!=0) _threads_per_three=1; if (_threads_per_three & (_threads_per_three - 1)) _threads_per_three=1; return flag; } template double DeviceT::host_memory_usage() const { return atom.host_memory_usage()+4*sizeof(numtyp)+ sizeof(Device); } template class Device; Device global_device; } using namespace LAMMPS_AL; int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, const int first_gpu_id, const int gpu_mode, const double particle_split, const int nthreads, const int t_per_atom, const double user_cell_size, char *opencl_config, const int ocl_platform, char *device_type_flags, const int block_pair) { return global_device.init_device(world,replica,ngpu,first_gpu_id,gpu_mode, particle_split,nthreads,t_per_atom, user_cell_size,opencl_config,ocl_platform, device_type_flags,block_pair); } void lmp_clear_device() { global_device.clear_device(); } double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul, int &error_flag) { return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul,error_flag); } double lmp_gpu_update_bin_size(const double subx, const double suby, const double subz, const int nlocal, const double cut) { return global_device._neighbor_shared.update_cell_size(subx, suby, subz, nlocal, cut); } bool lmp_gpu_config(const std::string &category, const std::string &setting) { if (category == "api") { #if defined(USE_OPENCL) return setting == "opencl"; #elif defined(USE_HIP) return setting == "hip"; #elif defined(USE_CUDA) return setting == "cuda"; #endif return false; } if (category == "precision") { if (setting == "single") { #if defined(_SINGLE_SINGLE) return true; #else return false; #endif } else if (setting == "mixed") { #if defined(_SINGLE_DOUBLE) return true; #else return false; #endif } else if (setting == "double") { #if defined(_DOUBLE_DOUBLE) return true; #else return false; #endif } else return false; } return false; }