diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index 9045420425..84fbddd4e9 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -56,7 +56,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_name) { + const char *k_name, const int disable_fast_math) { screen=_screen; int gpu_nbor=0; @@ -83,7 +83,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name); + compile_kernels(*ucl_device,pair_program,k_name,disable_fast_math); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -321,14 +321,20 @@ double BaseChargeT::host_memory_usage_atomic() const { template void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str, - const char *kname) { + const char *kname, + const int disable_fast_math) { if (_compiled) return; std::string s_fast=std::string(kname)+"_fast"; if (pair_program) delete pair_program; pair_program=new UCL_Program(dev); - std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + std::string device_compile_string; + if (disable_fast_math) + device_compile_string = device->compile_string_nofast(); + else + device_compile_string = device->compile_string(); + std::string oclstring = device_compile_string+" -DEVFLAG=1"; pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair.set_function(*pair_program,kname); @@ -336,7 +342,7 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str, q_tex.get_texture(*pair_program,"q_tex"); #if defined(LAL_OCL_EV_JIT) - oclstring = device->compile_string()+" -DEVFLAG=0"; + oclstring = device_compile_string+" -DEVFLAG=0"; if (pair_program_noev) delete pair_program_noev; pair_program_noev=new UCL_Program(dev); pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen); diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h index 6b8761092a..307c5c079f 100644 --- a/lib/gpu/lal_base_charge.h +++ b/lib/gpu/lal_base_charge.h @@ -44,6 +44,7 @@ class BaseCharge { * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * \param k_name name for the kernel for force calculation + * \param disable_fast_math override any fast math opts for kernel JIT * * Returns: * - 0 if successful @@ -54,7 +55,8 @@ class BaseCharge { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, - const void *pair_program, const char *k_name); + const void *pair_program, const char *k_name, + const int disable_fast_math = 0); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -198,7 +200,8 @@ class BaseCharge { double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; - void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); + void compile_kernels(UCL_Device &dev, const void *pair_string, + const char *k, const int disable_fast_math); virtual int loop(const int eflag, const int vflag) = 0; }; diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp index e6caebbab8..9aac866353 100644 --- a/lib/gpu/lal_born_coul_wolf.cpp +++ b/lib/gpu/lal_born_coul_wolf.cpp @@ -57,7 +57,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho const double alf, const double e_shift, const double f_shift) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,born_coul_wolf,"k_born_coul_wolf"); + _screen,born_coul_wolf,"k_born_coul_wolf",1); if (success!=0) return success; diff --git a/lib/gpu/lal_born_coul_wolf_cs.cpp b/lib/gpu/lal_born_coul_wolf_cs.cpp index 8deceeb1f4..abd4da439a 100644 --- a/lib/gpu/lal_born_coul_wolf_cs.cpp +++ b/lib/gpu/lal_born_coul_wolf_cs.cpp @@ -42,7 +42,7 @@ int BornCoulWolfCST::init(const int ntypes, double **host_cutsq, double **host_r const double alf, const double e_shift, const double f_shift) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,born_coul_wolf_cs,"k_born_coul_wolf_cs"); + _screen,born_coul_wolf_cs,"k_born_coul_wolf_cs",1); if (success!=0) return success; diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index e2b5b9cdb5..0ff7125089 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -420,6 +420,16 @@ int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) { return 0; } +template +std::string DeviceT::compile_string_nofast() { + std::string no_fast = _ocl_compile_string; + size_t p = no_fast.find("-cl-fast-relaxed-math "); + if (p != std::string::npos) no_fast.erase(p,22); + p = no_fast.find("-DFAST_MATH="); + if (p != std::string::npos) no_fast[p + 12]='0'; + return no_fast; +} + template int DeviceT::init(Answer &ans, const bool charge, const bool rot, const int nlocal, diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 1db6ae3127..933a3508b5 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -312,6 +312,7 @@ class Device { } inline std::string compile_string() { return _ocl_compile_string; } + std::string compile_string_nofast(); inline std::string ocl_config_name() { return _ocl_config_name; } template