Override any OpenCL fast math JIT settings for born/coul/wolf{/cs}/gpu to resolve numerical deviations seen with some OpenCL implementations.
This commit is contained in:
@ -56,7 +56,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
|
|||||||
const int max_nbors, const int maxspecial,
|
const int max_nbors, const int maxspecial,
|
||||||
const double cell_size, const double gpu_split,
|
const double cell_size, const double gpu_split,
|
||||||
FILE *_screen, const void *pair_program,
|
FILE *_screen, const void *pair_program,
|
||||||
const char *k_name) {
|
const char *k_name, const int disable_fast_math) {
|
||||||
screen=_screen;
|
screen=_screen;
|
||||||
|
|
||||||
int gpu_nbor=0;
|
int gpu_nbor=0;
|
||||||
@ -83,7 +83,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
|
|||||||
|
|
||||||
_block_size=device->pair_block_size();
|
_block_size=device->pair_block_size();
|
||||||
_block_bio_size=device->block_bio_pair();
|
_block_bio_size=device->block_bio_pair();
|
||||||
compile_kernels(*ucl_device,pair_program,k_name);
|
compile_kernels(*ucl_device,pair_program,k_name,disable_fast_math);
|
||||||
|
|
||||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||||
nbor->packing(true);
|
nbor->packing(true);
|
||||||
@ -321,14 +321,20 @@ double BaseChargeT::host_memory_usage_atomic() const {
|
|||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||||
const char *kname) {
|
const char *kname,
|
||||||
|
const int disable_fast_math) {
|
||||||
if (_compiled)
|
if (_compiled)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
std::string s_fast=std::string(kname)+"_fast";
|
std::string s_fast=std::string(kname)+"_fast";
|
||||||
if (pair_program) delete pair_program;
|
if (pair_program) delete pair_program;
|
||||||
pair_program=new UCL_Program(dev);
|
pair_program=new UCL_Program(dev);
|
||||||
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
std::string device_compile_string;
|
||||||
|
if (disable_fast_math)
|
||||||
|
device_compile_string = device->compile_string_nofast();
|
||||||
|
else
|
||||||
|
device_compile_string = device->compile_string();
|
||||||
|
std::string oclstring = device_compile_string+" -DEVFLAG=1";
|
||||||
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||||
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
||||||
k_pair.set_function(*pair_program,kname);
|
k_pair.set_function(*pair_program,kname);
|
||||||
@ -336,7 +342,7 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
|||||||
q_tex.get_texture(*pair_program,"q_tex");
|
q_tex.get_texture(*pair_program,"q_tex");
|
||||||
|
|
||||||
#if defined(LAL_OCL_EV_JIT)
|
#if defined(LAL_OCL_EV_JIT)
|
||||||
oclstring = device->compile_string()+" -DEVFLAG=0";
|
oclstring = device_compile_string+" -DEVFLAG=0";
|
||||||
if (pair_program_noev) delete pair_program_noev;
|
if (pair_program_noev) delete pair_program_noev;
|
||||||
pair_program_noev=new UCL_Program(dev);
|
pair_program_noev=new UCL_Program(dev);
|
||||||
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||||
|
|||||||
@ -44,6 +44,7 @@ class BaseCharge {
|
|||||||
* \param cell_size cutoff + skin
|
* \param cell_size cutoff + skin
|
||||||
* \param gpu_split fraction of particles handled by device
|
* \param gpu_split fraction of particles handled by device
|
||||||
* \param k_name name for the kernel for force calculation
|
* \param k_name name for the kernel for force calculation
|
||||||
|
* \param disable_fast_math override any fast math opts for kernel JIT
|
||||||
*
|
*
|
||||||
* Returns:
|
* Returns:
|
||||||
* - 0 if successful
|
* - 0 if successful
|
||||||
@ -54,7 +55,8 @@ class BaseCharge {
|
|||||||
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||||
const int maxspecial, const double cell_size,
|
const int maxspecial, const double cell_size,
|
||||||
const double gpu_split, FILE *screen,
|
const double gpu_split, FILE *screen,
|
||||||
const void *pair_program, const char *k_name);
|
const void *pair_program, const char *k_name,
|
||||||
|
const int disable_fast_math = 0);
|
||||||
|
|
||||||
/// Estimate the overhead for GPU context changes and CPU driver
|
/// Estimate the overhead for GPU context changes and CPU driver
|
||||||
void estimate_gpu_overhead(const int add_kernels=0);
|
void estimate_gpu_overhead(const int add_kernels=0);
|
||||||
@ -198,7 +200,8 @@ class BaseCharge {
|
|||||||
double _gpu_overhead, _driver_overhead;
|
double _gpu_overhead, _driver_overhead;
|
||||||
UCL_D_Vec<int> *_nbor_data;
|
UCL_D_Vec<int> *_nbor_data;
|
||||||
|
|
||||||
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
|
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||||
|
const char *k, const int disable_fast_math);
|
||||||
|
|
||||||
virtual int loop(const int eflag, const int vflag) = 0;
|
virtual int loop(const int eflag, const int vflag) = 0;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -57,7 +57,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
|
|||||||
const double alf, const double e_shift, const double f_shift) {
|
const double alf, const double e_shift, const double f_shift) {
|
||||||
int success;
|
int success;
|
||||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
_screen,born_coul_wolf,"k_born_coul_wolf");
|
_screen,born_coul_wolf,"k_born_coul_wolf",1);
|
||||||
if (success!=0)
|
if (success!=0)
|
||||||
return success;
|
return success;
|
||||||
|
|
||||||
|
|||||||
@ -42,7 +42,7 @@ int BornCoulWolfCST::init(const int ntypes, double **host_cutsq, double **host_r
|
|||||||
const double alf, const double e_shift, const double f_shift) {
|
const double alf, const double e_shift, const double f_shift) {
|
||||||
int success;
|
int success;
|
||||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
_screen,born_coul_wolf_cs,"k_born_coul_wolf_cs");
|
_screen,born_coul_wolf_cs,"k_born_coul_wolf_cs",1);
|
||||||
if (success!=0)
|
if (success!=0)
|
||||||
return success;
|
return success;
|
||||||
|
|
||||||
|
|||||||
@ -420,6 +420,16 @@ int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
std::string DeviceT::compile_string_nofast() {
|
||||||
|
std::string no_fast = _ocl_compile_string;
|
||||||
|
size_t p = no_fast.find("-cl-fast-relaxed-math ");
|
||||||
|
if (p != std::string::npos) no_fast.erase(p,22);
|
||||||
|
p = no_fast.find("-DFAST_MATH=");
|
||||||
|
if (p != std::string::npos) no_fast[p + 12]='0';
|
||||||
|
return no_fast;
|
||||||
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||||
const bool rot, const int nlocal,
|
const bool rot, const int nlocal,
|
||||||
|
|||||||
@ -312,6 +312,7 @@ class Device {
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline std::string compile_string() { return _ocl_compile_string; }
|
inline std::string compile_string() { return _ocl_compile_string; }
|
||||||
|
std::string compile_string_nofast();
|
||||||
inline std::string ocl_config_name() { return _ocl_config_name; }
|
inline std::string ocl_config_name() { return _ocl_config_name; }
|
||||||
|
|
||||||
template <class t>
|
template <class t>
|
||||||
|
|||||||
Reference in New Issue
Block a user