Fixing race condition with OpenMP for GPU styles using torque (missed in regression tests due to the first fix) Documenting GPU package option for setting the number of threads (consistent with USER-INTEL and USER-OMP).
580 lines
20 KiB
C++
580 lines
20 KiB
C++
/***************************************************************************
|
|
base_ellipsoid.cpp
|
|
-------------------
|
|
W. Michael Brown (ORNL)
|
|
|
|
Base class for acceleration of ellipsoid potentials
|
|
|
|
__________________________________________________________________________
|
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
|
__________________________________________________________________________
|
|
|
|
begin : Thu May 5 2011
|
|
email : brownw@ornl.gov
|
|
***************************************************************************/
|
|
|
|
#include "lal_base_ellipsoid.h"
|
|
#include <cstdlib>
|
|
namespace LAMMPS_AL {
|
|
|
|
#if defined(USE_OPENCL)
|
|
#include "ellipsoid_nbor_cl.h"
|
|
#elif defined(USE_CUDART)
|
|
const char *ellipsoid_nbor=0;
|
|
#else
|
|
#include "ellipsoid_nbor_cubin.h"
|
|
#endif
|
|
|
|
#define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp>
|
|
extern Device<PRECISION,ACC_PRECISION> global_device;
|
|
|
|
template <class numtyp, class acctyp>
|
|
BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0),
|
|
host_olist_size(0) {
|
|
device=&global_device;
|
|
ans=new Answer<numtyp,acctyp>();
|
|
nbor=new Neighbor();
|
|
nbor_program=nullptr;
|
|
ellipsoid_program=nullptr;
|
|
lj_program=nullptr;
|
|
ucl_device=nullptr;
|
|
#if defined(LAL_OCL_EV_JIT)
|
|
ellipsoid_program_noev=nullptr;
|
|
lj_program_noev=nullptr;
|
|
#endif
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
BaseEllipsoidT::~BaseEllipsoid() {
|
|
delete ans;
|
|
delete nbor;
|
|
k_nbor_fast.clear();
|
|
k_nbor.clear();
|
|
k_ellipsoid.clear();
|
|
k_ellipsoid_sphere.clear();
|
|
k_sphere_ellipsoid.clear();
|
|
k_lj_fast.clear();
|
|
k_lj.clear();
|
|
if (nbor_program) delete nbor_program;
|
|
if (ellipsoid_program) delete ellipsoid_program;
|
|
if (lj_program) delete lj_program;
|
|
#if defined(LAL_OCL_EV_JIT)
|
|
k_ellipsoid_noev.clear();
|
|
k_ellipsoid_sphere_noev.clear();
|
|
k_sphere_ellipsoid_noev.clear();
|
|
k_lj_fast.clear();
|
|
if (ellipsoid_program_noev) delete ellipsoid_program_noev;
|
|
if (lj_program_noev) delete lj_program_noev;
|
|
#endif
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
int BaseEllipsoidT::bytes_per_atom(const int max_nbors) const {
|
|
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
|
|
nbor->bytes_per_atom(max_nbors);
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
int BaseEllipsoidT::init_base(const int nlocal, const int nall,
|
|
const int max_nbors, const int maxspecial,
|
|
const double cell_size, const double gpu_split,
|
|
FILE *_screen, const int ntypes, int **h_form,
|
|
const void *ellipsoid_program,
|
|
const void *lj_program, const char *k_name,
|
|
const bool ellip_sphere) {
|
|
screen=_screen;
|
|
_ellipsoid_sphere=ellip_sphere;
|
|
|
|
int gpu_nbor=0;
|
|
if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
|
|
gpu_nbor=1;
|
|
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
|
|
gpu_nbor=2;
|
|
|
|
int _gpu_host=0;
|
|
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
|
if (host_nlocal>0)
|
|
_gpu_host=1;
|
|
|
|
_threads_per_atom=device->threads_per_atom();
|
|
|
|
int success=device->init(*ans,false,true,nlocal,nall,maxspecial);
|
|
if (success!=0)
|
|
return success;
|
|
|
|
if (ucl_device!=device->gpu) _compiled=false;
|
|
|
|
ucl_device=device->gpu;
|
|
atom=&device->atom;
|
|
|
|
_block_size=device->block_ellipse();
|
|
compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);
|
|
|
|
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
|
max_nbors,cell_size,true,1);
|
|
if (success!=0)
|
|
return success;
|
|
|
|
// Initialize host-device load balancer
|
|
hd_balancer.init(device,gpu_nbor,gpu_split);
|
|
|
|
// Initialize timers for the selected GPU
|
|
time_lj.init(*ucl_device);
|
|
time_nbor1.init(*ucl_device);
|
|
time_ellipsoid.init(*ucl_device);
|
|
time_nbor2.init(*ucl_device);
|
|
time_ellipsoid2.init(*ucl_device);
|
|
time_nbor3.init(*ucl_device);
|
|
time_ellipsoid3.init(*ucl_device);
|
|
time_lj.zero();
|
|
time_nbor1.zero();
|
|
time_ellipsoid.zero();
|
|
time_nbor2.zero();
|
|
time_ellipsoid2.zero();
|
|
time_nbor3.zero();
|
|
time_ellipsoid3.zero();
|
|
|
|
// See if we want fast GB-sphere or sphere-sphere calculations
|
|
_host_form=h_form;
|
|
_multiple_forms=false;
|
|
for (int i=1; i<ntypes; i++)
|
|
for (int j=i; j<ntypes; j++)
|
|
if (_host_form[i][j]!=ELLIPSE_ELLIPSE)
|
|
_multiple_forms=true;
|
|
if (_multiple_forms && host_nlocal>0)
|
|
return -8;
|
|
if (_multiple_forms && gpu_nbor!=0)
|
|
return -9;
|
|
|
|
if (_multiple_forms) {
|
|
ans->force.zero();
|
|
host_olist_size = nbor->max_atoms();
|
|
host_olist = new int[nbor->max_atoms()];
|
|
}
|
|
|
|
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
|
|
|
neigh_tex.bind_float(atom->x,4);
|
|
pos_tex.bind_float(atom->x,4);
|
|
quat_tex.bind_float(atom->quat,4);
|
|
lj_pos_tex.bind_float(atom->x,4);
|
|
lj_quat_tex.bind_float(atom->quat,4);
|
|
|
|
return 0;
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
void BaseEllipsoidT::estimate_gpu_overhead() {
|
|
device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
void BaseEllipsoidT::clear_base() {
|
|
// Output any timing information
|
|
output_times();
|
|
if (host_olist_size) {
|
|
host_olist_size = 0;
|
|
delete []host_olist;
|
|
}
|
|
|
|
time_nbor1.clear();
|
|
time_ellipsoid.clear();
|
|
time_nbor2.clear();
|
|
time_ellipsoid2.clear();
|
|
time_nbor3.clear();
|
|
time_ellipsoid3.clear();
|
|
time_lj.clear();
|
|
hd_balancer.clear();
|
|
|
|
nbor->clear();
|
|
ans->clear();
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
void BaseEllipsoidT::output_times() {
|
|
// Output any timing information
|
|
acc_timers();
|
|
double single[10], times[10];
|
|
|
|
single[0]=atom->transfer_time()+ans->transfer_time();
|
|
single[1]=nbor->time_nbor.total_seconds()+nbor->time_hybrid1.total_seconds()+
|
|
nbor->time_hybrid2.total_seconds();
|
|
single[2]=time_nbor1.total_seconds()+time_nbor2.total_seconds()+
|
|
time_nbor3.total_seconds()+nbor->time_nbor.total_seconds();
|
|
single[3]=time_ellipsoid.total_seconds()+time_ellipsoid2.total_seconds()+
|
|
time_ellipsoid3.total_seconds();
|
|
if (_multiple_forms)
|
|
single[4]=time_lj.total_seconds();
|
|
else
|
|
single[4]=0;
|
|
single[5]=atom->cast_time()+ans->cast_time();
|
|
single[6]=_gpu_overhead;
|
|
single[7]=_driver_overhead;
|
|
single[8]=ans->cpu_idle_time();
|
|
single[9]=nbor->bin_time();
|
|
|
|
MPI_Reduce(single,times,10,MPI_DOUBLE,MPI_SUM,0,device->replica());
|
|
double avg_split=hd_balancer.all_avg_split();
|
|
|
|
_max_bytes+=atom->max_gpu_bytes();
|
|
double mpi_max_bytes;
|
|
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
|
|
device->replica());
|
|
double max_mb=mpi_max_bytes/(1024*1024);
|
|
|
|
#ifdef USE_OPENCL
|
|
// Workaround for timing issue on Intel OpenCL
|
|
if (times[3] > 80e6) times[3]=0.0;
|
|
#endif
|
|
|
|
if (device->replica_me()==0)
|
|
if (screen && times[7]>0.0) {
|
|
int replica_size=device->replica_size();
|
|
|
|
fprintf(screen,"\n\n-------------------------------------");
|
|
fprintf(screen,"--------------------------------\n");
|
|
fprintf(screen," Device Time Info (average): ");
|
|
fprintf(screen,"\n-------------------------------------");
|
|
fprintf(screen,"--------------------------------\n");
|
|
|
|
if (device->procs_per_gpu()==1 && times[3]>0) {
|
|
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size);
|
|
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size);
|
|
if (nbor->gpu_nbor()>0)
|
|
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size);
|
|
else
|
|
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
|
|
fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
|
|
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
|
|
}
|
|
if (times[6]>0)
|
|
fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
|
|
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
|
fprintf(screen,"Lanes / atom: %d.\n",_threads_per_atom);
|
|
fprintf(screen,"Vector width: %d.\n", device->simd_size());
|
|
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
|
if (nbor->gpu_nbor()==2)
|
|
fprintf(screen,"CPU Neighbor: %.4f s.\n",times[9]/replica_size);
|
|
fprintf(screen,"CPU Cast/Pack: %.4f s.\n",times[5]/replica_size);
|
|
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
|
|
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
|
|
fprintf(screen,"-------------------------------------");
|
|
fprintf(screen,"--------------------------------\n\n");
|
|
}
|
|
_max_bytes=0.0;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Pack neighbors to limit thread divergence for lj-lj and ellipse
|
|
// ---------------------------------------------------------------------------
|
|
template<class numtyp, class acctyp>
|
|
void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
|
|
const int inum, const int form_low,
|
|
const int form_high, const bool shared_types,
|
|
int ntypes) {
|
|
int stride=nbor->nbor_pitch();
|
|
if (shared_types) {
|
|
k_nbor_fast.set_size(GX,BX);
|
|
k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start,
|
|
&inum, &nbor->dev_packed, &form_low, &form_high,
|
|
&_threads_per_atom);
|
|
} else {
|
|
k_nbor.set_size(GX,BX);
|
|
k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride,
|
|
&start, &inum, &nbor->dev_packed, &form_low, &form_high,
|
|
&_threads_per_atom);
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Copy neighbor list from host
|
|
// ---------------------------------------------------------------------------
|
|
template <class numtyp, class acctyp>
|
|
void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
|
|
const int osize, int *ilist,
|
|
int *numj, int *type, int **firstneigh,
|
|
bool &success) {
|
|
success=true;
|
|
|
|
int mn=nbor->max_nbor_loop(osize,numj,ilist);
|
|
resize_atom(nall,success);
|
|
resize_local(inum,0,mn,osize,success);
|
|
if (!success)
|
|
return;
|
|
|
|
if (_multiple_forms) {
|
|
int p=0;
|
|
for (int i=0; i<osize; i++) {
|
|
int itype=type[ilist[i]];
|
|
if (_host_form[itype][itype]==ELLIPSE_ELLIPSE) {
|
|
host_olist[p]=ilist[i];
|
|
p++;
|
|
}
|
|
}
|
|
_max_last_ellipse=p;
|
|
_last_ellipse=std::min(inum,_max_last_ellipse);
|
|
for (int i=0; i<osize; i++) {
|
|
int itype=type[ilist[i]];
|
|
if (_host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
|
|
host_olist[p]=ilist[i];
|
|
p++;
|
|
}
|
|
}
|
|
nbor->get_host(inum,host_olist,numj,firstneigh,block_size());
|
|
nbor->copy_unpacked(inum,mn);
|
|
return;
|
|
}
|
|
_last_ellipse=inum;
|
|
_max_last_ellipse=inum;
|
|
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
|
nbor->copy_unpacked(inum,mn);
|
|
|
|
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
|
if (bytes>_max_an_bytes)
|
|
_max_an_bytes=bytes;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Build neighbor list on device
|
|
// ---------------------------------------------------------------------------
|
|
template <class numtyp, class acctyp>
|
|
inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
|
|
const int nall, double **host_x,
|
|
int *host_type, double *sublo,
|
|
double *subhi, tagint *tag,
|
|
int **nspecial, tagint **special,
|
|
bool &success) {
|
|
success=true;
|
|
resize_atom(nall,success);
|
|
resize_local(inum,host_inum,nbor->max_nbors(),0,success);
|
|
if (!success)
|
|
return;
|
|
atom->cast_copy_x(host_x,host_type);
|
|
|
|
int mn;
|
|
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
|
|
tag, nspecial, special, success, mn, ans->error_flag);
|
|
nbor->copy_unpacked(inum,mn);
|
|
_last_ellipse=inum;
|
|
_max_last_ellipse=inum;
|
|
|
|
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
|
if (bytes>_max_an_bytes)
|
|
_max_an_bytes=bytes;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
|
// ---------------------------------------------------------------------------
|
|
template <class numtyp, class acctyp>
|
|
int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
|
|
const int nall, double **host_x, int *host_type,
|
|
int *ilist, int *numj, int **firstneigh,
|
|
const bool eflag_in, const bool vflag_in,
|
|
const bool eatom, const bool vatom,
|
|
int &host_start, const double cpu_time,
|
|
bool &success, double **host_quat) {
|
|
acc_timers();
|
|
int eflag, vflag;
|
|
if (eflag_in) eflag=2;
|
|
else eflag=0;
|
|
if (vflag_in) vflag=2;
|
|
else vflag=0;
|
|
|
|
set_kernel(eflag,vflag);
|
|
if (inum_full==0) {
|
|
host_start=0;
|
|
zero_timers();
|
|
return nullptr;
|
|
}
|
|
|
|
int ago=hd_balancer.ago_first(f_ago);
|
|
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
|
ans->inum(inum);
|
|
_last_ellipse=std::min(inum,_max_last_ellipse);
|
|
host_start=inum;
|
|
|
|
if (ago==0) {
|
|
reset_nbors(nall, inum, inum_full, ilist, numj, host_type, firstneigh,
|
|
success);
|
|
if (!success)
|
|
return nullptr;
|
|
}
|
|
int *list;
|
|
if (_multiple_forms)
|
|
list=host_olist;
|
|
else
|
|
list=ilist;
|
|
|
|
atom->cast_x_data(host_x,host_type);
|
|
atom->cast_quat_data(host_quat[0]);
|
|
hd_balancer.start_timer();
|
|
atom->add_x_data(host_x,host_type);
|
|
atom->add_quat_data();
|
|
|
|
loop(eflag,vflag);
|
|
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,list,inum);
|
|
device->add_ans_object(ans);
|
|
hd_balancer.stop_timer();
|
|
return list;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
|
// ---------------------------------------------------------------------------
|
|
template <class numtyp, class acctyp>
|
|
int** BaseEllipsoidT::compute(const int ago, const int inum_full,
|
|
const int nall, double **host_x, int *host_type,
|
|
double *sublo, double *subhi, tagint *tag,
|
|
int **nspecial, tagint **special,
|
|
const bool eflag_in, const bool vflag_in,
|
|
const bool eatom, const bool vatom,
|
|
int &host_start, int **ilist, int **jnum,
|
|
const double cpu_time, bool &success,
|
|
double **host_quat) {
|
|
acc_timers();
|
|
int eflag, vflag;
|
|
if (eflag_in) eflag=2;
|
|
else eflag=0;
|
|
if (vflag_in) vflag=2;
|
|
else vflag=0;
|
|
|
|
set_kernel(eflag,vflag);
|
|
if (inum_full==0) {
|
|
host_start=0;
|
|
zero_timers();
|
|
return nullptr;
|
|
}
|
|
|
|
hd_balancer.balance(cpu_time);
|
|
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
|
ans->inum(inum);
|
|
_last_ellipse=std::min(inum,_max_last_ellipse);
|
|
host_start=inum;
|
|
|
|
// Build neighbor list on GPU if necessary
|
|
if (ago==0) {
|
|
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
|
sublo, subhi, tag, nspecial, special, success);
|
|
if (!success)
|
|
return nullptr;
|
|
atom->cast_quat_data(host_quat[0]);
|
|
hd_balancer.start_timer();
|
|
} else {
|
|
atom->cast_x_data(host_x,host_type);
|
|
atom->cast_quat_data(host_quat[0]);
|
|
hd_balancer.start_timer();
|
|
atom->add_x_data(host_x,host_type);
|
|
}
|
|
|
|
atom->add_quat_data();
|
|
*ilist=nbor->host_ilist.begin();
|
|
*jnum=nbor->host_acc.begin();
|
|
|
|
loop(eflag,vflag);
|
|
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,inum);
|
|
device->add_ans_object(ans);
|
|
hd_balancer.stop_timer();
|
|
|
|
return nbor->host_jlist.begin()-host_start;
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
double BaseEllipsoidT::host_memory_usage_base() const {
|
|
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
|
4*sizeof(numtyp)+sizeof(BaseEllipsoid<numtyp,acctyp>);
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
|
|
const void *ellipsoid_string,
|
|
const void *lj_string,
|
|
const char *kname, const bool e_s) {
|
|
if (_compiled)
|
|
return;
|
|
|
|
std::string kns=kname;
|
|
std::string s_sphere_ellipsoid=kns+"_sphere_ellipsoid";
|
|
std::string s_ellipsoid_sphere=kns+"_ellipsoid_sphere";
|
|
std::string s_lj=kns+"_lj";
|
|
std::string s_lj_fast=kns+"_lj_fast";
|
|
|
|
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
|
|
|
if (nbor_program) delete nbor_program;
|
|
nbor_program=new UCL_Program(dev);
|
|
nbor_program->load_string(ellipsoid_nbor,oclstring.c_str(),nullptr,screen);
|
|
k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
|
|
k_nbor.set_function(*nbor_program,"kernel_nbor");
|
|
neigh_tex.get_texture(*nbor_program,"pos_tex");
|
|
|
|
if (ellipsoid_program) delete ellipsoid_program;
|
|
ellipsoid_program=new UCL_Program(dev);
|
|
ellipsoid_program->load_string(ellipsoid_string,oclstring.c_str(),
|
|
nullptr,screen);
|
|
k_ellipsoid.set_function(*ellipsoid_program,kname);
|
|
pos_tex.get_texture(*ellipsoid_program,"pos_tex");
|
|
quat_tex.get_texture(*ellipsoid_program,"quat_tex");
|
|
|
|
if (lj_program) delete lj_program;
|
|
lj_program=new UCL_Program(dev);
|
|
lj_program->load_string(lj_string,oclstring.c_str(),nullptr,screen);
|
|
k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str());
|
|
k_lj_fast.set_function(*lj_program,s_lj_fast.c_str());
|
|
k_lj.set_function(*lj_program,s_lj.c_str());
|
|
if (e_s)
|
|
k_ellipsoid_sphere.set_function(*lj_program,s_ellipsoid_sphere.c_str());
|
|
lj_pos_tex.get_texture(*lj_program,"pos_tex");
|
|
lj_quat_tex.get_texture(*lj_program,"quat_tex");
|
|
|
|
#if defined(LAL_OCL_EV_JIT)
|
|
oclstring = device->compile_string()+" -DEVFLAG=0";
|
|
if (ellipsoid_program_noev) delete ellipsoid_program_noev;
|
|
ellipsoid_program_noev=new UCL_Program(dev);
|
|
ellipsoid_program_noev->load_string(ellipsoid_string,oclstring.c_str(),
|
|
nullptr,screen);
|
|
k_ellipsoid_noev.set_function(*ellipsoid_program_noev,kname);
|
|
|
|
if (lj_program_noev) delete lj_program_noev;
|
|
lj_program_noev=new UCL_Program(dev);
|
|
lj_program_noev->load_string(lj_string,oclstring.c_str(),nullptr,screen);
|
|
k_sphere_ellipsoid_noev.set_function(*lj_program_noev,
|
|
s_sphere_ellipsoid.c_str());
|
|
k_lj_fast_noev.set_function(*lj_program_noev,s_lj_fast.c_str());
|
|
if (e_s)
|
|
k_ellipsoid_sphere_noev.set_function(*lj_program_noev,
|
|
s_ellipsoid_sphere.c_str());
|
|
#else
|
|
k_elps_sel = &k_ellipsoid;
|
|
k_elps_sphere_sel = &k_ellipsoid_sphere;
|
|
k_sphere_elps_sel = &k_sphere_ellipsoid;
|
|
k_lj_sel = &k_lj_fast;
|
|
#endif
|
|
|
|
_compiled=true;
|
|
|
|
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
|
if (dev.cl_device_version() >= 210) {
|
|
size_t mx_subgroup_sz = k_lj_fast.max_subgroup_size(_block_size);
|
|
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid.max_subgroup_size(_block_size));
|
|
mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid.max_subgroup_size(_block_size));
|
|
if (e_s)
|
|
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere.max_subgroup_size(_block_size));
|
|
#if defined(LAL_OCL_EV_JIT)
|
|
mx_subgroup_sz = std::min(mx_subgroup_sz, k_lj_fast_noev.max_subgroup_size(_block_size));
|
|
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_noev.max_subgroup_size(_block_size));
|
|
mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid_noev.max_subgroup_size(_block_size));
|
|
if (e_s)
|
|
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere_noev.max_subgroup_size(_block_size));
|
|
#endif
|
|
if (_threads_per_atom > mx_subgroup_sz)
|
|
_threads_per_atom = mx_subgroup_sz;
|
|
device->set_simd_size(mx_subgroup_sz);
|
|
}
|
|
#endif
|
|
|
|
}
|
|
|
|
template class BaseEllipsoid<PRECISION,ACC_PRECISION>;
|
|
}
|