Files
lammps/lib/gpu/lal_base_ellipsoid.cpp
Michael Brown 45c782308c Fixing issue from recent GPU package update with OMP_NUM_THREADS env being overridden in GPU library.
Fixing race condition with OpenMP for GPU styles using torque (missed in regression tests due to the first fix)
Documenting GPU package option for setting the number of threads (consistent with USER-INTEL and USER-OMP).
2021-02-18 21:08:18 -08:00

580 lines
20 KiB
C++

/***************************************************************************
base_ellipsoid.cpp
-------------------
W. Michael Brown (ORNL)
Base class for acceleration of ellipsoid potentials
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : Thu May 5 2011
email : brownw@ornl.gov
***************************************************************************/
#include "lal_base_ellipsoid.h"
#include <cstdlib>
namespace LAMMPS_AL {
#if defined(USE_OPENCL)
#include "ellipsoid_nbor_cl.h"
#elif defined(USE_CUDART)
const char *ellipsoid_nbor=0;
#else
#include "ellipsoid_nbor_cubin.h"
#endif
#define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp>
BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0),
host_olist_size(0) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor();
nbor_program=nullptr;
ellipsoid_program=nullptr;
lj_program=nullptr;
ucl_device=nullptr;
#if defined(LAL_OCL_EV_JIT)
ellipsoid_program_noev=nullptr;
lj_program_noev=nullptr;
#endif
}
template <class numtyp, class acctyp>
BaseEllipsoidT::~BaseEllipsoid() {
delete ans;
delete nbor;
k_nbor_fast.clear();
k_nbor.clear();
k_ellipsoid.clear();
k_ellipsoid_sphere.clear();
k_sphere_ellipsoid.clear();
k_lj_fast.clear();
k_lj.clear();
if (nbor_program) delete nbor_program;
if (ellipsoid_program) delete ellipsoid_program;
if (lj_program) delete lj_program;
#if defined(LAL_OCL_EV_JIT)
k_ellipsoid_noev.clear();
k_ellipsoid_sphere_noev.clear();
k_sphere_ellipsoid_noev.clear();
k_lj_fast.clear();
if (ellipsoid_program_noev) delete ellipsoid_program_noev;
if (lj_program_noev) delete lj_program_noev;
#endif
}
template <class numtyp, class acctyp>
int BaseEllipsoidT::bytes_per_atom(const int max_nbors) const {
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int BaseEllipsoidT::init_base(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split,
FILE *_screen, const int ntypes, int **h_form,
const void *ellipsoid_program,
const void *lj_program, const char *k_name,
const bool ellip_sphere) {
screen=_screen;
_ellipsoid_sphere=ellip_sphere;
int gpu_nbor=0;
if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=1;
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
_threads_per_atom=device->threads_per_atom();
int success=device->init(*ans,false,true,nlocal,nall,maxspecial);
if (success!=0)
return success;
if (ucl_device!=device->gpu) _compiled=false;
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->block_ellipse();
compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,true,1);
if (success!=0)
return success;
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_lj.init(*ucl_device);
time_nbor1.init(*ucl_device);
time_ellipsoid.init(*ucl_device);
time_nbor2.init(*ucl_device);
time_ellipsoid2.init(*ucl_device);
time_nbor3.init(*ucl_device);
time_ellipsoid3.init(*ucl_device);
time_lj.zero();
time_nbor1.zero();
time_ellipsoid.zero();
time_nbor2.zero();
time_ellipsoid2.zero();
time_nbor3.zero();
time_ellipsoid3.zero();
// See if we want fast GB-sphere or sphere-sphere calculations
_host_form=h_form;
_multiple_forms=false;
for (int i=1; i<ntypes; i++)
for (int j=i; j<ntypes; j++)
if (_host_form[i][j]!=ELLIPSE_ELLIPSE)
_multiple_forms=true;
if (_multiple_forms && host_nlocal>0)
return -8;
if (_multiple_forms && gpu_nbor!=0)
return -9;
if (_multiple_forms) {
ans->force.zero();
host_olist_size = nbor->max_atoms();
host_olist = new int[nbor->max_atoms()];
}
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
neigh_tex.bind_float(atom->x,4);
pos_tex.bind_float(atom->x,4);
quat_tex.bind_float(atom->quat,4);
lj_pos_tex.bind_float(atom->x,4);
lj_quat_tex.bind_float(atom->quat,4);
return 0;
}
template <class numtyp, class acctyp>
void BaseEllipsoidT::estimate_gpu_overhead() {
device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
void BaseEllipsoidT::clear_base() {
// Output any timing information
output_times();
if (host_olist_size) {
host_olist_size = 0;
delete []host_olist;
}
time_nbor1.clear();
time_ellipsoid.clear();
time_nbor2.clear();
time_ellipsoid2.clear();
time_nbor3.clear();
time_ellipsoid3.clear();
time_lj.clear();
hd_balancer.clear();
nbor->clear();
ans->clear();
}
template <class numtyp, class acctyp>
void BaseEllipsoidT::output_times() {
// Output any timing information
acc_timers();
double single[10], times[10];
single[0]=atom->transfer_time()+ans->transfer_time();
single[1]=nbor->time_nbor.total_seconds()+nbor->time_hybrid1.total_seconds()+
nbor->time_hybrid2.total_seconds();
single[2]=time_nbor1.total_seconds()+time_nbor2.total_seconds()+
time_nbor3.total_seconds()+nbor->time_nbor.total_seconds();
single[3]=time_ellipsoid.total_seconds()+time_ellipsoid2.total_seconds()+
time_ellipsoid3.total_seconds();
if (_multiple_forms)
single[4]=time_lj.total_seconds();
else
single[4]=0;
single[5]=atom->cast_time()+ans->cast_time();
single[6]=_gpu_overhead;
single[7]=_driver_overhead;
single[8]=ans->cpu_idle_time();
single[9]=nbor->bin_time();
MPI_Reduce(single,times,10,MPI_DOUBLE,MPI_SUM,0,device->replica());
double avg_split=hd_balancer.all_avg_split();
_max_bytes+=atom->max_gpu_bytes();
double mpi_max_bytes;
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
device->replica());
double max_mb=mpi_max_bytes/(1024*1024);
#ifdef USE_OPENCL
// Workaround for timing issue on Intel OpenCL
if (times[3] > 80e6) times[3]=0.0;
#endif
if (device->replica_me()==0)
if (screen && times[7]>0.0) {
int replica_size=device->replica_size();
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," Device Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
if (device->procs_per_gpu()==1 && times[3]>0) {
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size);
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size);
if (nbor->gpu_nbor()>0)
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size);
else
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
}
if (times[6]>0)
fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Lanes / atom: %d.\n",_threads_per_atom);
fprintf(screen,"Vector width: %d.\n", device->simd_size());
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
if (nbor->gpu_nbor()==2)
fprintf(screen,"CPU Neighbor: %.4f s.\n",times[9]/replica_size);
fprintf(screen,"CPU Cast/Pack: %.4f s.\n",times[5]/replica_size);
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
fprintf(screen,"-------------------------------------");
fprintf(screen,"--------------------------------\n\n");
}
_max_bytes=0.0;
}
// ---------------------------------------------------------------------------
// Pack neighbors to limit thread divergence for lj-lj and ellipse
// ---------------------------------------------------------------------------
template<class numtyp, class acctyp>
void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
const int inum, const int form_low,
const int form_high, const bool shared_types,
int ntypes) {
int stride=nbor->nbor_pitch();
if (shared_types) {
k_nbor_fast.set_size(GX,BX);
k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start,
&inum, &nbor->dev_packed, &form_low, &form_high,
&_threads_per_atom);
} else {
k_nbor.set_size(GX,BX);
k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride,
&start, &inum, &nbor->dev_packed, &form_low, &form_high,
&_threads_per_atom);
}
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
const int osize, int *ilist,
int *numj, int *type, int **firstneigh,
bool &success) {
success=true;
int mn=nbor->max_nbor_loop(osize,numj,ilist);
resize_atom(nall,success);
resize_local(inum,0,mn,osize,success);
if (!success)
return;
if (_multiple_forms) {
int p=0;
for (int i=0; i<osize; i++) {
int itype=type[ilist[i]];
if (_host_form[itype][itype]==ELLIPSE_ELLIPSE) {
host_olist[p]=ilist[i];
p++;
}
}
_max_last_ellipse=p;
_last_ellipse=std::min(inum,_max_last_ellipse);
for (int i=0; i<osize; i++) {
int itype=type[ilist[i]];
if (_host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
host_olist[p]=ilist[i];
p++;
}
}
nbor->get_host(inum,host_olist,numj,firstneigh,block_size());
nbor->copy_unpacked(inum,mn);
return;
}
_last_ellipse=inum;
_max_last_ellipse=inum;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
nbor->copy_unpacked(inum,mn);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, tagint *tag,
int **nspecial, tagint **special,
bool &success) {
success=true;
resize_atom(nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),0,success);
if (!success)
return;
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
tag, nspecial, special, success, mn, ans->error_flag);
nbor->copy_unpacked(inum,mn);
_last_ellipse=inum;
_max_last_ellipse=inum;
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, double **host_quat) {
acc_timers();
int eflag, vflag;
if (eflag_in) eflag=2;
else eflag=0;
if (vflag_in) vflag=2;
else vflag=0;
set_kernel(eflag,vflag);
if (inum_full==0) {
host_start=0;
zero_timers();
return nullptr;
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum);
_last_ellipse=std::min(inum,_max_last_ellipse);
host_start=inum;
if (ago==0) {
reset_nbors(nall, inum, inum_full, ilist, numj, host_type, firstneigh,
success);
if (!success)
return nullptr;
}
int *list;
if (_multiple_forms)
list=host_olist;
else
list=ilist;
atom->cast_x_data(host_x,host_type);
atom->cast_quat_data(host_quat[0]);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
atom->add_quat_data();
loop(eflag,vflag);
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,list,inum);
device->add_ans_object(ans);
hd_balancer.stop_timer();
return list;
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** BaseEllipsoidT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success,
double **host_quat) {
acc_timers();
int eflag, vflag;
if (eflag_in) eflag=2;
else eflag=0;
if (vflag_in) vflag=2;
else vflag=0;
set_kernel(eflag,vflag);
if (inum_full==0) {
host_start=0;
zero_timers();
return nullptr;
}
hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum);
_last_ellipse=std::min(inum,_max_last_ellipse);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return nullptr;
atom->cast_quat_data(host_quat[0]);
hd_balancer.start_timer();
} else {
atom->cast_x_data(host_x,host_type);
atom->cast_quat_data(host_quat[0]);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
atom->add_quat_data();
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
loop(eflag,vflag);
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,inum);
device->add_ans_object(ans);
hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start;
}
template <class numtyp, class acctyp>
double BaseEllipsoidT::host_memory_usage_base() const {
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(BaseEllipsoid<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
const void *ellipsoid_string,
const void *lj_string,
const char *kname, const bool e_s) {
if (_compiled)
return;
std::string kns=kname;
std::string s_sphere_ellipsoid=kns+"_sphere_ellipsoid";
std::string s_ellipsoid_sphere=kns+"_ellipsoid_sphere";
std::string s_lj=kns+"_lj";
std::string s_lj_fast=kns+"_lj_fast";
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
if (nbor_program) delete nbor_program;
nbor_program=new UCL_Program(dev);
nbor_program->load_string(ellipsoid_nbor,oclstring.c_str(),nullptr,screen);
k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
k_nbor.set_function(*nbor_program,"kernel_nbor");
neigh_tex.get_texture(*nbor_program,"pos_tex");
if (ellipsoid_program) delete ellipsoid_program;
ellipsoid_program=new UCL_Program(dev);
ellipsoid_program->load_string(ellipsoid_string,oclstring.c_str(),
nullptr,screen);
k_ellipsoid.set_function(*ellipsoid_program,kname);
pos_tex.get_texture(*ellipsoid_program,"pos_tex");
quat_tex.get_texture(*ellipsoid_program,"quat_tex");
if (lj_program) delete lj_program;
lj_program=new UCL_Program(dev);
lj_program->load_string(lj_string,oclstring.c_str(),nullptr,screen);
k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str());
k_lj_fast.set_function(*lj_program,s_lj_fast.c_str());
k_lj.set_function(*lj_program,s_lj.c_str());
if (e_s)
k_ellipsoid_sphere.set_function(*lj_program,s_ellipsoid_sphere.c_str());
lj_pos_tex.get_texture(*lj_program,"pos_tex");
lj_quat_tex.get_texture(*lj_program,"quat_tex");
#if defined(LAL_OCL_EV_JIT)
oclstring = device->compile_string()+" -DEVFLAG=0";
if (ellipsoid_program_noev) delete ellipsoid_program_noev;
ellipsoid_program_noev=new UCL_Program(dev);
ellipsoid_program_noev->load_string(ellipsoid_string,oclstring.c_str(),
nullptr,screen);
k_ellipsoid_noev.set_function(*ellipsoid_program_noev,kname);
if (lj_program_noev) delete lj_program_noev;
lj_program_noev=new UCL_Program(dev);
lj_program_noev->load_string(lj_string,oclstring.c_str(),nullptr,screen);
k_sphere_ellipsoid_noev.set_function(*lj_program_noev,
s_sphere_ellipsoid.c_str());
k_lj_fast_noev.set_function(*lj_program_noev,s_lj_fast.c_str());
if (e_s)
k_ellipsoid_sphere_noev.set_function(*lj_program_noev,
s_ellipsoid_sphere.c_str());
#else
k_elps_sel = &k_ellipsoid;
k_elps_sphere_sel = &k_ellipsoid_sphere;
k_sphere_elps_sel = &k_sphere_ellipsoid;
k_lj_sel = &k_lj_fast;
#endif
_compiled=true;
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
if (dev.cl_device_version() >= 210) {
size_t mx_subgroup_sz = k_lj_fast.max_subgroup_size(_block_size);
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid.max_subgroup_size(_block_size));
mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid.max_subgroup_size(_block_size));
if (e_s)
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere.max_subgroup_size(_block_size));
#if defined(LAL_OCL_EV_JIT)
mx_subgroup_sz = std::min(mx_subgroup_sz, k_lj_fast_noev.max_subgroup_size(_block_size));
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_noev.max_subgroup_size(_block_size));
mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid_noev.max_subgroup_size(_block_size));
if (e_s)
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere_noev.max_subgroup_size(_block_size));
#endif
if (_threads_per_atom > mx_subgroup_sz)
_threads_per_atom = mx_subgroup_sz;
device->set_simd_size(mx_subgroup_sz);
}
#endif
}
template class BaseEllipsoid<PRECISION,ACC_PRECISION>;
}