Merge branch 'amoeba' into amoeba-gpu

This commit is contained in:
Trung Nguyen
2022-04-22 16:10:24 -05:00
4933 changed files with 388163 additions and 506968 deletions

View File

@ -101,7 +101,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
// Get the names of all nodes
int name_length;
char node_name[MPI_MAX_PROCESSOR_NAME];
char *node_names = new char[MPI_MAX_PROCESSOR_NAME*_world_size];
auto node_names = new char[MPI_MAX_PROCESSOR_NAME*_world_size];
MPI_Get_processor_name(node_name,&name_length);
MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names[0],
MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
@ -198,12 +198,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
// Find deviceID with most CUs (priority given to the accelerator type)
if (_first_device < 0) {
int best_device = 0;
int best_cus = gpu->cus(0);
unsigned best_cus = gpu->cus(0);
bool type_match = (gpu->device_type(0) == type);
for (int i = 1; i < gpu->num_devices(); i++) {
if (type_match==true && gpu->device_type(i)!=type)
if (type_match && gpu->device_type(i)!=type)
continue;
if (type_match == false && gpu->device_type(i) == type) {
if (type_match && gpu->device_type(i) == type) {
type_match = true;
best_cus = gpu->cus(i);
best_device = i;
@ -280,7 +280,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
MPI_Comm_rank(_comm_gpu,&_gpu_rank);
#if !defined(CUDA_PROXY) && !defined(CUDA_MPS_SUPPORT)
if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
if (_procs_per_gpu>1 && !gpu->sharing_supported(my_gpu))
return -7;
#endif
@ -333,6 +333,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
gpu_barrier();
}
// check if double precision support is available
#if defined(_SINGLE_DOUBLE) || defined(_DOUBLE_DOUBLE)
if (!gpu->double_precision())
return -16;
#endif
// Setup auto bin size calculation for calls from atom::sort
// - This is repeated in neighbor init with additional info
if (_user_cell_size<0.0) {
@ -348,7 +354,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
}
template <class numtyp, class acctyp>
int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) {
int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args) {
#ifdef USE_OPENCL
#include "lal_pre_ocl_config.h"
@ -368,7 +374,7 @@ int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) {
int token_count=0;
std::string params[18];
char ocl_config[2048];
strcpy(ocl_config,s_config.c_str());
strncpy(ocl_config,s_config.c_str(),2047);
char *pch = strtok(ocl_config,",");
_ocl_config_name=pch;
pch = strtok(nullptr,",");
@ -394,7 +400,7 @@ int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) {
_ocl_compile_string += " -DCONFIG_ID="+params[0]+
" -DSIMD_SIZE="+params[1]+
" -DMEM_THREADS="+params[2];
if (gpu->has_shuffle_support()==false)
if (!gpu->has_shuffle_support())
_ocl_compile_string+=" -DSHUFFLE_AVAIL=0";
else
_ocl_compile_string+=" -DSHUFFLE_AVAIL="+params[3];
@ -420,6 +426,16 @@ int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) {
return 0;
}
template <class numtyp, class acctyp>
std::string DeviceT::compile_string_nofast() {
std::string no_fast = _ocl_compile_string;
size_t p = no_fast.find("-cl-fast-relaxed-math ");
if (p != std::string::npos) no_fast.erase(p,22);
p = no_fast.find("-DFAST_MATH=");
if (p != std::string::npos) no_fast[p + 12]='0';
return no_fast;
}
template <class numtyp, class acctyp>
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
const bool rot, const int nlocal,
@ -427,7 +443,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
const bool vel, const int extra_fields) {
if (!_device_init)
return -1;
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
if (sizeof(acctyp)==sizeof(double) && !gpu->double_precision())
return -5;
// Counts of data transfers for timing overhead estimates
@ -467,11 +483,11 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
_data_in_estimate++;
} else {
if (atom.charge()==false && charge)
if (!atom.charge() && charge)
_data_in_estimate++;
if (atom.quaternion()==false && rot)
if (!atom.quaternion() && rot)
_data_in_estimate++;
if (atom.velocity()==false && vel)
if (!atom.velocity() && vel)
_data_in_estimate++;
if (atom.using_extra()==false && extra_fields>0)
_data_in_estimate++;
@ -491,7 +507,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
const int nall) {
if (!_device_init)
return -1;
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
if (sizeof(acctyp)==sizeof(double) && !gpu->double_precision())
return -5;
if (_init_count==0) {
@ -541,14 +557,9 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
return -3;
if (_user_cell_size<0.0) {
#ifndef LAL_USE_OLD_NEIGHBOR
_neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size());
#else
_neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size());
#endif
} else
_neighbor_shared.setup_auto_cell_size(false,_user_cell_size,
nbor->simd_size());
_neighbor_shared.setup_auto_cell_size(false,_user_cell_size,nbor->simd_size());
nbor->set_cutoff(cutoff);
return 0;
@ -782,28 +793,30 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
#ifdef USE_OPENCL
// Workaround for timing issue on Intel OpenCL
if (times[0] > 80e6) times[0]=0.0;
if (times[3] > 80e6) times[3]=0.0;
if (times[5] > 80e6) times[5]=0.0;
#endif
if (replica_me()==0)
if (screen && times[6]>0.0) {
if (screen && (times[6] > 0.0)) {
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," Device Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
if (time_device() && times[3]>0) {
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size);
if (time_device() && (times[3] > 0.0)) {
if (times[0] > 0.0)
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size);
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size);
if (nbor.gpu_nbor()>0)
if (nbor.gpu_nbor() > 0.0)
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size);
else
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size);
}
if (times[5]>0)
if (times[5] > 0.0)
fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size);
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom);
@ -977,18 +990,16 @@ int DeviceT::compile_kernels() {
_max_bio_shared_types=gpu_lib_data[17];
_pppm_max_spline=gpu_lib_data[18];
if (static_cast<size_t>(_block_pair)>gpu->group_size_dim(0) ||
static_cast<size_t>(_block_bio_pair)>gpu->group_size_dim(0) ||
static_cast<size_t>(_block_ellipse)>gpu->group_size_dim(0) ||
static_cast<size_t>(_pppm_block)>gpu->group_size_dim(0) ||
static_cast<size_t>(_block_nbor_build)>gpu->group_size_dim(0) ||
static_cast<size_t>(_block_cell_2d)>gpu->group_size_dim(0) ||
static_cast<size_t>(_block_cell_2d)>gpu->group_size_dim(1) ||
static_cast<size_t>(_block_cell_id)>gpu->group_size_dim(0) ||
static_cast<size_t>(_max_shared_types*_max_shared_types*
sizeof(numtyp)*17 > gpu->slm_size()) ||
static_cast<size_t>(_max_bio_shared_types*2*sizeof(numtyp) >
gpu->slm_size()))
if (static_cast<size_t>(_block_pair) > gpu->group_size_dim(0) ||
static_cast<size_t>(_block_bio_pair) > gpu->group_size_dim(0) ||
static_cast<size_t>(_block_ellipse) > gpu->group_size_dim(0) ||
static_cast<size_t>(_pppm_block) > gpu->group_size_dim(0) ||
static_cast<size_t>(_block_nbor_build) > gpu->group_size_dim(0) ||
static_cast<size_t>(_block_cell_2d) > gpu->group_size_dim(0) ||
static_cast<size_t>(_block_cell_2d) > gpu->group_size_dim(1) ||
static_cast<size_t>(_block_cell_id) > gpu->group_size_dim(0) ||
static_cast<size_t>(_max_shared_types*_max_shared_types*sizeof(numtyp)*17 > gpu->slm_size()) ||
static_cast<size_t>(_max_bio_shared_types*2*sizeof(numtyp) > gpu->slm_size()))
return -13;
if (_block_pair % _simd_size != 0 || _block_bio_pair % _simd_size != 0 ||
@ -1033,10 +1044,18 @@ Device<PRECISION,ACC_PRECISION> global_device;
using namespace LAMMPS_AL;
bool lmp_has_gpu_device()
// check if a suitable GPU is present.
// for mixed and double precision GPU library compilation
// also the GPU needs to support double precision.
bool lmp_has_compatible_gpu_device()
{
UCL_Device gpu;
return (gpu.num_platforms() > 0);
bool compatible_gpu = gpu.num_platforms() > 0;
#if defined(_SINGLE_DOUBLE) || defined(_DOUBLE_DOUBLE)
if (compatible_gpu && !gpu.double_precision(0))
compatible_gpu = false;
#endif
return compatible_gpu;
}
std::string lmp_gpu_device_info()
@ -1064,9 +1083,8 @@ void lmp_clear_device() {
global_device.clear_device();
}
double lmp_gpu_forces(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul,
int &error_flag) {
double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom,
double *virial, double &ecoul, int &error_flag) {
return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul,error_flag);
}