Feb2021 GPU Package Update - GPU Package Files
This commit is contained in:
@ -39,22 +39,23 @@ class Device {
|
||||
|
||||
/// Initialize the device for use by this process
|
||||
/** Sets up a per-device MPI communicator for load balancing and initializes
|
||||
* the device (>=first_gpu and <=last_gpu) that this proc will be using
|
||||
* the device (ngpu starting at first_gpu_id) that this proc will be using
|
||||
* Returns:
|
||||
* - 0 if successful
|
||||
* - -2 if GPU not found
|
||||
* - -4 if GPU library not compiled for GPU
|
||||
* - -6 if GPU could not be initialized for use
|
||||
* - -7 if accelerator sharing is not currently allowed on system
|
||||
* - -11 if vendor_string has the wrong number of parameters **/
|
||||
int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||
const int last_gpu, const int gpu_mode,
|
||||
* - -11 if config_string has the wrong number of parameters **/
|
||||
int init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
|
||||
const int first_gpu_id, const int gpu_mode,
|
||||
const double particle_split, const int nthreads,
|
||||
const int t_per_atom, const double cell_size,
|
||||
char *vendor_string, const int block_pair);
|
||||
const int t_per_atom, const double user_cell_size,
|
||||
char *config_string, const int ocl_platform,
|
||||
char *device_type_flags, const int block_pair);
|
||||
|
||||
/// Initialize the device for Atom storage
|
||||
/** \param charge True if charges need to be stored
|
||||
/** \param charge True if charges need to be stored
|
||||
* \param rot True if quaternions need to be stored
|
||||
* \param nlocal Total number of local particles to allocate memory for
|
||||
* \param nall Total number of local+ghost particles
|
||||
@ -94,10 +95,11 @@ class Device {
|
||||
* 1 if gpu_nbor is true, and host needs a half nbor list,
|
||||
* 2 if gpu_nbor is true, and host needs a full nbor list
|
||||
* \param max_nbors Initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff+skin
|
||||
* \param cutoff cutoff+skin
|
||||
* \param pre_cut True if cutoff test will be performed in separate kernel
|
||||
* than the force kernel
|
||||
* \param threads_per_atom value to be used by the neighbor list only
|
||||
* \param ilist_map true if ilist mapping data structures used (3-body)
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successful
|
||||
@ -108,8 +110,9 @@ class Device {
|
||||
int init_nbor(Neighbor *nbor, const int nlocal,
|
||||
const int host_nlocal, const int nall,
|
||||
const int maxspecial, const int gpu_host,
|
||||
const int max_nbors, const double cell_size,
|
||||
const bool pre_cut, const int threads_per_atom);
|
||||
const int max_nbors, const double cutoff,
|
||||
const bool pre_cut, const int threads_per_atom,
|
||||
const bool ilist_map = false);
|
||||
|
||||
/// Output a message for pair_style acceleration with device stats
|
||||
void init_message(FILE *screen, const char *name,
|
||||
@ -161,13 +164,16 @@ class Device {
|
||||
|
||||
/// Add "answers" (force,energies,etc.) into LAMMPS structures
|
||||
inline double fix_gpu(double **f, double **tor, double *eatom,
|
||||
double **vatom, double *virial, double &ecoul) {
|
||||
double **vatom, double *virial, double &ecoul,
|
||||
int &error_flag) {
|
||||
error_flag=0;
|
||||
atom.data_unavail();
|
||||
if (ans_queue.empty()==false) {
|
||||
stop_host_timer();
|
||||
double evdw=0.0;
|
||||
while (ans_queue.empty()==false) {
|
||||
evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
|
||||
evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul,
|
||||
error_flag);
|
||||
ans_queue.pop();
|
||||
}
|
||||
return evdw;
|
||||
@ -228,45 +234,49 @@ class Device {
|
||||
/// True if device is being timed
|
||||
inline bool time_device() const { return _time_device; }
|
||||
|
||||
/// Accelerator device configuration id
|
||||
inline int config_id() const { return _config_id; }
|
||||
/// Number of threads executing concurrently on same multiproc
|
||||
inline int simd_size() const { return _simd_size; }
|
||||
/// Return the number of threads accessing memory simulatenously
|
||||
inline int num_mem_threads() const { return _num_mem_threads; }
|
||||
/// 1 if horizontal vector operations enabled, 0 otherwise
|
||||
inline int shuffle_avail() const { return _shuffle_avail; }
|
||||
/// For OpenCL, 0 if fast-math options disabled, 1 enabled
|
||||
inline int fast_math() const { return _fast_math; }
|
||||
|
||||
/// Return the number of threads per atom for pair styles
|
||||
inline int threads_per_atom() const { return _threads_per_atom; }
|
||||
/// Return the number of threads per atom for pair styles using charge
|
||||
inline int threads_per_charge() const { return _threads_per_charge; }
|
||||
/// Return the number of threads per atom for 3-body pair styles
|
||||
inline int threads_per_three() const { return _threads_per_three; }
|
||||
|
||||
/// Return the min of the pair block size or the device max block size
|
||||
inline int pair_block_size() const { return _block_pair; }
|
||||
/// Return the maximum number of atom types that can be used with shared mem
|
||||
inline int max_shared_types() const { return _max_shared_types; }
|
||||
/// Return the maximum order for PPPM splines
|
||||
inline int pppm_max_spline() const { return _pppm_max_spline; }
|
||||
/// Return the block size for PPPM kernels
|
||||
inline int pppm_block() const { return _pppm_block; }
|
||||
/// Return the block size for neighbor binning
|
||||
inline int block_cell_2d() const { return _block_cell_2d; }
|
||||
/// Return the block size for atom mapping for neighbor builds
|
||||
inline int block_cell_id() const { return _block_cell_id; }
|
||||
/// Return the block size for neighbor build kernel
|
||||
inline int block_nbor_build() const { return _block_nbor_build; }
|
||||
/// Return the block size for "bio" pair styles
|
||||
inline int block_bio_pair() const { return _block_bio_pair; }
|
||||
/// Return the block size for "ellipse" pair styles
|
||||
inline int block_ellipse() const { return _block_ellipse; }
|
||||
/// Return the block size for PPPM kernels
|
||||
inline int pppm_block() const { return _pppm_block; }
|
||||
/// Return the block size for neighbor build kernel
|
||||
inline int block_nbor_build() const { return _block_nbor_build; }
|
||||
/// Return the block size for neighbor binning
|
||||
inline int block_cell_2d() const { return _block_cell_2d; }
|
||||
/// Return the block size for atom mapping for neighbor builds
|
||||
inline int block_cell_id() const { return _block_cell_id; }
|
||||
|
||||
/// Return the maximum number of atom types that can be used with shared mem
|
||||
inline int max_shared_types() const { return _max_shared_types; }
|
||||
/// Return the maximum number of atom types for shared mem with "bio" styles
|
||||
inline int max_bio_shared_types() const { return _max_bio_shared_types; }
|
||||
/// Return the maximum order for PPPM splines
|
||||
inline int pppm_max_spline() const { return _pppm_max_spline; }
|
||||
|
||||
/// Architecture gpu code compiled for (returns 0 for OpenCL)
|
||||
inline double ptx_arch() const { return _ptx_arch; }
|
||||
/// Number of threads executing concurrently on same multiproc
|
||||
inline int warp_size() const { return _warp_size; }
|
||||
|
||||
// -------------------- SHARED DEVICE ROUTINES --------------------
|
||||
// Perform asynchronous zero of integer array
|
||||
void zero(UCL_D_Vec<int> &mem, const int numel) {
|
||||
int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
|
||||
_block_pair));
|
||||
k_zero.set_size(num_blocks,_block_pair);
|
||||
k_zero.run(&mem,&numel);
|
||||
}
|
||||
inline void set_simd_size(int simd_sz) { _simd_size = simd_sz; }
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
@ -304,6 +314,15 @@ class Device {
|
||||
}
|
||||
|
||||
inline std::string compile_string() { return _ocl_compile_string; }
|
||||
inline std::string ocl_config_name() { return _ocl_config_name; }
|
||||
|
||||
template <class t>
|
||||
inline std::string toa(const t& in) {
|
||||
std::ostringstream o;
|
||||
o.precision(2);
|
||||
o << in;
|
||||
return o.str();
|
||||
}
|
||||
|
||||
private:
|
||||
std::queue<Answer<numtyp,acctyp> *> ans_queue;
|
||||
@ -316,13 +335,13 @@ class Device {
|
||||
double _particle_split;
|
||||
double _cpu_full;
|
||||
double _ptx_arch;
|
||||
double _cell_size; // -1 if the cutoff is used
|
||||
double _user_cell_size; // -1 if the cutoff is used
|
||||
|
||||
int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
|
||||
int _pppm_max_spline, _pppm_block;
|
||||
int _block_pair, _block_ellipse, _max_shared_types;
|
||||
int _block_cell_2d, _block_cell_id, _block_nbor_build;
|
||||
int _block_bio_pair, _max_bio_shared_types;
|
||||
int _config_id, _simd_size, _num_mem_threads, _shuffle_avail, _fast_math;
|
||||
int _threads_per_atom, _threads_per_charge, _threads_per_three;
|
||||
int _block_pair, _block_bio_pair, _block_ellipse;
|
||||
int _pppm_block, _block_nbor_build, _block_cell_2d, _block_cell_id;
|
||||
int _max_shared_types, _max_bio_shared_types, _pppm_max_spline;
|
||||
|
||||
UCL_Program *dev_program;
|
||||
UCL_Kernel k_zero, k_info;
|
||||
@ -331,17 +350,8 @@ class Device {
|
||||
|
||||
int _data_in_estimate, _data_out_estimate;
|
||||
|
||||
std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string;
|
||||
int set_ocl_params(char *);
|
||||
|
||||
template <class t>
|
||||
inline std::string toa(const t& in) {
|
||||
std::ostringstream o;
|
||||
o.precision(2);
|
||||
o << in;
|
||||
return o.str();
|
||||
}
|
||||
|
||||
std::string _ocl_config_name, _ocl_config_string, _ocl_compile_string;
|
||||
int set_ocl_params(std::string, std::string);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user