using (signed) int will overflow at 2GB, switching to unsigned has risk of hiding overflows and using long long is not as portable as double precisiong floating point.
141 lines
5.1 KiB
C++
141 lines
5.1 KiB
C++
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the GNU General Public License.
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
------------------------------------------------------------------------- */
|
|
|
|
/* ----------------------------------------------------------------------
|
|
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
|
------------------------------------------------------------------------- */
|
|
|
|
#ifndef PAIR_GPU_DEVICE_H
|
|
#define PAIR_GPU_DEVICE_H
|
|
|
|
#include "pair_gpu_atom.h"
|
|
#include "pair_gpu_nbor.h"
|
|
#include "mpi.h"
|
|
#include <sstream>
|
|
#include "stdio.h"
|
|
#include <string>
|
|
|
|
template <class numtyp, class acctyp>
|
|
class PairGPUDevice {
|
|
public:
|
|
PairGPUDevice();
|
|
~PairGPUDevice();
|
|
|
|
/// Initialize the device for use by this process
|
|
/** Sets up a per-device MPI communicator for load balancing and initializes
|
|
* the device (>=first_gpu and <=last_gpu) that this proc will be using **/
|
|
bool init_device(const int first_gpu, const int last_gpu,
|
|
const int gpu_mode, const double particle_split);
|
|
|
|
/// Initialize the device for Atom and Neighbor storage
|
|
/** \param rot True if quaternions need to be stored
|
|
* \param nlocal Total number of local particles to allocate memory for
|
|
* \param host_nlocal Initial number of host particles to allocate memory for
|
|
* \param nall Total number of local+ghost particles
|
|
* \param gpu_nbor True if neighboring is performed on device
|
|
* \param gpu_host 0 if host will not perform force calculations,
|
|
* 1 if gpu_nbor is true, and host needs a half nbor list,
|
|
* 2 if gpu_nbor is true, and host needs a full nbor list
|
|
* \param max_nbors Initial number of rows in the neighbor matrix
|
|
* \param cell_size cutoff+skin
|
|
* \param pre_cut True if cutoff test will be performed in separate kernel
|
|
* than the force kernel **/
|
|
bool init(const bool charge, const bool rot, const int nlocal,
|
|
const int host_nlocal, const int nall, const int maxspecial,
|
|
const bool gpu_nbor, const int gpu_host, const int max_nbors,
|
|
const double cell_size, const bool pre_cut);
|
|
|
|
/// Output a message for pair_style acceleration with device stats
|
|
void init_message(FILE *screen, const char *name,
|
|
const int first_gpu, const int last_gpu);
|
|
|
|
/// Output a message with timing information
|
|
void output_times(UCL_Timer &time_pair, const double avg_split,
|
|
const double max_bytes, FILE *screen);
|
|
|
|
/// Clear all memory on host and device associated with atom and nbor data
|
|
void clear();
|
|
|
|
/// Clear all memory on host and device
|
|
void clear_device();
|
|
|
|
/// Start timer on host
|
|
inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
|
|
|
|
/// Stop timer on host
|
|
inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; }
|
|
|
|
/// Return host time
|
|
inline double host_time() { return _cpu_full; }
|
|
|
|
/// Return host memory usage in bytes
|
|
double host_memory_usage() const;
|
|
|
|
/// Return the number of procs sharing a device (size of device commincator)
|
|
inline int procs_per_gpu() const { return _procs_per_gpu; }
|
|
/// Return my rank in the device communicator
|
|
inline int gpu_rank() const { return _gpu_rank; }
|
|
/// My rank within all processes
|
|
inline int world_me() const { return _world_me; }
|
|
/// Total number of processes
|
|
inline int world_size() const { return _world_size; }
|
|
/// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH
|
|
inline int gpu_mode() const { return _gpu_mode; }
|
|
/// Index of first device used by a node
|
|
inline int first_device() const { return _first_device; }
|
|
/// Index of last device used by a node
|
|
inline int last_device() const { return _last_device; }
|
|
/// Particle split defined in fix
|
|
inline double particle_split() const { return _particle_split; }
|
|
/// Return the initialization count for the device
|
|
inline int init_count() const { return _init_count; }
|
|
|
|
// -------------------------- DEVICE DATA -------------------------
|
|
|
|
/// Geryon Device
|
|
UCL_Device *gpu;
|
|
/// Device communicator
|
|
MPI_Comm gpu_comm;
|
|
|
|
enum{GPU_FORCE, GPU_NEIGH};
|
|
|
|
// --------------------------- ATOM DATA --------------------------
|
|
|
|
/// Atom Data
|
|
PairGPUAtom<numtyp,acctyp> atom;
|
|
|
|
// --------------------------- NBOR DATA ----------------------------
|
|
|
|
/// Neighbor Data
|
|
PairGPUNbor nbor;
|
|
|
|
private:
|
|
int _init_count;
|
|
bool _device_init;
|
|
int _procs_per_gpu, _gpu_rank, _world_me, _world_size;
|
|
int _gpu_mode, _first_device, _last_device;
|
|
double _particle_split;
|
|
double _cpu_full;
|
|
|
|
template <class t>
|
|
inline std::string toa(const t& in) {
|
|
std::ostringstream o;
|
|
o.precision(2);
|
|
o << in;
|
|
return o.str();
|
|
}
|
|
|
|
};
|
|
|
|
#endif
|