diff --git a/doc/src/package.rst b/doc/src/package.rst index aea4ba657f..842fc8bc1c 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -32,10 +32,12 @@ Syntax size = bin size for neighbor list construction (distance units) *split* = fraction fraction = fraction of atoms assigned to GPU (default = 1.0) - *tpa* value = Nthreads - Nthreads = # of GPU vector lanes used per atom + *tpa* value = Nlanes + Nlanes = # of GPU vector lanes (CUDA threads) used per atom *blocksize* value = size size = thread block size for pair force computation + *omp* value = Nthreads + Nthreads = number of OpenMP threads to use on CPU (default = 0) *platform* value = id id = For OpenCL, platform ID for the GPU or accelerator *gpuID* values = id @@ -101,7 +103,7 @@ Syntax off = use device acceleration (e.g. GPU) for all available styles in the KOKKOS package (default) on = use device acceleration only for pair styles (and host acceleration for others) *omp* args = Nthreads keyword value ... - Nthread = # of OpenMP threads to associate with each MPI process + Nthreads = # of OpenMP threads to associate with each MPI process zero or more keyword/value pairs may be appended keywords = *neigh* *neigh* value = *yes* or *no* @@ -116,7 +118,7 @@ Examples package gpu 0 package gpu 1 split 0.75 package gpu 2 split -1.0 - package gpu 0 device_type intelgpu + package gpu 0 omp 2 device_type intelgpu package kokkos neigh half comm device package omp 0 neigh no package omp 4 @@ -266,10 +268,10 @@ with MPI. The *tpa* keyword sets the number of GPU vector lanes per atom used to perform force calculations. With a default value of 1, the number of -threads will be chosen based on the pair style, however, the value can +lanes will be chosen based on the pair style, however, the value can be set explicitly with this keyword to fine-tune performance. For large cutoffs or with a small number of particles per GPU, increasing -the value can improve performance. The number of threads per atom must +the value can improve performance. The number of lanes per atom must be a power of 2 and currently cannot be greater than the SIMD width for the GPU / accelerator. In the case it exceeds the SIMD width, it will automatically be decreased to meet the restriction. @@ -282,6 +284,14 @@ individual GPU cores, but reduces the total number of thread blocks, thus may lead to load imbalance. On modern hardware, the sensitivity to the blocksize is typically low. +The *Nthreads* value for the *omp* keyword sets the number of OpenMP +threads allocated for each MPI task. This setting controls OpenMP +parallelism only for routines run on the CPUs. For more details on +setting the number of OpenMP threads, see the discussion of the +*Nthreads* setting on this doc page for the "package omp" command. +The meaning of *Nthreads* is exactly the same for the GPU, USER-INTEL, +and GPU packages. + The *platform* keyword is only used with OpenCL to specify the ID for an OpenCL platform. See the output from ocl_get_devices in the lib/gpu directory. In LAMMPS only one platform can be active at a time and by @@ -336,44 +346,13 @@ built with co-processor support. Optional keyword/value pairs can also be specified. Each has a default value as listed below. -The *omp* keyword determines the number of OpenMP threads allocated -for each MPI task when any portion of the interactions computed by a -USER-INTEL pair style are run on the CPU. This can be the case even -if LAMMPS was built with co-processor support; see the *balance* -keyword discussion below. If you are running with less MPI tasks/node -than there are CPUs, it can be advantageous to use OpenMP threading on -the CPUs. - -.. note:: - - The *omp* keyword has nothing to do with co-processor threads on - the Xeon Phi; see the *tpc* and *tptask* keywords below for a - discussion of co-processor threads. - -The *Nthread* value for the *omp* keyword sets the number of OpenMP -threads allocated for each MPI task. Setting *Nthread* = 0 (the -default) instructs LAMMPS to use whatever value is the default for the -given OpenMP environment. This is usually determined via the -*OMP_NUM_THREADS* environment variable or the compiler runtime, which -is usually a value of 1. - -For more details, including examples of how to set the OMP_NUM_THREADS -environment variable, see the discussion of the *Nthreads* setting on -this doc page for the "package omp" command. Nthreads is a required -argument for the USER-OMP package. Its meaning is exactly the same -for the USER-INTEL package. - -.. note:: - - If you build LAMMPS with both the USER-INTEL and USER-OMP - packages, be aware that both packages allow setting of the *Nthreads* - value via their package commands, but there is only a single global - *Nthreads* value used by OpenMP. Thus if both package commands are - invoked, you should insure the two values are consistent. If they are - not, the last one invoked will take precedence, for both packages. - Also note that if the :doc:`-sf hybrid intel omp command-line switch ` is used, it invokes a "package intel" - command, followed by a "package omp" command, both with a setting of - *Nthreads* = 0. +The *Nthreads* value for the *omp* keyword sets the number of OpenMP +threads allocated for each MPI task. This setting controls OpenMP +parallelism only for routines run on the CPUs. For more details on +setting the number of OpenMP threads, see the discussion of the +*Nthreads* setting on this doc page for the "package omp" command. +The meaning of *Nthreads* is exactly the same for the GPU, USER-INTEL, +and GPU packages. The *mode* keyword determines the precision mode to use for computing pair style forces, either on the CPU or on the co-processor, @@ -579,7 +558,7 @@ result in better performance for certain configurations and system sizes. The *omp* style invokes settings associated with the use of the USER-OMP package. -The *Nthread* argument sets the number of OpenMP threads allocated for +The *Nthreads* argument sets the number of OpenMP threads allocated for each MPI task. For example, if your system has nodes with dual quad-core processors, it has a total of 8 cores per node. You could use two MPI tasks per node (e.g. using the -ppn option of the mpirun @@ -588,7 +567,7 @@ This would use all 8 cores on each node. Note that the product of MPI tasks \* threads/task should not exceed the physical number of cores (on a node), otherwise performance will suffer. -Setting *Nthread* = 0 instructs LAMMPS to use whatever value is the +Setting *Nthreads* = 0 instructs LAMMPS to use whatever value is the default for the given OpenMP environment. This is usually determined via the *OMP_NUM_THREADS* environment variable or the compiler runtime. Note that in most cases the default for OpenMP capable @@ -619,6 +598,18 @@ input. Not all features of LAMMPS support OpenMP threading via the USER-OMP package and the parallel efficiency can be very different, too. +.. note:: + + If you build LAMMPS with the GPU, USER-INTEL, and / or USER-OMP + packages, be aware these packages all allow setting of the *Nthreads* + value via their package commands, but there is only a single global + *Nthreads* value used by OpenMP. Thus if multiple package commands are + invoked, you should insure the values are consistent. If they are + not, the last one invoked will take precedence, for all packages. + Also note that if the :doc:`-sf hybrid intel omp command-line switch ` is used, it invokes a "package intel" command, followed by a + "package omp" command, both with a setting of *Nthreads* = 0. Likewise + for a hybrid suffix for gpu and omp. + Optional keyword/value pairs can also be specified. Each has a default value as listed below. @@ -665,7 +656,7 @@ Default For the GPU package, the default is Ngpu = 0 and the option defaults are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0 -to Ngpu-1, tpa = 1, and platform=-1. These settings are made +to Ngpu-1, tpa = 1, omp = 0, and platform=-1. These settings are made automatically if the "-sf gpu" :doc:`command-line switch ` is used. If it is not used, you must invoke the package gpu command in your input script or via the "-pk gpu" :doc:`command-line switch `. diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp index e2478a64e5..4a68466d05 100644 --- a/lib/gpu/lal_answer.cpp +++ b/lib/gpu/lal_answer.cpp @@ -331,11 +331,11 @@ void AnswerT::get_answers(double **f, double **tor) { } if (_rot) { vec3d *torp=reinterpret_cast(&(tor[0][0])); - forcep=reinterpret_cast(&(force[_inum*4])); + vec4d_t *torquep=reinterpret_cast(&(force[_inum*4])); for (int i=ifrom; i0) fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); - fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); + fprintf(screen,"Lanes / atom: %d.\n",_threads_per_atom); fprintf(screen,"Vector width: %d.\n", device->simd_size()); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); if (nbor->gpu_nbor()==2) diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 5ba9185e6f..a65c3d8810 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -53,14 +53,10 @@ DeviceT::~Device() { template int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, const int first_gpu_id, const int gpu_mode, - const double p_split, const int nthreads, - const int t_per_atom, const double user_cell_size, - char *ocl_args, const int ocl_platform, - char *device_type_flags, const int block_pair) { - _nthreads=nthreads; - #if (LAL_USE_OMP == 1) - omp_set_num_threads(nthreads); - #endif + const double p_split, const int t_per_atom, + const double user_cell_size, char *ocl_args, + const int ocl_platform, char *device_type_flags, + const int block_pair) { _threads_per_atom=t_per_atom; _threads_per_charge=t_per_atom; _threads_per_three=t_per_atom; @@ -583,7 +579,7 @@ void DeviceT::init_message(FILE *screen, const char *name, fprintf(screen,"- Using acceleration for %s:\n",name); fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu); #if (LAL_USE_OMP == 1) - fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads); + fprintf(screen,"- with %d thread(s) per proc.\n", omp_get_max_threads()); #endif #ifdef USE_OPENCL fprintf(screen,"- with OpenCL Parameters for: %s (%d)\n", @@ -803,7 +799,7 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, if (times[5]>0) fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); - fprintf(screen,"Threads / atom: %d.\n",threads_per_atom); + fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom); fprintf(screen,"Vector width: %d.\n", simd_size()); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); if (nbor.gpu_nbor()==2) @@ -1031,13 +1027,13 @@ Device global_device; using namespace LAMMPS_AL; int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, const int first_gpu_id, const int gpu_mode, - const double particle_split, const int nthreads, - const int t_per_atom, const double user_cell_size, - char *opencl_config, const int ocl_platform, - char *device_type_flags, const int block_pair) { + const double particle_split, const int t_per_atom, + const double user_cell_size, char *opencl_config, + const int ocl_platform, char *device_type_flags, + const int block_pair) { return global_device.init_device(world,replica,ngpu,first_gpu_id,gpu_mode, - particle_split,nthreads,t_per_atom, - user_cell_size,opencl_config,ocl_platform, + particle_split,t_per_atom,user_cell_size, + opencl_config,ocl_platform, device_type_flags,block_pair); } diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index bd5b81558c..1db6ae3127 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -49,10 +49,10 @@ class Device { * - -11 if config_string has the wrong number of parameters **/ int init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, const int first_gpu_id, const int gpu_mode, - const double particle_split, const int nthreads, - const int t_per_atom, const double user_cell_size, - char *config_string, const int ocl_platform, - char *device_type_flags, const int block_pair); + const double particle_split, const int t_per_atom, + const double user_cell_size, char *config_string, + const int ocl_platform, char *device_type_flags, + const int block_pair); /// Initialize the device for Atom storage /** \param charge True if charges need to be stored @@ -201,8 +201,6 @@ class Device { /// Return the number of procs sharing a device (size of device communicator) inline int procs_per_gpu() const { return _procs_per_gpu; } - /// Return the number of threads per proc - inline int num_threads() const { return _nthreads; } /// My rank within all processes inline int world_me() const { return _world_me; } /// Total number of processes @@ -331,7 +329,7 @@ class Device { MPI_Comm _comm_world, _comm_replica, _comm_gpu; int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, _replica_size; - int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads; + int _gpu_mode, _first_device, _last_device, _platform_id; double _particle_split; double _cpu_full; double _ptx_arch; diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index efbaa6e1f8..8297c338a5 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -32,16 +32,18 @@ #include "citeme.h" #include "error.h" +#if (LAL_USE_OMP == 1) +#include +#endif using namespace LAMMPS_NS; using namespace FixConst; enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH}; -extern int lmp_init_device(MPI_Comm world, MPI_Comm replica, - const int ngpu, const int first_gpu_id, - const int gpu_mode, const double particle_split, - const int nthreads, const int t_per_atom, +extern int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, + const int first_gpu_id, const int gpu_mode, + const double particle_split, const int t_per_atom, const double cell_size, char *opencl_args, const int ocl_platform, char *device_type_flags, const int block_pair); @@ -123,7 +125,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : _gpu_mode = GPU_NEIGH; _particle_split = 1.0; - int nthreads = 1; + int nthreads = 0; int newtonflag = 0; int threads_per_atom = -1; double binsize = 0.0; @@ -167,10 +169,10 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); threads_per_atom = utils::inumeric(FLERR,arg[iarg+1],false,lmp); iarg += 2; - } else if (strcmp(arg[iarg],"nthreads") == 0) { + } else if (strcmp(arg[iarg],"omp") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); nthreads = utils::inumeric(FLERR,arg[iarg+1],false,lmp); - if (nthreads < 1) error->all(FLERR,"Illegal fix GPU command"); + if (nthreads < 0) error->all(FLERR,"Illegal fix GPU command"); iarg += 2; } else if (strcmp(arg[iarg],"platform") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); @@ -200,6 +202,11 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : #if (LAL_USE_OMP == 0) if (nthreads > 1) error->all(FLERR,"No OpenMP support compiled in"); + #else + if (nthreads > 0) { + omp_set_num_threads(nthreads); + comm->nthreads = nthreads; + } #endif // set newton pair flag @@ -227,9 +234,9 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : if (binsize == 0.0) binsize = -1.0; _binsize = binsize; int gpu_flag = lmp_init_device(universe->uworld, world, ngpu, first_gpu_id, - _gpu_mode, _particle_split, nthreads, - threads_per_atom, binsize, opencl_args, - ocl_platform, device_type_flags, block_pair); + _gpu_mode, _particle_split, threads_per_atom, + binsize, opencl_args, ocl_platform, + device_type_flags, block_pair); GPU_EXTRA::check_flag(gpu_flag,error,world); }