From c9c83405ab900fe1f5eaae7c320f7f402e149801 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 13:59:16 +0000 Subject: [PATCH 01/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12447 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/QEQ/fix_qeq.cpp | 2 +- src/QEQ/fix_qeq_dynamic.cpp | 6 ++---- src/QEQ/fix_qeq_point.cpp | 1 - src/QEQ/fix_qeq_slater.cpp | 11 ++--------- 4 files changed, 5 insertions(+), 15 deletions(-) diff --git a/src/QEQ/fix_qeq.cpp b/src/QEQ/fix_qeq.cpp index b66d31b03d..970e7d7d47 100644 --- a/src/QEQ/fix_qeq.cpp +++ b/src/QEQ/fix_qeq.cpp @@ -504,7 +504,7 @@ int FixQEq::CG( double *b, double *x ) void FixQEq::sparse_matvec( sparse_matrix *A, double *x, double *b ) { int i, j, itr_j; - int nn, NN, ii; + int nn, NN; int *ilist; nn = atom->nlocal; diff --git a/src/QEQ/fix_qeq_dynamic.cpp b/src/QEQ/fix_qeq_dynamic.cpp index 4f7cb0d994..f483d3d32f 100644 --- a/src/QEQ/fix_qeq_dynamic.cpp +++ b/src/QEQ/fix_qeq_dynamic.cpp @@ -162,7 +162,7 @@ void FixQEqDynamic::pre_force(int vflag) if (iloop == maxiter) { char str[128]; sprintf(str,"Charges did not converge at step "BIGINT_FORMAT - ": %lg",enegchk,update->ntimestep); + ": %lg",update->ntimestep,enegchk); error->warning(FLERR,str); } } @@ -173,13 +173,11 @@ void FixQEqDynamic::pre_force(int vflag) double FixQEqDynamic::compute_eneg() { - int i, j, ii, jj, inum, jnum, itype, jtype, flag; + int i, j, ii, jj, inum, jnum, itype; int *ilist, *jlist, *numneigh, **firstneigh; double eneg, enegtot; double r, rsq, delr[3], rinv; - int nlocal = atom->nlocal; - int *tag = atom->tag; int *type = atom->type; int *mask = atom->mask; double *q = atom->q; diff --git a/src/QEQ/fix_qeq_point.cpp b/src/QEQ/fix_qeq_point.cpp index 9dae0655c5..1e4ce1d41f 100644 --- a/src/QEQ/fix_qeq_point.cpp +++ b/src/QEQ/fix_qeq_point.cpp @@ -122,7 +122,6 @@ void FixQEqPoint::compute_H() double **x, SMALL = 0.0001; double dx, dy, dz, r_sqr, r; - int *type = atom->type; tagint *tag = atom->tag; x = atom->x; int *mask = atom->mask; diff --git a/src/QEQ/fix_qeq_slater.cpp b/src/QEQ/fix_qeq_slater.cpp index d13669f2d1..88e4b00175 100644 --- a/src/QEQ/fix_qeq_slater.cpp +++ b/src/QEQ/fix_qeq_slater.cpp @@ -136,25 +136,20 @@ void FixQEqSlater::init_matvec() void FixQEqSlater::compute_H() { - int i, j, ii, jj, inum, jnum, itype, jtype, flag; + int i, j, ii, jj, inum, jnum, itype, jtype; int *ilist, *jlist, *numneigh, **firstneigh; double r, rsq, delr[3]; double zei, zej, zj, zjtmp; - double SMALL = 0.0001; - int *tag = atom->tag; int *type = atom->type; double **x = atom->x; - double *q = atom->q; inum = list->inum; ilist = list->ilist; numneigh = list->numneigh; firstneigh = list->firstneigh; - int nlocal = atom->nlocal; - m_fill = 0; for (ii = 0; ii < inum; ii++) { @@ -207,7 +202,6 @@ double FixQEqSlater::calculate_H(double zei, double zej, double zj, double r, double &zjtmp) { double rinv = 1.0/r; - double rinv2 = rinv*rinv; double exp2zir = exp(-2.0*zei*r); double zei2 = zei*zei; @@ -261,7 +255,6 @@ double FixQEqSlater::calculate_H_wolf(double zei, double zej, double zj, double r, double &zjtmp) { double rinv = 1.0/r; - double rinv2 = rinv*rinv; double exp2zir = exp(-2.0*zei*r); double zei2 = zei*zei; @@ -398,7 +391,7 @@ int FixQEqSlater::CG( double *b, double *x ) void FixQEqSlater::sparse_matvec( sparse_matrix *A, double *x, double *b ) { int i, j, itr_j; - int nn, NN, ii; + int nn, NN; int *ilist; nn = atom->nlocal; From e898a2c91b339c0b99cd0ea9a95f6733054420dc Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 14:20:12 +0000 Subject: [PATCH 02/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12448 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/SNAP/pair_snap.cpp | 16 ++++++++-------- src/SNAP/sna.cpp | 11 ++++++----- src/SNAP/sna.h | 9 ++------- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp index 1c56bba07c..5bad7ac6bb 100644 --- a/src/SNAP/pair_snap.cpp +++ b/src/SNAP/pair_snap.cpp @@ -101,8 +101,8 @@ PairSNAP::~PairSNAP() { if (nelements) { for (int i = 0; i < nelements; i++) - delete [] elements[i]; - delete elements; + delete[] elements[i]; + delete[] elements; memory->destroy(radelem); memory->destroy(wjelem); memory->destroy(coeffelem); @@ -1340,8 +1340,8 @@ void PairSNAP::coeff(int narg, char **arg) if (nelements) { for (int i = 0; i < nelements; i++) - delete [] elements[i]; - delete elements; + delete[] elements[i]; + delete[] elements; memory->destroy(radelem); memory->destroy(wjelem); memory->destroy(coeffelem); @@ -1486,7 +1486,7 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename) FILE *fpcoeff; if (comm->me == 0) { - fpcoeff = fopen(coefffilename,"r"); + fpcoeff = force->open_potential(coefffilename); if (fpcoeff == NULL) { char str[128]; sprintf(str,"Cannot open SNAP coefficient file %s",coefffilename); @@ -1514,7 +1514,7 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename) // strip comment, skip line if blank - if (ptr = strchr(line,'#')) *ptr = '\0'; + if ((ptr = strchr(line,'#'))) *ptr = '\0'; nwords = atom->count_words(line); } if (nwords != 2) @@ -1651,7 +1651,7 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename) FILE *fpparam; if (comm->me == 0) { - fpparam = fopen(paramfilename,"r"); + fpparam = force->open_potential(paramfilename); if (fpparam == NULL) { char str[128]; sprintf(str,"Cannot open SNAP parameter file %s",paramfilename); @@ -1675,7 +1675,7 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename) // strip comment, skip line if blank - if (ptr = strchr(line,'#')) *ptr = '\0'; + if ((ptr = strchr(line,'#'))) *ptr = '\0'; nwords = atom->count_words(line); if (nwords == 0) continue; diff --git a/src/SNAP/sna.cpp b/src/SNAP/sna.cpp index 10e9dac4af..abcc6d6804 100644 --- a/src/SNAP/sna.cpp +++ b/src/SNAP/sna.cpp @@ -128,6 +128,7 @@ SNA::SNA(LAMMPS* lmp, double rfac0_in, wj = NULL; rcutij = NULL; nmax = 0; + idxj = NULL; timers = new double[20]; @@ -153,7 +154,7 @@ SNA::~SNA() memory->destroy(bvec); memory->destroy(dbvec); } - + delete[] idxj; } void SNA::build_indexlist() @@ -168,7 +169,7 @@ void SNA::build_indexlist() // indexList can be changed here - idxj = new SNA_LOOPINDICES_J[idxj_count]; + idxj = new SNA_LOOPINDICES[idxj_count]; idxj_max = idxj_count; idxj_count = 0; @@ -193,7 +194,7 @@ void SNA::build_indexlist() // indexList can be changed here - idxj = new SNA_LOOPINDICES_J[idxj_count]; + idxj = new SNA_LOOPINDICES[idxj_count]; idxj_max = idxj_count; idxj_count = 0; @@ -216,7 +217,7 @@ void SNA::build_indexlist() // indexList can be changed here - idxj = new SNA_LOOPINDICES_J[idxj_count]; + idxj = new SNA_LOOPINDICES[idxj_count]; idxj_max = idxj_count; idxj_count = 0; @@ -239,7 +240,7 @@ void SNA::build_indexlist() // indexList can be changed here - idxj = new SNA_LOOPINDICES_J[idxj_count]; + idxj = new SNA_LOOPINDICES[idxj_count]; idxj_max = idxj_count; idxj_count = 0; diff --git a/src/SNAP/sna.h b/src/SNAP/sna.h index 1c38bcb920..a8a09963d9 100644 --- a/src/SNAP/sna.h +++ b/src/SNAP/sna.h @@ -11,11 +11,8 @@ #include namespace LAMMPS_NS { -struct SNA_LOOPINDICES { - int j1, j2, j, ma, mb, ma1, ma2, mb1, mb2; -}; -struct SNA_LOOPINDICES_J { +struct SNA_LOOPINDICES { int j1, j2, j; }; @@ -77,9 +74,7 @@ private: double rmin0, rfac0; //use indexlist instead of loops, constructor generates these - SNA_LOOPINDICES* idx; - int idx_max; - SNA_LOOPINDICES_J* idxj; + SNA_LOOPINDICES* idxj; int idxj_max; // data for bispectrum coefficients From ac246386d0dd57b8f2f8d337435131ae5bb5ae52 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 15:01:58 +0000 Subject: [PATCH 03/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12449 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/balance.cpp | 24 ++++++++++++------------ src/fix_move.cpp | 14 +++++++++----- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/balance.cpp b/src/balance.cpp index 246edc2849..76789211d7 100644 --- a/src/balance.cpp +++ b/src/balance.cpp @@ -1003,24 +1003,24 @@ void Balance::dumpout(bigint tstep, FILE *fp) int m = 0; for (int i = 0; i < nprocs; i++) { domain->lamda_box_corners(&boxall[i][0],&boxall[i][3]); - fprintf(fp,"%d %d %g %g %g\n",m+1,1,bc[i][0],bc[i][1],0.0); - fprintf(fp,"%d %d %g %g %g\n",m+2,1,bc[i][3],bc[i][1],0.0); - fprintf(fp,"%d %d %g %g %g\n",m+3,1,bc[i][3],bc[i][4],0.0); - fprintf(fp,"%d %d %g %g %g\n",m+4,1,bc[i][0],bc[i][4],0.0); + fprintf(fp,"%d %d %g %g %g\n",m+1,1,bc[0][0],bc[0][1],0.0); + fprintf(fp,"%d %d %g %g %g\n",m+2,1,bc[1][0],bc[1][1],0.0); + fprintf(fp,"%d %d %g %g %g\n",m+3,1,bc[2][0],bc[2][1],0.0); + fprintf(fp,"%d %d %g %g %g\n",m+4,1,bc[3][0],bc[3][1],0.0); m += 4; } } else { int m = 0; for (int i = 0; i < nprocs; i++) { domain->lamda_box_corners(&boxall[i][0],&boxall[i][3]); - fprintf(fp,"%d %d %g %g %g\n",m+1,1,bc[i][0],bc[i][1],bc[i][2]); - fprintf(fp,"%d %d %g %g %g\n",m+2,1,bc[i][3],bc[i][1],bc[i][2]); - fprintf(fp,"%d %d %g %g %g\n",m+3,1,bc[i][3],bc[i][4],bc[i][2]); - fprintf(fp,"%d %d %g %g %g\n",m+4,1,bc[i][0],bc[i][4],bc[i][2]); - fprintf(fp,"%d %d %g %g %g\n",m+5,1,bc[i][0],bc[i][1],bc[i][5]); - fprintf(fp,"%d %d %g %g %g\n",m+6,1,bc[i][3],bc[i][1],bc[i][5]); - fprintf(fp,"%d %d %g %g %g\n",m+7,1,bc[i][3],bc[i][4],bc[i][5]); - fprintf(fp,"%d %d %g %g %g\n",m+8,1,bc[i][0],bc[i][4],bc[i][5]); + fprintf(fp,"%d %d %g %g %g\n",m+1,1,bc[0][0],bc[0][1],bc[0][1]); + fprintf(fp,"%d %d %g %g %g\n",m+2,1,bc[1][0],bc[1][1],bc[1][1]); + fprintf(fp,"%d %d %g %g %g\n",m+3,1,bc[2][0],bc[2][1],bc[2][1]); + fprintf(fp,"%d %d %g %g %g\n",m+4,1,bc[3][0],bc[3][1],bc[3][1]); + fprintf(fp,"%d %d %g %g %g\n",m+5,1,bc[4][0],bc[4][1],bc[4][1]); + fprintf(fp,"%d %d %g %g %g\n",m+6,1,bc[5][0],bc[5][1],bc[5][1]); + fprintf(fp,"%d %d %g %g %g\n",m+7,1,bc[6][0],bc[6][1],bc[6][1]); + fprintf(fp,"%d %d %g %g %g\n",m+8,1,bc[7][0],bc[7][1],bc[7][1]); m += 8; } } diff --git a/src/fix_move.cpp b/src/fix_move.cpp index acc6031246..88aa0e1849 100644 --- a/src/fix_move.cpp +++ b/src/fix_move.cpp @@ -242,11 +242,7 @@ FixMove::FixMove(LAMMPS *lmp, int narg, char **arg) : atom->add_callback(0); atom->add_callback(1); - maxatom = atom->nmax; - if (displaceflag) memory->create(displace,maxatom,3,"move:displace"); - else displace = NULL; - if (velocityflag) memory->create(velocity,maxatom,3,"move:velocity"); - else velocity = NULL; + displace = velocity = NULL; // xoriginal = initial unwrapped positions of atoms @@ -368,6 +364,14 @@ void FixMove::init() if (vzvarstr && vzvarstyle == ATOM) velocityflag = 1; } + maxatom = atom->nmax; + memory->destroy(displace); + memory->destroy(velocity); + if (displaceflag) memory->create(displace,maxatom,3,"move:displace"); + else displace = NULL; + if (velocityflag) memory->create(velocity,maxatom,3,"move:velocity"); + else velocity = NULL; + if (strstr(update->integrate_style,"respa")) nlevels_respa = ((Respa *) update->integrate)->nlevels; } From 95d3f975f59102605f6fc806105a1e78bc126b57 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 15:17:43 +0000 Subject: [PATCH 04/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12450 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- doc/Section_accelerate.html | 747 ++++++++++++++++++++---------------- doc/Section_accelerate.txt | 746 ++++++++++++++++++++--------------- doc/fix_qeq.html | 26 +- doc/fix_qeq.txt | 26 +- doc/package.html | 20 + doc/package.txt | 28 ++ 6 files changed, 945 insertions(+), 648 deletions(-) diff --git a/doc/Section_accelerate.html b/doc/Section_accelerate.html index ebbd6a21ef..8c14fe7560 100644 --- a/doc/Section_accelerate.html +++ b/doc/Section_accelerate.html @@ -145,12 +145,12 @@ such as when using a barostat.

Accelerated versions of various pair_style, fixes, computes, and other commands have been added to LAMMPS, which will typically run faster than the -standard non-accelerated versions, if you have the appropriate -hardware on your system. +standard non-accelerated versions. Some require appropriate hardware +on your system, e.g. GPUs or Intel Xeon Phi chips.

-

All of these commands are in packages. -Currently, there are 6 such accelerator packages in LAMMPS, either as -standard or user packages: +

All of these commands are in packages provided with LAMMPS, as +explained here. Currently, there are 6 such +accelerator packages in LAMMPS, either as standard or user packages:

@@ -177,20 +177,34 @@ Lennard-Jones pair_style lj/cut:
  • pair_style lj/cut/omp
  • pair_style lj/cut/opt -

    Assuming LAMMPS was built with the appropriate package, these styles -can be invoked by specifying them explicitly in your input script. Or -the -suffix command-line switch can be -used to automatically invoke the accelerated versions, without -changing the input script. Use of the suffix command -allows a suffix to be set explicitly and to be turned off and back on -at various points within an input script. +

    Assuming LAMMPS was built with the appropriate package, a simulation +using accelerated styles from the package can be run without modifying +your input script, by specifying command-line +switches. The details of how to do this +vary from package to package and are explained below. There is also a +suffix command and a package command that +accomplish the same thing and can be used within an input script if +preferred. The suffix command allows more precise +control of whether an accelerated or unaccelerated version of a style +is used at various points within an input script.

    To see what styles are currently available in each of the accelerated packages, see Section_commands 5 of the -manual. The doc page for each indvidual style (e.g. pair +manual. The doc page for individual commands (e.g. pair lj/cut or fix nve) also lists any accelerated variants available for that style.

    +

    The examples directory has several sub-directories with scripts and +README files for using the accelerator packages: +

    +
    • examples/cuda for USER-CUDA package +
    • examples/gpu for GPU package +
    • examples/intel for USER-INTEL package +
    • examples/kokkos for KOKKOS package +
    +

    Likewise, the bench directory has FERMI and KEPLER sub-directories +with scripts and README files for using all the accelerator packages. +

    Here is a brief summary of what the various packages provide. Details are in individual sections below.

    @@ -208,8 +222,8 @@ coprocessors. This can result in additional speedup over 2x depending on the hardware configuration.
  • Styles with a "kk" suffix are part of the KOKKOS package, and can be -run using OpenMP, on an NVIDIA GPU, or on an Intel(R) Xeon Phi(TM). -The speed-up depends on a variety of factors, as discussed below. +run using OpenMP, on an NVIDIA GPU, or on an Intel Xeon Phi. The +speed-up depends on a variety of factors, as discussed below.
  • Styles with an "omp" suffix are part of the USER-OMP package and allow a pair-style to be run in multi-threaded mode using OpenMP. This can @@ -226,7 +240,7 @@ CPU.

    • what hardware and software the accelerated package requires
    • how to build LAMMPS with the accelerated package -
    • how to run an input script with the accelerated package +
    • how to run with the accelerated package via either command-line switches or modifying the input script
    • speed-ups to expect
    • guidelines for best performance
    • restrictions @@ -249,7 +263,9 @@ due to if tests and other conditional code.
      • include the OPT package and build LAMMPS
      • use OPT pair styles in your input script
      -

      Details follow. +

      The last step can be done using the "-sf opt" command-line +switch. Or it can be done by adding a +suffix opt command to your input script.

      Required hardware/software:

      @@ -257,28 +273,30 @@ due to if tests and other conditional code.

      Building LAMMPS with the OPT package:

      -

      Include the package and build LAMMPS. +

      Include the package and build LAMMPS:

      -
      make yes-opt
      +
      cd lammps/src
      +make yes-opt
       make machine 
       
      -

      No additional compile/link flags are needed in your machine -Makefile in src/MAKE. +

      No additional compile/link flags are needed in your Makefile.machine +in src/MAKE.

      -

      Running with the OPT package: +

      Run with the OPT package from the command line:

      -

      You can explicitly add an "opt" suffix to the -pair_style command in your input script: -

      -
      pair_style lj/cut/opt 2.5 
      -
      -

      Or you can run with the -sf command-line -switch, which will automatically append -"opt" to styles that support it. +

      Use the "-sf opt" command-line switch, +which will automatically append "opt" to styles that support it.

      lmp_machine -sf opt -in in.script
       mpirun -np 4 lmp_machine -sf opt -in in.script 
       
      +

      Or run with the OPT package by editing an input script: +

      +

      Use the suffix opt command, or you can explicitly add an +"opt" suffix to individual styles in your input script, e.g. +

      +
      pair_style lj/cut/opt 2.5 
      +

      Speed-ups to expect:

      You should see a reduction in the "Pair time" value printed at the end @@ -305,13 +323,16 @@ uses the OpenMP interface for multi-threading.

      Here is a quick overview of how to use the USER-OMP package:

      -
      • specify the -fopenmp flag for compiling and linking in your machine Makefile +
        • use the -fopenmp flag for compiling and linking in your Makefile.machine
        • include the USER-OMP package and build LAMMPS -
        • specify how many threads per MPI task to run with via an environment variable or the package omp command -
        • enable the USER-OMP package via the "-sf omp" command-line switch, or the package omp commmand +
        • use the mpirun command to set the number of MPI tasks/node +
        • specify how many threads per MPI task to use
        • use USER-OMP styles in your input script
        -

        Details follow. +

        The latter two steps can be done using the "-pk omp" and "-sf omp" +command-line switches respectively. Or +either step can be done by adding the package omp or +suffix omp commands respectively to your input script.

        Required hardware/software:

        @@ -321,73 +342,65 @@ MPI task running on a CPU.

        Building LAMMPS with the USER-OMP package:

        -

        Include the package and build LAMMPS. +

        Include the package and build LAMMPS:

        cd lammps/src
         make yes-user-omp
         make machine 
         
        -

        Your lo-level src/MAKE/Makefile.machine needs a flag for OpenMP -support in both the CCFLAGS and LINKFLAGS variables. For GNU and -Intel compilers, this flag is -fopenmp. Without this flag the -USER-OMP styles will still be compiled and work, but will not support -multi-threading. +

        Your src/MAKE/Makefile.machine needs a flag for OpenMP support in both +the CCFLAGS and LINKFLAGS variables. For GNU and Intel compilers, +this flag is "-fopenmp". Without this flag the USER-OMP styles will +still be compiled and work, but will not support multi-threading.

        -

        Running with the USER-OMP package: +

        Run with the USER-OMP package from the command line:

        -

        There are 3 issues (a,b,c) to address: +

        The mpirun or mpiexec command sets the total number of MPI tasks used +by LAMMPS (one or multiple per compute node) and the number of MPI +tasks used per node. E.g. the mpirun command does this via its -np +and -ppn switches.

        -

        (a) Specify how many threads per MPI task to use +

        You need to choose how many threads per MPI task will be used by the +USER-OMP package. Note that the product of MPI tasks * threads/task +should not exceed the physical number of cores (on a node), otherwise +performance will suffer.

        -

        Note that the product of MPI tasks * threads/task should not exceed -the physical number of cores, otherwise performance will suffer. +

        Use the "-sf omp" command-line switch, +which will automatically append "omp" to styles that support it. Use +the "-pk omp Nt" command-line switch, to +set Nt = # of OpenMP threads per MPI task to use.

        -

        By default LAMMPS uses 1 thread per MPI task. If the environment -variable OMP_NUM_THREADS is set to a valid value, this value is used. -You can set this environment variable when you launch LAMMPS, e.g. -

        -
        env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script
        -env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
        -mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script 
        +
        lmp_machine -sf omp -pk omp 16 -in in.script                       # 1 MPI task on a 16-core node
        +mpirun -np 4 lmp_machine -sf omp -pk omp 4 -in in.script           # 4 MPI tasks each with 4 threads on a single 16-core node
        +mpirun -np 32 -ppn 4 lmp_machine -sf omp -pk omp 4 -in in.script   # ditto on 8 16-core nodes 
         
        -

        or you can set it permanently in your shell's start-up script. -All three of these examples use a total of 4 CPU cores. +

        Note that if the "-sf omp" switch is used, it also issues a default +package omp 0 command, which sets the number of threads +per MPI task via the OMP_NUM_THREADS environment variable.

        -

        Note that different MPI implementations have different ways of passing -the OMP_NUM_THREADS environment variable to all MPI processes. The -2nd line above is for MPICH; the 3rd line with -x is for OpenMPI. -Check your MPI documentation for additional details. +

        Using the "-pk" switch explicitly allows for direct setting of the +number of threads and additional options. Its syntax is the same as +the "package omp" command. See the package command doc +page for details, including the default values used for all its +options if it is not specified, and how to set the number of threads +via the OMP_NUM_THREADS environment variable if desired.

        -

        You can also set the number of threads per MPI task via the package -omp command, which will override any OMP_NUM_THREADS -setting. +

        Or run with the USER-OMP package by editing an input script:

        -

        (b) Enable the USER-OMP package +

        The discussion above for the mpirun/mpiexec command, MPI tasks/node, +and threads/MPI task is the same.

        -

        This can be done in one of two ways. Use a package omp -command near the top of your input script. +

        Use the suffix omp command, or you can explicitly add an +"omp" suffix to individual styles in your input script, e.g.

        -

        Or use the "-sf omp" command-line switch, -which will automatically invoke the command package omp -*. -

        -

        (c) Use OMP-accelerated styles -

        -

        This can be done by explicitly adding an "omp" suffix to any supported -style in your input script: -

        -
        pair_style lj/cut/omp 2.5
        -fix nve/omp 
        +
        pair_style lj/cut/omp 2.5 
         
        -

        Or you can run with the "-sf omp" command-line -switch, which will automatically append -"omp" to styles that support it. -

        -
        lmp_machine -sf omp -in in.script
        -mpirun -np 4 lmp_machine -sf omp -in in.script 
        -
        -

        Using the "suffix omp" command in your input script does the same -thing. +

        You must also use the package omp command to enable the +USER-OMP package, unless the "-sf omp" or "-pk omp" command-line +switches were used. It specifies how many +threads per MPI task to use, as well as other options. Its doc page +explains how to set the number of threads via an environment variable +if desired.

        Speed-ups to expect:

        @@ -462,7 +475,7 @@ and thus reducing the work done by the long-range solver. Using the with the USER-OMP package, is an alternative way to reduce the number of MPI tasks assigned to the KSpace calculation.
      -

      Other performance tips are as follows: +

      Additional performance tips are as follows:

      • The best parallel efficiency from omp styles is typically achieved when there is at least one MPI task per physical processor, @@ -491,14 +504,14 @@ versions of many pair styles, including the 3-body Stillinger-Weber pair style, and for kspace_style pppm for long-range Coulombics. It has the following general features:

        -
        • The package is designed to exploit common GPU hardware configurations -where one or more GPUs are coupled to many cores of one or more -multi-core CPUs, e.g. within a node of a parallel machine. +
          • It is designed to exploit common GPU hardware configurations where one +or more GPUs are coupled to many cores of one or more multi-core CPUs, +e.g. within a node of a parallel machine.
          • Atom-based data (e.g. coordinates, forces) moves back-and-forth between the CPU(s) and GPU every timestep. -
          • Neighbor lists can be constructed on the CPU or on the GPU +
          • Neighbor lists can be built on the CPU or on the GPU
          • The charge assignement and force interpolation portions of PPPM can be run on the GPU. The FFT portion, which requires MPI communication @@ -520,16 +533,16 @@ hardware.

          Here is a quick overview of how to use the GPU package:

          -
          • build the library in lib/gpu for your GPU hardware (CUDA_ARCH) with desired precision (CUDA_PREC) +
            • build the library in lib/gpu for your GPU hardware wity desired precision
            • include the GPU package and build LAMMPS -
            • decide how many MPI tasks per GPU to run with, i.e. set MPI tasks/node via mpirun -
            • specify how many GPUs per node to use (default = 1) via the package gpu command -
            • enable the GPU package via the "-sf gpu" command-line switch, or the package gpu commmand -
            • use the newton command to turn off Newton's law for pairwise interactions -
            • use the package gpu command to enable neighbor list building on the GPU if desired -
            • use GPU pair styles and kspace styles in your input script +
            • use the mpirun command to set the number of MPI tasks/node which determines the number of MPI tasks/GPU +
            • specify the # of GPUs per node +
            • use GPU styles in your input script
            -

            Details follow. +

            The latter two steps can be done using the "-pk gpu" and "-sf gpu" +command-line switches respectively. Or +either step can be done by adding the package gpu or +suffix gpu commands respectively to your input script.

            Required hardware/software:

            @@ -544,7 +557,7 @@ install the NVIDIA Cuda software on your system:

            Building LAMMPS with the GPU package:

            This requires two steps (a,b): build the GPU library, then build -LAMMPS. +LAMMPS with the GPU package.

            (a) Build the GPU library

            @@ -560,9 +573,9 @@ attention to 3 settings in this makefile. for different GPU choices, e.g. Fermi vs Kepler. It also lists the possible precision settings:

            -
            CUDA_PREC = -D_SINGLE_SINGLE  # Single precision for all calculations
            -CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
            -CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc, in double 
            +
            CUDA_PREC = -D_SINGLE_SINGLE  # single precision for all calculations
            +CUDA_PREC = -D_DOUBLE_DOUBLE  # double precision for all calculations
            +CUDA_PREC = -D_SINGLE_DOUBLE  # accumulation of forces, etc, in double 
             

            The last setting is the mixed mode referred to above. Note that your GPU must support double precision to use either the 2nd or 3rd of @@ -584,74 +597,74 @@ own Makefile.lammps.machine if needed. re-build the entire library. Do a "clean" first, e.g. "make -f Makefile.linux clean", followed by the make command above.

            -

            (b) Build LAMMPS +

            (b) Build LAMMPS with the GPU package

            cd lammps/src
             make yes-gpu
             make machine 
             
            -

            Note that if you change the GPU library precision (discussed above), -you also need to re-install the GPU package and re-build LAMMPS, so -that all affected files are re-compiled and linked to the new GPU -library. +

            No additional compile/link flags are needed in your Makefile.machine +in src/MAKE.

            -

            Running with the GPU package: +

            Note that if you change the GPU library precision (discussed above) +and rebuild the GPU library, then you also need to re-install the GPU +package and re-build LAMMPS, so that all affected files are +re-compiled and linked to the new GPU library.

            -

            The examples/gpu and bench/GPU directories have scripts that can be -run with the GPU package, as well as detailed instructions on how to -run them. +

            Run with the GPU package from the command line:

            -

            To run with the GPU package, there are 3 basic issues (a,b,c) to -address: +

            The mpirun or mpiexec command sets the total number of MPI tasks used +by LAMMPS (one or multiple per compute node) and the number of MPI +tasks used per node. E.g. the mpirun command does this via its -np +and -ppn switches.

            -

            (a) Use one or more MPI tasks per GPU +

            When using the GPU package, you cannot assign more than one GPU to a +single MPI task. However multiple MPI tasks can share the same GPU, +and in many cases it will be more efficient to run this way. Likewise +it may be more efficient to use less MPI tasks/node than the available +# of CPU cores. Assignment of multiple MPI tasks to a GPU will happen +automatically if you create more MPI tasks/node than there are +GPUs/mode. E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be +shared by 4 MPI tasks.

            -

            The total number of MPI tasks used by LAMMPS (one or multiple per -compute node) is set in the usual manner via the mpirun or mpiexec -commands, and is independent of the GPU package. +

            Use the "-sf gpu" command-line switch, +which will automatically append "gpu" to styles that support it. Use +the "-pk gpu Ng" command-line switch to +set Ng = # of GPUs/node to use.

            -

            When using the GPU package, you cannot assign more than one physical -GPU to a single MPI task. However multiple MPI tasks can share the -same GPU, and in many cases it will be more efficient to run this way. +

            lmp_machine -sf gpu -pk gpu 1 -in in.script                         # 1 MPI task uses 1 GPU
            +mpirun -np 12 lmp_machine -sf gpu -pk gpu 2 -in in.script           # 12 MPI tasks share 2 GPUs on a single 16-core (or whatever) node
            +mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script   # ditto on 4 16-core nodes 
            +
            +

            Note that if the "-sf gpu" switch is used, it also issues a default +package gpu 1 command, which sets the number of +GPUs/node to use to 1.

            -

            The default is to have all MPI tasks on a compute node use a single -GPU. To use multiple GPUs per node, be sure to create one or more MPI -tasks per GPU, and use the first/last settings in the package -gpu command to include all the GPU IDs on the node. -E.g. first = 0, last = 1, for 2 GPUs. On a node with 8 CPU cores -and 2 GPUs, this would specify that each GPU is shared by 4 MPI tasks. +

            Using the "-pk" switch explicitly allows for direct setting of the +number of GPUs/node to use and additional options. Its syntax is the +same as same as the "package gpu" command. See the +package command doc page for details, including the +default values used for all its options if it is not specified.

            -

            (b) Enable the GPU package +

            Or run with the GPU package by editing an input script:

            -

            This can be done in one of two ways. Use a package gpu -command near the top of your input script. +

            The discussion above for the mpirun/mpiexec command, MPI tasks/node, +and use of multiple MPI tasks/GPU is the same.

            -

            Or use the "-sf gpu" command-line switch, -which will automatically invoke the command package gpu force/neigh 0 -0 1. Note that this specifies use of a single GPU (per -node), so you must specify the package command in your input script -explicitly if you want to use multiple GPUs per node. -

            -

            (c) Use GPU-accelerated styles -

            -

            This can be done by explicitly adding a "gpu" suffix to any supported -style in your input script: +

            Use the suffix gpu command, or you can explicitly add an +"gpu" suffix to individual styles in your input script, e.g.

            pair_style lj/cut/gpu 2.5 
             
            -

            Or you can run with the "-sf gpu" command-line -switch, which will automatically append -"gpu" to styles that support it. +

            You must also use the package gpu command to enable the +GPU package, unless the "-sf gpu" or "-pk gpu" command-line +switches were used. It specifies the +number of GPUs/node to use, as well as other options.

            -
            lmp_machine -sf gpu -in in.script
            -mpirun -np 4 lmp_machine -sf gpu -in in.script 
            -
            -

            Using the "suffix gpu" command in your input script does the same -thing. -

            -

            IMPORTANT NOTE: The input script must also use the -newton command with a pairwise setting of off, -since on is the default. +

            IMPORTANT NOTE: The input script must also use a newton pairwise +setting of off in order to use GPU package pair styles. This can be +set via the package gpu or newton +commands.

            Speed-ups to expect:

            @@ -745,18 +758,22 @@ single CPU (core), assigned to each GPU.

          Here is a quick overview of how to use the USER-CUDA package:

          -
          • build the library in lib/cuda for your GPU hardware (arch with desired precision (precision) +
            • build the library in lib/cuda for your GPU hardware with desired precision
            • include the USER-CUDA package and build LAMMPS
            • use the mpirun command to specify 1 MPI task per GPU (on each node) -
            • specify how many GPUs per node to use (default = 1) via the package cuda command
            • enable the USER-CUDA package via the "-c on" command-line switch +
            • specify the # of GPUs per node
            • use USER-CUDA styles in your input script
            -

            Details follow. +

            The latter two steps can be done using the "-pk cuda" and "-sf cuda" +command-line switches respectively. Or +either step can be done by adding the package cuda or +suffix cuda commands respectively to your input script.

            Required hardware/software:

            -

            To use this package, you need to have one or more NVIDIA GPUs and install the NVIDIA Cuda software on your system: +

            To use this package, you need to have one or more NVIDIA GPUs and +install the NVIDIA Cuda software on your system:

            Your NVIDIA GPU needs to support Compute Capability 1.3. This list may help you to find out the Compute Capability of your card: @@ -771,7 +788,7 @@ projects can be compiled without problems.

            Building LAMMPS with the USER-CUDA package:

            This requires two steps (a,b): build the USER-CUDA library, then build -LAMMPS. +LAMMPS with the USER-CUDA package.

            (a) Build the USER-CUDA library

            @@ -816,58 +833,68 @@ the library is built. to re-build the entire library. Do a "make clean" first, followed by "make".

            -

            (b) Build LAMMPS +

            (b) Build LAMMPS with the USER-CUDA package

            cd lammps/src
             make yes-user-cuda
             make machine 
             
            +

            No additional compile/link flags are needed in your Makefile.machine +in src/MAKE. +

            Note that if you change the USER-CUDA library precision (discussed -above), you also need to re-install the USER-CUDA package and re-build -LAMMPS, so that all affected files are re-compiled and linked to the -new USER-CUDA library. +above) and rebuild the USER-CUDA library, then you also need to +re-install the USER-CUDA package and re-build LAMMPS, so that all +affected files are re-compiled and linked to the new USER-CUDA +library.

            -

            Running with the USER-CUDA package: +

            Run with the USER-CUDA package from the command line:

            -

            The bench/CUDA directories has scripts that can be run with the -USER-CUDA package, as well as detailed instructions on how to run -them. +

            The mpirun or mpiexec command sets the total number of MPI tasks used +by LAMMPS (one or multiple per compute node) and the number of MPI +tasks used per node. E.g. the mpirun command does this via its -np +and -ppn switches.

            -

            To run with the USER-CUDA package, there are 3 basic issues (a,b,c) to -address: +

            When using the USER-CUDA package, you must use exactly one MPI task +per physical GPU.

            -

            (a) Use one MPI task per GPU +

            You must use the "-c on" command-line +switch to enable the USER-CUDA package. +This also issues a default package cuda 2 command which +sets the number of GPUs/node to use to 2.

            -

            This is a requirement of the USER-CUDA package, i.e. you cannot -use multiple MPI tasks per physical GPU. So if you are running -on nodes with 1 or 2 GPUs, use the mpirun or mpiexec command -to specify 1 or 2 MPI tasks per node. +

            Use the "-sf cuda" command-line switch, +which will automatically append "cuda" to styles that support it. Use +the "-pk cuda Ng" command-line switch to +set Ng = # of GPUs per node.

            -

            If the nodes have more than 1 GPU, you must use the package -cuda command near the top of your input script to -specify that more than 1 GPU will be used (the default = 1). +

            lmp_machine -c on -sf cuda -pk cuda 1 -in in.script                       # 1 MPI task uses 1 GPU
            +mpirun -np 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script          # 2 MPI tasks use 2 GPUs on a single 16-core (or whatever) node
            +mpirun -np 24 -ppn 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script  # ditto on 12 16-core nodes 
            +
            +

            Using the "-pk" switch explicitly allows for direct setting of the +number of GPUs/node to use and additional options. Its syntax is the +same as same as the "package cuda" command. See the +package command doc page for details, including the +default values used for all its options if it is not specified.

            -

            (b) Enable the USER-CUDA package +

            Or run with the USER-CUDA package by editing an input script:

            -

            The "-c on" or "-cuda on" command-line -switch must be used when launching LAMMPS. +

            The discussion above for the mpirun/mpiexec command and the requirement +of one MPI task per GPU is the same.

            -

            (c) Use USER-CUDA-accelerated styles +

            You must still use the "-c on" command-line +switch to enable the USER-CUDA package. +This also issues a default package cuda 2 command which +sets the number of GPUs/node to use to 2.

            -

            This can be done by explicitly adding a "cuda" suffix to any supported -style in your input script: +

            Use the suffix cuda command, or you can explicitly add a +"cuda" suffix to individual styles in your input script, e.g.

            pair_style lj/cut/cuda 2.5 
             
            -

            Or you can run with the "-sf cuda" command-line -switch, which will automatically append -"cuda" to styles that support it. -

            -
            lmp_machine -sf cuda -in in.script
            -mpirun -np 4 lmp_machine -sf cuda -in in.script 
            -
            -

            Using the "suffix cuda" command in your input script does the same -thing. +

            You only need to use the package cuda command if you +wish to change the number of GPUs/node to use or its other options.

            Speed-ups to expect:

            @@ -944,11 +971,26 @@ neighbor list builds, time integration, etc) can be parallelized for one or the other of the two modes. The first mode is called the "host" and is one or more threads running on one or more physical CPUs (within the node). Currently, both multi-core CPUs and an Intel Phi -processor (running in native mode) are supported. The second mode is -called the "device" and is an accelerator chip of some kind. -Currently only an NVIDIA GPU is supported. If your compute node does -not have a GPU, then there is only one mode of execution, i.e. the -host and device are the same. +processor (running in native mode, not offload mode like the +USER-INTEL package) are supported. The second mode is called the +"device" and is an accelerator chip of some kind. Currently only an +NVIDIA GPU is supported. If your compute node does not have a GPU, +then there is only one mode of execution, i.e. the host and device are +the same. +

            +

            Here is a quick overview of how to use the KOKKOS package +for GPU acceleration: +

            +
            • specify variables and settings in your Makefile.machine that enable GPU, Phi, or OpenMP support +
            • include the KOKKOS package and build LAMMPS +
            • enable the KOKKOS package and its hardware options via the "-k on" command-line switch +
            • use KOKKOS styles in your input script +
            +

            The latter two steps can be done using the "-k on", "-pk kokkos" and +"-sf kk" command-line switches +respectively. Or either the steps can be done by adding the package +kokkod or suffix kk commands respectively +to your input script.

            Required hardware/software:

            @@ -960,7 +1002,8 @@ LAMMPS on the following kinds of hardware configurations:
          • Phi: on one or more Intel Phi coprocessors (per node)
          • GPU: on the GPUs of a node with additional OpenMP threading on the CPUs
          -

          Intel Xeon Phi coprocessors are supported in "native" mode only. +

          Intel Xeon Phi coprocessors are supported in "native" mode only, not +"offload" mode.

          Only NVIDIA GPUs are currently supported.

          @@ -1013,7 +1056,7 @@ e.g. g++ in the first two examples above, then you *must* perform a to force all the KOKKOS-dependent files to be re-compiled with the new options.

          -

          You can also hardwire these variables in the specified machine +

          You can also hardwire these make variables in the specified machine makefile, e.g. src/MAKE/Makefile.g++ in the first two examples above, with a line like:

          @@ -1043,79 +1086,111 @@ or in the machine makefile in the src/MAKE directory. See command-line +switch to enable the KOKKOS package. It +takes additional arguments for hardware settings appropriate to your +system. Those arguments are documented +here. The two commonly used ones are as +follows: +

          +
          -k on t Nt
          +-k on g Ng 
          +
          +

          The "t Nt" option applies to host=OMP (even if device=CUDA) and +host=MIC. For host=OMP, it specifies how many OpenMP threads per MPI +task to use with a node. For host=MIC, it specifies how many Xeon Phi +threads per MPI task to use within a node. The default is Nt = 1. +Note that for host=OMP this is effectively MPI-only mode which may be +fine. But for host=MIC this may run 240 MPI tasks on the coprocessor, +which could give very poor perforamnce. +

          +

          The "g Ng" option applies to device=CUDA. It specifies how many GPUs +per compute node to use. The default is 1, so this only needs to be +specified is you have 2 or more GPUs per compute node. +

          +

          This also issues a default package cuda 2 command which +sets the number of GPUs/node to use to 2. +

          +

          The "-k on" switch also issues a default package kk neigh full +comm/exchange host comm/forward host command which sets +some KOKKOS options to default values, discussed on the +package command doc page. +

          +

          Use the "-sf kk" command-line switch, +which will automatically append "kokkos" to styles that support it. +Use the "-pk kokkos" command-line switch +if you wish to override any of the default values set by the package +kokkos command invoked by the "-k on" switch. +

          +

          host=OMP, dual hex-core nodes (12 threads/node):

          mpirun -np 12 lmp_g++ -in in.lj      # MPI-only mode with no Kokkos
           mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj      # MPI-only mode with Kokkos
           mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj     # one MPI task, 12 threads
           mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj      # two MPI tasks, 6 threads/task 
           
          -

          Intel Phi with 61 cores (240 total usable cores, with 4x hardware threading): +

          host=MIC, Intel Phi with 61 cores (240 threads/phi via 4x hardware threading):

          mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj      # 12*20 = 240
           mpirun -np 15 lmp_g++ -k on t 16 -sf kk -in in.lj
           mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj
           mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj 
           
          -

          Dual hex-core CPUs and a single GPU: +

          host=OMP, device=CUDA, node = dual hex-core CPUs and a single GPU:

          mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj       # one MPI task, 6 threads on CPU 
           
          +

          host=OMP, device=CUDA, node = dual 8-core CPUs and 2 GPUs: +

          Dual 8-core CPUs and 2 GPUs:

          mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj   # two MPI tasks, 8 threads per CPU 
           
          -

          (b) Enable the KOKKOS package +

          Or run with the KOKKOS package by editing an input script:

          -

          As illustrated above, the "-k on" or "-kokkos on" command-line -switch must be used when launching LAMMPS. +

          The discussion above for the mpirun/mpiexec command and setting

          -

          As documented here, the command-line -swithc allows for several options. Commonly used ones, as illustrated -above, are: +

          of one MPI task per GPU is the same.

          -
          • -k on t Nt : specifies how many threads per MPI task to use within a -compute node. For good performance, the product of MPI tasks * -threads/task should not exceed the number of physical cores on a CPU -or Intel Phi (including hardware threading, e.g. 240). - -
          • -k on g Ng : specifies how many GPUs per compute node are available. -The default is 1, so this should be specified is you have 2 or more -GPUs per compute node. -
          -

          (c) Use KOKKOS-accelerated styles +

          You must still use the "-c on" command-line +switch to enable the USER-CUDA package. +This also issues a default package cuda 2 command which +sets the number of GPUs/node to use to 2.

          -

          This can be done by explicitly adding a "kk" suffix to any supported -style in your input script: +

          Use the suffix cuda command, or you can explicitly add a +"cuda" suffix to individual styles in your input script, e.g.

          -
          pair_style lj/cut/kk 2.5 
          +
          pair_style lj/cut/cuda 2.5 
           
          -

          Or you can run with the "-sf kk" command-line -switch, which will automatically append -"kk" to styles that support it. -

          -
          lmp_machine -sf kk -in in.script
          -mpirun -np 4 lmp_machine -sf kk -in in.script 
          -
          -

          Using the "suffix kk" command in your input script does the same -thing. +

          You only need to use the package cuda command if you +wish to change the number of GPUs/node to use or its other options.

          Speed-ups to expect:

          @@ -1276,11 +1351,12 @@ change in the future.

          The USER-INTEL package was developed by Mike Brown at Intel Corporation. It provides a capability to accelerate simulations by offloading neighbor list and non-bonded force calculations to Intel(R) -Xeon Phi(TM) coprocessors. Additionally, it supports running -simulations in single, mixed, or double precision with vectorization, -even if a coprocessor is not present, i.e. on an Intel(R) CPU. The -same C++ code is used for both cases. When offloading to a -coprocessor, the routine is run twice, once with an offload flag. +Xeon Phi(TM) coprocessors (not native mode like the KOKKOS package). +Additionally, it supports running simulations in single, mixed, or +double precision with vectorization, even if a coprocessor is not +present, i.e. on an Intel(R) CPU. The same C++ code is used for both +cases. When offloading to a coprocessor, the routine is run twice, +once with an offload flag.

          The USER-INTEL package can be used in tandem with the USER-OMP package. This is useful when offloading pair style computations to @@ -1302,20 +1378,26 @@ package is available.

          Here is a quick overview of how to use the USER-INTEL package for CPU acceleration:

          -
          • specify these CCFLAGS in your machine Makefile: -fopenmp, -DLAMMPS_MEMALIGN=64, and -restrict, -xHost -
          • specify -fopenmp with LINKFLAGS in your machine Makefile -
          • include the USER-INTEL package and (optionally) USER-OMP package and build LAMMP -
          • if also using the USER-OMP package, specify how many threads per MPI task to run with via an environment variable or the package omp command +
            • specify these CCFLAGS in your Makefile.machine: -fopenmp, -DLAMMPS_MEMALIGN=64, and -restrict, -xHost +
            • specify -fopenmp with LINKFLAGS in your Makefile.machine +
            • include the USER-INTEL package and (optionally) USER-OMP package and build LAMMPS +
            • if using the USER-OMP package, specify how many threads per MPI task to use
            • use USER-INTEL styles in your input script
            -

            Running with the USER-INTEL package to offload to the Intel(R) Xeon Phi(TM) -is the same except for these additional steps: +

            Using the USER-INTEL package to offload work to the Intel(R) +Xeon Phi(TM) coprocessor is the same except for these additional +steps:

            -
            • add the flag -DLMP_INTEL_OFFLOAD to CCFLAGS in your Machine makefile -
            • add the flag -offload to the LINKFLAGS in your Machine makefile -
            • the package intel command can be used to adjust threads per coprocessor +
              • add the flag -DLMP_INTEL_OFFLOAD to CCFLAGS in your Makefile.machine +
              • add the flag -offload to LINKFLAGS in your Makefile.machine +
              • specify how many threads per coprocessor to use
              -

              Details follow. +

              The latter two steps in the first case and the last step in the +coprocessor case can be done using the "-pk omp" and "-sf intel" and +"-pk intel" command-line switches +respectively. Or any of the 3 steps can be done by adding the +package intel or suffix cuda or package +intel commands respectively to your input script.

              Required hardware/software:

              @@ -1331,7 +1413,7 @@ compiler must support the OpenMP interface.

              Building LAMMPS with the USER-INTEL package:

              -

              Include the package and build LAMMPS. +

              Include the package(s) and build LAMMPS:

              cd lammps/src
               make yes-user-intel
              @@ -1364,77 +1446,98 @@ has support for offload to coprocessors; the former does not.
               issues that are being addressed. If using Intel(R) MPI, version 5 or
               higher is recommended.
               

              -

              Running with the USER-INTEL package: +

              Running with the USER-INTEL package from the command line:

              -

              The examples/intel directory has scripts that can be run with the -USER-INTEL package, as well as detailed instructions on how to run -them. +

              The mpirun or mpiexec command sets the total number of MPI tasks used +by LAMMPS (one or multiple per compute node) and the number of MPI +tasks used per node. E.g. the mpirun command does this via its -np +and -ppn switches.

              -

              Note that the total number of MPI tasks used by LAMMPS (one or -multiple per compute node) is set in the usual manner via the mpirun -or mpiexec commands, and is independent of the USER-INTEL package. -

              -

              To run with the USER-INTEL package, there are 3 basic issues (a,b,c) -to address: -

              -

              (a) Specify how many threads per MPI task to use on the CPU. -

              -

              Whether using the USER-INTEL package to offload computations to -Intel(R) Xeon Phi(TM) coprocessors or not, work performed on the CPU -can be multi-threaded via the USER-OMP package, assuming the USER-OMP -package was also installed when LAMMPS was built. -

              -

              In this case, the instructions above for the USER-OMP package, in its -"Running with the USER-OMP package" sub-section apply here as well. -

              -

              You can specify the number of threads per MPI task via the -OMP_NUM_THREADS environment variable or the package omp -command. The product of MPI tasks * threads/task should not exceed -the physical number of cores on the CPU (per node), otherwise +

              If LAMMPS was also built with the USER-OMP package, you need to choose +how many OpenMP threads per MPI task will be used by the USER-OMP +package. Note that the product of MPI tasks * OpenMP threads/task +should not exceed the physical number of cores (on a node), otherwise performance will suffer.

              -

              Note that the threads per MPI task setting is completely independent -of the number of threads used on the coprocessor. Only the package -intel command can be used to control thread counts on -the coprocessor. +

              If LAMMPS was built with coprocessor support for the USER-INTEL +package, you need to specify the number of coprocessor/node and the +number of threads to use on the coproessor per MPI task. Note that +coprocessor threads (which run on the coprocessor) are totally +independent from OpenMP threads (which run on the CPU). The product +of MPI tasks * coprocessor threads/task should not exceed the maximum +number of threads the coproprocessor is designed to run, otherwise +performance will suffer. This value is 240 for current generation +Xeon Phi(TM) chips, which is 60 physical cores * 4 threads/core. The +threads/core value can be set to a smaller value if desired by an +option on the package intel command, in which case the +maximum number of threads is also reduced.

              -

              (b) Enable the USER-INTEL package +

              Use the "-sf intel" command-line switch, +which will automatically append "intel" to styles that support it. If +a style does not support it, a "omp" suffix is tried next. Use the +"-pk omp Nt" command-line switch, to set +Nt = # of OpenMP threads per MPI task to use. Use the "-pk intel Nt +Nphi" command-line switch to set Nphi = # +of Xeon Phi(TM) coprocessors/node.

              -

              This can be done in one of two ways. Use a package intel -command near the top of your input script. +

              CPU-only without USER-OMP (but using Intel vectorization on CPU):
              +lmp_machine -sf intel -in in.script                 # 1 MPI task
              +mpirun -np 32 lmp_machine -sf intel -in in.script   # 32 MPI tasks on as many nodes as needed (e.g. 2 16-core nodes) 
              +
              +
              CPU-only with USER-OMP (and Intel vectorization on CPU):
              +lmp_machine -sf intel -pk intel 16 0 -in in.script                # 1 MPI task on a 16-core node
              +mpirun -np 4 lmp_machine -sf intel -pk intel 4 0 -in in.script    # 4 MPI tasks each with 4 threads on a single 16-core node
              +mpirun -np 32 lmp_machine -sf intel -pk intel 4 0 -in in.script   # ditto on 8 16-core nodes 
              +
              +
              CPUs + Xeon Phi(TM) coprocessors with USER-OMP:
              +lmp_machine -sf intel -pk intel 16 1 -in in.script                                  # 1 MPI task, 240 threads on 1 coprocessor
              +mpirun -np 4 lmp_machine -sf intel -pk intel 4 1 tptask 60 -in in.script            # 4 MPI tasks each with 4 OpenMP threads on a single 16-core node, 
              +                                                                                    # each MPI task uses 60 threads on 1 coprocessor
              +mpirun -np 32 -ppn 4 lmp_machine -sf intel -pk intel 4 2 tptask 120 -in in.script   # ditto on 8 16-core nodes for MPI tasks and OpenMP threads, 
              +                                                                                    # each MPI task uses 120 threads on one of 2 coprocessors 
              +
              +

              Note that if the "-sf intel" switch is used, it also issues two +default commands: package omp 0 and package intel +1 command. These set the number of OpenMP threads per +MPI task via the OMP_NUM_THREADS environment variable, and the number +of Xeon Phi(TM) coprocessors/node to 1. The latter is ignored is +LAMMPS was not built with coprocessor support.

              -

              Or use the "-sf intel" command-line -switch, which will automatically invoke -the command "package intel * mixed balance -1 offload_cards 1 -offload_tpc 4 offload_threads 240". Note that this specifies mixed -precision and use of a single Xeon Phi(TM) coprocessor (per node), so -you must specify the package command in your input script explicitly -if you want a different precision or to use multiple Phi coprocessor -per node. Also note that the balance and offload keywords are ignored -if you did not build LAMMPS with offload support for a coprocessor, as -descibed above. +

              Using the "-pk omp" switch explicitly allows for direct setting of the +number of OpenMP threads per MPI task, and additional options. Using +the "-pk intel" switch explicitly allows for direct setting of the +number of coprocessors/node, and additional options. The syntax for +these two switches is the same as the package omp and +package intel commands. See the package +command doc page for details, including the default values used for +all its options if these switches are not specified, and how to set +the number of OpenMP threads via the OMP_NUM_THREADS environment +variable if desired.

              -

              (c) Use USER-INTEL-accelerated styles +

              Or run with the USER-OMP package by editing an input script:

              -

              This can be done by explicitly adding an "intel" suffix to any -supported style in your input script: +

              The discussion above for the mpirun/mpiexec command, MPI tasks/node, +OpenMP threads per MPI task, and coprocessor threads per MPI task is +the same. +

              +

              Use the suffix intel command, or you can explicitly add an +"intel" suffix to individual styles in your input script, e.g.

              pair_style lj/cut/intel 2.5 
               
              -

              Or you can run with the "-sf intel" command-line -switch, which will automatically append -"intel" to styles that support it. +

              You must also use the package omp command to enable the +USER-OMP package (assuming LAMMPS was built with USER-OMP) unless the "-sf +intel" or "-pk omp" command-line switches +were used. It specifies how many OpenMP threads per MPI task to use, +as well as other options. Its doc page explains how to set the number +of threads via an environment variable if desired.

              -
              lmp_machine -sf intel -in in.script
              -mpirun -np 4 lmp_machine -sf intel -in in.script 
              -
              -

              Using the "suffix intel" command in your input script does the same -thing. -

              -

              IMPORTANT NOTE: Using an "intel" suffix in any of the above modes, -actually invokes two suffixes, "intel" and "omp". "Intel" is tried -first, and if the style does not support it, "omp" is tried next. If -neither is supported, the default non-suffix style is used. +

              You must also use the package intel command to enable +coprocessor support within the USER-INTEL package (assuming LAMMPS was +built with coprocessor support) unless the "-sf intel" or "-pk intel" +command-line switches were used. It +specifies how many coprocessors/node to use, as well as other +coprocessor options.

              Speed-ups to expect:

              @@ -1472,8 +1575,8 @@ threads to use per core can be accomplished with keyword settings of the package intel command.
            • If desired, only a fraction of the pair style computation can be -offloaded to the coprocessors. This is accomplished by setting a -balance fraction in the package intel command. A +offloaded to the coprocessors. This is accomplished by using the +balance keyword in the package intel command. A balance of 0 runs all calculations on the CPU. A balance of 1 runs all calculations on the coprocessor. A balance of 0.5 runs half of the calculations on the coprocessor. Setting the balance to -1 (the @@ -1487,10 +1590,6 @@ performance to use fewer MPI tasks and OpenMP threads than available cores. This is due to the fact that additional threads are generated internally to handle the asynchronous offload tasks. -
            • If you have multiple coprocessors on each compute node, the -offload_cards keyword can be specified with the package -intel command. -
            • If running short benchmark runs with dynamic load balancing, adding a short warm-up run (10-20 steps) will allow the load-balancer to find a near-optimal setting that will carry over to additional runs. @@ -1509,13 +1608,13 @@ dihedral, improper calculations, computation and data transfer to the coprocessor will run concurrently with computations and MPI communications for these calculations on the host CPU. The USER-INTEL package has two modes for deciding which atoms will be handled by the -coprocessor. This choice is controlled with the "offload_ghost" -keyword of the package intel command. When set to 0, -ghost atoms (atoms at the borders between MPI tasks) are not offloaded -to the card. This allows for overlap of MPI communication of forces -with computation on the coprocessor when the newton -setting is "on". The default is dependent on the style being used, -however, better performance may be achieved by setting this option +coprocessor. This choice is controlled with the ghost keyword of +the package intel command. When set to 0, ghost atoms +(atoms at the borders between MPI tasks) are not offloaded to the +card. This allows for overlap of MPI communication of forces with +computation on the coprocessor when the newton setting +is "on". The default is dependent on the style being used, however, +better performance may be achieved by setting this option explictly.

            Restrictions: diff --git a/doc/Section_accelerate.txt b/doc/Section_accelerate.txt index b0b0e5fbdb..bf956b88e2 100644 --- a/doc/Section_accelerate.txt +++ b/doc/Section_accelerate.txt @@ -141,12 +141,12 @@ such as when using a barostat. Accelerated versions of various "pair_style"_pair_style.html, "fixes"_fix.html, "computes"_compute.html, and other commands have been added to LAMMPS, which will typically run faster than the -standard non-accelerated versions, if you have the appropriate -hardware on your system. +standard non-accelerated versions. Some require appropriate hardware +on your system, e.g. GPUs or Intel Xeon Phi chips. -All of these commands are in "packages"_Section_packages.html. -Currently, there are 6 such accelerator packages in LAMMPS, either as -standard or user packages: +All of these commands are in packages provided with LAMMPS, as +explained "here"_Section_packages.html. Currently, there are 6 such +accelerator packages in LAMMPS, either as standard or user packages: "USER-CUDA"_#acc_7 : for NVIDIA GPUs "GPU"_acc_6 : for NVIDIA GPUs as well as OpenCL support @@ -171,20 +171,34 @@ Lennard-Jones "pair_style lj/cut"_pair_lj.html: "pair_style lj/cut/omp"_pair_lj.html "pair_style lj/cut/opt"_pair_lj.html :ul -Assuming LAMMPS was built with the appropriate package, these styles -can be invoked by specifying them explicitly in your input script. Or -the "-suffix command-line switch"_Section_start.html#start_7 can be -used to automatically invoke the accelerated versions, without -changing the input script. Use of the "suffix"_suffix.html command -allows a suffix to be set explicitly and to be turned off and back on -at various points within an input script. +Assuming LAMMPS was built with the appropriate package, a simulation +using accelerated styles from the package can be run without modifying +your input script, by specifying "command-line +switches"_Section_start.html#start_7. The details of how to do this +vary from package to package and are explained below. There is also a +"suffix"_suffix.html command and a "package"_package.html command that +accomplish the same thing and can be used within an input script if +preferred. The "suffix"_suffix.html command allows more precise +control of whether an accelerated or unaccelerated version of a style +is used at various points within an input script. To see what styles are currently available in each of the accelerated packages, see "Section_commands 5"_Section_commands.html#cmd_5 of the -manual. The doc page for each indvidual style (e.g. "pair +manual. The doc page for individual commands (e.g. "pair lj/cut"_pair_lj.html or "fix nve"_fix_nve.html) also lists any accelerated variants available for that style. +The examples directory has several sub-directories with scripts and +README files for using the accelerator packages: + +examples/cuda for USER-CUDA package +examples/gpu for GPU package +examples/intel for USER-INTEL package +examples/kokkos for KOKKOS package :ul + +Likewise, the bench directory has FERMI and KEPLER sub-directories +with scripts and README files for using all the accelerator packages. + Here is a brief summary of what the various packages provide. Details are in individual sections below. @@ -202,8 +216,8 @@ coprocessors. This can result in additional speedup over 2x depending on the hardware configuration. :l Styles with a "kk" suffix are part of the KOKKOS package, and can be -run using OpenMP, on an NVIDIA GPU, or on an Intel(R) Xeon Phi(TM). -The speed-up depends on a variety of factors, as discussed below. :l +run using OpenMP, on an NVIDIA GPU, or on an Intel Xeon Phi. The +speed-up depends on a variety of factors, as discussed below. :l Styles with an "omp" suffix are part of the USER-OMP package and allow a pair-style to be run in multi-threaded mode using OpenMP. This can @@ -220,7 +234,7 @@ The following sections explain: what hardware and software the accelerated package requires how to build LAMMPS with the accelerated package -how to run an input script with the accelerated package +how to run with the accelerated package via either command-line switches or modifying the input script speed-ups to expect guidelines for best performance restrictions :ul @@ -243,7 +257,9 @@ Here is a quick overview of how to use the OPT package: include the OPT package and build LAMMPS use OPT pair styles in your input script :ul -Details follow. +The last step can be done using the "-sf opt" "command-line +switch"_Section_start.html#start_7. Or it can be done by adding a +"suffix opt"_suffix.html command to your input script. [Required hardware/software:] @@ -251,28 +267,30 @@ None. [Building LAMMPS with the OPT package:] -Include the package and build LAMMPS. +Include the package and build LAMMPS: +cd lammps/src make yes-opt make machine :pre -No additional compile/link flags are needed in your machine -Makefile in src/MAKE. +No additional compile/link flags are needed in your Makefile.machine +in src/MAKE. -[Running with the OPT package:] +[Run with the OPT package from the command line:] -You can explicitly add an "opt" suffix to the -"pair_style"_pair_style.html command in your input script: - -pair_style lj/cut/opt 2.5 :pre - -Or you can run with the -sf "command-line -switch"_Section_start.html#start_7, which will automatically append -"opt" to styles that support it. +Use the "-sf opt" "command-line switch"_Section_start.html#start_7, +which will automatically append "opt" to styles that support it. lmp_machine -sf opt -in in.script mpirun -np 4 lmp_machine -sf opt -in in.script :pre +[Or run with the OPT package by editing an input script:] + +Use the "suffix opt"_suffix.html command, or you can explicitly add an +"opt" suffix to individual styles in your input script, e.g. + +pair_style lj/cut/opt 2.5 :pre + [Speed-ups to expect:] You should see a reduction in the "Pair time" value printed at the end @@ -299,13 +317,16 @@ uses the OpenMP interface for multi-threading. Here is a quick overview of how to use the USER-OMP package: -specify the -fopenmp flag for compiling and linking in your machine Makefile +use the -fopenmp flag for compiling and linking in your Makefile.machine include the USER-OMP package and build LAMMPS -specify how many threads per MPI task to run with via an environment variable or the package omp command -enable the USER-OMP package via the "-sf omp" command-line switch, or the package omp commmand +use the mpirun command to set the number of MPI tasks/node +specify how many threads per MPI task to use use USER-OMP styles in your input script :ul -Details follow. +The latter two steps can be done using the "-pk omp" and "-sf omp" +"command-line switches"_Section_start.html#start_7 respectively. Or +either step can be done by adding the "package omp"_package.html or +"suffix omp"_suffix.html commands respectively to your input script. [Required hardware/software:] @@ -315,73 +336,65 @@ MPI task running on a CPU. [Building LAMMPS with the USER-OMP package:] -Include the package and build LAMMPS. +Include the package and build LAMMPS: cd lammps/src make yes-user-omp make machine :pre -Your lo-level src/MAKE/Makefile.machine needs a flag for OpenMP -support in both the CCFLAGS and LINKFLAGS variables. For GNU and -Intel compilers, this flag is {-fopenmp}. Without this flag the -USER-OMP styles will still be compiled and work, but will not support -multi-threading. +Your src/MAKE/Makefile.machine needs a flag for OpenMP support in both +the CCFLAGS and LINKFLAGS variables. For GNU and Intel compilers, +this flag is "-fopenmp". Without this flag the USER-OMP styles will +still be compiled and work, but will not support multi-threading. -[Running with the USER-OMP package:] +[Run with the USER-OMP package from the command line:] -There are 3 issues (a,b,c) to address: +The mpirun or mpiexec command sets the total number of MPI tasks used +by LAMMPS (one or multiple per compute node) and the number of MPI +tasks used per node. E.g. the mpirun command does this via its -np +and -ppn switches. -(a) Specify how many threads per MPI task to use +You need to choose how many threads per MPI task will be used by the +USER-OMP package. Note that the product of MPI tasks * threads/task +should not exceed the physical number of cores (on a node), otherwise +performance will suffer. -Note that the product of MPI tasks * threads/task should not exceed -the physical number of cores, otherwise performance will suffer. +Use the "-sf omp" "command-line switch"_Section_start.html#start_7, +which will automatically append "omp" to styles that support it. Use +the "-pk omp Nt" "command-line switch"_Section_start.html#start_7, to +set Nt = # of OpenMP threads per MPI task to use. -By default LAMMPS uses 1 thread per MPI task. If the environment -variable OMP_NUM_THREADS is set to a valid value, this value is used. -You can set this environment variable when you launch LAMMPS, e.g. +lmp_machine -sf omp -pk omp 16 -in in.script # 1 MPI task on a 16-core node +mpirun -np 4 lmp_machine -sf omp -pk omp 4 -in in.script # 4 MPI tasks each with 4 threads on a single 16-core node +mpirun -np 32 -ppn 4 lmp_machine -sf omp -pk omp 4 -in in.script # ditto on 8 16-core nodes :pre -env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script -env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script -mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script :pre +Note that if the "-sf omp" switch is used, it also issues a default +"package omp 0"_package.html command, which sets the number of threads +per MPI task via the OMP_NUM_THREADS environment variable. -or you can set it permanently in your shell's start-up script. -All three of these examples use a total of 4 CPU cores. +Using the "-pk" switch explicitly allows for direct setting of the +number of threads and additional options. Its syntax is the same as +the "package omp" command. See the "package"_package.html command doc +page for details, including the default values used for all its +options if it is not specified, and how to set the number of threads +via the OMP_NUM_THREADS environment variable if desired. -Note that different MPI implementations have different ways of passing -the OMP_NUM_THREADS environment variable to all MPI processes. The -2nd line above is for MPICH; the 3rd line with -x is for OpenMPI. -Check your MPI documentation for additional details. +[Or run with the USER-OMP package by editing an input script:] -You can also set the number of threads per MPI task via the "package -omp"_package.html command, which will override any OMP_NUM_THREADS -setting. +The discussion above for the mpirun/mpiexec command, MPI tasks/node, +and threads/MPI task is the same. -(b) Enable the USER-OMP package +Use the "suffix omp"_suffix.html command, or you can explicitly add an +"omp" suffix to individual styles in your input script, e.g. -This can be done in one of two ways. Use a "package omp"_package.html -command near the top of your input script. +pair_style lj/cut/omp 2.5 :pre -Or use the "-sf omp" "command-line switch"_Section_start.html#start_7, -which will automatically invoke the command "package omp -*"_package.html. - -(c) Use OMP-accelerated styles - -This can be done by explicitly adding an "omp" suffix to any supported -style in your input script: - -pair_style lj/cut/omp 2.5 -fix nve/omp :pre - -Or you can run with the "-sf omp" "command-line -switch"_Section_start.html#start_7, which will automatically append -"omp" to styles that support it. - -lmp_machine -sf omp -in in.script -mpirun -np 4 lmp_machine -sf omp -in in.script :pre - -Using the "suffix omp" command in your input script does the same -thing. +You must also use the "package omp"_package.html command to enable the +USER-OMP package, unless the "-sf omp" or "-pk omp" "command-line +switches"_Section_start.html#start_7 were used. It specifies how many +threads per MPI task to use, as well as other options. Its doc page +explains how to set the number of threads via an environment variable +if desired. [Speed-ups to expect:] @@ -456,7 +469,7 @@ and thus reducing the work done by the long-range solver. Using the with the USER-OMP package, is an alternative way to reduce the number of MPI tasks assigned to the KSpace calculation. :l,ule -Other performance tips are as follows: +Additional performance tips are as follows: The best parallel efficiency from {omp} styles is typically achieved when there is at least one MPI task per physical processor, @@ -485,14 +498,14 @@ versions of many pair styles, including the 3-body Stillinger-Weber pair style, and for "kspace_style pppm"_kspace_style.html for long-range Coulombics. It has the following general features: -The package is designed to exploit common GPU hardware configurations -where one or more GPUs are coupled to many cores of one or more -multi-core CPUs, e.g. within a node of a parallel machine. :ulb,l +It is designed to exploit common GPU hardware configurations where one +or more GPUs are coupled to many cores of one or more multi-core CPUs, +e.g. within a node of a parallel machine. :ulb,l Atom-based data (e.g. coordinates, forces) moves back-and-forth between the CPU(s) and GPU every timestep. :l -Neighbor lists can be constructed on the CPU or on the GPU :l +Neighbor lists can be built on the CPU or on the GPU :l The charge assignement and force interpolation portions of PPPM can be run on the GPU. The FFT portion, which requires MPI communication @@ -514,16 +527,16 @@ hardware. :l,ule Here is a quick overview of how to use the GPU package: -build the library in lib/gpu for your GPU hardware (CUDA_ARCH) with desired precision (CUDA_PREC) +build the library in lib/gpu for your GPU hardware wity desired precision include the GPU package and build LAMMPS -decide how many MPI tasks per GPU to run with, i.e. set MPI tasks/node via mpirun -specify how many GPUs per node to use (default = 1) via the package gpu command -enable the GPU package via the "-sf gpu" command-line switch, or the package gpu commmand -use the newton command to turn off Newton's law for pairwise interactions -use the package gpu command to enable neighbor list building on the GPU if desired -use GPU pair styles and kspace styles in your input script :ul +use the mpirun command to set the number of MPI tasks/node which determines the number of MPI tasks/GPU +specify the # of GPUs per node +use GPU styles in your input script :ul -Details follow. +The latter two steps can be done using the "-pk gpu" and "-sf gpu" +"command-line switches"_Section_start.html#start_7 respectively. Or +either step can be done by adding the "package gpu"_package.html or +"suffix gpu"_suffix.html commands respectively to your input script. [Required hardware/software:] @@ -538,7 +551,7 @@ Run lammps/lib/gpu/nvc_get_devices (after building the GPU library, see below) t [Building LAMMPS with the GPU package:] This requires two steps (a,b): build the GPU library, then build -LAMMPS. +LAMMPS with the GPU package. (a) Build the GPU library @@ -554,9 +567,9 @@ See lib/gpu/Makefile.linux.double for examples of the ARCH settings for different GPU choices, e.g. Fermi vs Kepler. It also lists the possible precision settings: -CUDA_PREC = -D_SINGLE_SINGLE # Single precision for all calculations -CUDA_PREC = -D_DOUBLE_DOUBLE # Double precision for all calculations -CUDA_PREC = -D_SINGLE_DOUBLE # Accumulation of forces, etc, in double :pre +CUDA_PREC = -D_SINGLE_SINGLE # single precision for all calculations +CUDA_PREC = -D_DOUBLE_DOUBLE # double precision for all calculations +CUDA_PREC = -D_SINGLE_DOUBLE # accumulation of forces, etc, in double :pre The last setting is the mixed mode referred to above. Note that your GPU must support double precision to use either the 2nd or 3rd of @@ -578,74 +591,74 @@ Note that to change the precision of the GPU library, you need to re-build the entire library. Do a "clean" first, e.g. "make -f Makefile.linux clean", followed by the make command above. -(b) Build LAMMPS +(b) Build LAMMPS with the GPU package cd lammps/src make yes-gpu make machine :pre -Note that if you change the GPU library precision (discussed above), -you also need to re-install the GPU package and re-build LAMMPS, so -that all affected files are re-compiled and linked to the new GPU -library. +No additional compile/link flags are needed in your Makefile.machine +in src/MAKE. -[Running with the GPU package:] +Note that if you change the GPU library precision (discussed above) +and rebuild the GPU library, then you also need to re-install the GPU +package and re-build LAMMPS, so that all affected files are +re-compiled and linked to the new GPU library. -The examples/gpu and bench/GPU directories have scripts that can be -run with the GPU package, as well as detailed instructions on how to -run them. +[Run with the GPU package from the command line:] -To run with the GPU package, there are 3 basic issues (a,b,c) to -address: +The mpirun or mpiexec command sets the total number of MPI tasks used +by LAMMPS (one or multiple per compute node) and the number of MPI +tasks used per node. E.g. the mpirun command does this via its -np +and -ppn switches. -(a) Use one or more MPI tasks per GPU +When using the GPU package, you cannot assign more than one GPU to a +single MPI task. However multiple MPI tasks can share the same GPU, +and in many cases it will be more efficient to run this way. Likewise +it may be more efficient to use less MPI tasks/node than the available +# of CPU cores. Assignment of multiple MPI tasks to a GPU will happen +automatically if you create more MPI tasks/node than there are +GPUs/mode. E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be +shared by 4 MPI tasks. -The total number of MPI tasks used by LAMMPS (one or multiple per -compute node) is set in the usual manner via the mpirun or mpiexec -commands, and is independent of the GPU package. +Use the "-sf gpu" "command-line switch"_Section_start.html#start_7, +which will automatically append "gpu" to styles that support it. Use +the "-pk gpu Ng" "command-line switch"_Section_start.html#start_7 to +set Ng = # of GPUs/node to use. -When using the GPU package, you cannot assign more than one physical -GPU to a single MPI task. However multiple MPI tasks can share the -same GPU, and in many cases it will be more efficient to run this way. +lmp_machine -sf gpu -pk gpu 1 -in in.script # 1 MPI task uses 1 GPU +mpirun -np 12 lmp_machine -sf gpu -pk gpu 2 -in in.script # 12 MPI tasks share 2 GPUs on a single 16-core (or whatever) node +mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script # ditto on 4 16-core nodes :pre -The default is to have all MPI tasks on a compute node use a single -GPU. To use multiple GPUs per node, be sure to create one or more MPI -tasks per GPU, and use the first/last settings in the "package -gpu"_package.html command to include all the GPU IDs on the node. -E.g. first = 0, last = 1, for 2 GPUs. On a node with 8 CPU cores -and 2 GPUs, this would specify that each GPU is shared by 4 MPI tasks. +Note that if the "-sf gpu" switch is used, it also issues a default +"package gpu 1"_package.html command, which sets the number of +GPUs/node to use to 1. -(b) Enable the GPU package +Using the "-pk" switch explicitly allows for direct setting of the +number of GPUs/node to use and additional options. Its syntax is the +same as same as the "package gpu" command. See the +"package"_package.html command doc page for details, including the +default values used for all its options if it is not specified. -This can be done in one of two ways. Use a "package gpu"_package.html -command near the top of your input script. +[Or run with the GPU package by editing an input script:] -Or use the "-sf gpu" "command-line switch"_Section_start.html#start_7, -which will automatically invoke the command "package gpu force/neigh 0 -0 1"_package.html. Note that this specifies use of a single GPU (per -node), so you must specify the package command in your input script -explicitly if you want to use multiple GPUs per node. +The discussion above for the mpirun/mpiexec command, MPI tasks/node, +and use of multiple MPI tasks/GPU is the same. -(c) Use GPU-accelerated styles - -This can be done by explicitly adding a "gpu" suffix to any supported -style in your input script: +Use the "suffix gpu"_suffix.html command, or you can explicitly add an +"gpu" suffix to individual styles in your input script, e.g. pair_style lj/cut/gpu 2.5 :pre -Or you can run with the "-sf gpu" "command-line -switch"_Section_start.html#start_7, which will automatically append -"gpu" to styles that support it. +You must also use the "package gpu"_package.html command to enable the +GPU package, unless the "-sf gpu" or "-pk gpu" "command-line +switches"_Section_start.html#start_7 were used. It specifies the +number of GPUs/node to use, as well as other options. -lmp_machine -sf gpu -in in.script -mpirun -np 4 lmp_machine -sf gpu -in in.script :pre - -Using the "suffix gpu" command in your input script does the same -thing. - -IMPORTANT NOTE: The input script must also use the -"newton"_newton.html command with a pairwise setting of {off}, -since {on} is the default. +IMPORTANT NOTE: The input script must also use a newton pairwise +setting of {off} in order to use GPU package pair styles. This can be +set via the "package gpu"_package.html or "newton"_newton.html +commands. [Speed-ups to expect:] @@ -739,18 +752,22 @@ single CPU (core), assigned to each GPU. :l,ule Here is a quick overview of how to use the USER-CUDA package: -build the library in lib/cuda for your GPU hardware (arch with desired precision (precision) +build the library in lib/cuda for your GPU hardware with desired precision include the USER-CUDA package and build LAMMPS use the mpirun command to specify 1 MPI task per GPU (on each node) -specify how many GPUs per node to use (default = 1) via the package cuda command enable the USER-CUDA package via the "-c on" command-line switch +specify the # of GPUs per node use USER-CUDA styles in your input script :ul -Details follow. +The latter two steps can be done using the "-pk cuda" and "-sf cuda" +"command-line switches"_Section_start.html#start_7 respectively. Or +either step can be done by adding the "package cuda"_package.html or +"suffix cuda"_suffix.html commands respectively to your input script. [Required hardware/software:] -To use this package, you need to have one or more NVIDIA GPUs and install the NVIDIA Cuda software on your system: +To use this package, you need to have one or more NVIDIA GPUs and +install the NVIDIA Cuda software on your system: Your NVIDIA GPU needs to support Compute Capability 1.3. This list may help you to find out the Compute Capability of your card: @@ -765,7 +782,7 @@ projects can be compiled without problems. [Building LAMMPS with the USER-CUDA package:] This requires two steps (a,b): build the USER-CUDA library, then build -LAMMPS. +LAMMPS with the USER-CUDA package. (a) Build the USER-CUDA library @@ -810,58 +827,68 @@ Note that if you change any of the options (like precision), you need to re-build the entire library. Do a "make clean" first, followed by "make". -(b) Build LAMMPS +(b) Build LAMMPS with the USER-CUDA package cd lammps/src make yes-user-cuda make machine :pre +No additional compile/link flags are needed in your Makefile.machine +in src/MAKE. + Note that if you change the USER-CUDA library precision (discussed -above), you also need to re-install the USER-CUDA package and re-build -LAMMPS, so that all affected files are re-compiled and linked to the -new USER-CUDA library. +above) and rebuild the USER-CUDA library, then you also need to +re-install the USER-CUDA package and re-build LAMMPS, so that all +affected files are re-compiled and linked to the new USER-CUDA +library. -[Running with the USER-CUDA package:] +[Run with the USER-CUDA package from the command line:] -The bench/CUDA directories has scripts that can be run with the -USER-CUDA package, as well as detailed instructions on how to run -them. +The mpirun or mpiexec command sets the total number of MPI tasks used +by LAMMPS (one or multiple per compute node) and the number of MPI +tasks used per node. E.g. the mpirun command does this via its -np +and -ppn switches. -To run with the USER-CUDA package, there are 3 basic issues (a,b,c) to -address: +When using the USER-CUDA package, you must use exactly one MPI task +per physical GPU. -(a) Use one MPI task per GPU +You must use the "-c on" "command-line +switch"_Section_start.html#start_7 to enable the USER-CUDA package. +This also issues a default "package cuda 2"_package.html command which +sets the number of GPUs/node to use to 2. -This is a requirement of the USER-CUDA package, i.e. you cannot -use multiple MPI tasks per physical GPU. So if you are running -on nodes with 1 or 2 GPUs, use the mpirun or mpiexec command -to specify 1 or 2 MPI tasks per node. +Use the "-sf cuda" "command-line switch"_Section_start.html#start_7, +which will automatically append "cuda" to styles that support it. Use +the "-pk cuda Ng" "command-line switch"_Section_start.html#start_7 to +set Ng = # of GPUs per node. -If the nodes have more than 1 GPU, you must use the "package -cuda"_package.html command near the top of your input script to -specify that more than 1 GPU will be used (the default = 1). +lmp_machine -c on -sf cuda -pk cuda 1 -in in.script # 1 MPI task uses 1 GPU +mpirun -np 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script # 2 MPI tasks use 2 GPUs on a single 16-core (or whatever) node +mpirun -np 24 -ppn 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script # ditto on 12 16-core nodes :pre -(b) Enable the USER-CUDA package +Using the "-pk" switch explicitly allows for direct setting of the +number of GPUs/node to use and additional options. Its syntax is the +same as same as the "package cuda" command. See the +"package"_package.html command doc page for details, including the +default values used for all its options if it is not specified. -The "-c on" or "-cuda on" "command-line -switch"_Section_start.html#start_7 must be used when launching LAMMPS. +[Or run with the USER-CUDA package by editing an input script:] -(c) Use USER-CUDA-accelerated styles +The discussion above for the mpirun/mpiexec command and the requirement +of one MPI task per GPU is the same. -This can be done by explicitly adding a "cuda" suffix to any supported -style in your input script: +You must still use the "-c on" "command-line +switch"_Section_start.html#start_7 to enable the USER-CUDA package. +This also issues a default "package cuda 2"_pacakge.html command which +sets the number of GPUs/node to use to 2. + +Use the "suffix cuda"_suffix.html command, or you can explicitly add a +"cuda" suffix to individual styles in your input script, e.g. pair_style lj/cut/cuda 2.5 :pre -Or you can run with the "-sf cuda" "command-line -switch"_Section_start.html#start_7, which will automatically append -"cuda" to styles that support it. - -lmp_machine -sf cuda -in in.script -mpirun -np 4 lmp_machine -sf cuda -in in.script :pre - -Using the "suffix cuda" command in your input script does the same -thing. +You only need to use the "package cuda"_package.html command if you +wish to change the number of GPUs/node to use or its other options. [Speed-ups to expect:] @@ -938,11 +965,26 @@ neighbor list builds, time integration, etc) can be parallelized for one or the other of the two modes. The first mode is called the "host" and is one or more threads running on one or more physical CPUs (within the node). Currently, both multi-core CPUs and an Intel Phi -processor (running in native mode) are supported. The second mode is -called the "device" and is an accelerator chip of some kind. -Currently only an NVIDIA GPU is supported. If your compute node does -not have a GPU, then there is only one mode of execution, i.e. the -host and device are the same. +processor (running in native mode, not offload mode like the +USER-INTEL package) are supported. The second mode is called the +"device" and is an accelerator chip of some kind. Currently only an +NVIDIA GPU is supported. If your compute node does not have a GPU, +then there is only one mode of execution, i.e. the host and device are +the same. + +Here is a quick overview of how to use the KOKKOS package +for GPU acceleration: + +specify variables and settings in your Makefile.machine that enable GPU, Phi, or OpenMP support +include the KOKKOS package and build LAMMPS +enable the KOKKOS package and its hardware options via the "-k on" command-line switch +use KOKKOS styles in your input script :ul + +The latter two steps can be done using the "-k on", "-pk kokkos" and +"-sf kk" "command-line switches"_Section_start.html#start_7 +respectively. Or either the steps can be done by adding the "package +kokkod"_package.html or "suffix kk"_suffix.html commands respectively +to your input script. [Required hardware/software:] @@ -954,7 +996,8 @@ CPU-only: one or a few MPI tasks per node with additional threading via OpenMP Phi: on one or more Intel Phi coprocessors (per node) GPU: on the GPUs of a node with additional OpenMP threading on the CPUs :ul -Intel Xeon Phi coprocessors are supported in "native" mode only. +Intel Xeon Phi coprocessors are supported in "native" mode only, not +"offload" mode. Only NVIDIA GPUs are currently supported. @@ -1007,7 +1050,7 @@ e.g. g++ in the first two examples above, then you *must* perform a to force all the KOKKOS-dependent files to be re-compiled with the new options. -You can also hardwire these variables in the specified machine +You can also hardwire these make variables in the specified machine makefile, e.g. src/MAKE/Makefile.g++ in the first two examples above, with a line like: @@ -1037,79 +1080,122 @@ IMPORTANT NOTE: Currently, there are no precision options with the KOKKOS package. All compilation and computation is performed in double precision. -[Running with the KOKKOS package:] +[Run with the KOKKOS package from the command line:] -The examples/kokkos and bench/KOKKOS directories have scripts that can -be run with the KOKKOS package, as well as detailed instructions on -how to run them. +The mpirun or mpiexec command sets the total number of MPI tasks used +by LAMMPS (one or multiple per compute node) and the number of MPI +tasks used per node. E.g. the mpirun command does this via its -np +and -ppn switches. -There are 3 issues (a,b,c) to address: +When using KOKKOS built with host=OMP, you need to choose how many +OpenMP threads per MPI task will be used. Note that the product of +MPI tasks * OpenMP threads/task should not exceed the physical number +of cores (on a node), otherwise performance will suffer. -(a) Launching LAMMPS in different KOKKOS modes +When using the KOKKOS package built with device=CUDA, you must use +exactly one MPI task per physical GPU. -Here are examples of how to run LAMMPS for the different compute-node -configurations listed above. +When using the KOKKOS package built with host=MIC for Intel Xeon Phi +coprocessor support you need to insure there is one or more MPI tasks +per coprocessor and choose the number of threads to use on a +coproessor per MPI task. The product of MPI tasks * coprocessor +threads/task should not exceed the maximum number of threads the +coproprocessor is designed to run, otherwise performance will suffer. +This value is 240 for current generation Xeon Phi(TM) chips, which is +60 physical cores * 4 threads/core. -Note that the -np setting for the mpirun command in these examples is -for runs on a single node. To scale these examples up to run on a -system with N compute nodes, simply multiply the -np setting by N. +NOTE: does not matter how many Phi per node, only concenred +with MPI tasks -CPU-only, dual hex-core CPUs: + + +You must use the "-k on" "command-line +switch"_Section_start.html#start_7 to enable the KOKKOS package. It +takes additional arguments for hardware settings appropriate to your +system. Those arguments are documented +"here"_Section_start.html#start_7. The two commonly used ones are as +follows: + +-k on t Nt +-k on g Ng :pre + +The "t Nt" option applies to host=OMP (even if device=CUDA) and +host=MIC. For host=OMP, it specifies how many OpenMP threads per MPI +task to use with a node. For host=MIC, it specifies how many Xeon Phi +threads per MPI task to use within a node. The default is Nt = 1. +Note that for host=OMP this is effectively MPI-only mode which may be +fine. But for host=MIC this may run 240 MPI tasks on the coprocessor, +which could give very poor perforamnce. + +The "g Ng" option applies to device=CUDA. It specifies how many GPUs +per compute node to use. The default is 1, so this only needs to be +specified is you have 2 or more GPUs per compute node. + +This also issues a default "package cuda 2"_package.html command which +sets the number of GPUs/node to use to 2. + +The "-k on" switch also issues a default "package kk neigh full +comm/exchange host comm/forward host"_package.html command which sets +some KOKKOS options to default values, discussed on the +"package"_package.html command doc page. + +Use the "-sf kk" "command-line switch"_Section_start.html#start_7, +which will automatically append "kokkos" to styles that support it. +Use the "-pk kokkos" "command-line switch"_Section_start.html#start_7 +if you wish to override any of the default values set by the "package +kokkos"_package.html command invoked by the "-k on" switch. + +host=OMP, dual hex-core nodes (12 threads/node): mpirun -np 12 lmp_g++ -in in.lj # MPI-only mode with no Kokkos mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj # MPI-only mode with Kokkos mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj # one MPI task, 12 threads mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj # two MPI tasks, 6 threads/task :pre -Intel Phi with 61 cores (240 total usable cores, with 4x hardware threading): +host=MIC, Intel Phi with 61 cores (240 threads/phi via 4x hardware threading): mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj # 12*20 = 240 mpirun -np 15 lmp_g++ -k on t 16 -sf kk -in in.lj mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj :pre -Dual hex-core CPUs and a single GPU: +host=OMP, device=CUDA, node = dual hex-core CPUs and a single GPU: mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj # one MPI task, 6 threads on CPU :pre +host=OMP, device=CUDA, node = dual 8-core CPUs and 2 GPUs: + Dual 8-core CPUs and 2 GPUs: mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj # two MPI tasks, 8 threads per CPU :pre -(b) Enable the KOKKOS package -As illustrated above, the "-k on" or "-kokkos on" "command-line -switch"_Section_start.html#start_7 must be used when launching LAMMPS. +[Or run with the KOKKOS package by editing an input script:] -As documented "here"_Section_start.html#start_7, the command-line -swithc allows for several options. Commonly used ones, as illustrated -above, are: +The discussion above for the mpirun/mpiexec command and setting + +of one MPI task per GPU is the same. + +You must still use the "-c on" "command-line +switch"_Section_start.html#start_7 to enable the USER-CUDA package. +This also issues a default "package cuda 2"_pacakge.html command which +sets the number of GPUs/node to use to 2. + +Use the "suffix cuda"_suffix.html command, or you can explicitly add a +"cuda" suffix to individual styles in your input script, e.g. + +pair_style lj/cut/cuda 2.5 :pre + +You only need to use the "package cuda"_package.html command if you +wish to change the number of GPUs/node to use or its other options. --k on t Nt : specifies how many threads per MPI task to use within a -compute node. For good performance, the product of MPI tasks * -threads/task should not exceed the number of physical cores on a CPU -or Intel Phi (including hardware threading, e.g. 240). :ulb,l --k on g Ng : specifies how many GPUs per compute node are available. -The default is 1, so this should be specified is you have 2 or more -GPUs per compute node. :l,ule -(c) Use KOKKOS-accelerated styles -This can be done by explicitly adding a "kk" suffix to any supported -style in your input script: -pair_style lj/cut/kk 2.5 :pre -Or you can run with the "-sf kk" "command-line -switch"_Section_start.html#start_7, which will automatically append -"kk" to styles that support it. -lmp_machine -sf kk -in in.script -mpirun -np 4 lmp_machine -sf kk -in in.script :pre -Using the "suffix kk" command in your input script does the same -thing. [Speed-ups to expect:] @@ -1270,11 +1356,12 @@ change in the future. The USER-INTEL package was developed by Mike Brown at Intel Corporation. It provides a capability to accelerate simulations by offloading neighbor list and non-bonded force calculations to Intel(R) -Xeon Phi(TM) coprocessors. Additionally, it supports running -simulations in single, mixed, or double precision with vectorization, -even if a coprocessor is not present, i.e. on an Intel(R) CPU. The -same C++ code is used for both cases. When offloading to a -coprocessor, the routine is run twice, once with an offload flag. +Xeon Phi(TM) coprocessors (not native mode like the KOKKOS package). +Additionally, it supports running simulations in single, mixed, or +double precision with vectorization, even if a coprocessor is not +present, i.e. on an Intel(R) CPU. The same C++ code is used for both +cases. When offloading to a coprocessor, the routine is run twice, +once with an offload flag. The USER-INTEL package can be used in tandem with the USER-OMP package. This is useful when offloading pair style computations to @@ -1296,20 +1383,26 @@ package is available. Here is a quick overview of how to use the USER-INTEL package for CPU acceleration: -specify these CCFLAGS in your machine Makefile: -fopenmp, -DLAMMPS_MEMALIGN=64, and -restrict, -xHost -specify -fopenmp with LINKFLAGS in your machine Makefile -include the USER-INTEL package and (optionally) USER-OMP package and build LAMMP -if also using the USER-OMP package, specify how many threads per MPI task to run with via an environment variable or the package omp command +specify these CCFLAGS in your Makefile.machine: -fopenmp, -DLAMMPS_MEMALIGN=64, and -restrict, -xHost +specify -fopenmp with LINKFLAGS in your Makefile.machine +include the USER-INTEL package and (optionally) USER-OMP package and build LAMMPS +if using the USER-OMP package, specify how many threads per MPI task to use use USER-INTEL styles in your input script :ul -Running with the USER-INTEL package to offload to the Intel(R) Xeon Phi(TM) -is the same except for these additional steps: +Using the USER-INTEL package to offload work to the Intel(R) +Xeon Phi(TM) coprocessor is the same except for these additional +steps: -add the flag -DLMP_INTEL_OFFLOAD to CCFLAGS in your Machine makefile -add the flag -offload to the LINKFLAGS in your Machine makefile -the package intel command can be used to adjust threads per coprocessor :ul +add the flag -DLMP_INTEL_OFFLOAD to CCFLAGS in your Makefile.machine +add the flag -offload to LINKFLAGS in your Makefile.machine +specify how many threads per coprocessor to use :ul -Details follow. +The latter two steps in the first case and the last step in the +coprocessor case can be done using the "-pk omp" and "-sf intel" and +"-pk intel" "command-line switches"_Section_start.html#start_7 +respectively. Or any of the 3 steps can be done by adding the +"package intel"_package.html or "suffix cuda"_suffix.html or "package +intel"_package.html commands respectively to your input script. [Required hardware/software:] @@ -1325,7 +1418,7 @@ compiler must support the OpenMP interface. [Building LAMMPS with the USER-INTEL package:] -Include the package and build LAMMPS. +Include the package(s) and build LAMMPS: cd lammps/src make yes-user-intel @@ -1358,77 +1451,98 @@ If using an Intel compiler, it is recommended that Intel(R) Compiler issues that are being addressed. If using Intel(R) MPI, version 5 or higher is recommended. -[Running with the USER-INTEL package:] +[Running with the USER-INTEL package from the command line:] -The examples/intel directory has scripts that can be run with the -USER-INTEL package, as well as detailed instructions on how to run -them. +The mpirun or mpiexec command sets the total number of MPI tasks used +by LAMMPS (one or multiple per compute node) and the number of MPI +tasks used per node. E.g. the mpirun command does this via its -np +and -ppn switches. -Note that the total number of MPI tasks used by LAMMPS (one or -multiple per compute node) is set in the usual manner via the mpirun -or mpiexec commands, and is independent of the USER-INTEL package. - -To run with the USER-INTEL package, there are 3 basic issues (a,b,c) -to address: - -(a) Specify how many threads per MPI task to use on the CPU. - -Whether using the USER-INTEL package to offload computations to -Intel(R) Xeon Phi(TM) coprocessors or not, work performed on the CPU -can be multi-threaded via the USER-OMP package, assuming the USER-OMP -package was also installed when LAMMPS was built. - -In this case, the instructions above for the USER-OMP package, in its -"Running with the USER-OMP package" sub-section apply here as well. - -You can specify the number of threads per MPI task via the -OMP_NUM_THREADS environment variable or the "package omp"_package.html -command. The product of MPI tasks * threads/task should not exceed -the physical number of cores on the CPU (per node), otherwise +If LAMMPS was also built with the USER-OMP package, you need to choose +how many OpenMP threads per MPI task will be used by the USER-OMP +package. Note that the product of MPI tasks * OpenMP threads/task +should not exceed the physical number of cores (on a node), otherwise performance will suffer. -Note that the threads per MPI task setting is completely independent -of the number of threads used on the coprocessor. Only the "package -intel"_package.html command can be used to control thread counts on -the coprocessor. +If LAMMPS was built with coprocessor support for the USER-INTEL +package, you need to specify the number of coprocessor/node and the +number of threads to use on the coproessor per MPI task. Note that +coprocessor threads (which run on the coprocessor) are totally +independent from OpenMP threads (which run on the CPU). The product +of MPI tasks * coprocessor threads/task should not exceed the maximum +number of threads the coproprocessor is designed to run, otherwise +performance will suffer. This value is 240 for current generation +Xeon Phi(TM) chips, which is 60 physical cores * 4 threads/core. The +threads/core value can be set to a smaller value if desired by an +option on the "package intel"_package.html command, in which case the +maximum number of threads is also reduced. -(b) Enable the USER-INTEL package +Use the "-sf intel" "command-line switch"_Section_start.html#start_7, +which will automatically append "intel" to styles that support it. If +a style does not support it, a "omp" suffix is tried next. Use the +"-pk omp Nt" "command-line switch"_Section_start.html#start_7, to set +Nt = # of OpenMP threads per MPI task to use. Use the "-pk intel Nt +Nphi" "command-line switch"_Section_start.html#start_7 to set Nphi = # +of Xeon Phi(TM) coprocessors/node. -This can be done in one of two ways. Use a "package intel"_package.html -command near the top of your input script. +CPU-only without USER-OMP (but using Intel vectorization on CPU): +lmp_machine -sf intel -in in.script # 1 MPI task +mpirun -np 32 lmp_machine -sf intel -in in.script # 32 MPI tasks on as many nodes as needed (e.g. 2 16-core nodes) :pre -Or use the "-sf intel" "command-line -switch"_Section_start.html#start_7, which will automatically invoke -the command "package intel * mixed balance -1 offload_cards 1 -offload_tpc 4 offload_threads 240". Note that this specifies mixed -precision and use of a single Xeon Phi(TM) coprocessor (per node), so -you must specify the package command in your input script explicitly -if you want a different precision or to use multiple Phi coprocessor -per node. Also note that the balance and offload keywords are ignored -if you did not build LAMMPS with offload support for a coprocessor, as -descibed above. +CPU-only with USER-OMP (and Intel vectorization on CPU): +lmp_machine -sf intel -pk intel 16 0 -in in.script # 1 MPI task on a 16-core node +mpirun -np 4 lmp_machine -sf intel -pk intel 4 0 -in in.script # 4 MPI tasks each with 4 threads on a single 16-core node +mpirun -np 32 lmp_machine -sf intel -pk intel 4 0 -in in.script # ditto on 8 16-core nodes :pre -(c) Use USER-INTEL-accelerated styles +CPUs + Xeon Phi(TM) coprocessors with USER-OMP: +lmp_machine -sf intel -pk intel 16 1 -in in.script # 1 MPI task, 240 threads on 1 coprocessor +mpirun -np 4 lmp_machine -sf intel -pk intel 4 1 tptask 60 -in in.script # 4 MPI tasks each with 4 OpenMP threads on a single 16-core node, + # each MPI task uses 60 threads on 1 coprocessor +mpirun -np 32 -ppn 4 lmp_machine -sf intel -pk intel 4 2 tptask 120 -in in.script # ditto on 8 16-core nodes for MPI tasks and OpenMP threads, + # each MPI task uses 120 threads on one of 2 coprocessors :pre -This can be done by explicitly adding an "intel" suffix to any -supported style in your input script: +Note that if the "-sf intel" switch is used, it also issues two +default commands: "package omp 0"_package.html and "package intel +1"_package.html command. These set the number of OpenMP threads per +MPI task via the OMP_NUM_THREADS environment variable, and the number +of Xeon Phi(TM) coprocessors/node to 1. The latter is ignored is +LAMMPS was not built with coprocessor support. + +Using the "-pk omp" switch explicitly allows for direct setting of the +number of OpenMP threads per MPI task, and additional options. Using +the "-pk intel" switch explicitly allows for direct setting of the +number of coprocessors/node, and additional options. The syntax for +these two switches is the same as the "package omp"_package.html and +"package intel"_package.html commands. See the "package"_package.html +command doc page for details, including the default values used for +all its options if these switches are not specified, and how to set +the number of OpenMP threads via the OMP_NUM_THREADS environment +variable if desired. + +[Or run with the USER-OMP package by editing an input script:] + +The discussion above for the mpirun/mpiexec command, MPI tasks/node, +OpenMP threads per MPI task, and coprocessor threads per MPI task is +the same. + +Use the "suffix intel"_suffix.html command, or you can explicitly add an +"intel" suffix to individual styles in your input script, e.g. pair_style lj/cut/intel 2.5 :pre -Or you can run with the "-sf intel" "command-line -switch"_Section_start.html#start_7, which will automatically append -"intel" to styles that support it. +You must also use the "package omp"_package.html command to enable the +USER-OMP package (assuming LAMMPS was built with USER-OMP) unless the "-sf +intel" or "-pk omp" "command-line switches"_Section_start.html#start_7 +were used. It specifies how many OpenMP threads per MPI task to use, +as well as other options. Its doc page explains how to set the number +of threads via an environment variable if desired. -lmp_machine -sf intel -in in.script -mpirun -np 4 lmp_machine -sf intel -in in.script :pre - -Using the "suffix intel" command in your input script does the same -thing. - -IMPORTANT NOTE: Using an "intel" suffix in any of the above modes, -actually invokes two suffixes, "intel" and "omp". "Intel" is tried -first, and if the style does not support it, "omp" is tried next. If -neither is supported, the default non-suffix style is used. +You must also use the "package intel"_package.html command to enable +coprocessor support within the USER-INTEL package (assuming LAMMPS was +built with coprocessor support) unless the "-sf intel" or "-pk intel" +"command-line switches"_Section_start.html#start_7 were used. It +specifies how many coprocessors/node to use, as well as other +coprocessor options. [Speed-ups to expect:] @@ -1466,8 +1580,8 @@ threads to use per core can be accomplished with keyword settings of the "package intel"_package.html command. :ulb,l If desired, only a fraction of the pair style computation can be -offloaded to the coprocessors. This is accomplished by setting a -balance fraction in the "package intel"_package.html command. A +offloaded to the coprocessors. This is accomplished by using the +{balance} keyword in the "package intel"_package.html command. A balance of 0 runs all calculations on the CPU. A balance of 1 runs all calculations on the coprocessor. A balance of 0.5 runs half of the calculations on the coprocessor. Setting the balance to -1 (the @@ -1481,10 +1595,6 @@ performance to use fewer MPI tasks and OpenMP threads than available cores. This is due to the fact that additional threads are generated internally to handle the asynchronous offload tasks. :l -If you have multiple coprocessors on each compute node, the -{offload_cards} keyword can be specified with the "package -intel"_package.html command. :l - If running short benchmark runs with dynamic load balancing, adding a short warm-up run (10-20 steps) will allow the load-balancer to find a near-optimal setting that will carry over to additional runs. :l @@ -1503,13 +1613,13 @@ dihedral, improper calculations, computation and data transfer to the coprocessor will run concurrently with computations and MPI communications for these calculations on the host CPU. The USER-INTEL package has two modes for deciding which atoms will be handled by the -coprocessor. This choice is controlled with the "offload_ghost" -keyword of the "package intel"_package.html command. When set to 0, -ghost atoms (atoms at the borders between MPI tasks) are not offloaded -to the card. This allows for overlap of MPI communication of forces -with computation on the coprocessor when the "newton"_newton.html -setting is "on". The default is dependent on the style being used, -however, better performance may be achieved by setting this option +coprocessor. This choice is controlled with the {ghost} keyword of +the "package intel"_package.html command. When set to 0, ghost atoms +(atoms at the borders between MPI tasks) are not offloaded to the +card. This allows for overlap of MPI communication of forces with +computation on the coprocessor when the "newton"_newton.html setting +is "on". The default is dependent on the style being used, however, +better performance may be achieved by setting this option explictly. :l,ule [Restrictions:] diff --git a/doc/fix_qeq.html b/doc/fix_qeq.html index 1389a85185..8e90b9fdd2 100644 --- a/doc/fix_qeq.html +++ b/doc/fix_qeq.html @@ -42,9 +42,29 @@ fix 1 qeq qeq/dynamic 1 12 1.0e-3 100 my_qeq and Goddard) and formulated in (Nakano) (also known as the matrix inversion method) and in (Rick and Stuart) (also known as the extended Lagrangian method) based on the -electronegativity equilization principle. These fixes can be used -with any potential in LAMMPS, so long as it defines and uses charges -on each atom and that QEq parameters are provided. +electronegativity equilization principle. +

            +

            These fixes can be used with any pair style in +LAMMPS, so long as per-atom charges are defined. The most typical +use-case is in conjunction with a pair style that +performs charge equilibration periodically (e.g. every timestep), such +as the ReaxFF or Streitz-Mintmire potential (the latter is not yet +implemented in LAMMPS). But these fixes can also be used with +potentials that normally assume per-atom charges are fixed, e.g. a +Buckingham or LJ/Coulombic potential. +

            +

            Because the charge equilibration calculation is effectively +independent of the pair style, these fixes can also be used to perform +a one-time assignment of charges to atoms. For example, you could +define the QEq fix, perform a zero-timestep run via the run +command without any pair style defined which would set per-atom +charges (based on the current atom configuration), then remove the fix +via the unfix command before performing further dynamics. +

            +

            IMPORTANT NOTE: Computing and using charge values different from +published values defined for a fixed-charge potential like Buckingham +or CHARMM or AMBER, can have a strong effect on energies and forces, +and produces a different model than the published versions.

            IMPORTANT NOTE: The fix qeq/comb command must still be used to perform charge equliibration with the COMB diff --git a/doc/fix_qeq.txt b/doc/fix_qeq.txt index b771fba8d8..6d9fbc25f9 100644 --- a/doc/fix_qeq.txt +++ b/doc/fix_qeq.txt @@ -36,9 +36,29 @@ Perform the charge equilibration (QEq) method as described in "(Rappe and Goddard)"_#Rappe and formulated in "(Nakano)"_#Nakano (also known as the matrix inversion method) and in "(Rick and Stuart)"_#Rick (also known as the extended Lagrangian method) based on the -electronegativity equilization principle. These fixes can be used -with any potential in LAMMPS, so long as it defines and uses charges -on each atom and that QEq parameters are provided. +electronegativity equilization principle. + +These fixes can be used with any "pair style"_pair_style.html in +LAMMPS, so long as per-atom charges are defined. The most typical +use-case is in conjunction with a "pair style"_pair_style.html that +performs charge equilibration periodically (e.g. every timestep), such +as the ReaxFF or Streitz-Mintmire potential (the latter is not yet +implemented in LAMMPS). But these fixes can also be used with +potentials that normally assume per-atom charges are fixed, e.g. a +"Buckingham"_pair_buck.html or "LJ/Coulombic"_pair_lj.html potential. + +Because the charge equilibration calculation is effectively +independent of the pair style, these fixes can also be used to perform +a one-time assignment of charges to atoms. For example, you could +define the QEq fix, perform a zero-timestep run via the "run"_run.html +command without any pair style defined which would set per-atom +charges (based on the current atom configuration), then remove the fix +via the "unfix"_unfix.html command before performing further dynamics. + +IMPORTANT NOTE: Computing and using charge values different from +published values defined for a fixed-charge potential like Buckingham +or CHARMM or AMBER, can have a strong effect on energies and forces, +and produces a different model than the published versions. IMPORTANT NOTE: The "fix qeq/comb"_fix_qeq_comb.html command must still be used to perform charge equliibration with the "COMB diff --git a/doc/package.html b/doc/package.html index 6a1f0ec39a..7e1ba294ae 100644 --- a/doc/package.html +++ b/doc/package.html @@ -370,6 +370,26 @@ capable compilers is to use one thread for each available CPU core when OMP_NUM_THREADS is not set, which can lead to extremely bad performance.

            +

            By default LAMMPS uses 1 thread per MPI task. If the environment +variable OMP_NUM_THREADS is set to a valid value, this value is used. +You can set this environment variable when you launch LAMMPS, e.g. +

            +
            env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script
            +env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
            +mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script 
            +
            +

            or you can set it permanently in your shell's start-up script. +All three of these examples use a total of 4 CPU cores. +

            +

            Note that different MPI implementations have different ways of passing +the OMP_NUM_THREADS environment variable to all MPI processes. The +2nd line above is for MPICH; the 3rd line with -x is for OpenMPI. +Check your MPI documentation for additional details. +

            +

            You can also set the number of threads per MPI task via the package +omp command, which will override any OMP_NUM_THREADS +setting. +

            Which combination of threads and MPI tasks gives the best performance is difficult to predict and can depend on many components of your input. Not all features of LAMMPS support OpenMP and the parallel efficiency diff --git a/doc/package.txt b/doc/package.txt index 11080c28a4..bca9992403 100644 --- a/doc/package.txt +++ b/doc/package.txt @@ -364,6 +364,34 @@ capable compilers is to use one thread for each available CPU core when {OMP_NUM_THREADS} is not set, which can lead to extremely bad performance. + + + +By default LAMMPS uses 1 thread per MPI task. If the environment +variable OMP_NUM_THREADS is set to a valid value, this value is used. +You can set this environment variable when you launch LAMMPS, e.g. + +env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script +env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script +mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script :pre + +or you can set it permanently in your shell's start-up script. +All three of these examples use a total of 4 CPU cores. + + +Note that different MPI implementations have different ways of passing +the OMP_NUM_THREADS environment variable to all MPI processes. The +2nd line above is for MPICH; the 3rd line with -x is for OpenMPI. +Check your MPI documentation for additional details. + +You can also set the number of threads per MPI task via the "package +omp"_package.html command, which will override any OMP_NUM_THREADS +setting. + + + + + Which combination of threads and MPI tasks gives the best performance is difficult to predict and can depend on many components of your input. Not all features of LAMMPS support OpenMP and the parallel efficiency From 3a18e667d459906b2ebf08a3b641cfc9e0362fef Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 16:05:17 +0000 Subject: [PATCH 05/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12451 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- doc/Section_accelerate.html | 187 +++++++++++++++++----------------- doc/Section_accelerate.txt | 193 +++++++++++++++++------------------- doc/package.html | 8 +- doc/package.txt | 8 +- 4 files changed, 190 insertions(+), 206 deletions(-) diff --git a/doc/Section_accelerate.html b/doc/Section_accelerate.html index 8c14fe7560..287e242bc6 100644 --- a/doc/Section_accelerate.html +++ b/doc/Section_accelerate.html @@ -264,8 +264,9 @@ due to if tests and other conditional code.

          • use OPT pair styles in your input script

          The last step can be done using the "-sf opt" command-line -switch. Or it can be done by adding a -suffix opt command to your input script. +switch. Or the effect of the "-sf" switch +can be duplicated by adding a suffix opt command to your +input script.

          Required hardware/software:

          @@ -331,8 +332,9 @@ uses the OpenMP interface for multi-threading.

        The latter two steps can be done using the "-pk omp" and "-sf omp" command-line switches respectively. Or -either step can be done by adding the package omp or -suffix omp commands respectively to your input script. +the effect of the "-pk" or "-sf" switches can be duplicated by adding +the package omp or suffix omp commands +respectively to your input script.

        Required hardware/software:

        @@ -541,8 +543,9 @@ hardware.

      The latter two steps can be done using the "-pk gpu" and "-sf gpu" command-line switches respectively. Or -either step can be done by adding the package gpu or -suffix gpu commands respectively to your input script. +the effect of the "-pk" or "-sf" switches can be duplicated by adding +the package gpu or suffix gpu commands +respectively to your input script.

      Required hardware/software:

      @@ -767,8 +770,9 @@ single CPU (core), assigned to each GPU.

    The latter two steps can be done using the "-pk cuda" and "-sf cuda" command-line switches respectively. Or -either step can be done by adding the package cuda or -suffix cuda commands respectively to your input script. +the effect of the "-pk" or "-sf" switches can be duplicated by adding +the package cuda or suffix cuda commands +respectively to your input script.

    Required hardware/software:

    @@ -894,7 +898,8 @@ sets the number of GPUs/node to use to 2.
    pair_style lj/cut/cuda 2.5 
     

    You only need to use the package cuda command if you -wish to change the number of GPUs/node to use or its other options. +wish to change the number of GPUs/node to use or its other option +defaults.

    Speed-ups to expect:

    @@ -988,22 +993,22 @@ for GPU acceleration:

    The latter two steps can be done using the "-k on", "-pk kokkos" and "-sf kk" command-line switches -respectively. Or either the steps can be done by adding the package -kokkod or suffix kk commands respectively -to your input script. +respectively. Or the effect of the "-pk" or "-sf" switches can be +duplicated by adding the package kokkos or suffix +kk commands respectively to your input script.

    Required hardware/software:

    -

    The KOKKOS package can be used to build and run -LAMMPS on the following kinds of hardware configurations: +

    The KOKKOS package can be used to build and run LAMMPS on the +following kinds of hardware:

    • CPU-only: one MPI task per CPU core (MPI-only, but using KOKKOS styles)
    • CPU-only: one or a few MPI tasks per node with additional threading via OpenMP
    • Phi: on one or more Intel Phi coprocessors (per node)
    • GPU: on the GPUs of a node with additional OpenMP threading on the CPUs
    -

    Intel Xeon Phi coprocessors are supported in "native" mode only, not -"offload" mode. +

    Note that Intel Xeon Phi coprocessors are supported in "native" mode, +not "offload" mode like the USER-INTEL package supports.

    Only NVIDIA GPUs are currently supported.

    @@ -1094,31 +1099,32 @@ tasks used per node. E.g. the mpirun command does this via its -np and -ppn switches.

    When using KOKKOS built with host=OMP, you need to choose how many -OpenMP threads per MPI task will be used. Note that the product of -MPI tasks * OpenMP threads/task should not exceed the physical number -of cores (on a node), otherwise performance will suffer. +OpenMP threads per MPI task will be used (via the "-k" command-line +switch discussed below). Note that the product of MPI tasks * OpenMP +threads/task should not exceed the physical number of cores (on a +node), otherwise performance will suffer.

    When using the KOKKOS package built with device=CUDA, you must use exactly one MPI task per physical GPU.

    When using the KOKKOS package built with host=MIC for Intel Xeon Phi -coprocessor support you need to insure there is one or more MPI tasks -per coprocessor and choose the number of threads to use on a -coproessor per MPI task. The product of MPI tasks * coprocessor -threads/task should not exceed the maximum number of threads the -coproprocessor is designed to run, otherwise performance will suffer. -This value is 240 for current generation Xeon Phi(TM) chips, which is -60 physical cores * 4 threads/core. -

    -

    NOTE: does not matter how many Phi per node, only concenred -with MPI tasks +coprocessor support you need to insure there are one or more MPI tasks +per coprocessor, and choose the number of coprocessor threads to use +per MPI task (via the "-k" command-line switch discussed below). The +product of MPI tasks * coprocessor threads/task should not exceed the +maximum number of threads the coproprocessor is designed to run, +otherwise performance will suffer. This value is 240 for current +generation Xeon Phi(TM) chips, which is 60 physical cores * 4 +threads/core. Note that with the KOKKOS package you do not need to +specify how many Phi coprocessors there are per node; each +coprocessors is simply treated as running some number of MPI tasks.

    You must use the "-k on" command-line switch to enable the KOKKOS package. It takes additional arguments for hardware settings appropriate to your -system. Those arguments are documented -here. The two commonly used ones are as -follows: +system. Those arguments are documented +here. The two most commonly used arguments +are:

    -k on t Nt
     -k on g Ng 
    @@ -1128,69 +1134,63 @@ host=MIC.  For host=OMP, it specifies how many OpenMP threads per MPI
     task to use with a node.  For host=MIC, it specifies how many Xeon Phi
     threads per MPI task to use within a node.  The default is Nt = 1.
     Note that for host=OMP this is effectively MPI-only mode which may be
    -fine.  But for host=MIC this may run 240 MPI tasks on the coprocessor,
    -which could give very poor perforamnce.
    +fine.  But for host=MIC you will typically end up using far less than
    +all the 240 available threads, which could give very poor performance.
     

    The "g Ng" option applies to device=CUDA. It specifies how many GPUs per compute node to use. The default is 1, so this only needs to be specified is you have 2 or more GPUs per compute node.

    -

    This also issues a default package cuda 2 command which -sets the number of GPUs/node to use to 2. -

    -

    The "-k on" switch also issues a default package kk neigh full -comm/exchange host comm/forward host command which sets -some KOKKOS options to default values, discussed on the -package command doc page. +

    The "-k on" switch also issues a default package kokkos neigh full +comm host command which sets various KOKKOS options to +default values, as discussed on the package command doc +page.

    Use the "-sf kk" command-line switch, -which will automatically append "kokkos" to styles that support it. -Use the "-pk kokkos" command-line switch -if you wish to override any of the default values set by the package +which will automatically append "kk" to styles that support it. Use +the "-pk kokkos" command-line switch if +you wish to override any of the default values set by the package kokkos command invoked by the "-k on" switch.

    -

    host=OMP, dual hex-core nodes (12 threads/node): -

    -
    mpirun -np 12 lmp_g++ -in in.lj      # MPI-only mode with no Kokkos
    -mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj      # MPI-only mode with Kokkos
    -mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj     # one MPI task, 12 threads
    -mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj      # two MPI tasks, 6 threads/task 
    +
    host=OMP, dual hex-core nodes (12 threads/node):
    +mpirun -np 12 lmp_g++ -in in.lj                           # MPI-only mode with no Kokkos
    +mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj              # MPI-only mode with Kokkos
    +mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj          # one MPI task, 12 threads
    +mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj           # two MPI tasks, 6 threads/task 
    +mpirun -np 32 -ppn 2 lmp_g++ -k on t 6 -sf kk -in in.lj   # ditto on 16 nodes 
     

    host=MIC, Intel Phi with 61 cores (240 threads/phi via 4x hardware threading): +mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj # 1 MPI task on 1 Phi, 1*240 = 240 +mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj # 30 MPI tasks on 1 Phi, 30*8 = 240 +mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj # 12 MPI tasks on 1 Phi, 12*20 = 240 +mpirun -np 96 -ppn 12 lmp_g++ -k on t 20 -sf kk -in in.lj # ditto on 8 Phis

    -
    mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj      # 12*20 = 240
    -mpirun -np 15 lmp_g++ -k on t 16 -sf kk -in in.lj
    -mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj
    -mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj 
    +
    host=OMP, device=CUDA, node = dual hex-core CPUs and a single GPU:
    +mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj          # one MPI task, 6 threads on CPU
    +mpirun -np 4 -ppn 1 lmp_cuda -k on t 6 -sf kk -in in.lj   # ditto on 4 nodes 
     
    -

    host=OMP, device=CUDA, node = dual hex-core CPUs and a single GPU: -

    -
    mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj       # one MPI task, 6 threads on CPU 
    -
    -

    host=OMP, device=CUDA, node = dual 8-core CPUs and 2 GPUs: -

    -

    Dual 8-core CPUs and 2 GPUs: -

    -
    mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj   # two MPI tasks, 8 threads per CPU 
    +
    host=OMP, device=CUDA, node = dual 8-core CPUs and 2 GPUs:
    +mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj           # two MPI tasks, 8 threads per CPU
    +mpirun -np 32 -ppn 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj   # ditto on 16 nodes 
     

    Or run with the KOKKOS package by editing an input script:

    The discussion above for the mpirun/mpiexec command and setting +appropriate thread and GPU values for host=OMP or host=MIC or +device=CUDA are the same.

    -

    of one MPI task per GPU is the same. +

    You must still use the "-k on" command-line +switch to enable the KOKKOS package, and +specify its additional arguments for hardware options appopriate to +your system, as documented above.

    -

    You must still use the "-c on" command-line -switch to enable the USER-CUDA package. -This also issues a default package cuda 2 command which -sets the number of GPUs/node to use to 2. +

    Use the suffix kk command, or you can explicitly add a +"kk" suffix to individual styles in your input script, e.g.

    -

    Use the suffix cuda command, or you can explicitly add a -"cuda" suffix to individual styles in your input script, e.g. -

    -
    pair_style lj/cut/cuda 2.5 
    +
    pair_style lj/cut/kk 2.5 
     
    -

    You only need to use the package cuda command if you -wish to change the number of GPUs/node to use or its other options. +

    You only need to use the package kokkos command if you +wish to change any of its option defaults.

    Speed-ups to expect:

    @@ -1210,8 +1210,8 @@ than 20%). performance of a KOKKOS style is a bit slower than the USER-OMP package. -
  • When running on GPUs, KOKKOS currently out-performs the -USER-CUDA and GPU packages. +
  • When running on GPUs, KOKKOS is typically faster than the USER-CUDA +and GPU packages.
  • When running on Intel Xeon Phi, KOKKOS is not as fast as the USER-INTEL package, which is optimized for that hardware. @@ -1222,8 +1222,8 @@ hardware.

    Guidelines for best performance:

    -

    Here are guidline for using the KOKKOS package on the different hardware -configurations listed above. +

    Here are guidline for using the KOKKOS package on the different +hardware configurations listed above.

    Many of the guidelines use the package kokkos command See its doc page for details and default settings. Experimenting with @@ -1234,7 +1234,7 @@ its options can provide a speed-up for specific calculations.

    If N is the number of physical cores/node, then the number of MPI tasks/node * number of threads/task should not exceed N, and should typically equal N. Note that the default threads/task is 1, as set by -the "t" keyword of the -k command-line +the "t" keyword of the "-k" command-line switch. If you do not change this, no additional parallelism (beyond MPI) will be invoked on the host CPU(s). @@ -1245,15 +1245,14 @@ CPU(s).

  • run with N MPI tasks/node and 1 thread/task
  • run with settings in between these extremes -

    Examples of mpirun commands in these modes, for nodes with dual -hex-core CPUs and no GPU, are shown above. +

    Examples of mpirun commands in these modes are shown above.

    When using KOKKOS to perform multi-threading, it is important for performance to bind both MPI tasks to physical cores, and threads to physical cores, so they do not migrate during a simulation.

    If you are not certain MPI tasks are being bound (check the defaults -for your MPI installation), it can be forced with these flags: +for your MPI installation), binding can be forced with these flags:

    OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ...
     Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ... 
    @@ -1276,7 +1275,7 @@ details).
     

    The -np setting of the mpirun command should set the number of MPI tasks/node to be equal to the # of physical GPUs on the node.

    -

    Use the -kokkos command-line switch to +

    Use the "-k" command-line switch to specify the number of GPUs per node, and the number of threads per MPI task. As above for multi-core CPUs (and no GPU), if N is the number of physical cores/node, then the number of MPI tasks/node * number of @@ -1286,14 +1285,13 @@ threads/task to a smaller value. This is because using all the cores on a dual-socket node will incur extra cost to copy memory from the 2nd socket to the GPU.

    -

    Examples of mpirun commands that follow these rules, for nodes with -dual hex-core CPUs and one or two GPUs, are shown above. +

    Examples of mpirun commands that follow these rules are shown above.

    -

    When using a GPU, you will achieve the best performance if your input -script does not use any fix or compute styles which are not yet -Kokkos-enabled. This allows data to stay on the GPU for multiple -timesteps, without being copied back to the host CPU. Invoking a -non-Kokkos fix or compute, or performing I/O for +

    IMPORTANT NOTE: When using a GPU, you will achieve the best +performance if your input script does not use any fix or compute +styles which are not yet Kokkos-enabled. This allows data to stay on +the GPU for multiple timesteps, without being copied back to the host +CPU. Invoking a non-Kokkos fix or compute, or performing I/O for thermo or dump output will cause data to be copied back to the CPU.

    @@ -1329,8 +1327,7 @@ threads/task as Nt. The product of these 2 values should be N, i.e. 4 so that logical threads from more than one MPI task do not run on the same physical core.

    -

    Examples of mpirun commands that follow these rules, for Intel Phi -nodes with 61 cores, are shown above. +

    Examples of mpirun commands that follow these rules are shown above.

    Restrictions:

    @@ -1395,8 +1392,8 @@ steps:

    The latter two steps in the first case and the last step in the coprocessor case can be done using the "-pk omp" and "-sf intel" and "-pk intel" command-line switches -respectively. Or any of the 3 steps can be done by adding the -package intel or suffix cuda or package +respectively. Or the effect of the "-pk" or "-sf" switches can be +duplicated by adding the package intel or suffix intel commands respectively to your input script.

    Required hardware/software: @@ -1514,7 +1511,7 @@ all its options if these switches are not specified, and how to set the number of OpenMP threads via the OMP_NUM_THREADS environment variable if desired.

    -

    Or run with the USER-OMP package by editing an input script: +

    Or run with the USER-INTEL package by editing an input script:

    The discussion above for the mpirun/mpiexec command, MPI tasks/node, OpenMP threads per MPI task, and coprocessor threads per MPI task is diff --git a/doc/Section_accelerate.txt b/doc/Section_accelerate.txt index bf956b88e2..b7e67559cf 100644 --- a/doc/Section_accelerate.txt +++ b/doc/Section_accelerate.txt @@ -258,8 +258,9 @@ include the OPT package and build LAMMPS use OPT pair styles in your input script :ul The last step can be done using the "-sf opt" "command-line -switch"_Section_start.html#start_7. Or it can be done by adding a -"suffix opt"_suffix.html command to your input script. +switch"_Section_start.html#start_7. Or the effect of the "-sf" switch +can be duplicated by adding a "suffix opt"_suffix.html command to your +input script. [Required hardware/software:] @@ -325,8 +326,9 @@ use USER-OMP styles in your input script :ul The latter two steps can be done using the "-pk omp" and "-sf omp" "command-line switches"_Section_start.html#start_7 respectively. Or -either step can be done by adding the "package omp"_package.html or -"suffix omp"_suffix.html commands respectively to your input script. +the effect of the "-pk" or "-sf" switches can be duplicated by adding +the "package omp"_package.html or "suffix omp"_suffix.html commands +respectively to your input script. [Required hardware/software:] @@ -535,8 +537,9 @@ use GPU styles in your input script :ul The latter two steps can be done using the "-pk gpu" and "-sf gpu" "command-line switches"_Section_start.html#start_7 respectively. Or -either step can be done by adding the "package gpu"_package.html or -"suffix gpu"_suffix.html commands respectively to your input script. +the effect of the "-pk" or "-sf" switches can be duplicated by adding +the "package gpu"_package.html or "suffix gpu"_suffix.html commands +respectively to your input script. [Required hardware/software:] @@ -761,8 +764,9 @@ use USER-CUDA styles in your input script :ul The latter two steps can be done using the "-pk cuda" and "-sf cuda" "command-line switches"_Section_start.html#start_7 respectively. Or -either step can be done by adding the "package cuda"_package.html or -"suffix cuda"_suffix.html commands respectively to your input script. +the effect of the "-pk" or "-sf" switches can be duplicated by adding +the "package cuda"_package.html or "suffix cuda"_suffix.html commands +respectively to your input script. [Required hardware/software:] @@ -888,7 +892,8 @@ Use the "suffix cuda"_suffix.html command, or you can explicitly add a pair_style lj/cut/cuda 2.5 :pre You only need to use the "package cuda"_package.html command if you -wish to change the number of GPUs/node to use or its other options. +wish to change the number of GPUs/node to use or its other option +defaults. [Speed-ups to expect:] @@ -982,22 +987,22 @@ use KOKKOS styles in your input script :ul The latter two steps can be done using the "-k on", "-pk kokkos" and "-sf kk" "command-line switches"_Section_start.html#start_7 -respectively. Or either the steps can be done by adding the "package -kokkod"_package.html or "suffix kk"_suffix.html commands respectively -to your input script. +respectively. Or the effect of the "-pk" or "-sf" switches can be +duplicated by adding the "package kokkos"_package.html or "suffix +kk"_suffix.html commands respectively to your input script. [Required hardware/software:] -The KOKKOS package can be used to build and run -LAMMPS on the following kinds of hardware configurations: +The KOKKOS package can be used to build and run LAMMPS on the +following kinds of hardware: CPU-only: one MPI task per CPU core (MPI-only, but using KOKKOS styles) CPU-only: one or a few MPI tasks per node with additional threading via OpenMP Phi: on one or more Intel Phi coprocessors (per node) GPU: on the GPUs of a node with additional OpenMP threading on the CPUs :ul -Intel Xeon Phi coprocessors are supported in "native" mode only, not -"offload" mode. +Note that Intel Xeon Phi coprocessors are supported in "native" mode, +not "offload" mode like the USER-INTEL package supports. Only NVIDIA GPUs are currently supported. @@ -1088,33 +1093,32 @@ tasks used per node. E.g. the mpirun command does this via its -np and -ppn switches. When using KOKKOS built with host=OMP, you need to choose how many -OpenMP threads per MPI task will be used. Note that the product of -MPI tasks * OpenMP threads/task should not exceed the physical number -of cores (on a node), otherwise performance will suffer. +OpenMP threads per MPI task will be used (via the "-k" command-line +switch discussed below). Note that the product of MPI tasks * OpenMP +threads/task should not exceed the physical number of cores (on a +node), otherwise performance will suffer. When using the KOKKOS package built with device=CUDA, you must use exactly one MPI task per physical GPU. When using the KOKKOS package built with host=MIC for Intel Xeon Phi -coprocessor support you need to insure there is one or more MPI tasks -per coprocessor and choose the number of threads to use on a -coproessor per MPI task. The product of MPI tasks * coprocessor -threads/task should not exceed the maximum number of threads the -coproprocessor is designed to run, otherwise performance will suffer. -This value is 240 for current generation Xeon Phi(TM) chips, which is -60 physical cores * 4 threads/core. - -NOTE: does not matter how many Phi per node, only concenred -with MPI tasks - - +coprocessor support you need to insure there are one or more MPI tasks +per coprocessor, and choose the number of coprocessor threads to use +per MPI task (via the "-k" command-line switch discussed below). The +product of MPI tasks * coprocessor threads/task should not exceed the +maximum number of threads the coproprocessor is designed to run, +otherwise performance will suffer. This value is 240 for current +generation Xeon Phi(TM) chips, which is 60 physical cores * 4 +threads/core. Note that with the KOKKOS package you do not need to +specify how many Phi coprocessors there are per node; each +coprocessors is simply treated as running some number of MPI tasks. You must use the "-k on" "command-line switch"_Section_start.html#start_7 to enable the KOKKOS package. It takes additional arguments for hardware settings appropriate to your -system. Those arguments are documented -"here"_Section_start.html#start_7. The two commonly used ones are as -follows: +system. Those arguments are "documented +here"_Section_start.html#start_7. The two most commonly used arguments +are: -k on t Nt -k on g Ng :pre @@ -1124,78 +1128,64 @@ host=MIC. For host=OMP, it specifies how many OpenMP threads per MPI task to use with a node. For host=MIC, it specifies how many Xeon Phi threads per MPI task to use within a node. The default is Nt = 1. Note that for host=OMP this is effectively MPI-only mode which may be -fine. But for host=MIC this may run 240 MPI tasks on the coprocessor, -which could give very poor perforamnce. +fine. But for host=MIC you will typically end up using far less than +all the 240 available threads, which could give very poor performance. The "g Ng" option applies to device=CUDA. It specifies how many GPUs per compute node to use. The default is 1, so this only needs to be specified is you have 2 or more GPUs per compute node. -This also issues a default "package cuda 2"_package.html command which -sets the number of GPUs/node to use to 2. - -The "-k on" switch also issues a default "package kk neigh full -comm/exchange host comm/forward host"_package.html command which sets -some KOKKOS options to default values, discussed on the -"package"_package.html command doc page. +The "-k on" switch also issues a default "package kokkos neigh full +comm host"_package.html command which sets various KOKKOS options to +default values, as discussed on the "package"_package.html command doc +page. Use the "-sf kk" "command-line switch"_Section_start.html#start_7, -which will automatically append "kokkos" to styles that support it. -Use the "-pk kokkos" "command-line switch"_Section_start.html#start_7 -if you wish to override any of the default values set by the "package +which will automatically append "kk" to styles that support it. Use +the "-pk kokkos" "command-line switch"_Section_start.html#start_7 if +you wish to override any of the default values set by the "package kokkos"_package.html command invoked by the "-k on" switch. host=OMP, dual hex-core nodes (12 threads/node): - -mpirun -np 12 lmp_g++ -in in.lj # MPI-only mode with no Kokkos -mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj # MPI-only mode with Kokkos -mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj # one MPI task, 12 threads -mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj # two MPI tasks, 6 threads/task :pre +mpirun -np 12 lmp_g++ -in in.lj # MPI-only mode with no Kokkos +mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj # MPI-only mode with Kokkos +mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj # one MPI task, 12 threads +mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj # two MPI tasks, 6 threads/task +mpirun -np 32 -ppn 2 lmp_g++ -k on t 6 -sf kk -in in.lj # ditto on 16 nodes :pre host=MIC, Intel Phi with 61 cores (240 threads/phi via 4x hardware threading): +mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj # 1 MPI task on 1 Phi, 1*240 = 240 +mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj # 30 MPI tasks on 1 Phi, 30*8 = 240 +mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj # 12 MPI tasks on 1 Phi, 12*20 = 240 +mpirun -np 96 -ppn 12 lmp_g++ -k on t 20 -sf kk -in in.lj # ditto on 8 Phis -mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj # 12*20 = 240 -mpirun -np 15 lmp_g++ -k on t 16 -sf kk -in in.lj -mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj -mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj :pre host=OMP, device=CUDA, node = dual hex-core CPUs and a single GPU: - -mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj # one MPI task, 6 threads on CPU :pre +mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj # one MPI task, 6 threads on CPU +mpirun -np 4 -ppn 1 lmp_cuda -k on t 6 -sf kk -in in.lj # ditto on 4 nodes :pre host=OMP, device=CUDA, node = dual 8-core CPUs and 2 GPUs: - -Dual 8-core CPUs and 2 GPUs: - -mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj # two MPI tasks, 8 threads per CPU :pre - +mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj # two MPI tasks, 8 threads per CPU +mpirun -np 32 -ppn 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj # ditto on 16 nodes :pre [Or run with the KOKKOS package by editing an input script:] The discussion above for the mpirun/mpiexec command and setting +appropriate thread and GPU values for host=OMP or host=MIC or +device=CUDA are the same. -of one MPI task per GPU is the same. - -You must still use the "-c on" "command-line -switch"_Section_start.html#start_7 to enable the USER-CUDA package. -This also issues a default "package cuda 2"_pacakge.html command which -sets the number of GPUs/node to use to 2. - -Use the "suffix cuda"_suffix.html command, or you can explicitly add a -"cuda" suffix to individual styles in your input script, e.g. - -pair_style lj/cut/cuda 2.5 :pre - -You only need to use the "package cuda"_package.html command if you -wish to change the number of GPUs/node to use or its other options. - - - - - +You must still use the "-k on" "command-line +switch"_Section_start.html#start_7 to enable the KOKKOS package, and +specify its additional arguments for hardware options appopriate to +your system, as documented above. +Use the "suffix kk"_suffix.html command, or you can explicitly add a +"kk" suffix to individual styles in your input script, e.g. +pair_style lj/cut/kk 2.5 :pre +You only need to use the "package kokkos"_package.html command if you +wish to change any of its option defaults. [Speed-ups to expect:] @@ -1215,8 +1205,8 @@ When running on CPUs only, with multiple threads per MPI task, performance of a KOKKOS style is a bit slower than the USER-OMP package. :l -When running on GPUs, KOKKOS currently out-performs the -USER-CUDA and GPU packages. :l +When running on GPUs, KOKKOS is typically faster than the USER-CUDA +and GPU packages. :l When running on Intel Xeon Phi, KOKKOS is not as fast as the USER-INTEL package, which is optimized for that hardware. :l,ule @@ -1227,8 +1217,8 @@ hardware. [Guidelines for best performance:] -Here are guidline for using the KOKKOS package on the different hardware -configurations listed above. +Here are guidline for using the KOKKOS package on the different +hardware configurations listed above. Many of the guidelines use the "package kokkos"_package.html command See its doc page for details and default settings. Experimenting with @@ -1239,7 +1229,7 @@ its options can provide a speed-up for specific calculations. If N is the number of physical cores/node, then the number of MPI tasks/node * number of threads/task should not exceed N, and should typically equal N. Note that the default threads/task is 1, as set by -the "t" keyword of the -k "command-line +the "t" keyword of the "-k" "command-line switch"_Section_start.html#start_7. If you do not change this, no additional parallelism (beyond MPI) will be invoked on the host CPU(s). @@ -1250,15 +1240,14 @@ run with 1 MPI task/node and N threads/task run with N MPI tasks/node and 1 thread/task run with settings in between these extremes :ul -Examples of mpirun commands in these modes, for nodes with dual -hex-core CPUs and no GPU, are shown above. +Examples of mpirun commands in these modes are shown above. When using KOKKOS to perform multi-threading, it is important for performance to bind both MPI tasks to physical cores, and threads to physical cores, so they do not migrate during a simulation. If you are not certain MPI tasks are being bound (check the defaults -for your MPI installation), it can be forced with these flags: +for your MPI installation), binding can be forced with these flags: OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ... Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ... :pre @@ -1281,7 +1270,7 @@ details). The -np setting of the mpirun command should set the number of MPI tasks/node to be equal to the # of physical GPUs on the node. -Use the "-kokkos command-line switch"_Section_commands.html#start_7 to +Use the "-k" "command-line switch"_Section_commands.html#start_7 to specify the number of GPUs per node, and the number of threads per MPI task. As above for multi-core CPUs (and no GPU), if N is the number of physical cores/node, then the number of MPI tasks/node * number of @@ -1291,14 +1280,13 @@ threads/task to a smaller value. This is because using all the cores on a dual-socket node will incur extra cost to copy memory from the 2nd socket to the GPU. -Examples of mpirun commands that follow these rules, for nodes with -dual hex-core CPUs and one or two GPUs, are shown above. +Examples of mpirun commands that follow these rules are shown above. -When using a GPU, you will achieve the best performance if your input -script does not use any fix or compute styles which are not yet -Kokkos-enabled. This allows data to stay on the GPU for multiple -timesteps, without being copied back to the host CPU. Invoking a -non-Kokkos fix or compute, or performing I/O for +IMPORTANT NOTE: When using a GPU, you will achieve the best +performance if your input script does not use any fix or compute +styles which are not yet Kokkos-enabled. This allows data to stay on +the GPU for multiple timesteps, without being copied back to the host +CPU. Invoking a non-Kokkos fix or compute, or performing I/O for "thermo"_thermo_style.html or "dump"_dump.html output will cause data to be copied back to the CPU. @@ -1334,8 +1322,7 @@ threads/task as Nt. The product of these 2 values should be N, i.e. 4 so that logical threads from more than one MPI task do not run on the same physical core. -Examples of mpirun commands that follow these rules, for Intel Phi -nodes with 61 cores, are shown above. +Examples of mpirun commands that follow these rules are shown above. [Restrictions:] @@ -1400,9 +1387,9 @@ specify how many threads per coprocessor to use :ul The latter two steps in the first case and the last step in the coprocessor case can be done using the "-pk omp" and "-sf intel" and "-pk intel" "command-line switches"_Section_start.html#start_7 -respectively. Or any of the 3 steps can be done by adding the -"package intel"_package.html or "suffix cuda"_suffix.html or "package -intel"_package.html commands respectively to your input script. +respectively. Or the effect of the "-pk" or "-sf" switches can be +duplicated by adding the "package intel"_package.html or "suffix +intel"_suffix.html commands respectively to your input script. [Required hardware/software:] @@ -1519,7 +1506,7 @@ all its options if these switches are not specified, and how to set the number of OpenMP threads via the OMP_NUM_THREADS environment variable if desired. -[Or run with the USER-OMP package by editing an input script:] +[Or run with the USER-INTEL package by editing an input script:] The discussion above for the mpirun/mpiexec command, MPI tasks/node, OpenMP threads per MPI task, and coprocessor threads per MPI task is diff --git a/doc/package.html b/doc/package.html index 7e1ba294ae..3a9893080e 100644 --- a/doc/package.html +++ b/doc/package.html @@ -449,10 +449,10 @@ The offload_ghost default setting is determined by the intel style being used. The value used is output to the screen in the offload report at the end of each run.

    -

    The default settings for the KOKKOS package are "package kk neigh full -comm/exchange host comm/forward host". This is the case whether the -"-sf kk" command-line switch is used or -not. +

    The default settings for the KOKKOS package are "package kokkos neigh +full comm/exchange host comm/forward host". This is the case whether +the "-sf kk" command-line switch is used +or not.

    If the "-sf omp" command-line switch is used then it is as if the command "package omp *" were invoked, to diff --git a/doc/package.txt b/doc/package.txt index bca9992403..94078fdb82 100644 --- a/doc/package.txt +++ b/doc/package.txt @@ -451,10 +451,10 @@ The {offload_ghost} default setting is determined by the intel style being used. The value used is output to the screen in the offload report at the end of each run. -The default settings for the KOKKOS package are "package kk neigh full -comm/exchange host comm/forward host". This is the case whether the -"-sf kk" "command-line switch"_Section_start.html#start_7 is used or -not. +The default settings for the KOKKOS package are "package kokkos neigh +full comm/exchange host comm/forward host". This is the case whether +the "-sf kk" "command-line switch"_Section_start.html#start_7 is used +or not. If the "-sf omp" "command-line switch"_Section_start.html#start_7 is used then it is as if the command "package omp *" were invoked, to From 334f57f7f3ea760c9a0a9252b8dd4c3d0cb20f21 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 16:37:27 +0000 Subject: [PATCH 06/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12452 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/VORONOI/compute_voronoi_atom.cpp | 16 ++++++++-------- src/VORONOI/compute_voronoi_atom.h | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/VORONOI/compute_voronoi_atom.cpp b/src/VORONOI/compute_voronoi_atom.cpp index 08f2379633..8d69a62af4 100644 --- a/src/VORONOI/compute_voronoi_atom.cpp +++ b/src/VORONOI/compute_voronoi_atom.cpp @@ -55,8 +55,7 @@ ComputeVoronoi::ComputeVoronoi(LAMMPS *lmp, int narg, char **arg) : con_mono = NULL; con_poly = NULL; - tags = NULL; - occvec = sendocc = lroot = lnext = NULL; + tags = occvec = sendocc = lroot = lnext = NULL; int iarg = 3; while ( iargput(i,x[i][0],x[i][1],x[i][2],rfield[i]); + } } else { // monodisperse voro++ container delete con_mono; @@ -450,7 +450,7 @@ void ComputeVoronoi::processCell(voronoicell_neighbor &c, int i) if (!have_narea) c.face_areas(narea); voro[i][2] = 0.0; - // each entry in neigh should correspond to amn entry in narea + // each entry in neigh should correspond to an entry in narea if (neighs != narea.size()) error->all(FLERR,"voro++ error: 'narea' and 'neigh' have a different size."); @@ -519,19 +519,19 @@ void ComputeVoronoi::compute_vector() /* ---------------------------------------------------------------------- */ -int ComputeVoronoi::pack_comm(int n, int *list, double *buf, +int ComputeVoronoi::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc) { int i,m=0; - for (i = 0; i < n; i++) buf[m++] = rfield[list[i]]; + for (i = 0; i < n; ++i) buf[m++] = rfield[list[i]]; return 1; } /* ---------------------------------------------------------------------- */ -void ComputeVoronoi::unpack_comm(int n, int first, double *buf) +void ComputeVoronoi::unpack_forward_comm(int n, int first, double *buf) { int i,last,m=0; last = first + n; - for (i = first; i < last; i++) rfield[i] = buf[m++]; + for (i = first; i < last; ++i) rfield[i] = buf[m++]; } diff --git a/src/VORONOI/compute_voronoi_atom.h b/src/VORONOI/compute_voronoi_atom.h index f762464f6b..f6026ced0d 100644 --- a/src/VORONOI/compute_voronoi_atom.h +++ b/src/VORONOI/compute_voronoi_atom.h @@ -34,8 +34,8 @@ class ComputeVoronoi : public Compute { void compute_vector(); double memory_usage(); - int pack_comm(int, int *, double *, int, int *); - void unpack_comm(int, int, double *); + int pack_forward_comm(int, int *, double *, int, int *); + void unpack_forward_comm(int, int, double *); private: voro::container *con_mono; From 787b9fc6f85487f1308f9ba0e6cf13f5846fb367 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 17:07:45 +0000 Subject: [PATCH 07/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12453 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- doc/package.html | 88 ++++++++++++++++++++++---------------------- doc/package.txt | 96 ++++++++++++++++++++++-------------------------- 2 files changed, 86 insertions(+), 98 deletions(-) diff --git a/doc/package.html b/doc/package.html index 3a9893080e..b4cec046ce 100644 --- a/doc/package.html +++ b/doc/package.html @@ -65,9 +65,13 @@ neigh value = full or half/thread or half or n2 or full/cluster comm/exchange value = no or host or device comm/forward value = no or host or device - omp args = Nthreads mode + omp args = Nthreads keyword value ... Nthreads = # of OpenMP threads to associate with each MPI process - mode = force or force/neigh (optional) + zero or more keyword/value pairs may be appended + keywords = neigh + neigh value = yes or no + yes = threaded neighbor list build (default) + no = non-threaded neighbor list build

    @@ -80,8 +84,8 @@ package gpu force/neigh 0 1 -1.0 package cuda gpu/node/special 2 0 2 package cuda test 3948 package kokkos neigh half/thread comm/forward device -package omp * force/neigh -package omp 4 force +package omp 0 neigh yes +package omp 4 package intel * mixed balance -1
  • Description: @@ -349,30 +353,25 @@ multiple threads to pack/unpack communicated data.

    The omp style invokes options associated with the use of the USER-OMP package.

    -

    The first argument allows to explicitly set the number of OpenMP -threads to be allocated for each MPI process. For example, if your -system has nodes with dual quad-core processors, it has a total of 8 -cores per node. You could run MPI on 2 cores on each node (e.g. using -options for the mpirun command), and set the Nthreads setting to 4. -This would effectively use all 8 cores on each node. Since each MPI -process would spawn 4 threads (one of which runs as part of the MPI -process itself). +

    The first argument sets the number of OpenMP threads allocated for +each MPI process or task. For example, if your system has nodes with +dual quad-core processors, it has a total of 8 cores per node. You +could two MPI tasks per node (e.g. using the -ppn option of the mpirun +command), and set Nthreads = 4. This would effectively use all 8 +cores on each node. Note that the product of MPI tasks * threads/task +should not exceed the physical number of cores (on a node), otherwise +performance will suffer.

    -

    For performance reasons, you should not set Nthreads to more threads -than there are physical cores (per MPI task), but LAMMPS cannot check -for this. -

    -

    An Nthreads value of '*' instructs LAMMPS to use whatever is the +

    An Nthreads value of 0 instructs LAMMPS to use whatever value is the default for the given OpenMP environment. This is usually determined via the OMP_NUM_THREADS environment variable or the compiler -runtime. Please note that in most cases the default for OpenMP -capable compilers is to use one thread for each available CPU core -when OMP_NUM_THREADS is not set, which can lead to extremely bad +runtime. Note that in most cases the default for OpenMP capable +compilers is to use one thread for each available CPU core when +OMP_NUM_THREADS is not explicitly set, which can lead to poor performance.

    -

    By default LAMMPS uses 1 thread per MPI task. If the environment -variable OMP_NUM_THREADS is set to a valid value, this value is used. -You can set this environment variable when you launch LAMMPS, e.g. +

    Here are examples of how to set the environment variable when +launching LAMMPS:

    env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script
     env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
    @@ -383,26 +382,24 @@ All three of these examples use a total of 4 CPU cores.
     

    Note that different MPI implementations have different ways of passing the OMP_NUM_THREADS environment variable to all MPI processes. The -2nd line above is for MPICH; the 3rd line with -x is for OpenMPI. -Check your MPI documentation for additional details. +2nd example line above is for MPICH; the 3rd example line with -x is +for OpenMPI. Check your MPI documentation for additional details.

    -

    You can also set the number of threads per MPI task via the package -omp command, which will override any OMP_NUM_THREADS -setting. +

    What combination of threads and MPI tasks gives the best performance +is difficult to predict and can depend on many components of your +input. Not all features of LAMMPS support OpenMP threading via the +USER-OMP packaage and the parallel efficiency can be very different, +too.

    -

    Which combination of threads and MPI tasks gives the best performance -is difficult to predict and can depend on many components of your input. -Not all features of LAMMPS support OpenMP and the parallel efficiency -can be very different, too. -

    -

    The mode setting specifies where neighbor list calculations will be -multi-threaded as well. If mode is force, neighbor list calculation -is performed in serial. If mode is force/neigh, a multi-threaded -neighbor list build is used. Using the force/neigh setting is almost -always faster and should produce idential neighbor lists at the -expense of using some more memory (neighbor list pages are always -allocated for all threads at the same time and each thread works on -its own pages). +

    The neigh keyword specifies whether neighbor list building will be +multi-threaded in addition to force calculations. If neigh is set +to no then neighbor list calculation is performed only by MPI tasks +with no OpenMP threading. If mode is yes (the default), a +multi-threaded neighbor list build is used. Using neigh = yes is +almost always faster and should produce idential neighbor lists at the +expense of using more memory. Specifically, neighbor list pages are +allocated for all threads at the same time and each thread works +within its own pages.


    @@ -455,9 +452,10 @@ the "-sf kk" command-line switch is u or not.

    If the "-sf omp" command-line switch is -used then it is as if the command "package omp *" were invoked, to -specify default settings for the USER-OMP package. If the -command-line switch is not used, then no defaults are set, and you -must specify the appropriate package command in your input script. +used then it is as if the command "package omp 0" were invoked, to +specify settings for the USER-OMP package. The option defaults are +neigh = yes. If the command-line switch is not used, then no defaults +are set, and you must specify the appropriate "package omp" command in +your input script.

    diff --git a/doc/package.txt b/doc/package.txt index 94078fdb82..f810a46ebe 100644 --- a/doc/package.txt +++ b/doc/package.txt @@ -60,9 +60,13 @@ args = arguments specific to the style :l {neigh} value = {full} or {half/thread} or {half} or {n2} or {full/cluster} {comm/exchange} value = {no} or {host} or {device} {comm/forward} value = {no} or {host} or {device} - {omp} args = Nthreads mode + {omp} args = Nthreads keyword value ... Nthreads = # of OpenMP threads to associate with each MPI process - mode = force or force/neigh (optional) :pre + zero or more keyword/value pairs may be appended + keywords = {neigh} + {neigh} value = {yes} or {no} + {yes} = threaded neighbor list build (default) + {no} = non-threaded neighbor list build :pre :ule [Examples:] @@ -74,8 +78,8 @@ package gpu force/neigh 0 1 -1.0 package cuda gpu/node/special 2 0 2 package cuda test 3948 package kokkos neigh half/thread comm/forward device -package omp * force/neigh -package omp 4 force +package omp 0 neigh yes +package omp 4 package intel * mixed balance -1 :pre [Description:] @@ -343,33 +347,25 @@ multiple threads to pack/unpack communicated data. The {omp} style invokes options associated with the use of the USER-OMP package. -The first argument allows to explicitly set the number of OpenMP -threads to be allocated for each MPI process. For example, if your -system has nodes with dual quad-core processors, it has a total of 8 -cores per node. You could run MPI on 2 cores on each node (e.g. using -options for the mpirun command), and set the {Nthreads} setting to 4. -This would effectively use all 8 cores on each node. Since each MPI -process would spawn 4 threads (one of which runs as part of the MPI -process itself). +The first argument sets the number of OpenMP threads allocated for +each MPI process or task. For example, if your system has nodes with +dual quad-core processors, it has a total of 8 cores per node. You +could two MPI tasks per node (e.g. using the -ppn option of the mpirun +command), and set {Nthreads} = 4. This would effectively use all 8 +cores on each node. Note that the product of MPI tasks * threads/task +should not exceed the physical number of cores (on a node), otherwise +performance will suffer. -For performance reasons, you should not set {Nthreads} to more threads -than there are physical cores (per MPI task), but LAMMPS cannot check -for this. - -An {Nthreads} value of '*' instructs LAMMPS to use whatever is the +An {Nthreads} value of 0 instructs LAMMPS to use whatever value is the default for the given OpenMP environment. This is usually determined via the {OMP_NUM_THREADS} environment variable or the compiler -runtime. Please note that in most cases the default for OpenMP -capable compilers is to use one thread for each available CPU core -when {OMP_NUM_THREADS} is not set, which can lead to extremely bad +runtime. Note that in most cases the default for OpenMP capable +compilers is to use one thread for each available CPU core when +{OMP_NUM_THREADS} is not explicitly set, which can lead to poor performance. - - - -By default LAMMPS uses 1 thread per MPI task. If the environment -variable OMP_NUM_THREADS is set to a valid value, this value is used. -You can set this environment variable when you launch LAMMPS, e.g. +Here are examples of how to set the environment variable when +launching LAMMPS: env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script @@ -378,33 +374,26 @@ mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script :pre or you can set it permanently in your shell's start-up script. All three of these examples use a total of 4 CPU cores. - Note that different MPI implementations have different ways of passing the OMP_NUM_THREADS environment variable to all MPI processes. The -2nd line above is for MPICH; the 3rd line with -x is for OpenMPI. -Check your MPI documentation for additional details. +2nd example line above is for MPICH; the 3rd example line with -x is +for OpenMPI. Check your MPI documentation for additional details. -You can also set the number of threads per MPI task via the "package -omp"_package.html command, which will override any OMP_NUM_THREADS -setting. +What combination of threads and MPI tasks gives the best performance +is difficult to predict and can depend on many components of your +input. Not all features of LAMMPS support OpenMP threading via the +USER-OMP packaage and the parallel efficiency can be very different, +too. - - - - -Which combination of threads and MPI tasks gives the best performance -is difficult to predict and can depend on many components of your input. -Not all features of LAMMPS support OpenMP and the parallel efficiency -can be very different, too. - -The {mode} setting specifies where neighbor list calculations will be -multi-threaded as well. If {mode} is force, neighbor list calculation -is performed in serial. If {mode} is force/neigh, a multi-threaded -neighbor list build is used. Using the force/neigh setting is almost -always faster and should produce idential neighbor lists at the -expense of using some more memory (neighbor list pages are always -allocated for all threads at the same time and each thread works on -its own pages). +The {neigh} keyword specifies whether neighbor list building will be +multi-threaded in addition to force calculations. If {neigh} is set +to {no} then neighbor list calculation is performed only by MPI tasks +with no OpenMP threading. If {mode} is {yes} (the default), a +multi-threaded neighbor list build is used. Using {neigh} = {yes} is +almost always faster and should produce idential neighbor lists at the +expense of using more memory. Specifically, neighbor list pages are +allocated for all threads at the same time and each thread works +within its own pages. :line @@ -457,7 +446,8 @@ the "-sf kk" "command-line switch"_Section_start.html#start_7 is used or not. If the "-sf omp" "command-line switch"_Section_start.html#start_7 is -used then it is as if the command "package omp *" were invoked, to -specify default settings for the USER-OMP package. If the -command-line switch is not used, then no defaults are set, and you -must specify the appropriate package command in your input script. +used then it is as if the command "package omp 0" were invoked, to +specify settings for the USER-OMP package. The option defaults are +neigh = yes. If the command-line switch is not used, then no defaults +are set, and you must specify the appropriate "package omp" command in +your input script. From 4f3c884aaac33b5efddc2b77518782c34a86a354 Mon Sep 17 00:00:00 2001 From: athomps Date: Tue, 9 Sep 2014 17:12:27 +0000 Subject: [PATCH 08/17] Changed optimized default to 1 git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12454 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/SNAP/pair_snap.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp index 5bad7ac6bb..03e6f1711a 100644 --- a/src/SNAP/pair_snap.cpp +++ b/src/SNAP/pair_snap.cpp @@ -1233,7 +1233,7 @@ void PairSNAP::settings(int narg, char **arg) nthreads = -1; use_shared_arrays=-1; do_load_balance = 0; - use_optimized = 0; + use_optimized = 1; // optional arguments @@ -1241,7 +1241,7 @@ void PairSNAP::settings(int narg, char **arg) if (i+2>narg) error->all(FLERR,"Illegal pair_style command." " Too few arguments."); if (strcmp(arg[i],"nthreads")==0) { - nthreads=atoi(arg[++i]); + nthreads=force->inumeric(FLERR,arg[++i]); #if defined(LMP_USER_OMP) error->all(FLERR,"Please set number of threads via package omp command"); #else @@ -1251,15 +1251,15 @@ void PairSNAP::settings(int narg, char **arg) continue; } if (strcmp(arg[i],"optimized")==0) { - use_optimized=atoi(arg[++i]); + use_optimized=force->inumeric(FLERR,arg[++i]); continue; } if (strcmp(arg[i],"shared")==0) { - use_shared_arrays=atoi(arg[++i]); + use_shared_arrays=force->inumeric(FLERR,arg[++i]); continue; } if (strcmp(arg[i],"loadbalance")==0) { - do_load_balance = atoi(arg[++i]); + do_load_balance = force->inumeric(FLERR,arg[++i]); if (do_load_balance) { double mincutoff = extra_cutoff() + rcutmax + neighbor->skin; @@ -1321,8 +1321,8 @@ void PairSNAP::settings(int narg, char **arg) use_shared_arrays || do_load_balance || schedule_user) - error->all(FLERR,"Illegal pair_style command."); - + error->all(FLERR,"Illegal pair_style command." + "Advanced options require setting 'optimized 1'."); } /* ---------------------------------------------------------------------- From b843c808a5038a3fb3cdbe8b1c4b6a36477407c8 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 19:23:31 +0000 Subject: [PATCH 09/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12455 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/GPU/fix_gpu.cpp | 112 +++++++++++++++++++++++---------------- src/USER-OMP/fix_omp.cpp | 25 +++++---- src/input.cpp | 1 - src/lammps.cpp | 6 +-- 4 files changed, 84 insertions(+), 60 deletions(-) diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index a386067289..9bbee8ff2b 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -80,68 +80,89 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : if (lmp->citeme) lmp->citeme->add(cite_gpu_package); if (lmp->cuda) - error->all(FLERR,"Cannot use fix GPU with USER-CUDA mode enabled"); + error->all(FLERR,"Cannot use GPU package with USER-CUDA package enabled"); - if (narg < 7) error->all(FLERR,"Illegal fix GPU command"); - if (strcmp(arg[1],"all") != 0) error->all(FLERR,"Illegal fix GPU command"); + if (narg < 4) error->all(FLERR,"Illegal package gpu command"); + if (strcmp(arg[1],"all") != 0) + error->all(FLERR,"Illegal package gpu command"); - int first_gpu, last_gpu; - - if (strcmp(arg[3],"force") == 0) - _gpu_mode = GPU_FORCE; - else if (strcmp(arg[3],"force/neigh") == 0) { - _gpu_mode = GPU_NEIGH; - if (domain->triclinic) - error->all(FLERR,"Cannot use force/neigh with triclinic box"); - } else if (strcmp(arg[3],"force/hybrid_neigh") == 0) { - _gpu_mode = GPU_HYB_NEIGH; - if (domain->triclinic) - error->all(FLERR, - "Cannot use force/hybrid_neigh with triclinic box"); - } else - error->all(FLERR,"Illegal fix GPU command"); - - first_gpu = force->inumeric(FLERR,arg[4]); - last_gpu = force->inumeric(FLERR,arg[5]); - - _particle_split = force->numeric(FLERR,arg[6]); - if (_particle_split==0 || _particle_split>1) - error->all(FLERR,"Illegal fix GPU command"); + int ngpu = atoi(arg[3]); + if (ngpu <= 0) error->all(FLERR,"Illegal package gpu command"); + int first_gpu = 0; + int last_gpu = ngpu-1; + + // options + _gpu_mode = GPU_NEIGH; + _particle_split = 1.0; + int newtonflag = 0; int nthreads = 1; int threads_per_atom = -1; - double cell_size = -1; - - int iarg = 7; + double binsize = -1; char *opencl_flags = NULL; + + int iarg = 4; while (iarg < narg) { - if (iarg+2 > narg) error->all(FLERR,"Illegal fix GPU command"); - - if (strcmp(arg[iarg],"threads_per_atom") == 0) + if (strcmp(arg[iarg],"neigh") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); + if (strcmp(arg[iarg]+1,"yes") == 0) _gpu_mode = GPU_NEIGH; + else if (strcmp(arg[iarg]+1,"no") == 0) _gpu_mode = GPU_FORCE; + else if (strcmp(arg[iarg]+1,"hybrid") == 0) _gpu_mode = GPU_HYB_NEIGH; + else error->all(FLERR,"Illegal package gpu command"); + iarg += 2; + } else if (strcmp(arg[iarg],"split") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); + _particle_split = force->numeric(FLERR,arg[iarg+1]); + if (_particle_split <= 0.0 || _particle_split > 1.0) + error->all(FLERR,"Illegal package GPU command"); + iarg += 2; + } else if (strcmp(arg[iarg],"newton") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); + if (strcmp(arg[iarg]+1,"off") == 0) newtonflag = 0; + else if (strcmp(arg[iarg]+1,"on") == 0) newtonflag = 1; + iarg += 2; + } else if (strcmp(arg[iarg],"gpuID") == 0) { + if (iarg+3 > narg) error->all(FLERR,"Illegal package gpu command"); + first_gpu = force->inumeric(FLERR,arg[iarg+1]); + last_gpu = force->inumeric(FLERR,arg[iarg+2]); + iarg += 3; + } else if (strcmp(arg[iarg],"tpa") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); threads_per_atom = force->inumeric(FLERR,arg[iarg+1]); - else if (strcmp(arg[iarg],"nthreads") == 0) + iarg += 2; + } else if (strcmp(arg[iarg],"nthreads") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); nthreads = force->inumeric(FLERR,arg[iarg+1]); - else if (strcmp(arg[iarg],"cellsize") == 0) - cell_size = force->numeric(FLERR,arg[iarg+1]); - else if (strcmp(arg[iarg],"device") == 0) + if (nthreads < 1) error->all(FLERR,"Illegal fix GPU command"); + iarg += 2; + } else if (strcmp(arg[iarg],"binsize") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); + binsize = force->numeric(FLERR,arg[iarg+1]); + if (binsize <= 0.0) error->all(FLERR,"Illegal fix GPU command"); + iarg += 2; + } else if (strcmp(arg[iarg],"device") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); opencl_flags = arg[iarg+1]; - else - error->all(FLERR,"Illegal fix GPU command"); - - iarg += 2; + iarg += 2; + } else error->all(FLERR,"Illegal package gpu command"); } - if (nthreads < 1) - error->all(FLERR,"Illegal fix GPU command"); + // error check + + if ((_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) && + domain->triclinic) + error->all(FLERR,"Cannot use package gpu neigh yes with triclinic box"); #ifndef _OPENMP if (nthreads > 1) error->all(FLERR,"No OpenMP support compiled in"); #endif + // pass params to GPU library + int gpu_flag = lmp_init_device(universe->uworld, world, first_gpu, last_gpu, _gpu_mode, _particle_split, nthreads, - threads_per_atom, cell_size, opencl_flags); + threads_per_atom, binsize, opencl_flags); GPU_EXTRA::check_flag(gpu_flag,error,world); } @@ -214,10 +235,9 @@ void FixGPU::setup(int vflag) error->all(FLERR, "Cannot use neigh_modify exclude with GPU neighbor builds"); - if (strstr(update->integrate_style,"verlet")) - post_force(vflag); + if (strstr(update->integrate_style,"verlet")) post_force(vflag); else { - // In setup only, all forces calculated on gpu are put in the outer level + // in setup only, all forces calculated on GPU are put in the outer level ((Respa *) update->integrate)->copy_flevel_f(_nlevels_respa-1); post_force(vflag); ((Respa *) update->integrate)->copy_f_flevel(_nlevels_respa-1); @@ -273,7 +293,7 @@ void FixGPU::post_force_respa(int vflag, int ilevel, int iloop) double FixGPU::memory_usage() { double bytes = 0.0; - // Memory usage currently returned by pair routine + // memory usage currently returned by pair routine return bytes; } diff --git a/src/USER-OMP/fix_omp.cpp b/src/USER-OMP/fix_omp.cpp index 9b0771ceef..28a99bb53a 100644 --- a/src/USER-OMP/fix_omp.cpp +++ b/src/USER-OMP/fix_omp.cpp @@ -76,7 +76,7 @@ FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg) int nthreads = 1; if (narg > 3) { #if defined(_OPENMP) - if (strcmp(arg[3],"*") == 0) + if (strcmp(arg[3],"0") == 0) #pragma omp parallel default(none) shared(nthreads) nthreads = omp_get_num_threads(); else @@ -96,22 +96,27 @@ FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg) comm->nthreads = nthreads; } + // optional keywords + int iarg = 4; while (iarg < narg) { - if (strcmp(arg[iarg],"force/neigh") == 0) - _neighbor = true; - else if (strcmp(arg[iarg],"force") == 0) - _neighbor = false; - else if (strcmp(arg[iarg],"mixed") == 0) + if (strcmp(arg[iarg],"neigh") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package omp command"); + if (strcmp(arg[iarg]+1,"yes") == 0) _neighbor = true; + else if (strcmp(arg[iarg]+1,"no") == 0) _neighbor = false; + else error->all(FLERR,"Illegal package omp command"); + iarg += 2; + } else if (strcmp(arg[iarg],"mixed") == 0) { _mixed = true; - else if (strcmp(arg[iarg],"double") == 0) + iarg++; + } else if (strcmp(arg[iarg],"double") == 0) { _mixed = false; - else - error->all(FLERR,"Illegal package omp mode requested"); - ++iarg; + iarg++; + } else error->all(FLERR,"Illegal package omp command"); } // print summary of settings + if (comm->me == 0) { const char * const nmode = _neighbor ? "multi-threaded" : "serial"; const char * const kmode = _mixed ? "mixed" : "double"; diff --git a/src/input.cpp b/src/input.cpp index 18f9ddf65d..30dd2d3019 100644 --- a/src/input.cpp +++ b/src/input.cpp @@ -1418,7 +1418,6 @@ void Input::package() for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i]; modify->add_fix(2+narg,fixarg); delete [] fixarg; - force->newton_pair = 0; } else if (strcmp(arg[0],"kokkos") == 0) { if (!lmp->kokkos) diff --git a/src/lammps.cpp b/src/lammps.cpp index c236961a87..2d2625bc4c 100644 --- a/src/lammps.cpp +++ b/src/lammps.cpp @@ -627,12 +627,12 @@ void LAMMPS::post_create() if (suffix) { if (strcmp(suffix,"gpu") == 0) input->one("package gpu force/neigh 0 0 1"); - if (strcmp(suffix,"omp") == 0) input->one("package omp *"); + if (strcmp(suffix,"omp") == 0) input->one("package omp 0"); if (strcmp(suffix,"intel") == 0) - input->one("package intel * mixed balance -1"); + input->one("package intel mixed balance -1"); } if (suffix2) { - if (strcmp(suffix,"omp") == 0) input->one("package omp *"); + if (strcmp(suffix,"omp") == 0) input->one("package omp 0"); } } From 3f532868367c443de85de6fd65b4e85a5dc244d2 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 21:14:55 +0000 Subject: [PATCH 10/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12456 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- doc/Section_accelerate.html | 21 +-- doc/Section_accelerate.txt | 21 +-- doc/package.html | 323 +++++++++++++++++++---------------- doc/package.txt | 325 ++++++++++++++++++++---------------- 4 files changed, 381 insertions(+), 309 deletions(-) diff --git a/doc/Section_accelerate.html b/doc/Section_accelerate.html index 287e242bc6..041c38ce14 100644 --- a/doc/Section_accelerate.html +++ b/doc/Section_accelerate.html @@ -864,8 +864,6 @@ per physical GPU.

    You must use the "-c on" command-line switch to enable the USER-CUDA package. -This also issues a default package cuda 2 command which -sets the number of GPUs/node to use to 2.

    Use the "-sf cuda" command-line switch, which will automatically append "cuda" to styles that support it. Use @@ -876,11 +874,11 @@ set Ng = # of GPUs per node. mpirun -np 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script # 2 MPI tasks use 2 GPUs on a single 16-core (or whatever) node mpirun -np 24 -ppn 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script # ditto on 12 16-core nodes

    -

    Using the "-pk" switch explicitly allows for direct setting of the -number of GPUs/node to use and additional options. Its syntax is the -same as same as the "package cuda" command. See the -package command doc page for details, including the -default values used for all its options if it is not specified. +

    The "-pk" switch must be used (unless the package cuda +command is used in the input script) to set the number of GPUs/node to +use. It also allows for setting of additional options. Its syntax is +the same as same as the "package cuda" command. See the +package command doc page for details.

    Or run with the USER-CUDA package by editing an input script:

    @@ -889,17 +887,16 @@ of one MPI task per GPU is the same.

    You must still use the "-c on" command-line switch to enable the USER-CUDA package. -This also issues a default package cuda 2 command which -sets the number of GPUs/node to use to 2.

    Use the suffix cuda command, or you can explicitly add a "cuda" suffix to individual styles in your input script, e.g.

    pair_style lj/cut/cuda 2.5 
     
    -

    You only need to use the package cuda command if you -wish to change the number of GPUs/node to use or its other option -defaults. +

    You must use the package cuda command to set the the +number of GPUs/node, unless the "-pk" command-line +switch was used. The command also +allows for setting of additional options.

    Speed-ups to expect:

    diff --git a/doc/Section_accelerate.txt b/doc/Section_accelerate.txt index b7e67559cf..ed2a5b93dd 100644 --- a/doc/Section_accelerate.txt +++ b/doc/Section_accelerate.txt @@ -858,8 +858,6 @@ per physical GPU. You must use the "-c on" "command-line switch"_Section_start.html#start_7 to enable the USER-CUDA package. -This also issues a default "package cuda 2"_package.html command which -sets the number of GPUs/node to use to 2. Use the "-sf cuda" "command-line switch"_Section_start.html#start_7, which will automatically append "cuda" to styles that support it. Use @@ -870,11 +868,11 @@ lmp_machine -c on -sf cuda -pk cuda 1 -in in.script # 1 MP mpirun -np 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script # 2 MPI tasks use 2 GPUs on a single 16-core (or whatever) node mpirun -np 24 -ppn 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script # ditto on 12 16-core nodes :pre -Using the "-pk" switch explicitly allows for direct setting of the -number of GPUs/node to use and additional options. Its syntax is the -same as same as the "package cuda" command. See the -"package"_package.html command doc page for details, including the -default values used for all its options if it is not specified. +The "-pk" switch must be used (unless the "package cuda"_package.html +command is used in the input script) to set the number of GPUs/node to +use. It also allows for setting of additional options. Its syntax is +the same as same as the "package cuda" command. See the +"package"_package.html command doc page for details. [Or run with the USER-CUDA package by editing an input script:] @@ -883,17 +881,16 @@ of one MPI task per GPU is the same. You must still use the "-c on" "command-line switch"_Section_start.html#start_7 to enable the USER-CUDA package. -This also issues a default "package cuda 2"_pacakge.html command which -sets the number of GPUs/node to use to 2. Use the "suffix cuda"_suffix.html command, or you can explicitly add a "cuda" suffix to individual styles in your input script, e.g. pair_style lj/cut/cuda 2.5 :pre -You only need to use the "package cuda"_package.html command if you -wish to change the number of GPUs/node to use or its other option -defaults. +You must use the "package cuda"_package.html command to set the the +number of GPUs/node, unless the "-pk" "command-line +switch"_Section_start.html#start_7 was used. The command also +allows for setting of additional options. [Speed-ups to expect:] diff --git a/doc/package.html b/doc/package.html index b4cec046ce..057b5bd344 100644 --- a/doc/package.html +++ b/doc/package.html @@ -19,30 +19,35 @@
  • args = arguments specific to the style -
      cuda args = keyword value ...
    -    one or more keyword/value pairs may be appended
    -    keywords = gpu/node or gpu/node/special or timing or test or override/bpa
    -      gpu/node value = N
    -        N = number of GPUs to be used per node
    -      gpu/node/special values = N gpu1 .. gpuN
    -        N = number of GPUs to be used per node
    -        gpu1 .. gpuN = N IDs of the GPUs to use
    +
      cuda args = Ngpu keyword value ...
    +    Ngpu = # of GPUs per node
    +    zero or more keyword/value pairs may be appended
    +    keywords = gpuID or timing or test or thread
    +      gpuID values = gpu1 .. gpuN
    +        gpu1 .. gpuN = IDs of the Ngpu GPUs to use
           timing values = none
           test values = id
             id = atom-ID of a test particle
    -      override/bpa values = flag
    -        flag = 0 for TpA algorithm, 1 for BpA algorithm 
    -  gpu args = mode first last split keyword value ...
    -    mode = force or force/neigh
    -    first = ID of first GPU to be used on each node
    -    last = ID of last GPU to be used on each node
    -    split = fraction of particles assigned to the GPU
    -    zero or more keyword/value pairs may be appended
    -    keywords = threads_per_atom or cellsize or device
    -      threads_per_atom value = Nthreads
    +      thread = auto or tpa or bpa
    +        auto = test whether tpa or bpa is faster
    +        tpa = one thread per atom
    +        bpa = one block per atom
    +  gpu args = Ngpu keyword value ...
    +    Ngpu = # of GPUs per node
    +    zero or more keyword/value pairs may be appended 
    +    keywords = neigh or split or gpuID or tpa or binsize or device
    +      neigh value = yes or no
    +        yes = neighbor list build on GPU (default)
    +        no = neighbor list build on CPU
    +      split = fraction
    +        fraction = fraction of atoms assigned to GPU (default = 1.0)
    +      gpuID values = first last
    +        first = ID of first GPU to be used on each node
    +        last = ID of last GPU to be used on each node
    +      tpa value = Nthreads
             Nthreads = # of GPU threads used per atom
    -      cellsize value = dist
    -        dist = length (distance units) in each dimension for neighbor bins
    +      binsize value = size
    +        size = bin size for neighbor list construction (distance units)
           device value = device_type
             device_type = kepler or fermi or cypress or generic
       intel args = Nthreads precision keyword value ...
    @@ -66,21 +71,20 @@
           comm/exchange value = no or host or device
           comm/forward value = no or host or device
       omp args = Nthreads keyword value ...
    -    Nthreads = # of OpenMP threads to associate with each MPI process
    +    Nthread = # of OpenMP threads to associate with each MPI process
         zero or more keyword/value pairs may be appended 
         keywords = neigh
           neigh value = yes or no
    -        yes = threaded neighbor list build (default)
    -        no = non-threaded neighbor list build 
    +        yes = threaded neighbor list build (default)
    +        no = non-threaded neighbor list build 
     

    Examples:

    -
    package gpu force 0 0 1.0
    -package gpu force 0 0 0.75
    -package gpu force/neigh 0 0 1.0
    -package gpu force/neigh 0 1 -1.0
    +
    package gpu 1
    +package gpu 1 split 0.75
    +package gpu 2 split -1.0
     package cuda gpu/node/special 2 0 2
     package cuda test 3948
     package kokkos neigh half/thread comm/forward device
    @@ -94,6 +98,10 @@ package intel * mixed balance -1
     following packages use it: USER-CUDA, GPU, USER-INTEL, KOKKOS, and
     USER-OMP.
     

    +

    Talk about command line switches +

    +

    When does it have to be invoked +

    To use the accelerated GPU and USER-OMP styles, the use of the package command is required. However, as described in the "Defaults" section below, if you use the "-sf gpu" or "-sf omp" command-line @@ -109,98 +117,105 @@ need to use the package command if you want to change the defaults. more details about using these various packages for accelerating LAMMPS calculations.

    +

    Package GPU always sets newton pair off. Not so for USER-CUDA> +


    -

    The cuda style invokes options associated with the use of the -USER-CUDA package. +

    The cuda style invokes settings associated with the use of the +USER-CUDA package.

    -

    The gpu/node keyword specifies the number N of GPUs to be used on -each node. An MPI process with rank K will use the GPU (K mod N). -This implies that processes should be assigned with successive ranks -on each node, which is the default with most (or even all) MPI -implementations. The default value for N is 2. +

    The Ngpus argument sets the number of GPUs per node. There must be +exactly one MPI task per GPU, as set by the mpirun or mpiexec command.

    -

    The gpu/node/special keyword also specifies the number (N) of GPUs -to be used on each node, but allows more control over their -specification. An MPI process with rank K will use the GPU gpuI -with l = (K mod N) + 1. This implies that processes should be assigned -with successive ranks on each node, which is the default with most (or -even all) MPI implementations. For example if you have three GPUs on -a machine, one of which is used for the X-Server (the GPU with the ID -1) while the others (with IDs 0 and 2) are used for computations you -would specify: +

    Optional keyword/value pairs can also be specified. Each has a +default value as listed below.

    -
    package cuda gpu/node/special 2 0 2 
    +

    The gpuID keyword allows selection of which GPUs on each node will +be used for a simulation. GPU IDs range from 0 to N-1 where N is the +physical number of GPUs/node. An ID is specified for each of the +Ngpus being used. For example if you have three GPUs on a machine, +one of which is used for the X-Server (the GPU with the ID 1) while +the others (with IDs 0 and 2) are used for computations you would +specify: +

    +
    package cuda 2 gpuID 0 2 
     
    -

    A main purpose of the gpu/node/special optoin is to allow two (or -more) simulations to be run on one workstation. In that case one -would set the first simulation to use GPU 0 and the second to use GPU -1. This is not necessary though, if the GPUs are in what is called -compute exclusive mode. Using that setting, every process will get -its own GPU automatically. This compute exclusive mode can be set -as root using the nvidia-smi tool which is part of the CUDA -installation. +

    The purpose of the gpuID keyword is to allow two (or more) +simulations to be run on one workstation. In that case one could set +the first simulation to use GPU 0 and the second to use GPU 1. This is +not necessary however, if the GPUs are in what is called compute +exclusive mode. Using that setting, every process will get its own +GPU automatically. This compute exclusive mode can be set as root +using the nvidia-smi tool which is part of the CUDA installation.

    -

    Note that if the gpu/node/special keyword is not used, the USER-CUDA +

    Also note that if the gpuID keyword is not used, the USER-CUDA package sorts existing GPUs on each node according to their number of multiprocessors. This way, compute GPUs will be priorized over X-Server GPUs.

    -

    Use of the timing keyword will output detailed timing information -for various subroutines. +

    If the timing keyword is specified, detailed timing information for +various subroutines will be output.

    -

    The test keyword will output info for the the specified atom at -several points during each time step. This is mainly usefull for -debugging purposes. Note that the simulation will be severly slowed -down if this option is used. +

    If the test keyword is specified, information for the specified atom +with atom-ID will be output at several points during each timestep. +This is mainly usefull for debugging purposes. Note that the +simulation slow down dramatically if this option is used.

    -

    The override/bpa keyword can be used to specify which mode is used -for pair-force evaluation. TpA = one thread per atom; BpA = one block -per atom. If this keyword is not used, a short test at the begin of -each run will determine which method is more effective (the result of -this test is part of the LAMMPS output). Therefore it is usually not -necessary to use this keyword. +

    The thread keyword can be used to specify how GPU threads are +assigned work during pair style force evaluation. If the value = +tpa, one thread per atom is used. If the value = bpa, one block +per atom is used. If the value = auto, a short test is performed at +the beginning of each run to determing where tpa or bpa mode is +faster. The result of this test is output. Since auto is the +default value, it is usually not necessary to use this keyword.


    -

    The gpu style invokes options associated with the use of the GPU -package. +

    The gpu style invokes settings settings associated with the use of +the GPU package.

    -

    The mode setting specifies where neighbor list calculations will be -performed. If mode is force, neighbor list calculation is performed -on the CPU. If mode is force/neigh, neighbor list calculation is -performed on the GPU. GPU neighbor list calculation currently cannot -be used with a triclinic box. GPU neighbor list calculation currently -cannot be used with hybrid pair styles. GPU -neighbor lists are not compatible with styles that are not -GPU-enabled. When a non-GPU enabled style requires a neighbor list, -it will also be built using CPU routines. In these cases, it will -typically be more efficient to only use CPU neighbor list builds. +

    The Ngpu argument sets the number of GPUs per node. There must be +at least as many MPI tasks per node as GPUs, as set by the mpirun or +mpiexec command. If there are more MPI tasks (per node) +than GPUs, multiple MPI tasks will share each GPU.

    -

    The first and last settings specify the GPUs that will be used for -simulation. On each node, the GPU IDs in the inclusive range from -first to last will be used. +

    Optional keyword/value pairs can also be specified. Each has a +default value as listed below.

    -

    The split setting can be used for load balancing force calculation -work between CPU and GPU cores in GPU-enabled pair styles. If 0 < -split < 1.0, a fixed fraction of particles is offloaded to the GPU -while force calculation for the other particles occurs simulataneously -on the CPU. If split<0, the optimal fraction (based on CPU and GPU -timings) is calculated every 25 timesteps. If split = 1.0, all force -calculations for GPU accelerated pair styles are performed on the -GPU. In this case, hybrid, bond, -angle, dihedral, -improper, and long-range -calculations can be performed on the CPU while the GPU is performing -force calculations for the GPU-enabled pair style. If all CPU force -computations complete before the GPU, LAMMPS will block until the GPU -has finished before continuing the timestep. +

    The neigh keyword specifies where neighbor lists for pair style +computation will be built. If neigh is yes, which is the default, +neighbor list building is performed on the GPU. If neigh is no, +neighbor list building is performed on the CPU. GPU neighbor list +building currently cannot be used with a triclinic box. GPU neighbor +list calculation currently cannot be used with +hybrid pair styles. GPU neighbor lists are not +compatible with comannds that are not GPU-enabled. When a non-GPU +enabled command requires a neighbor list, it will also be built on the +CPU. In these cases, it will typically be more efficient to only use +CPU neighbor list builds. +

    +

    The split keyword can be used for load balancing force calculations +between CPU and GPU cores in GPU-enabled pair styles. If 0 < split < +1.0, a fixed fraction of particles is offloaded to the GPU while force +calculation for the other particles occurs simulataneously on the +CPU. If split < 0.0, the optimal fraction (based on CPU and GPU +timings) is calculated every 25 timesteps. If split = 1.0, all +force calculations for GPU accelerated pair styles are performed on +the GPU. In this case, other hybrid pair +interactions, bond, angle, +dihedral, improper, and +long-range calculations can be performed on the +CPU while the GPU is performing force calculations for the GPU-enabled +pair style. If all CPU force computations complete before the GPU +completes, LAMMPS will block until the GPU has finished before +continuing the timestep.

    As an example, if you have two GPUs per node and 8 CPU cores per node, and would like to run on 4 nodes (32 cores) with dynamic balancing of force calculation across CPU and GPU cores, you could specify

    -
    package gpu force/neigh 0 1 -1 
    +
    mpirun -np 32 -sf gpu -in in.script    # launch command
    +package gpu 2 split -1                 # input script command 
     

    In this case, all CPU cores and GPU devices on the nodes would be utilized. Each GPU device would be shared by 4 CPU cores. The CPU @@ -208,38 +223,51 @@ cores would perform force calculations for some fraction of the particles at the same time the GPUs performed force calculation for the other particles.

    -

    The threads_per_atom keyword allows control of the number of GPU -threads used per-atom to perform the short range force calculation. -By default, the value will be chosen based on the pair style, however, -the value can be set with this keyword to fine-tune performance. For +

    The gpuID keyword allows selection of which GPUs on each node will +be used for a simulation. The first and last values specify the +GPU IDs to use (from 0 to Ngpu-1). By default, first = 0 and last = +Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number +of physical GPUs. If you only wish to use a subset, set Ngpu to a +smaller number and first/last to a sub-range of the available GPUs. +

    +

    The tpa keyword sets the number of GPU thread per atom used to +perform force calculations. With a default value of 1, the number of +threads will be chosen based on the pair style, however, the value can +be set explicitly with this keyword to fine-tune performance. For large cutoffs or with a small number of particles per GPU, increasing the value can improve performance. The number of threads per atom must be a power of 2 and currently cannot be greater than 32.

    -

    The cellsize keyword can be used to control the size of the cells used -for binning atoms in neighbor list calculations. Setting this value is -normally not needed; the optimal value is close to the default -(equal to the cutoff distance for the short range interactions -plus the neighbor skin). GPUs can perform efficiently with much larger cutoffs -than CPUs and this can be used to reduce the time required for long-range -calculations or in some cases to eliminate them with models such as -coul/wolf or coul/dsf. For very large cutoffs, -it can be more efficient to use smaller values for cellsize in parallel -simulations. For example, with a cutoff of 20*sigma and a neighbor skin of -sigma, a cellsize of 5.25*sigma can be efficient for parallel simulations. +

    The binsize keyword sets the size of bins used to bin atoms in +neighbor list builds. Setting this value is normally not needed; the +optimal value is close to the default, which is set equal to the +cutoff distance for the short range interactions plus the neighbor +skin. Note that this is 2x larger than the default bin size for +neighbor list builds on the CPU. This is becuase GPUs can perform +efficiently with much larger cutoffs than CPUs. This can be used to +reduce the time required for long-range calculations or in some cases +to eliminate them with pair style models such as +coul/wolf or coul/dsf. For very +large cutoffs, it can be more efficient to use smaller values for +binsize in parallel simulations. For example, with a cutoff of +20*sigma in LJ units and a neighbor skin distance of +sigma, a binsize = 5.25*sigma can be more efficient than the +default.

    -

    The device keyword can be used to tune parameters to optimize for a specific -accelerator when using OpenCL. For CUDA, the device keyword is ignored. -Currently, the device type is limited to NVIDIA Kepler, NVIDIA Fermi, -AMD Cypress, or a generic device. More devices will be added soon. The default -device type can be specified when building LAMMPS with the GPU library. +

    The device keyword can be used to tune parameters optimized for a +specific accelerator, when using OpenCL. For CUDA, the device +keyword is ignored. Currently, the device type is limited to NVIDIA +Kepler, NVIDIA Fermi, AMD Cypress, or a generic device. More devices +may be added later. The default device type can be specified when +building LAMMPS with the GPU library, via settings in the +lib/gpu/Makefile that is used.


    The intel style invokes options associated with the use of the USER-INTEL package.

    -

    The Nthreads argument allows to one explicitly set the number of +

    The Nthread argument allows to one explicitly set the number of OpenMP threads to be allocated for each MPI process, An Nthreads value of '*' instructs LAMMPS to use whatever is the default for the given OpenMP environment. This is usually determined via the @@ -259,7 +287,7 @@ additional settings are as follows:

    The balance setting is used to set the fraction of work offloaded to the coprocessor for an intel style (in the inclusive range 0.0 to -1.0). While this fraction of work is running on the coprocessor, other +1.0). While this fraction of work is running on the coprocessor, other calculations will run on the host, including neighbor and pair calculations that are not offloaded, angle, bond, dihedral, kspace, and some MPI communications. If the balance is set to -1, the fraction @@ -350,19 +378,19 @@ multiple threads to pack/unpack communicated data.


    -

    The omp style invokes options associated with the use of the +

    The omp style invokes settings associated with the use of the USER-OMP package.

    -

    The first argument sets the number of OpenMP threads allocated for -each MPI process or task. For example, if your system has nodes with -dual quad-core processors, it has a total of 8 cores per node. You -could two MPI tasks per node (e.g. using the -ppn option of the mpirun -command), and set Nthreads = 4. This would effectively use all 8 -cores on each node. Note that the product of MPI tasks * threads/task -should not exceed the physical number of cores (on a node), otherwise -performance will suffer. +

    The Nthread argument sets the number of OpenMP threads allocated for +each MPI task. For example, if your system has nodes with dual +quad-core processors, it has a total of 8 cores per node. You could +use two MPI tasks per node (e.g. using the -ppn option of the mpirun +command), and set Nthreads = 4. This would use all 8 cores on each +node. Note that the product of MPI tasks * threads/task should not +exceed the physical number of cores (on a node), otherwise performance +will suffer.

    -

    An Nthreads value of 0 instructs LAMMPS to use whatever value is the +

    Setting Nthread = 0 instructs LAMMPS to use whatever value is the default for the given OpenMP environment. This is usually determined via the OMP_NUM_THREADS environment variable or the compiler runtime. Note that in most cases the default for OpenMP capable @@ -391,6 +419,9 @@ input. Not all features of LAMMPS support OpenMP threading via the USER-OMP packaage and the parallel efficiency can be very different, too.

    +

    Optional keyword/value pairs can also be specified. Each has a +default value as listed below. +

    The neigh keyword specifies whether neighbor list building will be multi-threaded in addition to force calculations. If neigh is set to no then neighbor list calculation is performed only by MPI tasks @@ -416,6 +447,10 @@ LAMMPS section for more info. with the GPU package. See the Making LAMMPS section for more info.

    +

    The intel style of this command can only be invoked if LAMMPS was +built with the USER-INTEL package. See the Making +LAMMPS section for more info. +

    The kk style of this command can only be invoked if LAMMPS was built with the KOKKOS package. See the Making LAMMPS section for more info. @@ -426,19 +461,24 @@ LAMMPS section for more info.

    Related commands:

    -

    suffix +

    suffix, "-pk" command-line +setting

    Default:

    -

    The default settings for the USER-CUDA package are "package cuda gpu -2". This is the case whether the "-sf cuda" command-line -switch is used or not. +

    To use the USER-CUDA package, the package command must be invoked +explicitly, either via the "-pk cuda" command-line +switch or by invoking the package cuda +command in your input script. This will set the # of GPUs/node. The +options defaults are gpuID = 0 to Ngpu-1, timing not enabled, test not +enabled, and thread = auto.

    -

    If the "-sf gpu" command-line switch is -used then it is as if the command "package gpu force/neigh 0 0 1" were -invoked, to specify default settings for the GPU package. If the -command-line switch is not used, then no defaults are set, and you -must specify the appropriate package command in your input script. +

    For the GPU package, the default is Ngpu = 1 and the option defaults +are neigh = yes, split = 1.0, gpuID = 0 to Ngpu-1, tpa = 1, binsize = +pair cutoff + neighbor skin, device = not used. These settings are +made if the "-sf gpu" command-line switch +is used. If it is not used, you must invoke the package gpu command +in your input script.

    The default settings for the USER-INTEL package are "package intel * mixed balance -1 offload_cards 1 offload_tpc 4 offload_threads 240". @@ -451,11 +491,10 @@ full comm/exchange host comm/forward host". This is the case whether the "-sf kk" command-line switch is used or not.

    -

    If the "-sf omp" command-line switch is -used then it is as if the command "package omp 0" were invoked, to -specify settings for the USER-OMP package. The option defaults are -neigh = yes. If the command-line switch is not used, then no defaults -are set, and you must specify the appropriate "package omp" command in -your input script. +

    For the OMP package, the default is Nthreads = 0 and the option +defaults are neigh = yes. These settings are made if the "-sf omp" +command-line switch is used. If it is +not used, you must invoke the package omp command in your input +script.

    diff --git a/doc/package.txt b/doc/package.txt index f810a46ebe..da6066c740 100644 --- a/doc/package.txt +++ b/doc/package.txt @@ -14,30 +14,35 @@ package style args :pre style = {cuda} or {gpu} or {intel} or {kokkos} or {omp} :ulb,l args = arguments specific to the style :l - {cuda} args = keyword value ... - one or more keyword/value pairs may be appended - keywords = {gpu/node} or {gpu/node/special} or {timing} or {test} or {override/bpa} - {gpu/node} value = N - N = number of GPUs to be used per node - {gpu/node/special} values = N gpu1 .. gpuN - N = number of GPUs to be used per node - gpu1 .. gpuN = N IDs of the GPUs to use + {cuda} args = Ngpu keyword value ... + Ngpu = # of GPUs per node + zero or more keyword/value pairs may be appended + keywords = {gpuID} or {timing} or {test} or {thread} + {gpuID} values = gpu1 .. gpuN + gpu1 .. gpuN = IDs of the Ngpu GPUs to use {timing} values = none {test} values = id id = atom-ID of a test particle - {override/bpa} values = flag - flag = 0 for TpA algorithm, 1 for BpA algorithm - {gpu} args = mode first last split keyword value ... - mode = force or force/neigh - first = ID of first GPU to be used on each node - last = ID of last GPU to be used on each node - split = fraction of particles assigned to the GPU - zero or more keyword/value pairs may be appended - keywords = {threads_per_atom} or {cellsize} or {device} - {threads_per_atom} value = Nthreads + {thread} = auto or tpa or bpa + auto = test whether tpa or bpa is faster + tpa = one thread per atom + bpa = one block per atom + {gpu} args = Ngpu keyword value ... + Ngpu = # of GPUs per node + zero or more keyword/value pairs may be appended + keywords = {neigh} or {split} or {gpuID} or {tpa} or {binsize} or {device} + {neigh} value = {yes} or {no} + yes = neighbor list build on GPU (default) + no = neighbor list build on CPU + {split} = fraction + fraction = fraction of atoms assigned to GPU (default = 1.0) + {gpuID} values = first last + first = ID of first GPU to be used on each node + last = ID of last GPU to be used on each node + {tpa} value = Nthreads Nthreads = # of GPU threads used per atom - {cellsize} value = dist - dist = length (distance units) in each dimension for neighbor bins + {binsize} value = size + size = bin size for neighbor list construction (distance units) {device} value = device_type device_type = {kepler} or {fermi} or {cypress} or {generic} {intel} args = Nthreads precision keyword value ... @@ -61,20 +66,19 @@ args = arguments specific to the style :l {comm/exchange} value = {no} or {host} or {device} {comm/forward} value = {no} or {host} or {device} {omp} args = Nthreads keyword value ... - Nthreads = # of OpenMP threads to associate with each MPI process + Nthread = # of OpenMP threads to associate with each MPI process zero or more keyword/value pairs may be appended keywords = {neigh} {neigh} value = {yes} or {no} - {yes} = threaded neighbor list build (default) - {no} = non-threaded neighbor list build :pre + yes = threaded neighbor list build (default) + no = non-threaded neighbor list build :pre :ule [Examples:] -package gpu force 0 0 1.0 -package gpu force 0 0 0.75 -package gpu force/neigh 0 0 1.0 -package gpu force/neigh 0 1 -1.0 +package gpu 1 +package gpu 1 split 0.75 +package gpu 2 split -1.0 package cuda gpu/node/special 2 0 2 package cuda test 3948 package kokkos neigh half/thread comm/forward device @@ -88,6 +92,10 @@ This command invokes package-specific settings. Currently the following packages use it: USER-CUDA, GPU, USER-INTEL, KOKKOS, and USER-OMP. +Talk about command line switches + +When does it have to be invoked + To use the accelerated GPU and USER-OMP styles, the use of the package command is required. However, as described in the "Defaults" section below, if you use the "-sf gpu" or "-sf omp" "command-line @@ -103,98 +111,105 @@ See "Section_accelerate"_Section_accelerate.html of the manual for more details about using these various packages for accelerating LAMMPS calculations. +Package GPU always sets newton pair off. Not so for USER-CUDA> + :line -The {cuda} style invokes options associated with the use of the -USER-CUDA package. +The {cuda} style invokes settings associated with the use of the +USER-CUDA package. -The {gpu/node} keyword specifies the number {N} of GPUs to be used on -each node. An MPI process with rank {K} will use the GPU (K mod N). -This implies that processes should be assigned with successive ranks -on each node, which is the default with most (or even all) MPI -implementations. The default value for {N} is 2. +The {Ngpus} argument sets the number of GPUs per node. There must be +exactly one MPI task per GPU, as set by the mpirun or mpiexec command. -The {gpu/node/special} keyword also specifies the number (N) of GPUs -to be used on each node, but allows more control over their -specification. An MPI process with rank {K} will use the GPU {gpuI} -with l = (K mod N) + 1. This implies that processes should be assigned -with successive ranks on each node, which is the default with most (or -even all) MPI implementations. For example if you have three GPUs on -a machine, one of which is used for the X-Server (the GPU with the ID -1) while the others (with IDs 0 and 2) are used for computations you -would specify: +Optional keyword/value pairs can also be specified. Each has a +default value as listed below. -package cuda gpu/node/special 2 0 2 :pre +The {gpuID} keyword allows selection of which GPUs on each node will +be used for a simulation. GPU IDs range from 0 to N-1 where N is the +physical number of GPUs/node. An ID is specified for each of the +Ngpus being used. For example if you have three GPUs on a machine, +one of which is used for the X-Server (the GPU with the ID 1) while +the others (with IDs 0 and 2) are used for computations you would +specify: -A main purpose of the {gpu/node/special} optoin is to allow two (or -more) simulations to be run on one workstation. In that case one -would set the first simulation to use GPU 0 and the second to use GPU -1. This is not necessary though, if the GPUs are in what is called -{compute exclusive} mode. Using that setting, every process will get -its own GPU automatically. This {compute exclusive} mode can be set -as root using the {nvidia-smi} tool which is part of the CUDA -installation. +package cuda 2 gpuID 0 2 :pre -Note that if the {gpu/node/special} keyword is not used, the USER-CUDA +The purpose of the {gpuID} keyword is to allow two (or more) +simulations to be run on one workstation. In that case one could set +the first simulation to use GPU 0 and the second to use GPU 1. This is +not necessary however, if the GPUs are in what is called {compute +exclusive} mode. Using that setting, every process will get its own +GPU automatically. This {compute exclusive} mode can be set as root +using the {nvidia-smi} tool which is part of the CUDA installation. + +Also note that if the {gpuID} keyword is not used, the USER-CUDA package sorts existing GPUs on each node according to their number of multiprocessors. This way, compute GPUs will be priorized over X-Server GPUs. - -Use of the {timing} keyword will output detailed timing information -for various subroutines. -The {test} keyword will output info for the the specified atom at -several points during each time step. This is mainly usefull for -debugging purposes. Note that the simulation will be severly slowed -down if this option is used. +If the {timing} keyword is specified, detailed timing information for +various subroutines will be output. -The {override/bpa} keyword can be used to specify which mode is used -for pair-force evaluation. TpA = one thread per atom; BpA = one block -per atom. If this keyword is not used, a short test at the begin of -each run will determine which method is more effective (the result of -this test is part of the LAMMPS output). Therefore it is usually not -necessary to use this keyword. +If the {test} keyword is specified, information for the specified atom +with atom-ID will be output at several points during each timestep. +This is mainly usefull for debugging purposes. Note that the +simulation slow down dramatically if this option is used. + +The {thread} keyword can be used to specify how GPU threads are +assigned work during pair style force evaluation. If the value = +{tpa}, one thread per atom is used. If the value = {bpa}, one block +per atom is used. If the value = {auto}, a short test is performed at +the beginning of each run to determing where {tpa} or {bpa} mode is +faster. The result of this test is output. Since {auto} is the +default value, it is usually not necessary to use this keyword. :line -The {gpu} style invokes options associated with the use of the GPU -package. +The {gpu} style invokes settings settings associated with the use of +the GPU package. -The {mode} setting specifies where neighbor list calculations will be -performed. If {mode} is force, neighbor list calculation is performed -on the CPU. If {mode} is force/neigh, neighbor list calculation is -performed on the GPU. GPU neighbor list calculation currently cannot -be used with a triclinic box. GPU neighbor list calculation currently -cannot be used with "hybrid"_pair_hybrid.html pair styles. GPU -neighbor lists are not compatible with styles that are not -GPU-enabled. When a non-GPU enabled style requires a neighbor list, -it will also be built using CPU routines. In these cases, it will -typically be more efficient to only use CPU neighbor list builds. +The {Ngpu} argument sets the number of GPUs per node. There must be +at least as many MPI tasks per node as GPUs, as set by the mpirun or +mpiexec command. If there are more MPI tasks (per node) +than GPUs, multiple MPI tasks will share each GPU. -The {first} and {last} settings specify the GPUs that will be used for -simulation. On each node, the GPU IDs in the inclusive range from -{first} to {last} will be used. +Optional keyword/value pairs can also be specified. Each has a +default value as listed below. -The {split} setting can be used for load balancing force calculation -work between CPU and GPU cores in GPU-enabled pair styles. If 0 < -{split} < 1.0, a fixed fraction of particles is offloaded to the GPU -while force calculation for the other particles occurs simulataneously -on the CPU. If {split}<0, the optimal fraction (based on CPU and GPU -timings) is calculated every 25 timesteps. If {split} = 1.0, all force -calculations for GPU accelerated pair styles are performed on the -GPU. In this case, "hybrid"_pair_hybrid.html, "bond"_bond_style.html, -"angle"_angle_style.html, "dihedral"_dihedral_style.html, -"improper"_improper_style.html, and "long-range"_kspace_style.html -calculations can be performed on the CPU while the GPU is performing -force calculations for the GPU-enabled pair style. If all CPU force -computations complete before the GPU, LAMMPS will block until the GPU -has finished before continuing the timestep. +The {neigh} keyword specifies where neighbor lists for pair style +computation will be built. If {neigh} is {yes}, which is the default, +neighbor list building is performed on the GPU. If {neigh} is {no}, +neighbor list building is performed on the CPU. GPU neighbor list +building currently cannot be used with a triclinic box. GPU neighbor +list calculation currently cannot be used with +"hybrid"_pair_hybrid.html pair styles. GPU neighbor lists are not +compatible with comannds that are not GPU-enabled. When a non-GPU +enabled command requires a neighbor list, it will also be built on the +CPU. In these cases, it will typically be more efficient to only use +CPU neighbor list builds. + +The {split} keyword can be used for load balancing force calculations +between CPU and GPU cores in GPU-enabled pair styles. If 0 < {split} < +1.0, a fixed fraction of particles is offloaded to the GPU while force +calculation for the other particles occurs simulataneously on the +CPU. If {split} < 0.0, the optimal fraction (based on CPU and GPU +timings) is calculated every 25 timesteps. If {split} = 1.0, all +force calculations for GPU accelerated pair styles are performed on +the GPU. In this case, other "hybrid"_pair_hybrid.html pair +interactions, "bond"_bond_style.html, "angle"_angle_style.html, +"dihedral"_dihedral_style.html, "improper"_improper_style.html, and +"long-range"_kspace_style.html calculations can be performed on the +CPU while the GPU is performing force calculations for the GPU-enabled +pair style. If all CPU force computations complete before the GPU +completes, LAMMPS will block until the GPU has finished before +continuing the timestep. As an example, if you have two GPUs per node and 8 CPU cores per node, and would like to run on 4 nodes (32 cores) with dynamic balancing of force calculation across CPU and GPU cores, you could specify -package gpu force/neigh 0 1 -1 :pre +mpirun -np 32 -sf gpu -in in.script # launch command +package gpu 2 split -1 # input script command :pre In this case, all CPU cores and GPU devices on the nodes would be utilized. Each GPU device would be shared by 4 CPU cores. The CPU @@ -202,38 +217,51 @@ cores would perform force calculations for some fraction of the particles at the same time the GPUs performed force calculation for the other particles. -The {threads_per_atom} keyword allows control of the number of GPU -threads used per-atom to perform the short range force calculation. -By default, the value will be chosen based on the pair style, however, -the value can be set with this keyword to fine-tune performance. For +The {gpuID} keyword allows selection of which GPUs on each node will +be used for a simulation. The {first} and {last} values specify the +GPU IDs to use (from 0 to Ngpu-1). By default, first = 0 and last = +Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number +of physical GPUs. If you only wish to use a subset, set Ngpu to a +smaller number and first/last to a sub-range of the available GPUs. + +The {tpa} keyword sets the number of GPU thread per atom used to +perform force calculations. With a default value of 1, the number of +threads will be chosen based on the pair style, however, the value can +be set explicitly with this keyword to fine-tune performance. For large cutoffs or with a small number of particles per GPU, increasing the value can improve performance. The number of threads per atom must be a power of 2 and currently cannot be greater than 32. -The {cellsize} keyword can be used to control the size of the cells used -for binning atoms in neighbor list calculations. Setting this value is -normally not needed; the optimal value is close to the default -(equal to the cutoff distance for the short range interactions -plus the neighbor skin). GPUs can perform efficiently with much larger cutoffs -than CPUs and this can be used to reduce the time required for long-range -calculations or in some cases to eliminate them with models such as -"coul/wolf"_pair_coul.html or "coul/dsf"_pair_coul.html. For very large cutoffs, -it can be more efficient to use smaller values for cellsize in parallel -simulations. For example, with a cutoff of 20*sigma and a neighbor skin of -sigma, a cellsize of 5.25*sigma can be efficient for parallel simulations. +The {binsize} keyword sets the size of bins used to bin atoms in +neighbor list builds. Setting this value is normally not needed; the +optimal value is close to the default, which is set equal to the +cutoff distance for the short range interactions plus the neighbor +skin. Note that this is 2x larger than the default bin size for +neighbor list builds on the CPU. This is becuase GPUs can perform +efficiently with much larger cutoffs than CPUs. This can be used to +reduce the time required for long-range calculations or in some cases +to eliminate them with pair style models such as +"coul/wolf"_pair_coul.html or "coul/dsf"_pair_coul.html. For very +large cutoffs, it can be more efficient to use smaller values for +{binsize} in parallel simulations. For example, with a cutoff of +20*sigma in LJ "units"_units.html and a neighbor skin distance of +sigma, a {binsize} = 5.25*sigma can be more efficient than the +default. -The {device} keyword can be used to tune parameters to optimize for a specific -accelerator when using OpenCL. For CUDA, the {device} keyword is ignored. -Currently, the device type is limited to NVIDIA Kepler, NVIDIA Fermi, -AMD Cypress, or a generic device. More devices will be added soon. The default -device type can be specified when building LAMMPS with the GPU library. +The {device} keyword can be used to tune parameters optimized for a +specific accelerator, when using OpenCL. For CUDA, the {device} +keyword is ignored. Currently, the device type is limited to NVIDIA +Kepler, NVIDIA Fermi, AMD Cypress, or a generic device. More devices +may be added later. The default device type can be specified when +building LAMMPS with the GPU library, via settings in the +lib/gpu/Makefile that is used. :line The {intel} style invokes options associated with the use of the USER-INTEL package. -The {Nthreads} argument allows to one explicitly set the number of +The {Nthread} argument allows to one explicitly set the number of OpenMP threads to be allocated for each MPI process, An {Nthreads} value of '*' instructs LAMMPS to use whatever is the default for the given OpenMP environment. This is usually determined via the @@ -253,7 +281,7 @@ additional settings are as follows: The {balance} setting is used to set the fraction of work offloaded to the coprocessor for an intel style (in the inclusive range 0.0 to -1.0). While this fraction of work is running on the coprocessor, other +1.0). While this fraction of work is running on the coprocessor, other calculations will run on the host, including neighbor and pair calculations that are not offloaded, angle, bond, dihedral, kspace, and some MPI communications. If the balance is set to -1, the fraction @@ -344,19 +372,19 @@ multiple threads to pack/unpack communicated data. :line -The {omp} style invokes options associated with the use of the +The {omp} style invokes settings associated with the use of the USER-OMP package. -The first argument sets the number of OpenMP threads allocated for -each MPI process or task. For example, if your system has nodes with -dual quad-core processors, it has a total of 8 cores per node. You -could two MPI tasks per node (e.g. using the -ppn option of the mpirun -command), and set {Nthreads} = 4. This would effectively use all 8 -cores on each node. Note that the product of MPI tasks * threads/task -should not exceed the physical number of cores (on a node), otherwise -performance will suffer. +The {Nthread} argument sets the number of OpenMP threads allocated for +each MPI task. For example, if your system has nodes with dual +quad-core processors, it has a total of 8 cores per node. You could +use two MPI tasks per node (e.g. using the -ppn option of the mpirun +command), and set {Nthreads} = 4. This would use all 8 cores on each +node. Note that the product of MPI tasks * threads/task should not +exceed the physical number of cores (on a node), otherwise performance +will suffer. -An {Nthreads} value of 0 instructs LAMMPS to use whatever value is the +Setting {Nthread} = 0 instructs LAMMPS to use whatever value is the default for the given OpenMP environment. This is usually determined via the {OMP_NUM_THREADS} environment variable or the compiler runtime. Note that in most cases the default for OpenMP capable @@ -385,6 +413,9 @@ input. Not all features of LAMMPS support OpenMP threading via the USER-OMP packaage and the parallel efficiency can be very different, too. +Optional keyword/value pairs can also be specified. Each has a +default value as listed below. + The {neigh} keyword specifies whether neighbor list building will be multi-threaded in addition to force calculations. If {neigh} is set to {no} then neighbor list calculation is performed only by MPI tasks @@ -410,6 +441,10 @@ The gpu style of this command can only be invoked if LAMMPS was built with the GPU package. See the "Making LAMMPS"_Section_start.html#start_3 section for more info. +The intel style of this command can only be invoked if LAMMPS was +built with the USER-INTEL package. See the "Making +LAMMPS"_Section_start.html#start_3 section for more info. + The kk style of this command can only be invoked if LAMMPS was built with the KOKKOS package. See the "Making LAMMPS"_Section_start.html#start_3 section for more info. @@ -420,19 +455,24 @@ LAMMPS"_Section_start.html#start_3 section for more info. [Related commands:] -"suffix"_suffix.html +"suffix"_suffix.html, "-pk" "command-line +setting"_Section_start.html#start_7 [Default:] -The default settings for the USER-CUDA package are "package cuda gpu -2". This is the case whether the "-sf cuda" "command-line -switch"_Section_start.html#start_7 is used or not. +To use the USER-CUDA package, the package command must be invoked +explicitly, either via the "-pk cuda" "command-line +switch"_Section_start.html#start_7 or by invoking the package cuda +command in your input script. This will set the # of GPUs/node. The +options defaults are gpuID = 0 to Ngpu-1, timing not enabled, test not +enabled, and thread = auto. -If the "-sf gpu" "command-line switch"_Section_start.html#start_7 is -used then it is as if the command "package gpu force/neigh 0 0 1" were -invoked, to specify default settings for the GPU package. If the -command-line switch is not used, then no defaults are set, and you -must specify the appropriate package command in your input script. +For the GPU package, the default is Ngpu = 1 and the option defaults +are neigh = yes, split = 1.0, gpuID = 0 to Ngpu-1, tpa = 1, binsize = +pair cutoff + neighbor skin, device = not used. These settings are +made if the "-sf gpu" "command-line switch"_Section_start.html#start_7 +is used. If it is not used, you must invoke the package gpu command +in your input script. The default settings for the USER-INTEL package are "package intel * mixed balance -1 offload_cards 1 offload_tpc 4 offload_threads 240". @@ -445,9 +485,8 @@ full comm/exchange host comm/forward host". This is the case whether the "-sf kk" "command-line switch"_Section_start.html#start_7 is used or not. -If the "-sf omp" "command-line switch"_Section_start.html#start_7 is -used then it is as if the command "package omp 0" were invoked, to -specify settings for the USER-OMP package. The option defaults are -neigh = yes. If the command-line switch is not used, then no defaults -are set, and you must specify the appropriate "package omp" command in -your input script. +For the OMP package, the default is Nthreads = 0 and the option +defaults are neigh = yes. These settings are made if the "-sf omp" +"command-line switch"_Section_start.html#start_7 is used. If it is +not used, you must invoke the package omp command in your input +script. From b57945c40e383a59161bf7660b226b66d30f8503 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 21:15:33 +0000 Subject: [PATCH 11/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12457 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/GPU/fix_gpu.cpp | 14 ++--- src/USER-CUDA/cuda.cpp | 113 +++++++++++++++++------------------------ src/input.cpp | 4 +- 3 files changed, 55 insertions(+), 76 deletions(-) diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 9bbee8ff2b..a064fd301a 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -95,7 +95,6 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : _gpu_mode = GPU_NEIGH; _particle_split = 1.0; - int newtonflag = 0; int nthreads = 1; int threads_per_atom = -1; double binsize = -1; @@ -113,14 +112,9 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : } else if (strcmp(arg[iarg],"split") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); _particle_split = force->numeric(FLERR,arg[iarg+1]); - if (_particle_split <= 0.0 || _particle_split > 1.0) + if (_particle_split == 0.0 || _particle_split > 1.0) error->all(FLERR,"Illegal package GPU command"); iarg += 2; - } else if (strcmp(arg[iarg],"newton") == 0) { - if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command"); - if (strcmp(arg[iarg]+1,"off") == 0) newtonflag = 0; - else if (strcmp(arg[iarg]+1,"on") == 0) newtonflag = 1; - iarg += 2; } else if (strcmp(arg[iarg],"gpuID") == 0) { if (iarg+3 > narg) error->all(FLERR,"Illegal package gpu command"); first_gpu = force->inumeric(FLERR,arg[iarg+1]); @@ -158,6 +152,12 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : error->all(FLERR,"No OpenMP support compiled in"); #endif + // set newton_pair = 0 since required by all GPU pair styles + + force->newton_pair = 0; + if (force->newton_pair || force->newton_bond) force->newton = 1; + else force->newton = 0; + // pass params to GPU library int gpu_flag = lmp_init_device(universe->uworld, world, first_gpu, last_gpu, diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp index 1d678144a8..8f47c6c181 100644 --- a/src/USER-CUDA/cuda.cpp +++ b/src/USER-CUDA/cuda.cpp @@ -54,8 +54,7 @@ Cuda::Cuda(LAMMPS* lmp) : Pointers(lmp) cuda_exists = true; lmp->cuda = this; - if(universe->me == 0) - printf("# Using LAMMPS_CUDA \n"); + if (universe->me == 0) printf("# Using LAMMPS_CUDA \n"); shared_data.me = universe->me; device_set = false; @@ -153,10 +152,9 @@ Cuda::Cuda(LAMMPS* lmp) : Pointers(lmp) Cuda::~Cuda() { - print_timings(); - if(universe->me == 0) printf("# CUDA: Free memory...\n"); + if (universe->me == 0) printf("# CUDA: Free memory...\n"); delete cu_q; delete cu_x; @@ -203,79 +201,60 @@ Cuda::~Cuda() void Cuda::accelerator(int narg, char** arg) { - if(device_set) return; + if (device_set) return; + if (universe->me == 0) printf("# CUDA: Activate GPU \n"); - if(universe->me == 0) - printf("# CUDA: Activate GPU \n"); + int pppn = force->inumeric(FLERR,arg[0]); + if (pppn <= 0) error->all(FLERR,"Illegal package cuda command"); + + // optional args int* devicelist = NULL; - int pppn = 2; - - for(int i = 0; i < narg; i++) { - if(strcmp(arg[i], "gpu/node") == 0) { - if(++i == narg) - error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a number after 'gpu/node' option."); - - pppn = force->inumeric(FLERR,arg[i]); - } - - if(strcmp(arg[i], "gpu/node/special") == 0) { - if(++i == narg) - error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node/special'."); - - pppn = force->inumeric(FLERR,arg[i]); - - if(pppn < 1) error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'."); - - if(i + pppn == narg) - error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting list of device ids after keyword 'gpu/node special'."); + int iarg = 1; + while (iarg < narg) { + if (strcmp(arg[iarg],"gpuID") == 0) { + if (iarg+pppn+1 > narg) error->all(FLERR,"Illegal package cuda command"); devicelist = new int[pppn]; - - for(int k = 0; k < pppn; k++) { - i++; - devicelist[k] = force->inumeric(FLERR,arg[i]); - } - } - - if(strcmp(arg[i], "pinned") == 0) { - if(++i == narg) - error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a number after 'pinned' option."); - - pinned = force->inumeric(FLERR,arg[i]) == 0 ? false : true; - - if((pinned == false) && (universe->me == 0)) printf(" #CUDA: Pinned memory is not used for communication\n"); - } - - if(strcmp(arg[i], "timing") == 0) { + for (int k = 0; k < pppn; k++) + devicelist[k] = force->inumeric(FLERR,arg[iarg+k+1]); + iarg += pppn + 1; + } else if (strcmp(arg[iarg],"timing") == 0) { dotiming = true; - } - - if(strcmp(arg[i], "suffix") == 0) { - if(++i == narg) - error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a string after 'suffix' option."); - - strcpy(lmp->suffix, arg[i]); - } - - if(strcmp(arg[i], "overlap_comm") == 0) { - shared_data.overlap_comm = 1; - } - - if(strcmp(arg[i], "test") == 0) { - if(++i == narg) - error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a number after 'test' option."); - - testatom = force->numeric(FLERR,arg[i]); + iarg++; + } else if (strcmp(arg[iarg],"test") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command"); + testatom = force->numeric(FLERR,arg[iarg+1]); dotestatom = true; + iarg += 2; + } else if (strcmp(arg[iarg],"thread") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command"); + if (strcmp(arg[iarg+1],"auto") == 0) + shared_data.pair.override_block_per_atom = -1; + else if (strcmp(arg[iarg+1],"tpa") == 0) + shared_data.pair.override_block_per_atom = 0; + else if (strcmp(arg[iarg+1],"bpa") == 0) + shared_data.pair.override_block_per_atom = 1; + else error->all(FLERR,"Illegal package cuda command"); + iarg += 2; } - if(strcmp(arg[i], "override/bpa") == 0) { - if(++i == narg) - error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a number after 'override/bpa' option."); + // undocumented options - shared_data.pair.override_block_per_atom = force->inumeric(FLERR,arg[i]); - } + else if (strcmp(arg[iarg],"suffix") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command"); + strcpy(lmp->suffix,arg[iarg+1]); + iarg += 2; + } else if (strcmp(arg[iarg],"overlap_comm") == 0) { + shared_data.overlap_comm = 1; + iarg++; + } else if (strcmp(arg[iarg],"pinned") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command"); + pinned = force->inumeric(FLERR,arg[iarg+1]) == 0 ? false : true; + if ((pinned == false) && (universe->me == 0)) + printf(" #CUDA: Pinned memory is not used for communication\n"); + iarg += 2; + } else error->all(FLERR,"Illegal package cuda command"); } CudaWrapper_Init(0, (char**)0, universe->me, pppn, devicelist); diff --git a/src/input.cpp b/src/input.cpp index 30dd2d3019..a84f5d0e7e 100644 --- a/src/input.cpp +++ b/src/input.cpp @@ -1439,14 +1439,14 @@ void Input::package() delete [] fixarg; } else if (strcmp(arg[0],"intel") == 0) { - if (!modify->check_package("Intel")) + if (!modify->check_package("INTEL")) error->all(FLERR, "Package intel command without USER-INTEL package installed"); char **fixarg = new char*[2+narg]; fixarg[0] = (char *) "package_intel"; fixarg[1] = (char *) "all"; - fixarg[2] = (char *) "Intel"; + fixarg[2] = (char *) "INTEL"; for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i]; modify->add_fix(2+narg,fixarg); delete [] fixarg; From 4374200a779e5c8bf7648175ae5a4c54bece2831 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 22:30:16 +0000 Subject: [PATCH 12/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12458 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/USER-CUDA/cuda.cpp | 2 - src/USER-INTEL/fix_intel.cpp | 95 +++++++++++++++++++----------------- src/USER-INTEL/fix_intel.h | 2 +- src/USER-OMP/fix_omp.cpp | 3 +- src/input.cpp | 11 +++-- src/lammps.cpp | 23 ++++++--- 6 files changed, 73 insertions(+), 63 deletions(-) diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp index 8f47c6c181..271c377397 100644 --- a/src/USER-CUDA/cuda.cpp +++ b/src/USER-CUDA/cuda.cpp @@ -47,8 +47,6 @@ using namespace LAMMPS_NS; - - Cuda::Cuda(LAMMPS* lmp) : Pointers(lmp) { cuda_exists = true; diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp index 8fd3003b49..9c9cd7b0e1 100644 --- a/src/USER-INTEL/fix_intel.cpp +++ b/src/USER-INTEL/fix_intel.cpp @@ -49,10 +49,9 @@ enum{NSQ,BIN,MULTI}; FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) { - if (narg < 4) - error->all(FLERR, "Illegal package intel command"); - if (strcmp(arg[1],"all") != 0) - error->all(FLERR, "fix Intel has to operate on group 'all'"); + if (narg < 4) error->all(FLERR,"Illegal package intel command"); + + int ncops = force->inumeric(FLERR,arg[3]); _precision_mode = PREC_MODE_MIXED; _offload_balance = 1.0; @@ -64,6 +63,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) _offload_tpc = 4; #ifdef _LMP_INTEL_OFFLOAD + if (ncops < 1) error->all(FLERR,"Illegal package intel command"); _offload_affinity_set = 0; _off_force_array_s = 0; _off_force_array_m = 0; @@ -86,58 +86,61 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) _offload_cores = offload_cores; _offload_threads = offload_cores; #endif - int ncops = 1; + + // optional keywords + _allow_separate_buffers = 1; _offload_ghost = -1; int iarg = 4; while (iarg < narg) { - if (strcmp(arg[iarg], "mixed") == 0) - _precision_mode = PREC_MODE_MIXED; - else if (strcmp(arg[iarg], "double") == 0) - _precision_mode = PREC_MODE_DOUBLE; - else if (strcmp(arg[iarg], "single") == 0) - _precision_mode = PREC_MODE_SINGLE; - else if (strcmp(arg[iarg], "offload_affinity_balanced") == 0) + if (strcmp(arg[iarg],"prec") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); + if (strcmp(arg[iarg+1],"single") == 0) + _precision_mode = PREC_MODE_SINGLE; + else if (strcmp(arg[iarg+1],"mixed") == 0) + _precision_mode = PREC_MODE_MIXED; + else if (strcmp(arg[iarg+1],"double") == 0) + _precision_mode = PREC_MODE_DOUBLE; + else error->all(FLERR,"Illegal package intel command"); + iarg += 2; + } else if (strcmp(arg[iarg],"balance") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); + _offload_balance = force->numeric(FLERR,arg[iarg+1]); + iarg += 2; + } else if (strcmp(arg[iarg], "ghost") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); + if (strcmp(arg[iarg+1],"yes") == 0) _offload_ghost = 1; + else if (strcmp(arg[iarg+1],"no") == 0) _offload_ghost = 0; + else error->all(FLERR,"Illegal package intel command"); + iarg += 2; + } else if (strcmp(arg[iarg], "tpc") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); + _offload_tpc = atoi(arg[iarg+1]); + iarg += 2; + } else if (strcmp(arg[iarg],"tptask") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); + _offload_threads = atoi(arg[iarg+1]); + iarg += 2; + } + + // undocumented options + + else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) { _offload_affinity_balanced = 1; - else if (strcmp(arg[iarg], "balance") == 0) { - if (iarg == narg - 1) - error->all(FLERR, "Illegal package intel mode requested"); - ++iarg; - _offload_balance = force->numeric(FLERR,arg[iarg]); - } else if (strcmp(arg[iarg], "offload_threads") == 0) { - if (iarg == narg - 1) - error->all(FLERR, "Illegal package intel mode requested"); - ++iarg; - _offload_threads = atoi(arg[iarg]); - } else if (strcmp(arg[iarg], "offload_tpc") == 0) { - if (iarg == narg - 1) - error->all(FLERR, "Illegal package intel mode requested"); - ++iarg; - _offload_tpc = atoi(arg[iarg]); - } else if (strcmp(arg[iarg], "offload_cards") == 0) { - if (iarg == narg - 1) - error->all(FLERR, "Illegal package intel mode requested"); - ++iarg; - ncops = atoi(arg[iarg]); - } else if (strcmp(arg[iarg], "buffers") == 0) { - if (iarg == narg - 1) - error->all(FLERR, "Illegal package intel mode requested"); - ++iarg; - _allow_separate_buffers = atoi(arg[iarg]); - } else if (strcmp(arg[iarg], "offload_ghost") == 0) { - if (iarg == narg - 1) - error->all(FLERR, "Illegal package intel mode requested"); - ++iarg; - _offload_ghost = atoi(arg[iarg]); - } else - error->all(FLERR, "Illegal package intel mode requested"); - ++iarg; + iarg++; + } else if (strcmp(arg[iarg],"buffers") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command"); + _allow_separate_buffers = atoi(arg[iarg+1]); + iarg += 2; + } else error->all(FLERR,"Illegal package intel command"); } + // error check + if (_offload_balance > 1.0 || _offload_threads <= 0 || _offload_tpc <= 0 || _offload_tpc > 4) - error->all(FLERR, "Illegal package intel mode requested"); + error->all(FLERR,"Illegal package intel command"); #ifdef _LMP_INTEL_OFFLOAD _ncops = ncops; diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h index 5b7d2b3926..a19512b140 100644 --- a/src/USER-INTEL/fix_intel.h +++ b/src/USER-INTEL/fix_intel.h @@ -13,7 +13,7 @@ #ifdef FIX_CLASS -FixStyle(Intel,FixIntel) +FixStyle(INTEL,FixIntel) #else diff --git a/src/USER-OMP/fix_omp.cpp b/src/USER-OMP/fix_omp.cpp index 28a99bb53a..41616eba69 100644 --- a/src/USER-OMP/fix_omp.cpp +++ b/src/USER-OMP/fix_omp.cpp @@ -70,8 +70,7 @@ FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg) thr(NULL), last_omp_style(NULL), last_pair_hybrid(NULL), _nthr(-1), _neighbor(true), _mixed(false), _reduced(true) { - if ((narg < 4) || (narg > 7)) error->all(FLERR,"Illegal package omp command"); - if (strcmp(arg[1],"all") != 0) error->all(FLERR,"fix OMP has to operate on group 'all'"); + if (narg < 4) error->all(FLERR,"Illegal package omp command"); int nthreads = 1; if (narg > 3) { diff --git a/src/input.cpp b/src/input.cpp index a84f5d0e7e..e43932ec2a 100644 --- a/src/input.cpp +++ b/src/input.cpp @@ -1401,10 +1401,13 @@ void Input::package() error->all(FLERR,"Package command after simulation box is defined"); if (narg < 1) error->all(FLERR,"Illegal package command"); + // same checks for packages existing as in LAMMPS::post_create() + // since can be invoked here by package command in input script + if (strcmp(arg[0],"cuda") == 0) { - if (!lmp->cuda) + if (lmp->cuda == NULL || lmp->cuda->cuda_exists == 0) error->all(FLERR, - "Package cuda command without USER-CUDA package installed"); + "Package cuda command without USER-CUDA package enabled"); lmp->cuda->accelerator(narg-1,&arg[1]); } else if (strcmp(arg[0],"gpu") == 0) { @@ -1420,9 +1423,9 @@ void Input::package() delete [] fixarg; } else if (strcmp(arg[0],"kokkos") == 0) { - if (!lmp->kokkos) + if (lmp->kokkos == NULL || lmp->kokkos->kokkos_exists == 0) error->all(FLERR, - "Package kokkos command without KOKKOS package installed"); + "Package kokkos command without KOKKOS package enabled"); lmp->kokkos->accelerator(narg-1,&arg[1]); } else if (strcmp(arg[0],"omp") == 0) { diff --git a/src/lammps.cpp b/src/lammps.cpp index 2d2625bc4c..60a01c397a 100644 --- a/src/lammps.cpp +++ b/src/lammps.cpp @@ -595,13 +595,12 @@ void LAMMPS::create() /* ---------------------------------------------------------------------- check suffix consistency with installed packages - do this for GPU, USER-INTEL, USER-OMP - already done in constructor for USER-CUDA, KOKKOS turn off suffix2 = omp if USER-OMP is not installed - invoke package-specific setup commands + invoke package-specific deafult package commands only invoke if suffix is set and enabled also check if suffix2 is set called from LAMMPS constructor and after clear() command + so that package-specific core classes have been instantiated ------------------------------------------------------------------------- */ void LAMMPS::post_create() @@ -609,16 +608,25 @@ void LAMMPS::post_create() if (!suffix_enable) return; // suffix will always be set if suffix_enable = 1 + // USER-CUDA and KOKKOS have package classes instantiated if enabled + // via "-c on" and "-k on" + // GPU, INTEL, USER-OMP provide their own fixes which will have + // been compiled with LAMMPS if those packages were installed + if (strcmp(suffix,"cuda") == 0 && (cuda == NULL || cuda->cuda_exists == 0)) + error->all(FLERR,"Using suffix cuda without USER-CUDA package enabled"); if (strcmp(suffix,"gpu") == 0 && !modify->check_package("GPU")) error->all(FLERR,"Using suffix gpu without GPU package installed"); - if (strcmp(suffix,"intel") == 0 && !modify->check_package("Intel")) + if (strcmp(suffix,"intel") == 0 && !modify->check_package("INTEL")) error->all(FLERR,"Using suffix intel without USER-INTEL package installed"); + if (strcmp(suffix,"kk") == 0 && + (kokkos == NULL || kokkos->kokkos_exists == 0)) + error->all(FLERR,"Using suffix kk without KOKKOS package enabled"); if (strcmp(suffix,"omp") == 0 && !modify->check_package("OMP")) error->all(FLERR,"Using suffix omp without USER-OMP package installed"); // suffix2 only currently set by -sf intel - // need to unset if LAMMPS was not built with USER-OMP package + // unset if LAMMPS was not built with USER-OMP package if (suffix2 && strcmp(suffix2,"omp") == 0 && !modify->check_package("OMP")) { delete [] suffix2; @@ -626,10 +634,9 @@ void LAMMPS::post_create() } if (suffix) { - if (strcmp(suffix,"gpu") == 0) input->one("package gpu force/neigh 0 0 1"); + if (strcmp(suffix,"gpu") == 0) input->one("package gpu 1"); + if (strcmp(suffix,"intel") == 0) input->one("package intel 1"); if (strcmp(suffix,"omp") == 0) input->one("package omp 0"); - if (strcmp(suffix,"intel") == 0) - input->one("package intel mixed balance -1"); } if (suffix2) { if (strcmp(suffix,"omp") == 0) input->one("package omp 0"); From f9684d5076fcd879f43a6d1573f95bfccdfc2947 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 22:33:06 +0000 Subject: [PATCH 13/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12459 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/USER-OMP/fix_omp.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/USER-OMP/fix_omp.cpp b/src/USER-OMP/fix_omp.cpp index 41616eba69..6717d04179 100644 --- a/src/USER-OMP/fix_omp.cpp +++ b/src/USER-OMP/fix_omp.cpp @@ -105,7 +105,11 @@ FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg) else if (strcmp(arg[iarg]+1,"no") == 0) _neighbor = false; else error->all(FLERR,"Illegal package omp command"); iarg += 2; - } else if (strcmp(arg[iarg],"mixed") == 0) { + } + + // undocumented options + + else if (strcmp(arg[iarg],"mixed") == 0) { _mixed = true; iarg++; } else if (strcmp(arg[iarg],"double") == 0) { From ca8fd22c19e773f7e550a82b1491c7aff72e7d66 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 22:36:53 +0000 Subject: [PATCH 14/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12460 f3b2605a-c512-4ea7-a41b-209d697bcdaa From 4d9d81fe69ab177a380ca9761179496d3e57ec5e Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 22:50:42 +0000 Subject: [PATCH 15/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12461 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- doc/Section_accelerate.html | 17 ++-- doc/Section_accelerate.txt | 17 ++-- doc/package.html | 172 ++++++++++++++++++----------------- doc/package.txt | 173 +++++++++++++++++++----------------- 4 files changed, 205 insertions(+), 174 deletions(-) diff --git a/doc/Section_accelerate.html b/doc/Section_accelerate.html index 041c38ce14..a9165b45c9 100644 --- a/doc/Section_accelerate.html +++ b/doc/Section_accelerate.html @@ -1390,8 +1390,9 @@ steps: coprocessor case can be done using the "-pk omp" and "-sf intel" and "-pk intel" command-line switches respectively. Or the effect of the "-pk" or "-sf" switches can be -duplicated by adding the package intel or suffix -intel commands respectively to your input script. +duplicated by adding the package omp or suffix +intel or package intel commands +respectively to your input script.

    Required hardware/software:

    @@ -1470,9 +1471,10 @@ maximum number of threads is also reduced. which will automatically append "intel" to styles that support it. If a style does not support it, a "omp" suffix is tried next. Use the "-pk omp Nt" command-line switch, to set -Nt = # of OpenMP threads per MPI task to use. Use the "-pk intel Nt -Nphi" command-line switch to set Nphi = # -of Xeon Phi(TM) coprocessors/node. +Nt = # of OpenMP threads per MPI task to use, if LAMMPS was built with +the USER-OMP package. Use the "-pk intel Nt Nphi" command-line +switch to set Nphi = # of Xeon Phi(TM) +coprocessors/node, if LAMMPS was built with coprocessor support.

    CPU-only without USER-OMP (but using Intel vectorization on CPU):
     lmp_machine -sf intel -in in.script                 # 1 MPI task
    @@ -1494,8 +1496,9 @@ mpirun -np 32 -ppn 4 lmp_machine -sf intel -pk intel 4 2 tptask 120 -in in.scrip
     default commands: package omp 0 and package intel
     1 command.  These set the number of OpenMP threads per
     MPI task via the OMP_NUM_THREADS environment variable, and the number
    -of Xeon Phi(TM) coprocessors/node to 1.  The latter is ignored is
    -LAMMPS was not built with coprocessor support.
    +of Xeon Phi(TM) coprocessors/node to 1.  The former is ignored if
    +LAMMPS was not built with the USER-OMP package.  The latter is ignored
    +is LAMMPS was not built with coprocessor support.
     

    Using the "-pk omp" switch explicitly allows for direct setting of the number of OpenMP threads per MPI task, and additional options. Using diff --git a/doc/Section_accelerate.txt b/doc/Section_accelerate.txt index ed2a5b93dd..f7b92369e7 100644 --- a/doc/Section_accelerate.txt +++ b/doc/Section_accelerate.txt @@ -1385,8 +1385,9 @@ The latter two steps in the first case and the last step in the coprocessor case can be done using the "-pk omp" and "-sf intel" and "-pk intel" "command-line switches"_Section_start.html#start_7 respectively. Or the effect of the "-pk" or "-sf" switches can be -duplicated by adding the "package intel"_package.html or "suffix -intel"_suffix.html commands respectively to your input script. +duplicated by adding the "package omp"_package.html or "suffix +intel"_suffix.html or "package intel"_package.html commands +respectively to your input script. [Required hardware/software:] @@ -1465,9 +1466,10 @@ Use the "-sf intel" "command-line switch"_Section_start.html#start_7, which will automatically append "intel" to styles that support it. If a style does not support it, a "omp" suffix is tried next. Use the "-pk omp Nt" "command-line switch"_Section_start.html#start_7, to set -Nt = # of OpenMP threads per MPI task to use. Use the "-pk intel Nt -Nphi" "command-line switch"_Section_start.html#start_7 to set Nphi = # -of Xeon Phi(TM) coprocessors/node. +Nt = # of OpenMP threads per MPI task to use, if LAMMPS was built with +the USER-OMP package. Use the "-pk intel Nt Nphi" "command-line +switch"_Section_start.html#start_7 to set Nphi = # of Xeon Phi(TM) +coprocessors/node, if LAMMPS was built with coprocessor support. CPU-only without USER-OMP (but using Intel vectorization on CPU): lmp_machine -sf intel -in in.script # 1 MPI task @@ -1489,8 +1491,9 @@ Note that if the "-sf intel" switch is used, it also issues two default commands: "package omp 0"_package.html and "package intel 1"_package.html command. These set the number of OpenMP threads per MPI task via the OMP_NUM_THREADS environment variable, and the number -of Xeon Phi(TM) coprocessors/node to 1. The latter is ignored is -LAMMPS was not built with coprocessor support. +of Xeon Phi(TM) coprocessors/node to 1. The former is ignored if +LAMMPS was not built with the USER-OMP package. The latter is ignored +is LAMMPS was not built with coprocessor support. Using the "-pk omp" switch explicitly allows for direct setting of the number of OpenMP threads per MPI task, and additional options. Using diff --git a/doc/package.html b/doc/package.html index 057b5bd344..42fb14396b 100644 --- a/doc/package.html +++ b/doc/package.html @@ -50,20 +50,23 @@ size = bin size for neighbor list construction (distance units) device value = device_type device_type = kepler or fermi or cypress or generic - intel args = Nthreads precision keyword value ... - Nthreads = # of OpenMP threads to associate with each MPI process on host - precision = single or mixed or double - keywords = balance or offload_cards or offload_ghost or offload_tpc or offload_threads + intel args = NPhi keyword value ... + Nphi = # of coprocessors per node + zero or more keyword/value pairs may be appended + keywords = prec or balance or ghost or tpc or tptask + prec value = single or mixed or double + single = perform force calculations in single precision + mixed = perform force calculations in mixed precision + double = perform force calculations in double precision balance value = split split = fraction of work to offload to coprocessor, -1 for dynamic - offload_cards value = ncops - ncops = number of coprocessors to use on each node - offload_ghost value = offload_type - offload_type = 1 to include ghost atoms for offload, 0 for local only - offload_tpc value = tpc - tpc = number of threads to use on each core of coprocessor - offload_threads value = tptask - tptask = max number of threads to use on coprocessor for each MPI task + ghost value = yes or no + yes = include ghost atoms for offload + no = do not include ghost atoms for offload + tpc value = Ntpc + Ntpc = number of threads to use on each physical core of coprocessor + tptask value = Ntptask + Ntptask = max number of threads to use on coprocessor for each MPI task kokkos args = keyword value ... one or more keyword/value pairs may be appended keywords = neigh or comm/exchange or comm/forward @@ -171,8 +174,8 @@ default value, it is usually not necessary to use this keyword.


    -

    The gpu style invokes settings settings associated with the use of -the GPU package. +

    The gpu style invokes settings associated with the use of the GPU +package.

    The Ngpu argument sets the number of GPUs per node. There must be at least as many MPI tasks per node as GPUs, as set by the mpirun or @@ -264,65 +267,64 @@ lib/gpu/Makefile that is used.


    -

    The intel style invokes options associated with the use of the -USER-INTEL package. +

    The intel style invokes settings associated with the use of the +USER-INTEL package. All of its settings, except the prec keyword, +are ignored if LAMMPS was not built with Xeon Phi coprocessor support, +when building with the USER-INTEL package. All of its settings, +including the prec keyword are applicable if LAMMPS was built with +coprocessor support.

    -

    The Nthread argument allows to one explicitly set the number of -OpenMP threads to be allocated for each MPI process, An Nthreads -value of '*' instructs LAMMPS to use whatever is the default for the -given OpenMP environment. This is usually determined via the -OMP_NUM_THREADS environment variable or the compiler runtime. +

    The Nphi argument sets the number of coprocessors per node.

    -

    The precision argument determines the precision mode to use and can -take values of single (intel styles use single precision for all -calculations), mixed (intel styles use double precision for -accumulation and storage of forces, torques, energies, and virial -terms and single precision for everything else), or double (intel -styles use double precision for all calculations). +

    Optional keyword/value pairs can also be specified. Each has a +default value as listed below.

    -

    Additional keyword-value pairs are available that are used to -determine how work is offloaded to an Intel(R) coprocessor. If LAMMPS is -built without offload support, these values are ignored. The -additional settings are as follows: +

    The prec keyword argument determines the precision mode to use for +computing pair style forces, either on the CPU or on the coprocessor, +when using a USER-INTEL supported pair style. It +can take a value of single, mixed which is the default, or +double. Single means single precision is used for the entire +force calculation. Mixed means forces between a pair of atoms are +computed in single precision, but accumulated and stored in double +precision, including storage of forces, torques, energies, and virial +quantities. Double means double precision is used for the entire +force calculation.

    -

    The balance setting is used to set the fraction of work offloaded to -the coprocessor for an intel style (in the inclusive range 0.0 to -1.0). While this fraction of work is running on the coprocessor, other -calculations will run on the host, including neighbor and pair -calculations that are not offloaded, angle, bond, dihedral, kspace, -and some MPI communications. If the balance is set to -1, the fraction -of work is dynamically adjusted automatically throughout the run. This -can typically give performance within 5 to 10 percent of the optimal -fixed fraction. +

    The balance keyword sets the fraction of pair +style work offloaded to the coprocessor style for +split values between 0.0 and 1.0 inclusive. While this fraction of +work is running on the coprocessor, other calculations will run on the +host, including neighbor and pair calculations that are not offloaded, +angle, bond, dihedral, kspace, and some MPI communications. If +split is set to -1, the fraction of work is dynamically adjusted +automatically throughout the run. This typically give performance +within 5 to 10 percent of the optimal fixed fraction.

    -

    The offload_cards setting determines the number of coprocessors to -use on each node. -

    -

    Additional options for fine tuning performance with offload are as -follows: -

    -

    The offload_ghost setting determines whether or not ghost atoms, -atoms at the borders between MPI tasks, are offloaded for neighbor and -force calculations. When set to "0", ghost atoms are not offloaded. -This option can reduce the amount of data transfer with the -coprocessor and also can overlap MPI communication of forces with +

    The ghost keyword determines whether or not ghost atoms, i.e. atoms +at the boundaries of proessor sub-domains, are offloaded for neighbor +and force calculations. When the value = "no", ghost atoms are not +offloaded. This option can reduce the amount of data transfer with +the coprocessor and can also overlap MPI communication of forces with computation on the coprocessor when the newton pair -setting is "on". When set to "1", ghost atoms are offloaded. In some -cases this can provide better performance, especially if the offload -fraction is high. +setting is "on". When the value = "ues", ghost atoms are offloaded. +In some cases this can provide better performance, especially if the +balance fraction is high.

    -

    The offload_tpc option sets the maximum number of threads that will -run on each core of the coprocessor. +

    The tpc keyword sets the maximum # of threads Ntpc that will +run on each physical core of the coprocessor. The default value is +set to 4, which is the number of hardware threads per core supported +by the current generation Xeon Phi chips.

    -

    The offload_threads option sets the maximum number of threads that -will be used on the coprocessor for each MPI task. This, along with -the offload_tpc setting, are the only methods for changing the -number of threads on the coprocessor. The OMP_NUM_THREADS keyword and -Nthreads options are only used for threads on the host. +

    The tptask keyword sets the maximum # of threads (Ntptask that will +be used on the coprocessor for each MPI task. This, along with the +tpc keyword setting, are the only methods for changing the number of +threads used on the coprocessor. The default value is set to 240 = +60*4, which is the maximum # of threads supported by an entire current +generation Xeon Phi chip.


    -

    The kokkos style invokes options associated with the use of the +

    The kokkos style invokes settings associated with the use of the KOKKOS package.

    The neigh keyword determines what kinds of neighbor lists are built. @@ -466,35 +468,45 @@ setting

    Default:

    -

    To use the USER-CUDA package, the package command must be invoked -explicitly, either via the "-pk cuda" command-line -switch or by invoking the package cuda -command in your input script. This will set the # of GPUs/node. The -options defaults are gpuID = 0 to Ngpu-1, timing not enabled, test not -enabled, and thread = auto. +

    To use the USER-CUDA package, the package cuda command must be invoked +explicitly in your input script or via the "-pk cuda" command-line +switch. This will set the # of GPUs/node. +The options defaults are gpuID = 0 to Ngpu-1, timing = not enabled, +test = not enabled, and thread = auto.

    For the GPU package, the default is Ngpu = 1 and the option defaults are neigh = yes, split = 1.0, gpuID = 0 to Ngpu-1, tpa = 1, binsize = pair cutoff + neighbor skin, device = not used. These settings are -made if the "-sf gpu" command-line switch -is used. If it is not used, you must invoke the package gpu command -in your input script. +made automatically if the "-sf gpu" command-line +switch is used. If it is not used, you +must invoke the package gpu command in your input script or via the +"-pk gpu" command-line switch.

    -

    The default settings for the USER-INTEL package are "package intel * -mixed balance -1 offload_cards 1 offload_tpc 4 offload_threads 240". -The offload_ghost default setting is determined by the intel style -being used. The value used is output to the screen in the offload -report at the end of each run. +

    For the USER-INTEL package, the default is Nphi = 1 and the option +defaults are prec = mixed, balance = -1, tpc = 4, tptask = 240. The +default ghost option is determined by the pair style being used. This +value used is output to the screen in the offload report at the end of +each run. These settings are made automatically if the "-sf intel" +command-line switch is used. If it is +not used, you must invoke the package intel command in your input +script or or via the "-pk intel" command-line +switch.

    The default settings for the KOKKOS package are "package kokkos neigh full comm/exchange host comm/forward host". This is the case whether the "-sf kk" command-line switch is used or not. +To use the KOKKOS package, the package kokkos command must be invoked +explicitly in your input script or via the "-pk kokkos" command-line +switch. This will set the # of GPUs/node. +The options defaults are gpuID = 0 to Ngpu-1, timing = not enabled, +test = not enabled, and thread = auto.

    For the OMP package, the default is Nthreads = 0 and the option -defaults are neigh = yes. These settings are made if the "-sf omp" -command-line switch is used. If it is -not used, you must invoke the package omp command in your input -script. +defaults are neigh = yes. These settings are made automatically if +the "-sf omp" command-line switch is +used. If it is not used, you must invoke the package omp command in +your input script or via the "-pk omp" command-line +switch.

    diff --git a/doc/package.txt b/doc/package.txt index da6066c740..b6d2701879 100644 --- a/doc/package.txt +++ b/doc/package.txt @@ -45,20 +45,23 @@ args = arguments specific to the style :l size = bin size for neighbor list construction (distance units) {device} value = device_type device_type = {kepler} or {fermi} or {cypress} or {generic} - {intel} args = Nthreads precision keyword value ... - Nthreads = # of OpenMP threads to associate with each MPI process on host - precision = {single} or {mixed} or {double} - keywords = {balance} or {offload_cards} or {offload_ghost} or {offload_tpc} or {offload_threads} + {intel} args = NPhi keyword value ... + Nphi = # of coprocessors per node + zero or more keyword/value pairs may be appended + keywords = {prec} or {balance} or {ghost} or {tpc} or {tptask} + {prec} value = {single} or {mixed} or {double} + single = perform force calculations in single precision + mixed = perform force calculations in mixed precision + double = perform force calculations in double precision {balance} value = split split = fraction of work to offload to coprocessor, -1 for dynamic - {offload_cards} value = ncops - ncops = number of coprocessors to use on each node - {offload_ghost} value = offload_type - offload_type = 1 to include ghost atoms for offload, 0 for local only - {offload_tpc} value = tpc - tpc = number of threads to use on each core of coprocessor - {offload_threads} value = tptask - tptask = max number of threads to use on coprocessor for each MPI task + {ghost} value = {yes} or {no} + yes = include ghost atoms for offload + no = do not include ghost atoms for offload + {tpc} value = Ntpc + Ntpc = number of threads to use on each physical core of coprocessor + {tptask} value = Ntptask + Ntptask = max number of threads to use on coprocessor for each MPI task {kokkos} args = keyword value ... one or more keyword/value pairs may be appended keywords = {neigh} or {comm/exchange} or {comm/forward} @@ -165,8 +168,8 @@ default value, it is usually not necessary to use this keyword. :line -The {gpu} style invokes settings settings associated with the use of -the GPU package. +The {gpu} style invokes settings associated with the use of the GPU +package. The {Ngpu} argument sets the number of GPUs per node. There must be at least as many MPI tasks per node as GPUs, as set by the mpirun or @@ -258,65 +261,64 @@ lib/gpu/Makefile that is used. :line -The {intel} style invokes options associated with the use of the -USER-INTEL package. +The {intel} style invokes settings associated with the use of the +USER-INTEL package. All of its settings, except the {prec} keyword, +are ignored if LAMMPS was not built with Xeon Phi coprocessor support, +when building with the USER-INTEL package. All of its settings, +including the {prec} keyword are applicable if LAMMPS was built with +coprocessor support. -The {Nthread} argument allows to one explicitly set the number of -OpenMP threads to be allocated for each MPI process, An {Nthreads} -value of '*' instructs LAMMPS to use whatever is the default for the -given OpenMP environment. This is usually determined via the -OMP_NUM_THREADS environment variable or the compiler runtime. +The {Nphi} argument sets the number of coprocessors per node. -The {precision} argument determines the precision mode to use and can -take values of {single} (intel styles use single precision for all -calculations), {mixed} (intel styles use double precision for -accumulation and storage of forces, torques, energies, and virial -terms and single precision for everything else), or {double} (intel -styles use double precision for all calculations). +Optional keyword/value pairs can also be specified. Each has a +default value as listed below. -Additional keyword-value pairs are available that are used to -determine how work is offloaded to an Intel(R) coprocessor. If LAMMPS is -built without offload support, these values are ignored. The -additional settings are as follows: +The {prec} keyword argument determines the precision mode to use for +computing pair style forces, either on the CPU or on the coprocessor, +when using a USER-INTEL supported "pair style"_pair_style.html. It +can take a value of {single}, {mixed} which is the default, or +{double}. {Single} means single precision is used for the entire +force calculation. {Mixed} means forces between a pair of atoms are +computed in single precision, but accumulated and stored in double +precision, including storage of forces, torques, energies, and virial +quantities. {Double} means double precision is used for the entire +force calculation. -The {balance} setting is used to set the fraction of work offloaded to -the coprocessor for an intel style (in the inclusive range 0.0 to -1.0). While this fraction of work is running on the coprocessor, other -calculations will run on the host, including neighbor and pair -calculations that are not offloaded, angle, bond, dihedral, kspace, -and some MPI communications. If the balance is set to -1, the fraction -of work is dynamically adjusted automatically throughout the run. This -can typically give performance within 5 to 10 percent of the optimal -fixed fraction. +The {balance} keyword sets the fraction of "pair +style"_pair_style.html work offloaded to the coprocessor style for +split values between 0.0 and 1.0 inclusive. While this fraction of +work is running on the coprocessor, other calculations will run on the +host, including neighbor and pair calculations that are not offloaded, +angle, bond, dihedral, kspace, and some MPI communications. If +{split} is set to -1, the fraction of work is dynamically adjusted +automatically throughout the run. This typically give performance +within 5 to 10 percent of the optimal fixed fraction. -The {offload_cards} setting determines the number of coprocessors to -use on each node. - -Additional options for fine tuning performance with offload are as -follows: - -The {offload_ghost} setting determines whether or not ghost atoms, -atoms at the borders between MPI tasks, are offloaded for neighbor and -force calculations. When set to "0", ghost atoms are not offloaded. -This option can reduce the amount of data transfer with the -coprocessor and also can overlap MPI communication of forces with +The {ghost} keyword determines whether or not ghost atoms, i.e. atoms +at the boundaries of proessor sub-domains, are offloaded for neighbor +and force calculations. When the value = "no", ghost atoms are not +offloaded. This option can reduce the amount of data transfer with +the coprocessor and can also overlap MPI communication of forces with computation on the coprocessor when the "newton pair"_newton.html -setting is "on". When set to "1", ghost atoms are offloaded. In some -cases this can provide better performance, especially if the offload -fraction is high. +setting is "on". When the value = "ues", ghost atoms are offloaded. +In some cases this can provide better performance, especially if the +{balance} fraction is high. -The {offload_tpc} option sets the maximum number of threads that will -run on each core of the coprocessor. +The {tpc} keyword sets the maximum # of threads {Ntpc} that will +run on each physical core of the coprocessor. The default value is +set to 4, which is the number of hardware threads per core supported +by the current generation Xeon Phi chips. -The {offload_threads} option sets the maximum number of threads that -will be used on the coprocessor for each MPI task. This, along with -the {offload_tpc} setting, are the only methods for changing the -number of threads on the coprocessor. The OMP_NUM_THREADS keyword and -{Nthreads} options are only used for threads on the host. +The {tptask} keyword sets the maximum # of threads (Ntptask} that will +be used on the coprocessor for each MPI task. This, along with the +{tpc} keyword setting, are the only methods for changing the number of +threads used on the coprocessor. The default value is set to 240 = +60*4, which is the maximum # of threads supported by an entire current +generation Xeon Phi chip. :line -The {kokkos} style invokes options associated with the use of the +The {kokkos} style invokes settings associated with the use of the KOKKOS package. The {neigh} keyword determines what kinds of neighbor lists are built. @@ -460,33 +462,44 @@ setting"_Section_start.html#start_7 [Default:] -To use the USER-CUDA package, the package command must be invoked -explicitly, either via the "-pk cuda" "command-line -switch"_Section_start.html#start_7 or by invoking the package cuda -command in your input script. This will set the # of GPUs/node. The -options defaults are gpuID = 0 to Ngpu-1, timing not enabled, test not -enabled, and thread = auto. +To use the USER-CUDA package, the package cuda command must be invoked +explicitly in your input script or via the "-pk cuda" "command-line +switch"_Section_start.html#start_7. This will set the # of GPUs/node. +The options defaults are gpuID = 0 to Ngpu-1, timing = not enabled, +test = not enabled, and thread = auto. For the GPU package, the default is Ngpu = 1 and the option defaults are neigh = yes, split = 1.0, gpuID = 0 to Ngpu-1, tpa = 1, binsize = pair cutoff + neighbor skin, device = not used. These settings are -made if the "-sf gpu" "command-line switch"_Section_start.html#start_7 -is used. If it is not used, you must invoke the package gpu command -in your input script. +made automatically if the "-sf gpu" "command-line +switch"_Section_start.html#start_7 is used. If it is not used, you +must invoke the package gpu command in your input script or via the +"-pk gpu" "command-line switch"_Section_start.html#start_7. -The default settings for the USER-INTEL package are "package intel * -mixed balance -1 offload_cards 1 offload_tpc 4 offload_threads 240". -The {offload_ghost} default setting is determined by the intel style -being used. The value used is output to the screen in the offload -report at the end of each run. +For the USER-INTEL package, the default is Nphi = 1 and the option +defaults are prec = mixed, balance = -1, tpc = 4, tptask = 240. The +default ghost option is determined by the pair style being used. This +value used is output to the screen in the offload report at the end of +each run. These settings are made automatically if the "-sf intel" +"command-line switch"_Section_start.html#start_7 is used. If it is +not used, you must invoke the package intel command in your input +script or or via the "-pk intel" "command-line +switch"_Section_start.html#start_7. The default settings for the KOKKOS package are "package kokkos neigh full comm/exchange host comm/forward host". This is the case whether the "-sf kk" "command-line switch"_Section_start.html#start_7 is used or not. +To use the KOKKOS package, the package kokkos command must be invoked +explicitly in your input script or via the "-pk kokkos" "command-line +switch"_Section_start.html#start_7. This will set the # of GPUs/node. +The options defaults are gpuID = 0 to Ngpu-1, timing = not enabled, +test = not enabled, and thread = auto. For the OMP package, the default is Nthreads = 0 and the option -defaults are neigh = yes. These settings are made if the "-sf omp" -"command-line switch"_Section_start.html#start_7 is used. If it is -not used, you must invoke the package omp command in your input -script. +defaults are neigh = yes. These settings are made automatically if +the "-sf omp" "command-line switch"_Section_start.html#start_7 is +used. If it is not used, you must invoke the package omp command in +your input script or via the "-pk omp" "command-line +switch"_Section_start.html#start_7. + From 3aaa82088bcd7448700d5e36092176b7225b11f5 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 9 Sep 2014 22:51:38 +0000 Subject: [PATCH 16/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12462 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- doc/Section_accelerate.html | 7 ++++--- doc/Section_accelerate.txt | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/Section_accelerate.html b/doc/Section_accelerate.html index a9165b45c9..27b80f3d63 100644 --- a/doc/Section_accelerate.html +++ b/doc/Section_accelerate.html @@ -1456,7 +1456,7 @@ performance will suffer.

    If LAMMPS was built with coprocessor support for the USER-INTEL package, you need to specify the number of coprocessor/node and the -number of threads to use on the coproessor per MPI task. Note that +number of threads to use on the coprocessor per MPI task. Note that coprocessor threads (which run on the coprocessor) are totally independent from OpenMP threads (which run on the CPU). The product of MPI tasks * coprocessor threads/task should not exceed the maximum @@ -1472,7 +1472,7 @@ which will automatically append "intel" to styles that support it. If a style does not support it, a "omp" suffix is tried next. Use the "-pk omp Nt" command-line switch, to set Nt = # of OpenMP threads per MPI task to use, if LAMMPS was built with -the USER-OMP package. Use the "-pk intel Nt Nphi" command-line +the USER-OMP package. Use the "-pk intel Nphi" command-line switch to set Nphi = # of Xeon Phi(TM) coprocessors/node, if LAMMPS was built with coprocessor support.

    @@ -1498,7 +1498,8 @@ default commands: package omp 0 and Date: Tue, 9 Sep 2014 22:52:01 +0000 Subject: [PATCH 17/17] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12463 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/GPU/fix_gpu.cpp | 2 -- src/modify.cpp | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index a064fd301a..c10af08d32 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -83,8 +83,6 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : error->all(FLERR,"Cannot use GPU package with USER-CUDA package enabled"); if (narg < 4) error->all(FLERR,"Illegal package gpu command"); - if (strcmp(arg[1],"all") != 0) - error->all(FLERR,"Illegal package gpu command"); int ngpu = atoi(arg[3]); if (ngpu <= 0) error->all(FLERR,"Illegal package gpu command"); diff --git a/src/modify.cpp b/src/modify.cpp index 637cfc0863..47bf41cd40 100644 --- a/src/modify.cpp +++ b/src/modify.cpp @@ -861,10 +861,10 @@ int Modify::find_fix(const char *id) } /* ---------------------------------------------------------------------- - check for fix associated with package name + check for fix associated with package name in compiled list return 1 if found else 0 used to determine whether LAMMPS was built with - GPU, USER-INTEL, USER-OMP packages + GPU, USER-INTEL, USER-OMP packages, which have their own fixes ------------------------------------------------------------------------- */ int Modify::check_package(const char *package_fix_name)
  • USER-CUDA for NVIDIA GPUs