diff --git a/doc/dump.txt b/doc/dump.txt index 3ce723ad4c..324e09f714 100644 --- a/doc/dump.txt +++ b/doc/dump.txt @@ -17,15 +17,17 @@ dump ID group-ID style N file args :pre ID = user-assigned name for the dump :ulb,l group-ID = ID of the group of atoms to be dumped :l -style = {atom} or {cfg} or {dcd} or {xtc} or {xyz} or {image} or {molfile} or {local} or {custom} :l +style = {atom} or {aotm\mpiio} or {cfg} or {dcd} or {xtc} or {xyz} or {xyz/mpiio} or {image} or {molfile} or {local} or {custom} or {custom/mpiio} :l N = dump every this many timesteps :l file = name of file to write dump info to :l args = list of arguments for a particular style :l {atom} args = none + {atom/mpiio} args = none {cfg} args = same as {custom} args, see below {dcd} args = none {xtc} args = none {xyz} args = none :pre + {xyz/mpiio} args = none :pre {image} args = discussed on "dump image"_dump_image.html doc page :pre @@ -40,6 +42,7 @@ args = list of arguments for a particular style :l f_ID\[N\] = Nth column of local array calculated by a fix with ID :pre {custom} args = list of atom attributes + {custom/mpiio} args = list of atom attributes possible attributes = id, mol, type, element, mass, x, y, z, xs, ys, zs, xu, yu, zu, xsu, ysu, zsu, ix, iy, iz, @@ -83,7 +86,9 @@ args = list of arguments for a particular style :l [Examples:] dump myDump all atom 100 dump.atom +dump myDump all atom/mpiio 100 dump.atom.mpiio dump 2 subgroup atom 50 dump.run.bin +dump 2 subgroup atom 50 dump.run.mpiio.bin dump 4a all custom 100 dump.myforce.* id type x y vx fx dump 4b flow custom 100 dump.%.myforce id type c_myF\[3\] v_ke dump 2 inner cfg 10 dump.snap.*.cfg mass type xs ys zs vx vy vz @@ -130,6 +135,15 @@ default. For the {dcd}, {xtc}, {xyz}, and {molfile} styles, sorting by atom ID is on by default. See the "dump_modify"_dump_modify.html doc page for details. +As explained below, the {atom/mpiio}, {custom/mpiio}, and {xyz/mpiio} +styles are identical in command syntax and in the format of the dump +files they create, to the corresponding styles without "mpiio", except +the single dump file they produce is written in parallel via the +MPI-IO library. For the remainder of this doc page, you should thus +consider the {atom} and {atom/mpiio} styles (etc) to be +inter-changeable. The one exception is how the filename is specified +for the MPI-IO styles, as explained below. + :line The {style} keyword determines what atom quantities are written to the @@ -339,6 +353,31 @@ when running on large numbers of processors. Note that using the "*" and "%" characters together can produce a large number of small dump files! +For the {atom/mpiio}, {custom/mpiio}, and {xyz/mpiio} styles, a single +dump file is written in parallel via the MPI-IO library, which is part +of the MPI standard for versions 2.0 and above. Using MPI-IO requires +two steps. First, build LAMMPS with its MPIIO package installed, e.g. + +make yes-mpiio # installs the MPIIO package +make g++ # build LAMMPS for your platform :pre + +Second, use a dump filename which contains ".mpiio". Note that it +does not have to end in ".mpiio", just contain those characters. +Unlike MPI-IO restart files, which must be both written and read using +MPI-IO, the dump files produced by these MPI-IO styles are identical +in format to the files produced by their non-MPI-IO style +counterparts. This means you can write a dump file using MPI-IO and +use the "read_dump"_read_dump.html command or perform other +post-processing, just as if the dump file was not written using +MPI-IO. + +Note that MPI-IO dump files are one large file which all processors +write to. You thus cannot use the "%" wildcard character described +above in the filename since that specifies generation of multiple +files. You can use the ".bin" suffix described below in an MPI-IO +dump file; again this file will be written in parallel and have the +same binary format as if it were written without MPI-IO. + If the filename ends with ".bin", the dump file (or files, if "*" or "%" is also used) is written in binary format. A binary dump file will be about the same size as a text version, but will typically diff --git a/doc/dump_modify.txt b/doc/dump_modify.txt index 10ee8bc907..84018f00ca 100644 --- a/doc/dump_modify.txt +++ b/doc/dump_modify.txt @@ -110,6 +110,15 @@ dump_modify 1 amap min max cf 0.0 3 min green 0.5 yellow max blue boxcolor red : Modify the parameters of a previously defined dump command. Not all parameters are relevant to all dump styles. +As explained on the "dump"_dump.html doc page, the {atom/mpiio}, +{custom/mpiio}, and {xyz/mpiio} dump styles are identical in command +syntax and in the format of the dump files they create, to the +corresponding styles without "mpiio", except the single dump file they +produce is written in parallel via the MPI-IO library. Thus if a +dump_modify option below is valid for the {atom} style, it is also +valid for the {atom/mpiio} style, and similarly for the other styles +which allow for use of MPI-IO. + :line :line diff --git a/doc/read_restart.html b/doc/read_restart.html index 6cc58583b7..db0793a4f1 100644 --- a/doc/read_restart.html +++ b/doc/read_restart.html @@ -21,6 +21,7 @@

read_restart save.10000
 read_restart restart.*
+read_restart restart.*.mpiio
 read_restart poly.*.% 
 
@@ -98,6 +99,20 @@ different the number of processors in the current LAMMPS simulation.
 This can be a fast mode of input on parallel machines that support
 parallel I/O.
 

+

A restart file can also be read in parallel as one large binary file +via the MPI-IO library, assuming it was also written with MPI-IO. +MPI-IO is part of the MPI standard for versions 2.0 and above. Using +MPI-IO requires two steps. First, build LAMMPS with its MPIIO package +installed, e.g. +

+
make yes-mpiio    # installs the MPIIO package
+make g++          # build LAMMPS for your platform 
+
+

Second, use a restart filename which contains ".mpiio". Note that it +does not have to end in ".mpiio", just contain those characters. +Unlike MPI-IO dump files, a particular restart file must be both +written and read using MPI-IO. +


A restart file stores the following information about a simulation: @@ -163,7 +178,10 @@ output, geometric regions, etc.


-

Restrictions: none +

Restrictions: +

+

To write and read restart files in parallel with MPI-IO, the MPIIO +package must be installed.

Related commands:

diff --git a/doc/read_restart.txt b/doc/read_restart.txt index 09fd395ea1..936cf81e14 100644 --- a/doc/read_restart.txt +++ b/doc/read_restart.txt @@ -18,6 +18,7 @@ file = name of binary restart file to read in :ul read_restart save.10000 read_restart restart.* +read_restart restart.*.mpiio read_restart poly.*.% :pre :pre @@ -95,6 +96,20 @@ different the number of processors in the current LAMMPS simulation. This can be a fast mode of input on parallel machines that support parallel I/O. +A restart file can also be read in parallel as one large binary file +via the MPI-IO library, assuming it was also written with MPI-IO. +MPI-IO is part of the MPI standard for versions 2.0 and above. Using +MPI-IO requires two steps. First, build LAMMPS with its MPIIO package +installed, e.g. + +make yes-mpiio # installs the MPIIO package +make g++ # build LAMMPS for your platform :pre + +Second, use a restart filename which contains ".mpiio". Note that it +does not have to end in ".mpiio", just contain those characters. +Unlike MPI-IO dump files, a particular restart file must be both +written and read using MPI-IO. + :line A restart file stores the following information about a simulation: @@ -160,7 +175,10 @@ output, "geometric regions"_region.html, etc. :line -[Restrictions:] none +[Restrictions:] + +To write and read restart files in parallel with MPI-IO, the MPIIO +package must be installed. [Related commands:] diff --git a/doc/restart.html b/doc/restart.html index 8ea6d074d7..c56afdcb33 100644 --- a/doc/restart.html +++ b/doc/restart.html @@ -40,6 +40,7 @@ restart N file1 file2 keyword value ...

restart 0
 restart 1000 poly.restart
+restart 1000 poly.restart.mpiio
 restart 1000 restart.*.equil
 restart 10000 poly.%.1 poly.%.2 nfile 10
 restart v_mystep poly.restart 
@@ -84,6 +85,19 @@ of output and subsequent input on parallel machines that support
 parallel I/O.  The optional fileper and nfile keywords discussed
 below can alter the number of files written.
 

+

The restart file can also be written in parallel as one large binary +file via the MPI-IO library, which is part of the MPI standard for +versions 2.0 and above. Using MPI-IO requires two steps. First, +build LAMMPS with its MPIIO package installed, e.g. +

+
make yes-mpiio    # installs the MPIIO package
+make g++          # build LAMMPS for your platform 
+
+

Second, use a restart filename which contains ".mpiio". Note that it +does not have to end in ".mpiio", just contain those characters. +Unlike MPI-IO dump files, a particular restart file must be both +written and read using MPI-IO. +

Restart files are written on timesteps that are a multiple of N but not on the first timestep of a run or minimization. You can use the write_restart command to write a restart file @@ -145,7 +159,10 @@ next 3 processors and write it to a restart file.


-

Restrictions: none +

Restrictions: +

+

To write and read restart files in parallel with MPI-IO, the MPIIO +package must be installed.

Related commands:

diff --git a/doc/restart.txt b/doc/restart.txt index 7588f6cf6a..834e7d0d71 100644 --- a/doc/restart.txt +++ b/doc/restart.txt @@ -30,6 +30,7 @@ keyword = {fileper} or {nfile} :l restart 0 restart 1000 poly.restart +restart 1000 poly.restart.mpiio restart 1000 restart.*.equil restart 10000 poly.%.1 poly.%.2 nfile 10 restart v_mystep poly.restart :pre @@ -74,6 +75,19 @@ of output and subsequent input on parallel machines that support parallel I/O. The optional {fileper} and {nfile} keywords discussed below can alter the number of files written. +The restart file can also be written in parallel as one large binary +file via the MPI-IO library, which is part of the MPI standard for +versions 2.0 and above. Using MPI-IO requires two steps. First, +build LAMMPS with its MPIIO package installed, e.g. + +make yes-mpiio # installs the MPIIO package +make g++ # build LAMMPS for your platform :pre + +Second, use a restart filename which contains ".mpiio". Note that it +does not have to end in ".mpiio", just contain those characters. +Unlike MPI-IO dump files, a particular restart file must be both +written and read using MPI-IO. + Restart files are written on timesteps that are a multiple of N but not on the first timestep of a run or minimization. You can use the "write_restart"_write_restart.html command to write a restart file @@ -135,7 +149,10 @@ next 3 processors and write it to a restart file. :line -[Restrictions:] none +[Restrictions:] + +To write and read restart files in parallel with MPI-IO, the MPIIO +package must be installed. [Related commands:] diff --git a/doc/write_restart.html b/doc/write_restart.html index 16bd95cdf0..a59e07082b 100644 --- a/doc/write_restart.html +++ b/doc/write_restart.html @@ -31,6 +31,7 @@

Examples:

write_restart restart.equil
+write_restart restart.equil.mpiio
 write_restart poly.%.* nfile 10 
 

Description: @@ -55,6 +56,19 @@ output and subsequent input on parallel machines that support parallel I/O. The optional fileper and nfile keywords discussed below can alter the number of files written.

+

The restart file can also be written in parallel as one large binary +file via the MPI-IO library, which is part of the MPI standard for +versions 2.0 and above. Using MPI-IO requires two steps. First, +build LAMMPS with its MPIIO package installed, e.g. +

+
make yes-mpiio    # installs the MPIIO package
+make g++          # build LAMMPS for your platform 
+
+

Second, use a restart filename which contains ".mpiio". Note that it +does not have to end in ".mpiio", just contain those characters. +Unlike MPI-IO dump files, a particular restart file must be both +written and read using MPI-IO. +

Restart files can be read by a read_restart command to restart a simulation from a particular state. Because the file is binary (to enable exact restarts), it may not be readable on @@ -102,6 +116,9 @@ before the restart file is written. This means that your system must be ready to perform a simulation before using this command (force fields setup, atom masses initialized, etc).

+

To write and read restart files in parallel with MPI-IO, the MPIIO +package must be installed. +

Related commands:

restart, read_restart, diff --git a/doc/write_restart.txt b/doc/write_restart.txt index 8e3378681f..d8041ec32d 100644 --- a/doc/write_restart.txt +++ b/doc/write_restart.txt @@ -24,6 +24,7 @@ keyword = {fileper} or {nfile} :l [Examples:] write_restart restart.equil +write_restart restart.equil.mpiio write_restart poly.%.* nfile 10 :pre [Description:] @@ -48,6 +49,19 @@ output and subsequent input on parallel machines that support parallel I/O. The optional {fileper} and {nfile} keywords discussed below can alter the number of files written. +The restart file can also be written in parallel as one large binary +file via the MPI-IO library, which is part of the MPI standard for +versions 2.0 and above. Using MPI-IO requires two steps. First, +build LAMMPS with its MPIIO package installed, e.g. + +make yes-mpiio # installs the MPIIO package +make g++ # build LAMMPS for your platform :pre + +Second, use a restart filename which contains ".mpiio". Note that it +does not have to end in ".mpiio", just contain those characters. +Unlike MPI-IO dump files, a particular restart file must be both +written and read using MPI-IO. + Restart files can be read by a "read_restart"_read_restart.html command to restart a simulation from a particular state. Because the file is binary (to enable exact restarts), it may not be readable on @@ -95,6 +109,9 @@ before the restart file is written. This means that your system must be ready to perform a simulation before using this command (force fields setup, atom masses initialized, etc). +To write and read restart files in parallel with MPI-IO, the MPIIO +package must be installed. + [Related commands:] "restart"_restart.html, "read_restart"_read_restart.html, diff --git a/src/MPIIO/Install.sh b/src/MPIIO/Install.sh new file mode 100644 index 0000000000..e40f5acf28 --- /dev/null +++ b/src/MPIIO/Install.sh @@ -0,0 +1,51 @@ +# Install/unInstall package files in LAMMPS +# mode = 0/1/2 for uninstall/install/update + +mode=$1 + +# arg1 = file, arg2 = file it depends on + +action () { + if (test $mode = 0) then + rm -f ../$1 + elif (! cmp -s $1 ../$1) then + if (test -z "$2" || test -e ../$2) then + cp $1 .. + if (test $mode = 2) then + echo " updating src/$1" + fi + fi + elif (test -n "$2") then + if (test ! -e ../$2) then + rm -f ../$1 + fi + fi +} + +# force rebuild of files with LMP_MPIIO switch + +touch ../mpiio.h + +# all package files with no dependencies + +for file in *.cpp *.h; do + action $file +done + +# edit 2 Makefile.package to include/exclude LMP_MPIIO setting + +if (test $1 = 1) then + + if (test -e ../Makefile.package) then + sed -i -e 's/[^ \t]*MPIIO[^ \t]* //' ../Makefile.package + sed -i -e 's|^PKG_INC =[ \t]*|&-DLMP_MPIIO |' ../Makefile.package + + fi + +elif (test $1 = 0) then + + if (test -e ../Makefile.package) then + sed -i -e 's/[^ \t]*MPIIO[^ \t]* //' ../Makefile.package + fi + +fi diff --git a/src/MPIIO/restart_mpiio.cpp b/src/MPIIO/restart_mpiio.cpp new file mode 100644 index 0000000000..40f9327821 --- /dev/null +++ b/src/MPIIO/restart_mpiio.cpp @@ -0,0 +1,194 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Coffman (IBM) +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include "restart_mpiio.h" +#include "error.h" +#include "limits.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +RestartMPIIO::RestartMPIIO(LAMMPS *lmp) : Pointers(lmp) +{ + mpiio_exists = 1; + MPI_Comm_size(world,&nprocs); + MPI_Comm_rank(world,&myrank); +} + +/* ---------------------------------------------------------------------- + calls MPI_File_open in read-only mode, read_restart should call this + for some file servers it is most efficient to only read or only write +------------------------------------------------------------------------- */ + +void RestartMPIIO::openForRead(char *filename) +{ + int err = MPI_File_open(world, filename, MPI_MODE_RDONLY , + MPI_INFO_NULL, &mpifh); + if (err != MPI_SUCCESS) { + char str[MPI_MAX_ERROR_STRING+128]; + char mpiErrorString[MPI_MAX_ERROR_STRING]; + int mpiErrorStringLength; + MPI_Error_string(err, mpiErrorString, &mpiErrorStringLength); + sprintf(str,"Cannot open restart file for reading - mpi error: %s\n", + mpiErrorString); + error->one(FLERR,str); + } +} + +/* ---------------------------------------------------------------------- + calls MPI_File_open in write-only mode, write_restart should call this + for some file servers it is most efficient to only read or only write +------------------------------------------------------------------------- */ + +void RestartMPIIO::openForWrite(char *filename) +{ + int err = MPI_File_open(world, filename, MPI_MODE_APPEND | MPI_MODE_WRONLY, + MPI_INFO_NULL, &mpifh); + if (err != MPI_SUCCESS) { + char str[MPI_MAX_ERROR_STRING+128]; + char mpiErrorString[MPI_MAX_ERROR_STRING]; + int mpiErrorStringLength; + MPI_Error_string(err, mpiErrorString, &mpiErrorStringLength); + sprintf(str,"Cannot open restart file for writing - mpi error: %s\n", + mpiErrorString); + error->one(FLERR,str); + } +} + +/* ---------------------------------------------------------------------- + determine the absolute offset for the data to be written with + MPI_Scan of the send sizes + compute the file size based off the MPI_Scan send size value on the last rank + set the filesize with ftruncate via MPI_File_set_size + write the data via collective MPI-IO by calling MPI_File_write_at_all +------------------------------------------------------------------------- */ + +void RestartMPIIO::write(MPI_Offset headerOffset, int send_size, double *buf) +{ + MPI_Status mpiStatus; + long incPrefix = 0; + long longSendSize = (long) send_size; + MPI_Scan(&longSendSize,&incPrefix,1,MPI_LONG,MPI_SUM,world); + + long largestIncPrefix = incPrefix; + MPI_Bcast(&largestIncPrefix, 1, MPI_LONG, (nprocs-1), world); + + int err = MPI_File_set_size(mpifh, + (headerOffset+(largestIncPrefix*sizeof(double)))); + if (err != MPI_SUCCESS) { + char str[MPI_MAX_ERROR_STRING+128]; + char mpiErrorString[MPI_MAX_ERROR_STRING]; + int mpiErrorStringLength; + MPI_Error_string(err, mpiErrorString, &mpiErrorStringLength); + sprintf(str,"Cannot set restart file size - mpi error: %s\n", + mpiErrorString); + error->one(FLERR,str); + } + + err = MPI_File_write_at_all(mpifh,headerOffset + + ((incPrefix-longSendSize)*sizeof(double)), + buf,send_size,MPI_DOUBLE,&mpiStatus); + if (err != MPI_SUCCESS) { + char str[MPI_MAX_ERROR_STRING+128]; + char mpiErrorString[MPI_MAX_ERROR_STRING]; + int mpiErrorStringLength; + MPI_Error_string(err, mpiErrorString, &mpiErrorStringLength); + sprintf(str,"Cannot write to restart file - mpi error: %s\n", + mpiErrorString); + error->one(FLERR,str); + } +} + +/* ---------------------------------------------------------------------- + read the data into buf via collective MPI-IO by calling MPI_File_read_at_all + with the chunkOffset and chunkSize provided + if the consolidated chunksize is greater than INT_MAX + can only happen in extreme situation of reading restart file on + much fewer ranks than written and with relatively large data sizes + follow the collective IO call with rank independant IO to read remaining data +------------------------------------------------------------------------- */ + +void RestartMPIIO::read(MPI_Offset chunkOffset, long chunkSize, double *buf) +{ + MPI_Status mpiStatus; + + int intChunkSize; + long remainingSize = 0; + if (chunkSize > INT_MAX) { + intChunkSize = INT_MAX; + remainingSize = chunkSize - INT_MAX; + } + else intChunkSize = (int) chunkSize; + + int err = MPI_File_read_at_all(mpifh,chunkOffset,buf,intChunkSize, + MPI_DOUBLE,&mpiStatus); + if (err != MPI_SUCCESS) { + char str[MPI_MAX_ERROR_STRING+128]; + char mpiErrorString[MPI_MAX_ERROR_STRING]; + int mpiErrorStringLength; + MPI_Error_string(err, mpiErrorString, &mpiErrorStringLength); + sprintf(str,"Cannot read from restart file - mpi error: %s\n", + mpiErrorString); + error->one(FLERR,str); + } + + MPI_Offset currentOffset = chunkOffset+intChunkSize; + MPI_Offset bufOffset = intChunkSize; + while (remainingSize > 0) { + int currentChunkSize; + if (remainingSize > INT_MAX) { + currentChunkSize = INT_MAX; + remainingSize -= INT_MAX; + } + else { + currentChunkSize = remainingSize; + remainingSize = 0; + } + int err = MPI_File_read_at(mpifh,currentOffset,&buf[bufOffset], + currentChunkSize,MPI_DOUBLE,&mpiStatus); + if (err != MPI_SUCCESS) { + char str[MPI_MAX_ERROR_STRING+128]; + char mpiErrorString[MPI_MAX_ERROR_STRING]; + int mpiErrorStringLength; + MPI_Error_string(err, mpiErrorString, &mpiErrorStringLength); + sprintf(str,"Cannot read from restart file - mpi error: %s\n", + mpiErrorString); + error->one(FLERR,str); + } + currentOffset += currentChunkSize; + bufOffset += currentChunkSize; + } +} + +/* ---------------------------------------------------------------------- + calls MPI_File_close +------------------------------------------------------------------------- */ + +void RestartMPIIO::close() +{ + int err = MPI_File_close(&mpifh); + if (err != MPI_SUCCESS) { + char str[MPI_MAX_ERROR_STRING+128]; + char mpiErrorString[MPI_MAX_ERROR_STRING]; + int mpiErrorStringLength; + MPI_Error_string(err, mpiErrorString, &mpiErrorStringLength); + sprintf(str,"Cannot close restart file - mpi error: %s\n",mpiErrorString); + error->one(FLERR,str); + } +} diff --git a/src/MPIIO/restart_mpiio.h b/src/MPIIO/restart_mpiio.h new file mode 100644 index 0000000000..87478af588 --- /dev/null +++ b/src/MPIIO/restart_mpiio.h @@ -0,0 +1,40 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_RESTART_MPIIO_H +#define LMP_RESTART_MPIIO_H + +#include "pointers.h" + +namespace LAMMPS_NS { + +class RestartMPIIO : protected Pointers { + private: + MPI_File mpifh; + int nprocs, myrank; + + public: + int mpiio_exists; + + RestartMPIIO(class LAMMPS *); + ~RestartMPIIO() {} + void openForRead(char *); + void openForWrite(char *); + void write(MPI_Offset, int, double *); + void read(MPI_Offset, long, double *); + void close(); +}; + +} + +#endif diff --git a/src/Makefile b/src/Makefile index 0263fbae21..93d874aa8e 100755 --- a/src/Makefile +++ b/src/Makefile @@ -14,8 +14,8 @@ OBJ = $(SRC:.cpp=.o) # Package variables PACKAGE = asphere body class2 colloid dipole fld gpu granular kim \ - kspace manybody mc meam misc molecule opt peri poems reax replica \ - rigid shock srd voronoi xtc + kspace manybody mc meam misc molecule mpiio opt peri poems \ + reax replica rigid shock srd voronoi xtc PACKUSER = user-atc user-awpmd user-cg-cmm user-colvars \ user-cuda user-eff user-lb user-misc user-omp user-molfile \ diff --git a/src/STUBS/mpi.c b/src/STUBS/mpi.c index 119f1efe0d..e7d574cc4a 100644 --- a/src/STUBS/mpi.c +++ b/src/STUBS/mpi.c @@ -108,6 +108,7 @@ int MPI_Type_size(MPI_Datatype datatype, int *size) else if (datatype == MPI_DOUBLE) *size = sizeof(double); else if (datatype == MPI_CHAR) *size = sizeof(char); else if (datatype == MPI_BYTE) *size = sizeof(char); + else if (datatype == MPI_LONG) *size = sizeof(long); else if (datatype == MPI_LONG_LONG) *size = sizeof(uint64_t); else if (datatype == MPI_DOUBLE_INT) *size = sizeof(double_int); @@ -282,6 +283,7 @@ int MPI_Allreduce(void *sendbuf, void *recvbuf, int count, else if (datatype == MPI_DOUBLE) n = count*sizeof(double); else if (datatype == MPI_CHAR) n = count*sizeof(char); else if (datatype == MPI_BYTE) n = count*sizeof(char); + else if (datatype == MPI_LONG) n = count*sizeof(long); else if (datatype == MPI_LONG_LONG) n = count*sizeof(uint64_t); else if (datatype == MPI_DOUBLE_INT) n = count*sizeof(double_int); @@ -304,6 +306,7 @@ int MPI_Reduce(void *sendbuf, void *recvbuf, int count, else if (datatype == MPI_DOUBLE) n = count*sizeof(double); else if (datatype == MPI_CHAR) n = count*sizeof(char); else if (datatype == MPI_BYTE) n = count*sizeof(char); + else if (datatype == MPI_LONG) n = count*sizeof(long); else if (datatype == MPI_LONG_LONG) n = count*sizeof(uint64_t); else if (datatype == MPI_DOUBLE_INT) n = count*sizeof(double_int); @@ -324,6 +327,7 @@ int MPI_Scan(void *sendbuf, void *recvbuf, int count, else if (datatype == MPI_DOUBLE) n = count*sizeof(double); else if (datatype == MPI_CHAR) n = count*sizeof(char); else if (datatype == MPI_BYTE) n = count*sizeof(char); + else if (datatype == MPI_LONG) n = count*sizeof(long); else if (datatype == MPI_LONG_LONG) n = count*sizeof(uint64_t); else if (datatype == MPI_DOUBLE_INT) n = count*sizeof(double_int); @@ -346,6 +350,7 @@ int MPI_Allgather(void *sendbuf, int sendcount, MPI_Datatype sendtype, else if (sendtype == MPI_DOUBLE) n = sendcount*sizeof(double); else if (sendtype == MPI_CHAR) n = sendcount*sizeof(char); else if (sendtype == MPI_BYTE) n = sendcount*sizeof(char); + else if (sendtype == MPI_LONG) n = sendcount*sizeof(long); else if (sendtype == MPI_LONG_LONG) n = sendcount*sizeof(uint64_t); else if (sendtype == MPI_DOUBLE_INT) n = sendcount*sizeof(double_int); @@ -368,6 +373,7 @@ int MPI_Allgatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, else if (sendtype == MPI_DOUBLE) n = sendcount*sizeof(double); else if (sendtype == MPI_CHAR) n = sendcount*sizeof(char); else if (sendtype == MPI_BYTE) n = sendcount*sizeof(char); + else if (sendtype == MPI_LONG) n = sendcount*sizeof(long); else if (sendtype == MPI_LONG_LONG) n = sendcount*sizeof(uint64_t); else if (sendtype == MPI_DOUBLE_INT) n = sendcount*sizeof(double_int); @@ -389,6 +395,7 @@ int MPI_Reduce_scatter(void *sendbuf, void *recvbuf, int *recvcounts, else if (datatype == MPI_DOUBLE) n = *recvcounts*sizeof(double); else if (datatype == MPI_CHAR) n = *recvcounts*sizeof(char); else if (datatype == MPI_BYTE) n = *recvcounts*sizeof(char); + else if (datatype == MPI_LONG) n = *recvcounts*sizeof(long); else if (datatype == MPI_LONG_LONG) n = *recvcounts*sizeof(uint64_t); else if (datatype == MPI_DOUBLE_INT) n = *recvcounts*sizeof(double_int); @@ -411,6 +418,7 @@ int MPI_Gather(void *sendbuf, int sendcount, MPI_Datatype sendtype, else if (sendtype == MPI_DOUBLE) n = sendcount*sizeof(double); else if (sendtype == MPI_CHAR) n = sendcount*sizeof(char); else if (sendtype == MPI_BYTE) n = sendcount*sizeof(char); + else if (sendtype == MPI_LONG) n = sendcount*sizeof(long); else if (sendtype == MPI_LONG_LONG) n = sendcount*sizeof(uint64_t); else if (sendtype == MPI_DOUBLE_INT) n = sendcount*sizeof(double_int); @@ -433,6 +441,7 @@ int MPI_Gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, else if (sendtype == MPI_DOUBLE) n = sendcount*sizeof(double); else if (sendtype == MPI_CHAR) n = sendcount*sizeof(char); else if (sendtype == MPI_BYTE) n = sendcount*sizeof(char); + else if (sendtype == MPI_LONG) n = sendcount*sizeof(long); else if (sendtype == MPI_LONG_LONG) n = sendcount*sizeof(uint64_t); else if (sendtype == MPI_DOUBLE_INT) n = sendcount*sizeof(double_int); @@ -441,6 +450,30 @@ int MPI_Gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, return 0; } +/* ---------------------------------------------------------------------- */ + +/* copy values from data1 to data2 */ + +int MPI_Scatter(void *sendbuf, int sendcount, MPI_Datatype sendtype, + void *recvbuf, int recvcount, MPI_Datatype recvtype, + int root, MPI_Comm comm) +{ + int n; + if (sendtype == MPI_INT) n = recvcount*sizeof(int); + else if (sendtype == MPI_FLOAT) n = recvcount*sizeof(float); + else if (sendtype == MPI_DOUBLE) n = recvcount*sizeof(double); + else if (sendtype == MPI_CHAR) n = recvcount*sizeof(char); + else if (sendtype == MPI_BYTE) n = recvcount*sizeof(char); + else if (sendtype == MPI_LONG) n = recvcount*sizeof(long); + else if (sendtype == MPI_LONG_LONG) n = recvcount*sizeof(uint64_t); + else if (sendtype == MPI_DOUBLE_INT) n = recvcount*sizeof(double_int); + + if (sendbuf == MPI_IN_PLACE || recvbuf == MPI_IN_PLACE) return 0; + memcpy(recvbuf,sendbuf,n); + return 0; +} + + /* ---------------------------------------------------------------------- */ /* copy values from data1 to data2 */ @@ -455,6 +488,7 @@ int MPI_Scatterv(void *sendbuf, int *sendcounts, int *displs, else if (sendtype == MPI_DOUBLE) n = recvcount*sizeof(double); else if (sendtype == MPI_CHAR) n = recvcount*sizeof(char); else if (sendtype == MPI_BYTE) n = recvcount*sizeof(char); + else if (sendtype == MPI_LONG) n = recvcount*sizeof(long); else if (sendtype == MPI_LONG_LONG) n = recvcount*sizeof(uint64_t); else if (sendtype == MPI_DOUBLE_INT) n = recvcount*sizeof(double_int); diff --git a/src/STUBS/mpi.h b/src/STUBS/mpi.h index 752465e224..00db42755d 100644 --- a/src/STUBS/mpi.h +++ b/src/STUBS/mpi.h @@ -16,6 +16,10 @@ #include "stdlib.h" +#ifdef LMP_MPIIO +#error Cannot build serial LAMMPS with MPIIO package +#endif + /* use C bindings for MPI interface */ #ifdef __cplusplus @@ -33,8 +37,9 @@ extern "C" { #define MPI_DOUBLE 3 #define MPI_CHAR 4 #define MPI_BYTE 5 -#define MPI_LONG_LONG 6 -#define MPI_DOUBLE_INT 7 +#define MPI_LONG 6 +#define MPI_LONG_LONG 7 +#define MPI_DOUBLE_INT 8 #define MPI_SUM 1 #define MPI_MAX 2 @@ -52,6 +57,7 @@ extern "C" { #define MPI_Request int #define MPI_Datatype int #define MPI_Op int +#define MPI_Offset long #define MPI_IN_PLACE NULL @@ -133,6 +139,9 @@ int MPI_Gather(void *sendbuf, int sendcount, MPI_Datatype sendtype, int MPI_Gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype, int root, MPI_Comm comm); +int MPI_Scatter(void *sendbuf, int sendcount, MPI_Datatype sendtype, + void *recvbuf, int recvcount, MPI_Datatype recvtype, + int root, MPI_Comm comm); int MPI_Scatterv(void *sendbuf, int *sendcounts, int *displs, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm); diff --git a/src/mpiio.h b/src/mpiio.h index aa4d659dda..9b67fac532 100644 --- a/src/mpiio.h +++ b/src/mpiio.h @@ -34,9 +34,10 @@ class RestartMPIIO { RestartMPIIO(class LAMMPS *) {mpiio_exists = 0;} ~RestartMPIIO() {} - void open() {} - void write() {} - void read() {} + void openForRead(char *) {} + void openForWrite(char *) {} + void write(MPI_Offset,int,double *) {} + void read(MPI_Offset,long,double *) {} void close() {} }; diff --git a/src/read_restart.cpp b/src/read_restart.cpp index 31549b69dc..2d6db0ec87 100644 --- a/src/read_restart.cpp +++ b/src/read_restart.cpp @@ -98,7 +98,7 @@ void ReadRestart::command(int narg, char **arg) if (strchr(arg[0],'%')) multiproc = 1; else multiproc = 0; - if (strstr(arg[0],".mpi")) mpiioflag = 1; + if (strstr(arg[0],".mpiio")) mpiioflag = 1; else mpiioflag = 0; if (multiproc && mpiioflag) @@ -190,19 +190,13 @@ void ReadRestart::command(int narg, char **arg) // MPI-IO input from single file if (mpiioflag) { - // add calls to RestartMPIIO class - // reopen header file - // perform reads - // allow for different # of procs reading than wrote the file + mpiio->openForRead(file); + memory->create(buf,assignedChunkSize,"read_restart:buf"); + mpiio->read((headerOffset+assignedChunkOffset),assignedChunkSize,buf); + mpiio->close(); - // mpiio->open(file); - // mpiio->read(); - // mpiio->close(); - - // then process atom info as - - //m = 0; - //while (m < n) m += avec->unpack_restart(&buf[m]); + m = 0; + while (m < assignedChunkSize) m += avec->unpack_restart(&buf[m]); } // input of single native file @@ -937,13 +931,81 @@ void ReadRestart::file_layout() error->all(FLERR,"Restart file is a MPI-IO file"); if (mpiioflag && mpiioflag_file == 0) error->all(FLERR,"Restart file is not a MPI-IO file"); + + if (mpiioflag) { + long *nproc_chunk_offsets; + memory->create(nproc_chunk_offsets,nprocs, + "write_restart:nproc_chunk_offsets"); + long *nproc_chunk_sizes; + memory->create(nproc_chunk_sizes,nprocs, + "write_restart:nproc_chunk_sizes"); + + // on rank 0 read in the chunk sizes that were written out + // then consolidate them and compute offsets relative to the + // end of the header info to fit the current partition size + // if the number of ranks that did the writing is different + + if (me == 0) { + int *all_written_send_sizes; + memory->create(all_written_send_sizes,nprocs_file, + "write_restart:all_written_send_sizes"); + int *nproc_chunk_number; + memory->create(nproc_chunk_number,nprocs, + "write_restart:nproc_chunk_number"); + + fread(all_written_send_sizes,sizeof(int),nprocs_file,fp); + + int init_chunk_number = nprocs_file/nprocs; + int num_extra_chunks = nprocs_file - (nprocs*init_chunk_number); + + for (int i = 0; i < nprocs; i++) { + if (i < num_extra_chunks) + nproc_chunk_number[i] = init_chunk_number+1; + else + nproc_chunk_number[i] = init_chunk_number; + } + + int all_written_send_sizes_index = 0; + long current_offset = 0; + for (int i=0;idestroy(all_written_send_sizes); + memory->destroy(nproc_chunk_number); + } + + // scatter chunk sizes and offsets to all procs + + MPI_Scatter(nproc_chunk_sizes, 1, MPI_LONG, + &assignedChunkSize , 1, MPI_LONG, 0,world); + MPI_Scatter(nproc_chunk_offsets, 1, MPI_LONG, + &assignedChunkOffset , 1, MPI_LONG, 0,world); + + memory->destroy(nproc_chunk_sizes); + memory->destroy(nproc_chunk_offsets); + } } - // NOTE: could add reading of MPI-IO specific fields to header here - // e.g. read vector of PERPROCSIZE values - flag = read_int(); } + + // if MPI-IO file, broadcast the end of the header offste + // this allows all ranks to compute offset to their data + + if (mpiioflag) { + if (me == 0) headerOffset = ftell(fp); + MPI_Bcast(&headerOffset,1,MPI_LONG,0,world); + } } // ---------------------------------------------------------------------- diff --git a/src/read_restart.h b/src/read_restart.h index e074d46f71..d61140cd4a 100644 --- a/src/read_restart.h +++ b/src/read_restart.h @@ -38,8 +38,13 @@ class ReadRestart : protected Pointers { int multiproc; // 0 = proc 0 writes for all // else # of procs writing files + // MPI-IO values + int mpiioflag; // 1 for MPIIO output, else 0 class RestartMPIIO *mpiio; // MPIIO for restart file input + int numChunksAssigned; + long assignedChunkSize; + MPI_Offset assignedChunkOffset,headerOffset; void file_search(char *, char *); void header(int); diff --git a/src/write_restart.cpp b/src/write_restart.cpp index 541bf45942..2be1cfecfd 100644 --- a/src/write_restart.cpp +++ b/src/write_restart.cpp @@ -98,7 +98,7 @@ void WriteRestart::command(int narg, char **arg) if (strchr(arg[0],'%')) multiproc = nprocs; else multiproc = 0; - if (strstr(arg[0],".mpi")) mpiioflag = 1; + if (strstr(arg[0],".mpiio")) mpiioflag = 1; else mpiioflag = 0; // setup output style and process optional args @@ -380,13 +380,10 @@ void WriteRestart::write(char *file) // MPI-IO output to single file if (mpiioflag) { - // add calls to RestartMPIIO class - // reopen header file in append mode - // perform writes - - // mpiio->open(file); - // mpiio->write(send_size,buf); - // mpiio->close(); + if (me == 0) fclose(fp); + mpiio->openForWrite(file); + mpiio->write(headerOffset,send_size,buf); + mpiio->close(); } // output of one or more native files @@ -562,8 +559,13 @@ void WriteRestart::file_layout(int send_size) write_int(MPIIO,mpiioflag); } - // NOTE: could add MPI-IO specific fields to header here - // e.g. gather send_size across all procs and call write_int_vec() + if (mpiioflag) { + int *all_send_sizes; + memory->create(all_send_sizes,nprocs,"write_restart:all_send_sizes"); + MPI_Gather(&send_size, 1, MPI_INT, all_send_sizes, 1, MPI_INT, 0,world); + if (me == 0) fwrite(all_send_sizes,sizeof(int),nprocs,fp); + memory->destroy(all_send_sizes); + } // -1 flag signals end of file layout info @@ -571,6 +573,14 @@ void WriteRestart::file_layout(int send_size) int flag = -1; fwrite(&flag,sizeof(int),1,fp); } + + // if MPI-IO file, broadcast the end of the header offste + // this allows all ranks to compute offset to their data + + if (mpiioflag) { + if (me == 0) headerOffset = ftell(fp); + MPI_Bcast(&headerOffset,1,MPI_LONG,0,world); + } } // ---------------------------------------------------------------------- diff --git a/src/write_restart.h b/src/write_restart.h index be8f716cd5..be8b3aad94 100644 --- a/src/write_restart.h +++ b/src/write_restart.h @@ -44,8 +44,11 @@ class WriteRestart : protected Pointers { int fileproc; // ID of proc in my cluster who writes to file int icluster; // which cluster I am in + // MPI-IO values + int mpiioflag; // 1 for MPIIO output, else 0 class RestartMPIIO *mpiio; // MPIIO for restart file output + MPI_Offset headerOffset; void header(); void type_arrays();