diff --git a/couple/lammps_spparks/image.lammps.1.png b/couple/lammps_spparks/image.lammps.1.png
deleted file mode 100644
index 2afdecf84f..0000000000
Binary files a/couple/lammps_spparks/image.lammps.1.png and /dev/null differ
diff --git a/couple/lammps_spparks/image.lammps.10.png b/couple/lammps_spparks/image.lammps.10.png
deleted file mode 100644
index 6aa584f811..0000000000
Binary files a/couple/lammps_spparks/image.lammps.10.png and /dev/null differ
diff --git a/couple/lammps_spparks/image.lammps.20.png b/couple/lammps_spparks/image.lammps.20.png
deleted file mode 100644
index 8e0c225ae2..0000000000
Binary files a/couple/lammps_spparks/image.lammps.20.png and /dev/null differ
diff --git a/couple/lammps_spparks/image.spparks.1.png b/couple/lammps_spparks/image.spparks.1.png
deleted file mode 100644
index 0e0ea0c62d..0000000000
Binary files a/couple/lammps_spparks/image.spparks.1.png and /dev/null differ
diff --git a/couple/lammps_spparks/image.spparks.10.png b/couple/lammps_spparks/image.spparks.10.png
deleted file mode 100644
index 8d55cc907b..0000000000
Binary files a/couple/lammps_spparks/image.spparks.10.png and /dev/null differ
diff --git a/couple/lammps_spparks/image.spparks.20.png b/couple/lammps_spparks/image.spparks.20.png
deleted file mode 100644
index ce283569c9..0000000000
Binary files a/couple/lammps_spparks/image.spparks.20.png and /dev/null differ
diff --git a/doc/Section_howto.html b/doc/Section_howto.html
index ea4e93f11c..2d9cca3028 100644
--- a/doc/Section_howto.html
+++ b/doc/Section_howto.html
@@ -659,8 +659,8 @@ invoked with minimal overhead (no setup or clean-up) if you wish to do
 multiple short runs, driven by another program.
 </P>
 <P>Examples of driver codes that call LAMMPS as a library are included in
-the "couple" directory of the LAMMPS distribution; see couple/README
-for more details:
+the examples/COUPLE directory of the LAMMPS distribution; see
+examples/COUPLE/README for more details:
 </P>
 <UL><LI>simple: simple driver programs in C++ and C which invoke LAMMPS as a
 library 
@@ -1820,11 +1820,12 @@ details.
 <P>The key idea of the library interface is that you can write any
 functions you wish to define how your code talks to LAMMPS and add
 them to src/library.cpp and src/library.h, as well as to the <A HREF = "Section_python.html">Python
-interface</A>.  The routines you add can access
-or change any LAMMPS data you wish.  The couple and python directories
-have example C++ and C and Python codes which show how a driver code
-can link to LAMMPS as a library, run LAMMPS on a subset of processors,
-grab data from LAMMPS, change it, and put it back into LAMMPS.
+interface</A>.  The routines you add can access or
+change any LAMMPS data you wish.  The examples/COUPLE and python
+directories have example C++ and C and Python codes which show how a
+driver code can link to LAMMPS as a library, run LAMMPS on a subset of
+processors, grab data from LAMMPS, change it, and put it back into
+LAMMPS.
 </P>
 <HR>
 
diff --git a/doc/Section_howto.txt b/doc/Section_howto.txt
index fb94ceeaf8..b02ef0b79b 100644
--- a/doc/Section_howto.txt
+++ b/doc/Section_howto.txt
@@ -654,8 +654,8 @@ invoked with minimal overhead (no setup or clean-up) if you wish to do
 multiple short runs, driven by another program.
 
 Examples of driver codes that call LAMMPS as a library are included in
-the "couple" directory of the LAMMPS distribution; see couple/README
-for more details:
+the examples/COUPLE directory of the LAMMPS distribution; see
+examples/COUPLE/README for more details:
 
 simple: simple driver programs in C++ and C which invoke LAMMPS as a
 library :ulb,l
@@ -1807,11 +1807,12 @@ details.
 The key idea of the library interface is that you can write any
 functions you wish to define how your code talks to LAMMPS and add
 them to src/library.cpp and src/library.h, as well as to the "Python
-interface"_Section_python.html.  The routines you add can access
-or change any LAMMPS data you wish.  The couple and python directories
-have example C++ and C and Python codes which show how a driver code
-can link to LAMMPS as a library, run LAMMPS on a subset of processors,
-grab data from LAMMPS, change it, and put it back into LAMMPS.
+interface"_Section_python.html.  The routines you add can access or
+change any LAMMPS data you wish.  The examples/COUPLE and python
+directories have example C++ and C and Python codes which show how a
+driver code can link to LAMMPS as a library, run LAMMPS on a subset of
+processors, grab data from LAMMPS, change it, and put it back into
+LAMMPS.
 
 :line
 
diff --git a/doc/Section_python.html b/doc/Section_python.html
index e8b768c5d8..b761ee73c3 100644
--- a/doc/Section_python.html
+++ b/doc/Section_python.html
@@ -14,175 +14,153 @@
 <P>This section describes how to build and use LAMMPS via a Python
 interface.
 </P>
-<UL><LI>11.1 <A HREF = "#py_1">Extending Python with a serial version of LAMMPS</A>
-<LI>11.2 <A HREF = "#py_2">Creating a shared MPI library</A>
-<LI>11.3 <A HREF = "#py_3">Extending Python with a parallel version of LAMMPS</A>
-<LI>11.4 <A HREF = "#py_4">Extending Python with MPI</A>
-<LI>11.5 <A HREF = "#py_5">Testing the Python-LAMMPS interface</A>
-<LI>11.6 <A HREF = "#py_6">Using LAMMPS from Python</A>
-<LI>11.7 <A HREF = "#py_7">Example Python scripts that use LAMMPS</A> 
+<UL><LI>11.1 <A HREF = "#py_1">Setting necessary environment variables</A>
+<LI>11.2 <A HREF = "#py_2">Building LAMMPS as a shared library</A>
+<LI>11.3 <A HREF = "#py_3">Extending Python with MPI to run in parallel</A>
+<LI>11.4 <A HREF = "#py_4">Testing the Python-LAMMPS interface</A>
+<LI>11.5 <A HREF = "#py_5">Using LAMMPS from Python</A>
+<LI>11.6 <A HREF = "#py_6">Example Python scripts that use LAMMPS</A> 
 </UL>
-<P>The LAMMPS distribution includes some Python code in its python
-directory which wraps the library interface to LAMMPS.  This makes it
-is possible to run LAMMPS, invoke LAMMPS commands or give it an input
-script, extract LAMMPS results, an modify internal LAMMPS variables,
-either from a Python script or interactively from a Python prompt.
+<P>The LAMMPS distribution includes the file python/lammps.py which wraps
+the library interface to LAMMPS.  This file makes it is possible to
+run LAMMPS, invoke LAMMPS commands or give it an input script, extract
+LAMMPS results, an modify internal LAMMPS variables, either from a
+Python script or interactively from a Python prompt.  You can do the
+former in serial or parallel.  Running Python interactively in
+parallel does not generally work, unless you have a package installed
+that extends your Python to enable multiple instances of Python to
+read what you type.
 </P>
 <P><A HREF = "http://www.python.org">Python</A> is a powerful scripting and programming
 language which can be used to wrap software like LAMMPS and other
 packages.  It can be used to glue multiple pieces of software
-together, e.g. to run a coupled or multiscale model.  See <A HREF = "Section_howto.html#howto_10">this
+together, e.g. to run a coupled or multiscale model.  See <A HREF = "Section_howto.html#howto_10">Section
 section</A> of the manual and the couple
 directory of the distribution for more ideas about coupling LAMMPS to
 other codes.  See <A HREF = "Section_start.html#start_5">Section_start 4</A> about
-how to build LAMMPS as a library, and <A HREF = "Section_howto.html#howto_19">this
-section</A> for a description of the library
+how to build LAMMPS as a library, and <A HREF = "Section_howto.html#howto_19">Section_howto
+19</A> for a description of the library
 interface provided in src/library.cpp and src/library.h and how to
 extend it for your needs.  As described below, that interface is what
 is exposed to Python.  It is designed to be easy to add functions to.
-This has the effect of extending the Python inteface as well.  See
-details below.
+This can easily extend the Python inteface as well.  See details
+below.
 </P>
-<P>By using the Python interface LAMMPS can also be coupled with a GUI or
-visualization tools that display graphs or animations in real time as
-LAMMPS runs.  Examples of such scripts are inlcluded in the python
-directory.
+<P>By using the Python interface, LAMMPS can also be coupled with a GUI
+or other visualization tools that display graphs or animations in real
+time as LAMMPS runs.  Examples of such scripts are inlcluded in the
+python directory.
 </P>
-<P>Two advantages of using Python are how concise the language is and
+<P>Two advantages of using Python are how concise the language is, and
 that it can be run interactively, enabling rapid development and
 debugging of programs.  If you use it to mostly invoke costly
 operations within LAMMPS, such as running a simulation for a
 reasonable number of timesteps, then the overhead cost of invoking
 LAMMPS thru Python will be negligible.
 </P>
-<P>Before using LAMMPS from a Python script, the Python on your machine
-must be "extended" to include an interface to the LAMMPS library.  If
-your Python script will invoke MPI operations, you will also need to
-extend your Python with an interface to MPI itself.
+<P>Before using LAMMPS from a Python script, you have to do two things.
+You need to set two environment variables.  And you need to build
+LAMMPS as a dynamic shared library, so it can be loaded by Python.
+Both these steps are discussed below.  If you wish to run LAMMPS in
+parallel from Python, you also need to extend your Python with MPI.
+This is also discussed below.
 </P>
-<P>Thus you should first decide how you intend to use LAMMPS from Python.
-There are 3 options:
-</P>
-<P>(1) Use LAMMPS on a single processor running Python.
-</P>
-<P>(2) Use LAMMPS in parallel, where each processor runs Python, but your
-Python program does not use MPI.
-</P>
-<P>(3) Use LAMMPS in parallel, where each processor runs Python, and your
-Python script also makes MPI calls through a Python/MPI interface.
-</P>
-<P>Note that for (2) and (3) you will not be able to use Python
-interactively by typing commands and getting a response.  This is
-because you will have multiple instances of Python running (e.g. on a
-parallel machine) and they cannot all read what you type.
-</P>
-<P>Working in mode (1) does not require your machine to have MPI
-installed.  You should extend your Python with a serial version of
-LAMMPS and the dummy MPI library provided with LAMMPS.  See
-instructions below on how to do this.
-</P>
-<P>Working in mode (2) requires your machine to have an MPI library
-installed, but your Python does not need to be extended with MPI
-itself.  The MPI library must be a shared library (e.g. a *.so file on
-Linux) which is not typically created when MPI is built/installed.
-See instruction below on how to do this.  You should extend your
-Python with the a parallel versionn of LAMMPS which will use the
-shared MPI system library.  See instructions below on how to do this.
-</P>
-<P>Working in mode (3) requires your machine to have MPI installed (as a
-shared library as in (2)).  You must also extend your Python with a
-parallel version of LAMMPS (same as in (2)) and with MPI itself, via
-one of several available Python/MPI packages.  See instructions below
-on how to do the latter task.
-</P>
-<P>Several of the following sub-sections cover the rest of the Python
-setup discussion.  The next to last sub-section describes the Python
-syntax used to invoke LAMMPS.  The last sub-section describes example
-Python scripts included in the python directory.
-</P>
-<P>Before proceeding, there are 2 items to note.
-</P>
-<P>(1) The provided Python wrapper for LAMMPS uses the amazing and
-magical (to me) "ctypes" package in Python, which auto-generates the
-interface code needed between Python and a set of C interface routines
-for a library.  Ctypes is part of standard Python for versions 2.5 and
-later.  You can check which version of Python you have installed, by
-simply typing "python" at a shell prompt.
-</P>
-<P>(2) Any library wrapped by Python, including LAMMPS, must be built as
-a shared library (e.g. a *.so file on Linux and not a *.a file).  The
-python/setup_serial.py and setup.py scripts do this build for LAMMPS
-itself (described below).  But if you have LAMMPS configured to use
-additional packages that have their own libraries, then those
-libraries must also be shared libraries.  E.g. MPI, FFTW, or any of
-the libraries in lammps/lib.  When you build LAMMPS as a stand-alone
-code, you are not building shared versions of these libraries.
-</P>
-<P>The discussion below describes how to create a shared MPI library.  I
-suggest you start by configuing LAMMPS without packages installed that
-require any libraries besides MPI.  See <A HREF = "Section_start.html#start_3">this
-section</A> of the manual for a discussion of
-LAMMPS packages.  E.g. do not use the KSPACE, GPU, MEAM, POEMS, or
-REAX packages.
-</P>
-<P>If you are successfully follow the steps belwo to build the Python
-wrappers and use this version of LAMMPS through Python, you can then
-take the next step of adding LAMMPS packages that use additional
-libraries.  This will require you to build a shared library for that
-package's library, similar to what is described below for MPI.  It
-will also require you to edit the python/setup_serial.py or setup.py
-scripts to enable Python to access those libraries when it builds the
-LAMMPS wrapper.
+<P>The Python wrapper for LAMMPS uses the amazing and magical (to me)
+"ctypes" package in Python, which auto-generates the interface code
+needed between Python and a set of C interface routines for a library.
+Ctypes is part of standard Python for versions 2.5 and later.  You can
+check which version of Python you have installed, by simply typing
+"python" at a shell prompt.
 </P>
 <HR>
 
 <HR>
 
-<A NAME = "py_1"></A><H4>11.1 Extending Python with a serial version of LAMMPS 
+<A NAME = "py_1"></A><H4>11.1 Setting necessary environment variables 
 </H4>
-<P>From the python directory in the LAMMPS distribution, type
+<P>For Python to use the LAMMPS interface, it needs to find two files.
+The paths to these files need to be added to two environment variables
+that Python checks.
 </P>
-<PRE>python setup_serial.py build 
+<P>The first is the environment variable PYTHONPATH.  It needs
+to include the directory where the python/lammps.py file is.
+</P>
+<P>For the csh or tcsh shells, you could add something like this to your
+~/.cshrc file:
+</P>
+<PRE>setenv PYTHONPATH $<I>PYTHONPATH</I>:/home/sjplimp/lammps/python 
 </PRE>
-<P>and then one of these commands:
+<P>The second is the environment variable LD_LIBRARY_PATH, which is used
+by the operating system to find dynamic shared libraries when it loads
+them.  It needs to include the directory where the shared LAMMPS
+library will be.  Normally this is the LAMMPS src dir, as explained in
+the following section.
 </P>
-<PRE>sudo python setup_serial.py install
-python setup_serial.py install --home=~/foo 
+<P>For the csh or tcsh shells, you could add something like this to your
+~/.cshrc file:
+</P>
+<PRE>setenv LD_LIBRARY_PATH $<I>LD_LIBRARY_PATH</I>:/home/sjplimp/lammps/src 
 </PRE>
-<P>The "build" command should compile all the needed LAMMPS files,
-including its dummy MPI library.  The first "install" command will put
-the needed files in your Python's site-packages sub-directory, so that
-Python can load them.  For example, if you installed Python yourself
-on a Linux machine, it would typically be somewhere like
-/usr/local/lib/python2.5/site-packages.  Installing Python packages
-this way often requires you to be able to write to the Python
-directories, which may require root priveleges, hence the "sudo"
-prefix.  If this is not the case, you can drop the "sudo".  If you use
-the "sudo" prefix and you have installed Python yourself, you should
-make sure that root uses the same Python as the one you did the
-"install" in.  E.g. these 2 commands may do the install in different
-Python versions:
+<P>As discussed below, if your LAMMPS build includes auxiliary libraries,
+they must also be available as shared libraries for Python to
+successfully load LAMMPS.  If they are not in default places where the
+operating system can find them, then you also have to add their paths
+to the LD_LIBRARY_PATH environment variable.
 </P>
-<PRE>python setup_serial.py install --home=~/foo
-python /usr/local/bin/python/setup_serial.py install --home=~/foo 
+<P>For example, if you are using the dummy MPI library provided in
+src/STUBS, you need to add something like this to your ~/.cshrc file:
+</P>
+<PRE>setenv LD_LIBRARY_PATH $<I>LD_LIBRARY_PATH</I>:/home/sjplimp/lammps/src/STUBS 
 </PRE>
-<P>Alternatively, you can install the LAMMPS files (or any other Python
-packages) in your own user space.  The second "install" command does
-this, where you should replace "foo" with your directory of choice.
-</P>
-<P>If these commands are successful, a <I>lammps.py</I> and
-<I>_lammps_serial.so</I> file will be put in the appropriate directory.
+<P>If you are using the LAMMPS USER-ATC package, you need to add
+something like this to your ~/.cshrc file:
 </P>
+<PRE>setenv LD_LIBRARY_PATH $<I>LD_LIBRARY_PATH</I>:/home/sjplimp/lammps/lib/atc 
+</PRE>
 <HR>
 
-<A NAME = "py_2"></A><H4>11.2 Creating a shared MPI library 
+<A NAME = "py_2"></A><H4>11.2 Building LAMMPS as a shared library 
 </H4>
-<P>A shared library is one that is dynamically loadable, which is what
-Python requires.  On Linux this is a library file that ends in ".so",
-not ".a".  Such a shared library is normally not built if you
-installed MPI yourself, but it is easy to do.  Here is how to do it
-for <A HREF = "http://www-unix.mcs.anl.gov/mpi">MPICH</A>, a popular open-source version of MPI, distributed
-by Argonne National Labs.  From within the mpich directory, type
+<P>Instructions on how to build LAMMPS as a shared library are given in
+<A HREF = "Section_start.html#start_5">Section_start 5</A>.  A shared library is one
+that is dynamically loadable, which is what Python requires.  On Linux
+this is a library file that ends in ".so", not ".a".
+</P>
+<P>>From the src directory, type
+</P>
+<P>make makeshlib
+make -f Makefile.shlib foo
+</P>
+<P>where foo is the machine target name, such as linux or g++ or serial.
+This should create the file liblmp_foo.so in the src directory, as
+well as a soft link liblmp.so which is what the Python wrapper will
+load by default.  If you are building multiple machine versions of the
+shared library, the soft link is always set to the most recently built
+version.
+</P>
+<P>Note that as discussed in below, a LAMMPS build may depend on several
+auxiliary libraries, which are specified in your low-level
+src/Makefile.foo file.  For example, an MPI library, the FFTW library,
+a JPEG library, etc.  Depending on what LAMMPS packages you have
+installed, the build may also require additional libraries from the
+lib directories, such as lib/atc/libatc.so or lib/reax/libreax.so.
+</P>
+<P>You must insure that each of these libraries exist in shared library
+form (*.so file for Linux systems), or either the LAMMPS shared
+library build or the Python load of the library will fail.  For the
+load to be successful all the shared libraries must also be in
+directories that the operating system checks.  See the discussion in
+the preceding section about the LD_LIBRARY_PATH environment variable
+for how to insure this.
+</P>
+<P>Note that some system libraries, such as MPI, if you installed it
+yourself, may not be built by default as shared libraries.  The build
+instructions for the library should tell you how to do this.
+</P>
+<P>For example, here is how to build and install the <A HREF = "http://www-unix.mcs.anl.gov/mpi">MPICH
+library</A>, a popular open-source version of MPI, distributed by
+Argonne National Labs, as a shared library in the default
+/usr/local/lib location:
 </P>
 
 
@@ -190,62 +168,26 @@ by Argonne National Labs.  From within the mpich directory, type
 make
 make install 
 </PRE>
-<P>You may need to use "sudo make install" in place of the last line.
-The end result should be the file libmpich.so in /usr/local/lib.
+<P>You may need to use "sudo make install" in place of the last line if
+you do not have write priveleges for /usr/local/lib.  The end result
+should be the file /usr/local/lib/libmpich.so.
 </P>
-<P>IMPORTANT NOTE: If the file libmpich.a already exists in your
-installation directory (e.g. /usr/local/lib), you will now have both a
-static and shared MPI library.  This will be fine for running LAMMPS
-from Python since it only uses the shared library.  But if you now try
-to build LAMMPS by itself as a stand-alone program (cd lammps/src;
-make foo) or build other codes that expect to link against libmpich.a,
-then those builds may fail if the linker uses libmpich.so instead.  If
-this happens, it means you will need to remove the file
-/usr/local/lib/libmich.so before building LAMMPS again as a
-stand-alone code.
+<P>Note that not all of the auxiliary libraries provided with LAMMPS have
+shared-library Makefiles in their lib directories.  Typically this
+simply requires a Makefile.foo that adds a -fPIC switch when files are
+compiled and a "-fPIC -shared" switches when the library is linked
+with a C++ (or Fortran) compiler, as well as an output target that
+ends in ".so", like libatc.o.  As we or others create and contribute
+these Makefiles, we will add them to the LAMMPS distribution.
 </P>
 <HR>
 
-<A NAME = "py_3"></A><H4>11.3 Extending Python with a parallel version of LAMMPS 
+<A NAME = "py_3"></A><H4>11.3 Extending Python with MPI to run in parallel 
 </H4>
-<P>From the python directory, type
+<P>If you wish to run LAMMPS in parallel from Python, you need to extend
+your Python with an interface to MPI.  This also allows you to
+make MPI calls directly from Python in your script, if you desire.
 </P>
-<PRE>python setup.py build 
-</PRE>
-<P>and then one of these commands:
-</P>
-<PRE>sudo python setup.py install
-python setup.py install --home=~/foo 
-</PRE>
-<P>The "build" command should compile all the needed LAMMPS C++ files,
-which will require MPI to be installed on your system.  This means it
-must find both the header file mpi.h and a shared library file,
-e.g. libmpich.so if the MPICH version of MPI is installed.  See the
-preceding section for how to create a shared library version of MPI if
-it does not exist.  You may need to adjust the "include_dirs" and
-"library_dirs" and "libraries" fields in python/setup.py to
-insure the Python build finds all the files it needs.
-</P>
-<P>The first "install" command will put the needed files in your Python's
-site-packages sub-directory, so that Python can load them.  For
-example, if you installed Python yourself on a Linux machine, it would
-typically be somewhere like /usr/local/lib/python2.5/site-packages.
-Installing Python packages this way often requires you to be able to
-write to the Python directories, which may require root priveleges,
-hence the "sudo" prefix.  If this is not the case, you can drop the
-"sudo".
-</P>
-<P>Alternatively, you can install the LAMMPS files (or any other Python
-packages) in your own user space.  The second "install" command does
-this, where you should replace "foo" with your directory of choice.
-</P>
-<P>If these commands are successful, a <I>lammps.py</I> and <I>_lammps.so</I> file
-will be put in the appropriate directory.
-</P>
-<HR>
-
-<A NAME = "py_4"></A><H4>11.4 Extending Python with MPI 
-</H4>
 <P>There are several Python packages available that purport to wrap MPI
 as a library and allow MPI functions to be called from Python.
 </P>
@@ -260,26 +202,26 @@ as a library and allow MPI functions to be called from Python.
 <P>All of these except pyMPI work by wrapping the MPI library (which must
 be available on your system as a shared library, as discussed above),
 and exposing (some portion of) its interface to your Python script.
-This means they cannot be used interactively in parallel, since they
+This means Python cannot be used interactively in parallel, since they
 do not address the issue of interactive input to multiple instances of
 Python running on different processors.  The one exception is pyMPI,
 which alters the Python interpreter to address this issue, and (I
-believe) creates a new alternate executable (in place of python
+believe) creates a new alternate executable (in place of "python"
 itself) as a result.
 </P>
 <P>In principle any of these Python/MPI packages should work to invoke
-both calls to LAMMPS and MPI itself from a Python script running in
-parallel.  However, when I downloaded and looked at a few of them,
-their docuemtation was incomplete and I had trouble with their
-installation.  It's not clear if some of the packages are still being
-actively developed and supported.
+LAMMPS in parallel and MPI calls themselves from a Python script which
+is itself running in parallel.  However, when I downloaded and looked
+at a few of them, their documentation was incomplete and I had trouble
+with their installation.  It's not clear if some of the packages are
+still being actively developed and supported.
 </P>
 <P>The one I recommend, since I have successfully used it with LAMMPS, is
 Pypar.  Pypar requires the ubiquitous <A HREF = "http://numpy.scipy.org">Numpy
 package</A> be installed in your Python.  After
 launching python, type
 </P>
-<PRE>>>> import numpy 
+<PRE>import numpy 
 </PRE>
 <P>to see if it is installed.  If not, here is how to install it (version
 1.3.0b1 as of April 2009).  Unpack the numpy tarball and from its
@@ -303,106 +245,124 @@ your Python distribution's site-packages directory.
 <P>If you have successully installed Pypar, you should be able to run
 python serially and type
 </P>
-<PRE>>>> import pypar 
+<PRE>import pypar 
 </PRE>
 <P>without error.  You should also be able to run python in parallel
 on a simple test script
 </P>
-<PRE>% mpirun -np 4 python test.script 
+<PRE>% mpirun -np 4 python test.py 
 </PRE>
-<P>where test.script contains the lines
+<P>where test.py contains the lines
 </P>
 <PRE>import pypar
 print "Proc %d out of %d procs" % (pypar.rank(),pypar.size()) 
 </PRE>
-<P>and see one line of output for each processor you ran on.
+<P>and see one line of output for each processor you run on.
 </P>
 <HR>
 
-<A NAME = "py_5"></A><H4>11.5 Testing the Python-LAMMPS interface 
+<A NAME = "py_4"></A><H4>11.4 Testing the Python-LAMMPS interface 
 </H4>
-<P>Before using LAMMPS in a Python program, one more step is needed.  The
-interface to LAMMPS is via the Python ctypes package, which loads the
-shared LAMMPS library via a CDLL() call, which in turn is a wrapper on
-the C-library dlopen().  This command is different than a normal
-Python "import" and needs to be able to find the LAMMPS shared
-library, which is either in the Python site-packages directory or in a
-local directory you specified in the "python setup.py install"
-command, as described above.
-</P>
-<P>The simplest way to do this is add a line like this to your
-.cshrc or other shell start-up file.
-</P>
-<PRE>setenv LD_LIBRARY_PATH
-${LD_LIBRARY_PATH}:/usr/local/lib/python2.5/site-packages 
-</PRE>
-<P>and then execute the shell file to insure the path has been updated.
-This will extend the path that dlopen() uses to look for shared
-libraries.
-</P>
-<P>To test if the serial LAMMPS library has been successfully installed
-(mode 1 above), launch Python and type
+<P>To test if LAMMPS is callable from Python, launch Python interactively
+and type:
 </P>
 <PRE>>>> from lammps import lammps
 >>> lmp = lammps() 
 </PRE>
-<P>If you get no errors, you're ready to use serial LAMMPS from Python.
+<P>If you get no errors, you're ready to use LAMMPS from Python.
+If the load fails, the most common error to see is
 </P>
-<P>If you built LAMMPS for parallel use (mode 2 or 3 above), launch
-Python in parallel:
+<P>"CDLL: asdfasdfasdf"
 </P>
-<PRE>% mpirun -np 4 python test.script 
+<P>which means Python was unable to load the LAMMPS shared library.  This
+can occur if it can't find the LAMMMPS library; see the environment
+variable discussion <A HREF = "#python_1">above</A>.  Or if it can't find one of the
+auxiliary libraries that was specified in the LAMMPS build, in a
+shared dynamic library format.  This includes all libraries needed by
+main LAMMPS (e.g. MPI or FFTW or JPEG), system libraries needed by
+main LAMMPS (e.g. extra libs needed by MPI), or packages you have
+installed that require libraries provided with LAMMPS (e.g. the
+USER-ATC package require lib/atc/libatc.so) or system libraries
+(e.g. BLAS or Fortran-to-C libraries) listed in the
+lib/package/Makefile.lammps file.  Again, all of these must be
+available as shared libraries, or the Python load will fail.
+</P>
+<P>Python (actually the operating system) isn't verbose about telling you
+why the load failed, so go through the steps above and in
+<A HREF = "Section_start.html#start_5">Section_start 5</A> carefully.
+</P>
+<H5><B>Test LAMMPS and Python in serial:</B> 
+</H5>
+<P>To run a LAMMPS test in serial, type these lines into Python
+interactively from the bench directory:
+</P>
+<PRE>>>> from lammps import lammps
+>>> lmp = lammps()
+>>> lmp.file("in.lj") 
 </PRE>
-<P>where test.script contains the lines
+<P>Or put the same lines in the file test.py and run it as
+</P>
+<PRE>% python test.py 
+</PRE>
+<P>Either way, you should see the results of running the in.lj benchmark
+on a single processor appear on the screen, the same as if you had
+typed something like:
+</P>
+<PRE>lmp_g++ < in.lj 
+</PRE>
+<H5><B>Test LAMMPS and Python in parallel:</B> 
+</H5>
+<P>To run LAMMPS in parallel, assuming you have installed the
+<A HREF = "http://datamining.anu.edu.au/~ole/pypar">Pypar</A> package as discussed
+above, create a test.py file containing these lines:
 </P>
 <PRE>import pypar
 from lammps import lammps
 lmp = lammps()
-print "Proc %d out of %d procs has" % (pypar.rank(),pypar.size()), lmp
+lmp.file("in.lj")
+print "Proc %d out of %d procs has" % (pypar.rank(),pypar.size()),lmp
 pypar.finalize() 
 </PRE>
-<P>Again, if you get no errors, you're good to go.
+<P>You can then run it in parallel as:
 </P>
-<P>Note that if you left out the "import pypar" line from this script,
-you would instantiate and run LAMMPS independently on each of the P
-processors specified in the mpirun command.  You can test if Pypar is
-enabling true parallel Python and LAMMPS by adding a line to the above
-sequence of commands like lmp.file("in.lj") to run an input script and
-see if the LAMMPS run says it ran on P processors or if you get output
-from P duplicated 1-processor runs written to the screen.  In the
-latter case, Pypar is not working correctly.
-</P>
-<P>Note that this line:
-</P>
-<PRE>from lammps import lammps 
+<PRE>% mpirun -np 4 python test.py 
 </PRE>
-<P>will import either the serial or parallel version of the LAMMPS
-library, as wrapped by lammps.py.  But if you installed both via
-setup_serial.py and setup.py, it will always import the parallel
-version, since it attempts that first.
+<P>and you should see the same output as if you had typed
 </P>
-<P>Note that if your Python script imports the Pypar package (as above),
-so that it can use MPI calls directly, then Pypar initializes MPI for
-you.  Thus the last line of your Python script should be
-pypar.finalize(), to insure MPI is shut down correctly.
+<PRE>% mpirun -np 4 lmp_g++ < in.lj 
+</PRE>
+<P>Note that if you leave out the 3 lines from test.py that specify Pypar
+commands you will instantiate and run LAMMPS independently on each of
+the P processors specified in the mpirun command.  In this case you
+should get 4 sets of output, each showing that a run was made on a
+single processor, instead of one set of output showing that it ran on
+4 processors.  If the 1-processor outputs occur, it means that Pypar
+is not working correctly.
 </P>
-<P>Also note that a Python script can be invoked in one of several ways:
+<P>Also note that once you import the PyPar module, Pypar initializes MPI
+for you, and you can use MPI calls directly in your Python script, as
+described in the Pypar documentation.  The last line of your Python
+script should be pypar.finalize(), to insure MPI is shut down
+correctly.
 </P>
-<P>% python foo.script
+<P>Note that any Python script (not just for LAMMPS) can be invoked in
+one of several ways:
+</P>
+<PRE>% python foo.script
 % python -i foo.script
-% foo.script
-</P>
+% foo.script 
+</PRE>
 <P>The last command requires that the first line of the script be
 something like this:
 </P>
-<P>#!/usr/local/bin/python 
-#!/usr/local/bin/python -i
-</P>
+<PRE>#!/usr/local/bin/python 
+#!/usr/local/bin/python -i 
+</PRE>
 <P>where the path points to where you have Python installed, and that you
 have made the script file executable:
 </P>
-<P>% chmod +x foo.script
-</P>
+<PRE>% chmod +x foo.script 
+</PRE>
 <P>Without the "-i" flag, Python will exit when the script finishes.
 With the "-i" flag, you will be left in the Python interpreter when
 the script finishes, so you can type subsequent commands.  As
@@ -413,14 +373,15 @@ Python on a single processor, not in parallel.
 
 <HR>
 
-<A NAME = "py_6"></A><H4>11.6 Using LAMMPS from Python 
+<A NAME = "py_5"></A><H4>11.5 Using LAMMPS from Python 
 </H4>
 <P>The Python interface to LAMMPS consists of a Python "lammps" module,
 the source code for which is in python/lammps.py, which creates a
 "lammps" object, with a set of methods that can be invoked on that
 object.  The sample Python code below assumes you have first imported
-the "lammps" module in your Python script and its settings as
-follows:
+the "lammps" module in your Python script.  You can also include its
+settings as follows, which are useful in test return values from some
+of the methods described below:
 </P>
 <PRE>from lammps import lammps 
 from lammps import LMPINT as INT
@@ -434,8 +395,10 @@ at the file src/library.cpp you will see that they correspond
 one-to-one with calls you can make to the LAMMPS library from a C++ or
 C or Fortran program.
 </P>
-<PRE>lmp = lammps()           # create a LAMMPS object
-lmp = lammps(list)       # ditto, with command-line args, list = ["-echo","screen"] 
+<PRE>lmp = lammps()           # create a LAMMPS object using the default liblmp.so library
+lmp = lammps("g++")      # create a LAMMPS object using the liblmp_g++.so library
+lmp = lammps("",list)    # ditto, with command-line args, list = ["-echo","screen"]
+lmp = lammps("g++",list) 
 </PRE>
 <PRE>lmp.close()              # destroy a LAMMPS object 
 </PRE>
@@ -443,16 +406,16 @@ lmp = lammps(list)       # ditto, with command-line args, list = ["-echo","scree
 lmp.command(cmd)         # invoke a single LAMMPS command, cmd = "run 100" 
 </PRE>
 <PRE>xlo = lmp.extract_global(name,type)  # extract a global quantity
-                                     # name = "boxxlo", "nlocal", etc
+                                    # name = "boxxlo", "nlocal", etc
 				     # type = INT or DOUBLE 
 </PRE>
 <PRE>coords = lmp.extract_atom(name,type)      # extract a per-atom quantity
-                                          # name = "x", "type", etc
+                                         # name = "x", "type", etc
 				          # type = IPTR or DPTR or DPTRPTR 
 </PRE>
 <PRE>eng = lmp.extract_compute(id,style,type)  # extract value(s) from a compute
 v3 = lmp.extract_fix(id,style,type,i,j)   # extract value(s) from a fix
-                                          # id = ID of compute or fix
+                                         # id = ID of compute or fix
 					  # style = 0 = global data
 					  #	    1 = per-atom data
 					  #         2 = local data
@@ -473,12 +436,23 @@ lmp.put_coords(x)                         # set all atom coords via x
 </PRE>
 <HR>
 
-<P>The creation of a LAMMPS object does not take an MPI communicator as
-an argument.  There should be a way to do this, so that the LAMMPS
-instance runs on a subset of processors, if desired, but I don't yet
-know how from Pypar.  So for now, it runs on MPI_COMM_WORLD, which is
-all the processors.
+<P>IMPORTANT NOTE: Currently, the creation of a LAMMPS object does not
+take an MPI communicator as an argument.  There should be a way to do
+this, so that the LAMMPS instance runs on a subset of processors if
+desired, but I don't know how to do it from Pypar.  So for now, it
+runs on MPI_COMM_WORLD, which is all the processors.  If someone
+figures out how to do this with one or more of the Python wrappers for
+MPI, like Pypar, please let us know and we will amend these doc pages.
 </P>
+<P>Note that you can create multiple LAMMPS objects in your Python
+script, and coordinate and run multiple simulations, e.g.
+</P>
+<PRE>from lammps import lammps
+lmp1 = lammps()
+lmp2 = lammps()
+lmp1.file("in.file1")
+lmp2.file("in.file2") 
+</PRE>
 <P>The file() and command() methods allow an input script or single
 commands to be invoked.
 </P>
@@ -588,15 +562,10 @@ following steps:
 <UL><LI>Add a new interface function to src/library.cpp and
 src/library.h. 
 
-<LI>Verify the new function is syntactically correct by building LAMMPS as
-a library - see <A HREF = "Section_start.html#start_5">Section_start 4</A> of the
-manual. 
+<LI>Rebuild LAMMPS as a shared library. 
 
-<LI>Add a wrapper method in the Python LAMMPS module to python/lammps.py
-for this interface function. 
-
-<LI>Rebuild the Python wrapper via python/setup_serial.py or
-python/setup.py. 
+<LI>Add a wrapper method to python/lammps.py for this interface
+function. 
 
 <LI>You should now be able to invoke the new interface function from a
 Python script.  Isn't ctypes amazing? 
@@ -605,7 +574,7 @@ Python script.  Isn't ctypes amazing?
 
 <HR>
 
-<A NAME = "py_7"></A><H4>11.7 Example Python scripts that use LAMMPS 
+<A NAME = "py_6"></A><H4>11.6 Example Python scripts that use LAMMPS 
 </H4>
 <P>These are the Python scripts included as demos in the python/examples
 directory of the LAMMPS distribution, to illustrate the kinds of
diff --git a/doc/Section_python.txt b/doc/Section_python.txt
index 89290a4a85..7386e8b53e 100644
--- a/doc/Section_python.txt
+++ b/doc/Section_python.txt
@@ -11,174 +11,152 @@
 This section describes how to build and use LAMMPS via a Python
 interface.
 
-11.1 "Extending Python with a serial version of LAMMPS"_#py_1
-11.2 "Creating a shared MPI library"_#py_2
-11.3 "Extending Python with a parallel version of LAMMPS"_#py_3
-11.4 "Extending Python with MPI"_#py_4
-11.5 "Testing the Python-LAMMPS interface"_#py_5
-11.6 "Using LAMMPS from Python"_#py_6
-11.7 "Example Python scripts that use LAMMPS"_#py_7 :ul
+11.1 "Setting necessary environment variables"_#py_1
+11.2 "Building LAMMPS as a shared library"_#py_2
+11.3 "Extending Python with MPI to run in parallel"_#py_3
+11.4 "Testing the Python-LAMMPS interface"_#py_4
+11.5 "Using LAMMPS from Python"_#py_5
+11.6 "Example Python scripts that use LAMMPS"_#py_6 :ul
 
-The LAMMPS distribution includes some Python code in its python
-directory which wraps the library interface to LAMMPS.  This makes it
-is possible to run LAMMPS, invoke LAMMPS commands or give it an input
-script, extract LAMMPS results, an modify internal LAMMPS variables,
-either from a Python script or interactively from a Python prompt.
+The LAMMPS distribution includes the file python/lammps.py which wraps
+the library interface to LAMMPS.  This file makes it is possible to
+run LAMMPS, invoke LAMMPS commands or give it an input script, extract
+LAMMPS results, an modify internal LAMMPS variables, either from a
+Python script or interactively from a Python prompt.  You can do the
+former in serial or parallel.  Running Python interactively in
+parallel does not generally work, unless you have a package installed
+that extends your Python to enable multiple instances of Python to
+read what you type.
 
 "Python"_http://www.python.org is a powerful scripting and programming
 language which can be used to wrap software like LAMMPS and other
 packages.  It can be used to glue multiple pieces of software
-together, e.g. to run a coupled or multiscale model.  See "this
+together, e.g. to run a coupled or multiscale model.  See "Section
 section"_Section_howto.html#howto_10 of the manual and the couple
 directory of the distribution for more ideas about coupling LAMMPS to
 other codes.  See "Section_start 4"_Section_start.html#start_5 about
-how to build LAMMPS as a library, and "this
-section"_Section_howto.html#howto_19 for a description of the library
+how to build LAMMPS as a library, and "Section_howto
+19"_Section_howto.html#howto_19 for a description of the library
 interface provided in src/library.cpp and src/library.h and how to
 extend it for your needs.  As described below, that interface is what
 is exposed to Python.  It is designed to be easy to add functions to.
-This has the effect of extending the Python inteface as well.  See
-details below.
+This can easily extend the Python inteface as well.  See details
+below.
 
-By using the Python interface LAMMPS can also be coupled with a GUI or
-visualization tools that display graphs or animations in real time as
-LAMMPS runs.  Examples of such scripts are inlcluded in the python
-directory.
+By using the Python interface, LAMMPS can also be coupled with a GUI
+or other visualization tools that display graphs or animations in real
+time as LAMMPS runs.  Examples of such scripts are inlcluded in the
+python directory.
 
-Two advantages of using Python are how concise the language is and
+Two advantages of using Python are how concise the language is, and
 that it can be run interactively, enabling rapid development and
 debugging of programs.  If you use it to mostly invoke costly
 operations within LAMMPS, such as running a simulation for a
 reasonable number of timesteps, then the overhead cost of invoking
 LAMMPS thru Python will be negligible.
 
-Before using LAMMPS from a Python script, the Python on your machine
-must be "extended" to include an interface to the LAMMPS library.  If
-your Python script will invoke MPI operations, you will also need to
-extend your Python with an interface to MPI itself.
+Before using LAMMPS from a Python script, you have to do two things.
+You need to set two environment variables.  And you need to build
+LAMMPS as a dynamic shared library, so it can be loaded by Python.
+Both these steps are discussed below.  If you wish to run LAMMPS in
+parallel from Python, you also need to extend your Python with MPI.
+This is also discussed below.
 
-Thus you should first decide how you intend to use LAMMPS from Python.
-There are 3 options:
-
-(1) Use LAMMPS on a single processor running Python.
-
-(2) Use LAMMPS in parallel, where each processor runs Python, but your
-Python program does not use MPI.
-
-(3) Use LAMMPS in parallel, where each processor runs Python, and your
-Python script also makes MPI calls through a Python/MPI interface.
-
-Note that for (2) and (3) you will not be able to use Python
-interactively by typing commands and getting a response.  This is
-because you will have multiple instances of Python running (e.g. on a
-parallel machine) and they cannot all read what you type.
-
-Working in mode (1) does not require your machine to have MPI
-installed.  You should extend your Python with a serial version of
-LAMMPS and the dummy MPI library provided with LAMMPS.  See
-instructions below on how to do this.
-
-Working in mode (2) requires your machine to have an MPI library
-installed, but your Python does not need to be extended with MPI
-itself.  The MPI library must be a shared library (e.g. a *.so file on
-Linux) which is not typically created when MPI is built/installed.
-See instruction below on how to do this.  You should extend your
-Python with the a parallel versionn of LAMMPS which will use the
-shared MPI system library.  See instructions below on how to do this.
-
-Working in mode (3) requires your machine to have MPI installed (as a
-shared library as in (2)).  You must also extend your Python with a
-parallel version of LAMMPS (same as in (2)) and with MPI itself, via
-one of several available Python/MPI packages.  See instructions below
-on how to do the latter task.
-
-Several of the following sub-sections cover the rest of the Python
-setup discussion.  The next to last sub-section describes the Python
-syntax used to invoke LAMMPS.  The last sub-section describes example
-Python scripts included in the python directory.
-
-Before proceeding, there are 2 items to note.
-
-(1) The provided Python wrapper for LAMMPS uses the amazing and
-magical (to me) "ctypes" package in Python, which auto-generates the
-interface code needed between Python and a set of C interface routines
-for a library.  Ctypes is part of standard Python for versions 2.5 and
-later.  You can check which version of Python you have installed, by
-simply typing "python" at a shell prompt.
-
-(2) Any library wrapped by Python, including LAMMPS, must be built as
-a shared library (e.g. a *.so file on Linux and not a *.a file).  The
-python/setup_serial.py and setup.py scripts do this build for LAMMPS
-itself (described below).  But if you have LAMMPS configured to use
-additional packages that have their own libraries, then those
-libraries must also be shared libraries.  E.g. MPI, FFTW, or any of
-the libraries in lammps/lib.  When you build LAMMPS as a stand-alone
-code, you are not building shared versions of these libraries.
-
-The discussion below describes how to create a shared MPI library.  I
-suggest you start by configuing LAMMPS without packages installed that
-require any libraries besides MPI.  See "this
-section"_Section_start.html#start_3 of the manual for a discussion of
-LAMMPS packages.  E.g. do not use the KSPACE, GPU, MEAM, POEMS, or
-REAX packages.
-
-If you are successfully follow the steps belwo to build the Python
-wrappers and use this version of LAMMPS through Python, you can then
-take the next step of adding LAMMPS packages that use additional
-libraries.  This will require you to build a shared library for that
-package's library, similar to what is described below for MPI.  It
-will also require you to edit the python/setup_serial.py or setup.py
-scripts to enable Python to access those libraries when it builds the
-LAMMPS wrapper.
+The Python wrapper for LAMMPS uses the amazing and magical (to me)
+"ctypes" package in Python, which auto-generates the interface code
+needed between Python and a set of C interface routines for a library.
+Ctypes is part of standard Python for versions 2.5 and later.  You can
+check which version of Python you have installed, by simply typing
+"python" at a shell prompt.
 
 :line
 :line
 
-11.1 Extending Python with a serial version of LAMMPS :link(py_1),h4
+11.1 Setting necessary environment variables :link(py_1),h4
 
-From the python directory in the LAMMPS distribution, type
+For Python to use the LAMMPS interface, it needs to find two files.
+The paths to these files need to be added to two environment variables
+that Python checks.
 
-python setup_serial.py build :pre
+The first is the environment variable PYTHONPATH.  It needs
+to include the directory where the python/lammps.py file is.
 
-and then one of these commands:
+For the csh or tcsh shells, you could add something like this to your
+~/.cshrc file:
 
-sudo python setup_serial.py install
-python setup_serial.py install --home=~/foo :pre
+setenv PYTHONPATH ${PYTHONPATH}:/home/sjplimp/lammps/python :pre
 
-The "build" command should compile all the needed LAMMPS files,
-including its dummy MPI library.  The first "install" command will put
-the needed files in your Python's site-packages sub-directory, so that
-Python can load them.  For example, if you installed Python yourself
-on a Linux machine, it would typically be somewhere like
-/usr/local/lib/python2.5/site-packages.  Installing Python packages
-this way often requires you to be able to write to the Python
-directories, which may require root priveleges, hence the "sudo"
-prefix.  If this is not the case, you can drop the "sudo".  If you use
-the "sudo" prefix and you have installed Python yourself, you should
-make sure that root uses the same Python as the one you did the
-"install" in.  E.g. these 2 commands may do the install in different
-Python versions:
+The second is the environment variable LD_LIBRARY_PATH, which is used
+by the operating system to find dynamic shared libraries when it loads
+them.  It needs to include the directory where the shared LAMMPS
+library will be.  Normally this is the LAMMPS src dir, as explained in
+the following section.
 
-python setup_serial.py install --home=~/foo
-python /usr/local/bin/python/setup_serial.py install --home=~/foo :pre
+For the csh or tcsh shells, you could add something like this to your
+~/.cshrc file:
 
-Alternatively, you can install the LAMMPS files (or any other Python
-packages) in your own user space.  The second "install" command does
-this, where you should replace "foo" with your directory of choice.
+setenv LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/home/sjplimp/lammps/src :pre
 
-If these commands are successful, a {lammps.py} and
-{_lammps_serial.so} file will be put in the appropriate directory.
+As discussed below, if your LAMMPS build includes auxiliary libraries,
+they must also be available as shared libraries for Python to
+successfully load LAMMPS.  If they are not in default places where the
+operating system can find them, then you also have to add their paths
+to the LD_LIBRARY_PATH environment variable.
+
+For example, if you are using the dummy MPI library provided in
+src/STUBS, you need to add something like this to your ~/.cshrc file:
+
+setenv LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/home/sjplimp/lammps/src/STUBS :pre
+
+If you are using the LAMMPS USER-ATC package, you need to add
+something like this to your ~/.cshrc file:
+
+setenv LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/home/sjplimp/lammps/lib/atc :pre
 
 :line
 
-11.2 Creating a shared MPI library :link(py_2),h4
+11.2 Building LAMMPS as a shared library :link(py_2),h4
 
-A shared library is one that is dynamically loadable, which is what
-Python requires.  On Linux this is a library file that ends in ".so",
-not ".a".  Such a shared library is normally not built if you
-installed MPI yourself, but it is easy to do.  Here is how to do it
-for "MPICH"_mpich, a popular open-source version of MPI, distributed
-by Argonne National Labs.  From within the mpich directory, type
+Instructions on how to build LAMMPS as a shared library are given in
+"Section_start 5"_Section_start.html#start_5.  A shared library is one
+that is dynamically loadable, which is what Python requires.  On Linux
+this is a library file that ends in ".so", not ".a".
+
+>From the src directory, type
+
+make makeshlib
+make -f Makefile.shlib foo
+
+where foo is the machine target name, such as linux or g++ or serial.
+This should create the file liblmp_foo.so in the src directory, as
+well as a soft link liblmp.so which is what the Python wrapper will
+load by default.  If you are building multiple machine versions of the
+shared library, the soft link is always set to the most recently built
+version.
+
+Note that as discussed in below, a LAMMPS build may depend on several
+auxiliary libraries, which are specified in your low-level
+src/Makefile.foo file.  For example, an MPI library, the FFTW library,
+a JPEG library, etc.  Depending on what LAMMPS packages you have
+installed, the build may also require additional libraries from the
+lib directories, such as lib/atc/libatc.so or lib/reax/libreax.so.
+
+You must insure that each of these libraries exist in shared library
+form (*.so file for Linux systems), or either the LAMMPS shared
+library build or the Python load of the library will fail.  For the
+load to be successful all the shared libraries must also be in
+directories that the operating system checks.  See the discussion in
+the preceding section about the LD_LIBRARY_PATH environment variable
+for how to insure this.
+
+Note that some system libraries, such as MPI, if you installed it
+yourself, may not be built by default as shared libraries.  The build
+instructions for the library should tell you how to do this.
+
+For example, here is how to build and install the "MPICH
+library"_mpich, a popular open-source version of MPI, distributed by
+Argonne National Labs, as a shared library in the default
+/usr/local/lib location:
 
 :link(mpich,http://www-unix.mcs.anl.gov/mpi)
 
@@ -186,61 +164,25 @@ by Argonne National Labs.  From within the mpich directory, type
 make
 make install :pre
 
-You may need to use "sudo make install" in place of the last line.
-The end result should be the file libmpich.so in /usr/local/lib.
+You may need to use "sudo make install" in place of the last line if
+you do not have write priveleges for /usr/local/lib.  The end result
+should be the file /usr/local/lib/libmpich.so.
 
-IMPORTANT NOTE: If the file libmpich.a already exists in your
-installation directory (e.g. /usr/local/lib), you will now have both a
-static and shared MPI library.  This will be fine for running LAMMPS
-from Python since it only uses the shared library.  But if you now try
-to build LAMMPS by itself as a stand-alone program (cd lammps/src;
-make foo) or build other codes that expect to link against libmpich.a,
-then those builds may fail if the linker uses libmpich.so instead.  If
-this happens, it means you will need to remove the file
-/usr/local/lib/libmich.so before building LAMMPS again as a
-stand-alone code.
+Note that not all of the auxiliary libraries provided with LAMMPS have
+shared-library Makefiles in their lib directories.  Typically this
+simply requires a Makefile.foo that adds a -fPIC switch when files are
+compiled and a "-fPIC -shared" switches when the library is linked
+with a C++ (or Fortran) compiler, as well as an output target that
+ends in ".so", like libatc.o.  As we or others create and contribute
+these Makefiles, we will add them to the LAMMPS distribution.
 
 :line
 
-11.3 Extending Python with a parallel version of LAMMPS :link(py_3),h4
+11.3 Extending Python with MPI to run in parallel :link(py_3),h4
 
-From the python directory, type
-
-python setup.py build :pre
-
-and then one of these commands:
-
-sudo python setup.py install
-python setup.py install --home=~/foo :pre
-
-The "build" command should compile all the needed LAMMPS C++ files,
-which will require MPI to be installed on your system.  This means it
-must find both the header file mpi.h and a shared library file,
-e.g. libmpich.so if the MPICH version of MPI is installed.  See the
-preceding section for how to create a shared library version of MPI if
-it does not exist.  You may need to adjust the "include_dirs" and
-"library_dirs" and "libraries" fields in python/setup.py to
-insure the Python build finds all the files it needs.
-
-The first "install" command will put the needed files in your Python's
-site-packages sub-directory, so that Python can load them.  For
-example, if you installed Python yourself on a Linux machine, it would
-typically be somewhere like /usr/local/lib/python2.5/site-packages.
-Installing Python packages this way often requires you to be able to
-write to the Python directories, which may require root priveleges,
-hence the "sudo" prefix.  If this is not the case, you can drop the
-"sudo".
-
-Alternatively, you can install the LAMMPS files (or any other Python
-packages) in your own user space.  The second "install" command does
-this, where you should replace "foo" with your directory of choice.
-
-If these commands are successful, a {lammps.py} and {_lammps.so} file
-will be put in the appropriate directory.
-
-:line
-
-11.4 Extending Python with MPI :link(py_4),h4
+If you wish to run LAMMPS in parallel from Python, you need to extend
+your Python with an interface to MPI.  This also allows you to
+make MPI calls directly from Python in your script, if you desire.
 
 There are several Python packages available that purport to wrap MPI
 as a library and allow MPI functions to be called from Python.
@@ -256,26 +198,26 @@ These include
 All of these except pyMPI work by wrapping the MPI library (which must
 be available on your system as a shared library, as discussed above),
 and exposing (some portion of) its interface to your Python script.
-This means they cannot be used interactively in parallel, since they
+This means Python cannot be used interactively in parallel, since they
 do not address the issue of interactive input to multiple instances of
 Python running on different processors.  The one exception is pyMPI,
 which alters the Python interpreter to address this issue, and (I
-believe) creates a new alternate executable (in place of python
+believe) creates a new alternate executable (in place of "python"
 itself) as a result.
 
 In principle any of these Python/MPI packages should work to invoke
-both calls to LAMMPS and MPI itself from a Python script running in
-parallel.  However, when I downloaded and looked at a few of them,
-their docuemtation was incomplete and I had trouble with their
-installation.  It's not clear if some of the packages are still being
-actively developed and supported.
+LAMMPS in parallel and MPI calls themselves from a Python script which
+is itself running in parallel.  However, when I downloaded and looked
+at a few of them, their documentation was incomplete and I had trouble
+with their installation.  It's not clear if some of the packages are
+still being actively developed and supported.
 
 The one I recommend, since I have successfully used it with LAMMPS, is
 Pypar.  Pypar requires the ubiquitous "Numpy
 package"_http://numpy.scipy.org be installed in your Python.  After
 launching python, type
 
->>> import numpy :pre
+import numpy :pre
 
 to see if it is installed.  If not, here is how to install it (version
 1.3.0b1 as of April 2009).  Unpack the numpy tarball and from its
@@ -299,105 +241,123 @@ your Python distribution's site-packages directory.
 If you have successully installed Pypar, you should be able to run
 python serially and type
 
->>> import pypar :pre
+import pypar :pre
 
 without error.  You should also be able to run python in parallel
 on a simple test script
 
-% mpirun -np 4 python test.script :pre
+% mpirun -np 4 python test.py :pre
 
-where test.script contains the lines
+where test.py contains the lines
 
 import pypar
 print "Proc %d out of %d procs" % (pypar.rank(),pypar.size()) :pre
 
-and see one line of output for each processor you ran on.
+and see one line of output for each processor you run on.
 
 :line
 
-11.5 Testing the Python-LAMMPS interface :link(py_5),h4
+11.4 Testing the Python-LAMMPS interface :link(py_4),h4
 
-Before using LAMMPS in a Python program, one more step is needed.  The
-interface to LAMMPS is via the Python ctypes package, which loads the
-shared LAMMPS library via a CDLL() call, which in turn is a wrapper on
-the C-library dlopen().  This command is different than a normal
-Python "import" and needs to be able to find the LAMMPS shared
-library, which is either in the Python site-packages directory or in a
-local directory you specified in the "python setup.py install"
-command, as described above.
-
-The simplest way to do this is add a line like this to your
-.cshrc or other shell start-up file.
-
-setenv LD_LIBRARY_PATH
-$\{LD_LIBRARY_PATH\}:/usr/local/lib/python2.5/site-packages :pre
-
-and then execute the shell file to insure the path has been updated.
-This will extend the path that dlopen() uses to look for shared
-libraries.
-
-To test if the serial LAMMPS library has been successfully installed
-(mode 1 above), launch Python and type
+To test if LAMMPS is callable from Python, launch Python interactively
+and type:
 
 >>> from lammps import lammps
 >>> lmp = lammps() :pre
 
-If you get no errors, you're ready to use serial LAMMPS from Python.
+If you get no errors, you're ready to use LAMMPS from Python.
+If the load fails, the most common error to see is
 
-If you built LAMMPS for parallel use (mode 2 or 3 above), launch
-Python in parallel:
+"CDLL: asdfasdfasdf"
 
-% mpirun -np 4 python test.script :pre
+which means Python was unable to load the LAMMPS shared library.  This
+can occur if it can't find the LAMMMPS library; see the environment
+variable discussion "above"_#python_1.  Or if it can't find one of the
+auxiliary libraries that was specified in the LAMMPS build, in a
+shared dynamic library format.  This includes all libraries needed by
+main LAMMPS (e.g. MPI or FFTW or JPEG), system libraries needed by
+main LAMMPS (e.g. extra libs needed by MPI), or packages you have
+installed that require libraries provided with LAMMPS (e.g. the
+USER-ATC package require lib/atc/libatc.so) or system libraries
+(e.g. BLAS or Fortran-to-C libraries) listed in the
+lib/package/Makefile.lammps file.  Again, all of these must be
+available as shared libraries, or the Python load will fail.
 
-where test.script contains the lines
+Python (actually the operating system) isn't verbose about telling you
+why the load failed, so go through the steps above and in
+"Section_start 5"_Section_start.html#start_5 carefully.
+
+[Test LAMMPS and Python in serial:] :h5
+
+To run a LAMMPS test in serial, type these lines into Python
+interactively from the bench directory:
+
+>>> from lammps import lammps
+>>> lmp = lammps()
+>>> lmp.file("in.lj") :pre
+
+Or put the same lines in the file test.py and run it as
+
+% python test.py :pre
+
+Either way, you should see the results of running the in.lj benchmark
+on a single processor appear on the screen, the same as if you had
+typed something like:
+
+lmp_g++ < in.lj :pre
+
+[Test LAMMPS and Python in parallel:] :h5
+
+To run LAMMPS in parallel, assuming you have installed the
+"Pypar"_http://datamining.anu.edu.au/~ole/pypar package as discussed
+above, create a test.py file containing these lines:
 
 import pypar
 from lammps import lammps
 lmp = lammps()
-print "Proc %d out of %d procs has" % (pypar.rank(),pypar.size()), lmp
+lmp.file("in.lj")
+print "Proc %d out of %d procs has" % (pypar.rank(),pypar.size()),lmp
 pypar.finalize() :pre
 
-Again, if you get no errors, you're good to go.
+You can then run it in parallel as:
 
-Note that if you left out the "import pypar" line from this script,
-you would instantiate and run LAMMPS independently on each of the P
-processors specified in the mpirun command.  You can test if Pypar is
-enabling true parallel Python and LAMMPS by adding a line to the above
-sequence of commands like lmp.file("in.lj") to run an input script and
-see if the LAMMPS run says it ran on P processors or if you get output
-from P duplicated 1-processor runs written to the screen.  In the
-latter case, Pypar is not working correctly.
+% mpirun -np 4 python test.py :pre
 
-Note that this line:
+and you should see the same output as if you had typed
 
-from lammps import lammps :pre
+% mpirun -np 4 lmp_g++ < in.lj :pre
 
-will import either the serial or parallel version of the LAMMPS
-library, as wrapped by lammps.py.  But if you installed both via
-setup_serial.py and setup.py, it will always import the parallel
-version, since it attempts that first.
+Note that if you leave out the 3 lines from test.py that specify Pypar
+commands you will instantiate and run LAMMPS independently on each of
+the P processors specified in the mpirun command.  In this case you
+should get 4 sets of output, each showing that a run was made on a
+single processor, instead of one set of output showing that it ran on
+4 processors.  If the 1-processor outputs occur, it means that Pypar
+is not working correctly.
 
-Note that if your Python script imports the Pypar package (as above),
-so that it can use MPI calls directly, then Pypar initializes MPI for
-you.  Thus the last line of your Python script should be
-pypar.finalize(), to insure MPI is shut down correctly.
+Also note that once you import the PyPar module, Pypar initializes MPI
+for you, and you can use MPI calls directly in your Python script, as
+described in the Pypar documentation.  The last line of your Python
+script should be pypar.finalize(), to insure MPI is shut down
+correctly.
 
-Also note that a Python script can be invoked in one of several ways:
+Note that any Python script (not just for LAMMPS) can be invoked in
+one of several ways:
 
 % python foo.script
 % python -i foo.script
-% foo.script
+% foo.script :pre
 
 The last command requires that the first line of the script be
 something like this:
 
 #!/usr/local/bin/python 
-#!/usr/local/bin/python -i
+#!/usr/local/bin/python -i :pre
 
 where the path points to where you have Python installed, and that you
 have made the script file executable:
 
-% chmod +x foo.script
+% chmod +x foo.script :pre
 
 Without the "-i" flag, Python will exit when the script finishes.
 With the "-i" flag, you will be left in the Python interpreter when
@@ -408,14 +368,15 @@ Python on a single processor, not in parallel.
 :line
 :line
 
-11.6 Using LAMMPS from Python :link(py_6),h4
+11.5 Using LAMMPS from Python :link(py_5),h4
 
 The Python interface to LAMMPS consists of a Python "lammps" module,
 the source code for which is in python/lammps.py, which creates a
 "lammps" object, with a set of methods that can be invoked on that
 object.  The sample Python code below assumes you have first imported
-the "lammps" module in your Python script and its settings as
-follows:
+the "lammps" module in your Python script.  You can also include its
+settings as follows, which are useful in test return values from some
+of the methods described below:
 
 from lammps import lammps 
 from lammps import LMPINT as INT
@@ -429,8 +390,10 @@ at the file src/library.cpp you will see that they correspond
 one-to-one with calls you can make to the LAMMPS library from a C++ or
 C or Fortran program.
 
-lmp = lammps()           # create a LAMMPS object
-lmp = lammps(list)       # ditto, with command-line args, list = \["-echo","screen"\] :pre
+lmp = lammps()           # create a LAMMPS object using the default liblmp.so library
+lmp = lammps("g++")      # create a LAMMPS object using the liblmp_g++.so library
+lmp = lammps("",list)    # ditto, with command-line args, list = \["-echo","screen"\]
+lmp = lammps("g++",list) :pre
 
 lmp.close()              # destroy a LAMMPS object :pre
 
@@ -438,16 +401,16 @@ lmp.file(file)           # run an entire input script, file = "in.lj"
 lmp.command(cmd)         # invoke a single LAMMPS command, cmd = "run 100" :pre
 
 xlo = lmp.extract_global(name,type)  # extract a global quantity
-                                     # name = "boxxlo", "nlocal", etc
+                                    # name = "boxxlo", "nlocal", etc
 				     # type = INT or DOUBLE :pre
 
 coords = lmp.extract_atom(name,type)      # extract a per-atom quantity
-                                          # name = "x", "type", etc
+                                         # name = "x", "type", etc
 				          # type = IPTR or DPTR or DPTRPTR :pre
 
 eng = lmp.extract_compute(id,style,type)  # extract value(s) from a compute
 v3 = lmp.extract_fix(id,style,type,i,j)   # extract value(s) from a fix
-                                          # id = ID of compute or fix
+                                         # id = ID of compute or fix
 					  # style = 0 = global data
 					  #	    1 = per-atom data
 					  #         2 = local data
@@ -468,11 +431,22 @@ lmp.put_coords(x)                         # set all atom coords via x :pre
 
 :line
 
-The creation of a LAMMPS object does not take an MPI communicator as
-an argument.  There should be a way to do this, so that the LAMMPS
-instance runs on a subset of processors, if desired, but I don't yet
-know how from Pypar.  So for now, it runs on MPI_COMM_WORLD, which is
-all the processors.
+IMPORTANT NOTE: Currently, the creation of a LAMMPS object does not
+take an MPI communicator as an argument.  There should be a way to do
+this, so that the LAMMPS instance runs on a subset of processors if
+desired, but I don't know how to do it from Pypar.  So for now, it
+runs on MPI_COMM_WORLD, which is all the processors.  If someone
+figures out how to do this with one or more of the Python wrappers for
+MPI, like Pypar, please let us know and we will amend these doc pages.
+
+Note that you can create multiple LAMMPS objects in your Python
+script, and coordinate and run multiple simulations, e.g.
+
+from lammps import lammps
+lmp1 = lammps()
+lmp2 = lammps()
+lmp1.file("in.file1")
+lmp2.file("in.file2") :pre
 
 The file() and command() methods allow an input script or single
 commands to be invoked.
@@ -583,15 +557,10 @@ following steps:
 Add a new interface function to src/library.cpp and
 src/library.h. :ulb,l
 
-Verify the new function is syntactically correct by building LAMMPS as
-a library - see "Section_start 4"_Section_start.html#start_5 of the
-manual. :l
+Rebuild LAMMPS as a shared library. :l
 
-Add a wrapper method in the Python LAMMPS module to python/lammps.py
-for this interface function. :l
-
-Rebuild the Python wrapper via python/setup_serial.py or
-python/setup.py. :l
+Add a wrapper method to python/lammps.py for this interface
+function. :l
 
 You should now be able to invoke the new interface function from a
 Python script.  Isn't ctypes amazing? :l,ule
@@ -599,7 +568,7 @@ Python script.  Isn't ctypes amazing? :l,ule
 :line
 :line
 
-11.7 Example Python scripts that use LAMMPS :link(py_7),h4
+11.6 Example Python scripts that use LAMMPS :link(py_6),h4
 
 These are the Python scripts included as demos in the python/examples
 directory of the LAMMPS distribution, to illustrate the kinds of
diff --git a/doc/Section_start.html b/doc/Section_start.html
index eef8e51a83..f16a963e6b 100644
--- a/doc/Section_start.html
+++ b/doc/Section_start.html
@@ -44,7 +44,6 @@ sub-directories:
 <TR><TD >README</TD><TD > text file</TD></TR>
 <TR><TD >LICENSE</TD><TD > the GNU General Public License (GPL)</TD></TR>
 <TR><TD >bench</TD><TD > benchmark problems</TD></TR>
-<TR><TD >couple</TD><TD > code coupling examples, using LAMMPS as a library</TD></TR>
 <TR><TD >doc</TD><TD > documentation</TD></TR>
 <TR><TD >examples</TD><TD > simple test problems</TD></TR>
 <TR><TD >potentials</TD><TD > embedded atom method (EAM) potential files</TD></TR>
@@ -774,39 +773,80 @@ input scripts.
 
 <H4><A NAME = "start_5"></A>2.5 Building LAMMPS as a library 
 </H4>
-<P>LAMMPS itself can be built as a library, which can then be called from
-another application or a scripting language.  See <A HREF = "Section_howto.html#howto_10">this
-section</A> for more info on coupling LAMMPS
-to other codes.  Building LAMMPS as a library is done by typing
+<P>LAMMPS can be built as either a static or shared library, which can
+then be called from another application or a scripting language.  See
+<A HREF = "Section_howto.html#howto_10">this section</A> for more info on coupling
+LAMMPS to other codes.  See <A HREF = "Section_python.html">this section</A> for
+more info on wrapping and running LAMMPS from Python.
+</P>
+<P>To build LAMMPS as a static library (*.a file on Linux), type
 </P>
 <PRE>make makelib
 make -f Makefile.lib foo 
 </PRE>
-<P>where foo is the machine name.  Note that inclusion or exclusion of
-any desired optional packages should be done before typing "make
-makelib".  The first "make" command will create a current Makefile.lib
-with all the file names in your src dir.  The 2nd "make" command will
-use it to build LAMMPS as a library.  This requires that Makefile.foo
-have a library target (lib) and system-specific settings for ARCHIVE
-and ARFLAGS.  See Makefile.linux for an example.  The build will
-create the file liblmp_foo.a which another application can link to.
+<P>where foo is the machine name.  This kind of library is typically used
+to statically link a driver application to all of LAMMPS, so that you
+can insure all dependencies are satisfied at compile time.  Note that
+inclusion or exclusion of any desired optional packages should be done
+before typing "make makelib".  The first "make" command will create a
+current Makefile.lib with all the file names in your src dir.  The 2nd
+"make" command will use it to build LAMMPS as a static library, using
+the ARCHIVE and ARFLAGS settings in src/MAKE/Makefile.foo.  The build
+will create the file liblmp_foo.a which another application can link
+to.
 </P>
-<P>When used from a C++ program, the library allows one or more LAMMPS
-objects to be instantiated.  All of LAMMPS is wrapped in a LAMMPS_NS
+<P>To build LAMMPS as a shared library (*.so file on Linux), which can be
+dynamically loaded, type
+</P>
+<PRE>make makeshlib
+make -f Makefile.shlib foo 
+</PRE>
+<P>where foo is the machine name.  This kind of library is required when
+wrapping LAMMPS with Python; see <A HREF = "Section_python.html">Section_python</A>
+for details.  Again, note that inclusion or exclusion of any desired
+optional packages should be done before typing "make makelib".  The
+first "make" command will create a current Makefile.shlib with all the
+file names in your src dir.  The 2nd "make" command will use it to
+build LAMMPS as a shared library, using the SHFLAGS and SHLIBFLAGS
+settings in src/MAKE/Makefile.foo.  The build will create the file
+liblmp_foo.so which another application can link to dyamically, as
+well as a soft link liblmp.so, which the Python wrapper uses by
+default.
+</P>
+<P>Note that for a shared library to be usable by a calling program, all
+the auxiliary libraries it depends on must also exist as shared
+libraries, and be find-able by the operating system.  Else you will
+get a run-time error when the shared library is loaded.  For LAMMPS,
+this includes all libraries needed by main LAMMPS (e.g. MPI or FFTW or
+JPEG), system libraries needed by main LAMMPS (e.g. extra libs needed
+by MPI), or packages you have installed that require libraries
+provided with LAMMPS (e.g. the USER-ATC package require
+lib/atc/libatc.so) or system libraries (e.g. BLAS or Fortran-to-C
+libraries) listed in the lib/package/Makefile.lammps file.  See the
+discussion about the LAMMPS shared library in
+<A HREF = "Section_python.html">Section_python</A> for details about how to build
+shared versions of these libraries, and how to insure the operating
+system can find them, by setting the LD_LIBRARY_PATH environment
+variable correctly.
+</P>
+<P>Either flavor of library allows one or more LAMMPS objects to be
+instantiated from the calling program.
+</P>
+<P>When used from a C++ program, all of LAMMPS is wrapped in a LAMMPS_NS
 namespace; you can safely use any of its classes and methods from
-within your application code, as needed. 
+within the calling code, as needed.
 </P>
-<P>When used from a C or Fortran program or a scripting language, the
-library has a simple function-style interface, provided in
+<P>When used from a C or Fortran program or a scripting language like
+Python, the library has a simple function-style interface, provided in
 src/library.cpp and src/library.h.
 </P>
-<P>See the sample codes couple/simple/simple.cpp and simple.c as examples
-of C++ and C codes that invoke LAMMPS thru its library interface.
-There are other examples as well in the couple directory which are
-discussed in <A HREF = "Section_howto.html#howto_10">Section_howto 10</A> of the
-manual.  See <A HREF = "Section_python.html">Section_python</A> of the manual for a
-description of the Python wrapper provided with LAMMPS that operates
-through the LAMMPS library interface.
+<P>See the sample codes in examples/COUPLE/simple for examples of C++ and
+C codes that invoke LAMMPS thru its library interface.  There are
+other examples as well in the COUPLE directory which are discussed in
+<A HREF = "Section_howto.html#howto_10">Section_howto 10</A> of the manual.  See
+<A HREF = "Section_python.html">Section_python</A> of the manual for a description
+of the Python wrapper provided with LAMMPS that operates through the
+LAMMPS library interface.
 </P>
 <P>The files src/library.cpp and library.h contain the C-style interface
 to LAMMPS.  See <A HREF = "Section_howto.html#howto_19">Section_howto 19</A> of the
diff --git a/doc/Section_start.txt b/doc/Section_start.txt
index 8469a82dc1..5cd6d9febf 100644
--- a/doc/Section_start.txt
+++ b/doc/Section_start.txt
@@ -39,7 +39,6 @@ sub-directories:
 README: text file
 LICENSE: the GNU General Public License (GPL)
 bench: benchmark problems
-couple: code coupling examples, using LAMMPS as a library
 doc: documentation
 examples: simple test problems
 potentials: embedded atom method (EAM) potential files
@@ -768,39 +767,80 @@ input scripts.
 
 2.5 Building LAMMPS as a library :h4,link(start_5)
 
-LAMMPS itself can be built as a library, which can then be called from
-another application or a scripting language.  See "this
-section"_Section_howto.html#howto_10 for more info on coupling LAMMPS
-to other codes.  Building LAMMPS as a library is done by typing
+LAMMPS can be built as either a static or shared library, which can
+then be called from another application or a scripting language.  See
+"this section"_Section_howto.html#howto_10 for more info on coupling
+LAMMPS to other codes.  See "this section"_Section_python.html for
+more info on wrapping and running LAMMPS from Python.
+
+To build LAMMPS as a static library (*.a file on Linux), type
 
 make makelib
 make -f Makefile.lib foo :pre
 
-where foo is the machine name.  Note that inclusion or exclusion of
-any desired optional packages should be done before typing "make
-makelib".  The first "make" command will create a current Makefile.lib
-with all the file names in your src dir.  The 2nd "make" command will
-use it to build LAMMPS as a library.  This requires that Makefile.foo
-have a library target (lib) and system-specific settings for ARCHIVE
-and ARFLAGS.  See Makefile.linux for an example.  The build will
-create the file liblmp_foo.a which another application can link to.
+where foo is the machine name.  This kind of library is typically used
+to statically link a driver application to all of LAMMPS, so that you
+can insure all dependencies are satisfied at compile time.  Note that
+inclusion or exclusion of any desired optional packages should be done
+before typing "make makelib".  The first "make" command will create a
+current Makefile.lib with all the file names in your src dir.  The 2nd
+"make" command will use it to build LAMMPS as a static library, using
+the ARCHIVE and ARFLAGS settings in src/MAKE/Makefile.foo.  The build
+will create the file liblmp_foo.a which another application can link
+to.
 
-When used from a C++ program, the library allows one or more LAMMPS
-objects to be instantiated.  All of LAMMPS is wrapped in a LAMMPS_NS
+To build LAMMPS as a shared library (*.so file on Linux), which can be
+dynamically loaded, type
+
+make makeshlib
+make -f Makefile.shlib foo :pre
+
+where foo is the machine name.  This kind of library is required when
+wrapping LAMMPS with Python; see "Section_python"_Section_python.html
+for details.  Again, note that inclusion or exclusion of any desired
+optional packages should be done before typing "make makelib".  The
+first "make" command will create a current Makefile.shlib with all the
+file names in your src dir.  The 2nd "make" command will use it to
+build LAMMPS as a shared library, using the SHFLAGS and SHLIBFLAGS
+settings in src/MAKE/Makefile.foo.  The build will create the file
+liblmp_foo.so which another application can link to dyamically, as
+well as a soft link liblmp.so, which the Python wrapper uses by
+default.
+
+Note that for a shared library to be usable by a calling program, all
+the auxiliary libraries it depends on must also exist as shared
+libraries, and be find-able by the operating system.  Else you will
+get a run-time error when the shared library is loaded.  For LAMMPS,
+this includes all libraries needed by main LAMMPS (e.g. MPI or FFTW or
+JPEG), system libraries needed by main LAMMPS (e.g. extra libs needed
+by MPI), or packages you have installed that require libraries
+provided with LAMMPS (e.g. the USER-ATC package require
+lib/atc/libatc.so) or system libraries (e.g. BLAS or Fortran-to-C
+libraries) listed in the lib/package/Makefile.lammps file.  See the
+discussion about the LAMMPS shared library in
+"Section_python"_Section_python.html for details about how to build
+shared versions of these libraries, and how to insure the operating
+system can find them, by setting the LD_LIBRARY_PATH environment
+variable correctly.
+
+Either flavor of library allows one or more LAMMPS objects to be
+instantiated from the calling program.
+
+When used from a C++ program, all of LAMMPS is wrapped in a LAMMPS_NS
 namespace; you can safely use any of its classes and methods from
-within your application code, as needed. 
+within the calling code, as needed.
 
-When used from a C or Fortran program or a scripting language, the
-library has a simple function-style interface, provided in
+When used from a C or Fortran program or a scripting language like
+Python, the library has a simple function-style interface, provided in
 src/library.cpp and src/library.h.
 
-See the sample codes couple/simple/simple.cpp and simple.c as examples
-of C++ and C codes that invoke LAMMPS thru its library interface.
-There are other examples as well in the couple directory which are
-discussed in "Section_howto 10"_Section_howto.html#howto_10 of the
-manual.  See "Section_python"_Section_python.html of the manual for a
-description of the Python wrapper provided with LAMMPS that operates
-through the LAMMPS library interface.
+See the sample codes in examples/COUPLE/simple for examples of C++ and
+C codes that invoke LAMMPS thru its library interface.  There are
+other examples as well in the COUPLE directory which are discussed in
+"Section_howto 10"_Section_howto.html#howto_10 of the manual.  See
+"Section_python"_Section_python.html of the manual for a description
+of the Python wrapper provided with LAMMPS that operates through the
+LAMMPS library interface.
 
 The files src/library.cpp and library.h contain the C-style interface
 to LAMMPS.  See "Section_howto 19"_Section_howto.html#howto_19 of the
diff --git a/doc/compute_property_local.html b/doc/compute_property_local.html
index f7c1a32b1c..70b0fd7556 100644
--- a/doc/compute_property_local.html
+++ b/doc/compute_property_local.html
@@ -21,15 +21,17 @@
 
 <LI>input = one or more attributes 
 
-<PRE>  possible attributes = natom1 natom2
-		        patom1 patom2
+<PRE>  possible attributes = natom1 natom2 ntype1 ntype2
+		        patom1 patom2 ptype1 ptype2
                         batom1 batom2 btype
                         aatom1 aatom2 aatom3 atype
                         datom1 datom2 datom3 dtype
                         iatom1 iatom2 iatom3 itype 
 </PRE>
 <PRE>     natom1, natom2 = IDs of 2 atoms in each pair (within neighbor cutoff)
+     ntype1, ntype2 = type of 2 atoms in each pair (within neighbor cutoff)
      patom1, patom2 = IDs of 2 atoms in each pair (within force cutoff)
+     ptype1, ptype2 = type of 2 atoms in each pair (within force cutoff)
      batom1, batom2 = IDs of 2 atoms in each bond
      btype = bond type of each bond
      aatom1, aatom2, aatom3 = IDs of 3 atoms in each angle
@@ -91,7 +93,9 @@ local</A> command in a consistent way.
 </P>
 <P>The <I>natom1</I> and <I>natom2</I>, or <I>patom1</I> and <I>patom2</I> attributes refer
 to the atom IDs of the 2 atoms in each pairwise interaction computed
-by the <A HREF = "pair_style.html">pair_style</A> command.
+by the <A HREF = "pair_style.html">pair_style</A> command.  The <I>ntype1</I> and
+<I>ntype2</I>, or <I>ptype1</I> and <I>ptype2</I> attributes refer to the atom types
+of the 2 atoms in each pairwise interaction.
 </P>
 <P>IMPORTANT NOTE: For pairs, if two atoms I,J are involved in 1-2, 1-3,
 1-4 interactions within the molecular topology, their pairwise
@@ -107,9 +111,11 @@ command.
 atoms in each <A HREF = "bond_style.html">bond</A>.  The <I>btype</I> attribute refers to
 the type of the bond, from 1 to Nbtypes = # of bond types.  The number
 of bond types is defined in the data file read by the
-<A HREF = "read_data.html">read_data</A> command.  The attributes that start with
-"a", "d", "i", refer to similar values for <A HREF = "angle_style.html">angles</A>,
-<A HREF = "dihedral_style.html">dihedrals</A>, and <A HREF = "improper_style.html">impropers</A>.
+<A HREF = "read_data.html">read_data</A> command.
+</P>
+<P>The attributes that start with "a", "d", "i", refer to similar values
+for <A HREF = "angle_style.html">angles</A>, <A HREF = "dihedral_style.html">dihedrals</A>, and
+<A HREF = "improper_style.html">impropers</A>.
 </P>
 <P><B>Output info:</B>
 </P>
diff --git a/doc/compute_property_local.txt b/doc/compute_property_local.txt
index 33da51c08e..ffc25d4e8a 100644
--- a/doc/compute_property_local.txt
+++ b/doc/compute_property_local.txt
@@ -15,15 +15,17 @@ compute ID group-ID property/local input1 input2 ... :pre
 ID, group-ID are documented in "compute"_compute.html command :ulb,l
 property/local = style name of this compute command :l
 input = one or more attributes :l
-  possible attributes = natom1 natom2
-		        patom1 patom2
+  possible attributes = natom1 natom2 ntype1 ntype2
+		        patom1 patom2 ptype1 ptype2
                         batom1 batom2 btype
                         aatom1 aatom2 aatom3 atype
                         datom1 datom2 datom3 dtype
                         iatom1 iatom2 iatom3 itype :pre
 
      natom1, natom2 = IDs of 2 atoms in each pair (within neighbor cutoff)
+     ntype1, ntype2 = type of 2 atoms in each pair (within neighbor cutoff)
      patom1, patom2 = IDs of 2 atoms in each pair (within force cutoff)
+     ptype1, ptype2 = type of 2 atoms in each pair (within force cutoff)
      batom1, batom2 = IDs of 2 atoms in each bond
      btype = bond type of each bond
      aatom1, aatom2, aatom3 = IDs of 3 atoms in each angle
@@ -84,7 +86,9 @@ local"_dump.html command in a consistent way.
 
 The {natom1} and {natom2}, or {patom1} and {patom2} attributes refer
 to the atom IDs of the 2 atoms in each pairwise interaction computed
-by the "pair_style"_pair_style.html command.
+by the "pair_style"_pair_style.html command.  The {ntype1} and
+{ntype2}, or {ptype1} and {ptype2} attributes refer to the atom types
+of the 2 atoms in each pairwise interaction.
 
 IMPORTANT NOTE: For pairs, if two atoms I,J are involved in 1-2, 1-3,
 1-4 interactions within the molecular topology, their pairwise
@@ -100,9 +104,11 @@ The {batom1} and {batom2} attributes refer to the atom IDs of the 2
 atoms in each "bond"_bond_style.html.  The {btype} attribute refers to
 the type of the bond, from 1 to Nbtypes = # of bond types.  The number
 of bond types is defined in the data file read by the
-"read_data"_read_data.html command.  The attributes that start with
-"a", "d", "i", refer to similar values for "angles"_angle_style.html,
-"dihedrals"_dihedral_style.html, and "impropers"_improper_style.html.
+"read_data"_read_data.html command.
+
+The attributes that start with "a", "d", "i", refer to similar values
+for "angles"_angle_style.html, "dihedrals"_dihedral_style.html, and
+"impropers"_improper_style.html.
 
 [Output info:]
 
diff --git a/couple/README b/examples/COUPLE/README
similarity index 91%
rename from couple/README
rename to examples/COUPLE/README
index 93c0a48080..727d2b5367 100644
--- a/couple/README
+++ b/examples/COUPLE/README
@@ -8,8 +8,8 @@ model a realistic problem.
 
 See these sections of the LAMMPS manaul for details:
 
-2.4 Building LAMMPS as a library (doc/Section_start.html#2_4)
-4.10 Coupling LAMMPS to other codes (doc/Section_howto.html#4_10)
+2.5 Building LAMMPS as a library (doc/Section_start.html#start_5)
+6.10 Coupling LAMMPS to other codes (doc/Section_howto.html#howto_10)
 
 In all of the examples included here, LAMMPS must first be built as a
 library.  Basically, you type something like
diff --git a/couple/fortran/README b/examples/COUPLE/fortran/README
similarity index 100%
rename from couple/fortran/README
rename to examples/COUPLE/fortran/README
diff --git a/couple/fortran/libfwrapper.c b/examples/COUPLE/fortran/libfwrapper.c
similarity index 100%
rename from couple/fortran/libfwrapper.c
rename to examples/COUPLE/fortran/libfwrapper.c
diff --git a/couple/lammps_quest/Makefile.g++ b/examples/COUPLE/lammps_quest/Makefile.g++
similarity index 100%
rename from couple/lammps_quest/Makefile.g++
rename to examples/COUPLE/lammps_quest/Makefile.g++
diff --git a/couple/lammps_quest/README b/examples/COUPLE/lammps_quest/README
similarity index 100%
rename from couple/lammps_quest/README
rename to examples/COUPLE/lammps_quest/README
diff --git a/couple/lammps_quest/in.lammps b/examples/COUPLE/lammps_quest/in.lammps
similarity index 100%
rename from couple/lammps_quest/in.lammps
rename to examples/COUPLE/lammps_quest/in.lammps
diff --git a/couple/lammps_quest/lmppath.h b/examples/COUPLE/lammps_quest/lmppath.h
similarity index 100%
rename from couple/lammps_quest/lmppath.h
rename to examples/COUPLE/lammps_quest/lmppath.h
diff --git a/couple/lammps_quest/lmpqst.cpp b/examples/COUPLE/lammps_quest/lmpqst.cpp
similarity index 100%
rename from couple/lammps_quest/lmpqst.cpp
rename to examples/COUPLE/lammps_quest/lmpqst.cpp
diff --git a/couple/lammps_quest/log.lammps.1 b/examples/COUPLE/lammps_quest/log.lammps.1
similarity index 100%
rename from couple/lammps_quest/log.lammps.1
rename to examples/COUPLE/lammps_quest/log.lammps.1
diff --git a/couple/lammps_quest/log.lammps.4 b/examples/COUPLE/lammps_quest/log.lammps.4
similarity index 100%
rename from couple/lammps_quest/log.lammps.4
rename to examples/COUPLE/lammps_quest/log.lammps.4
diff --git a/couple/lammps_quest/qstexe.h b/examples/COUPLE/lammps_quest/qstexe.h
similarity index 100%
rename from couple/lammps_quest/qstexe.h
rename to examples/COUPLE/lammps_quest/qstexe.h
diff --git a/couple/lammps_quest/si_111.in b/examples/COUPLE/lammps_quest/si_111.in
similarity index 100%
rename from couple/lammps_quest/si_111.in
rename to examples/COUPLE/lammps_quest/si_111.in
diff --git a/couple/lammps_spparks/Makefile.g++ b/examples/COUPLE/lammps_spparks/Makefile.g++
similarity index 100%
rename from couple/lammps_spparks/Makefile.g++
rename to examples/COUPLE/lammps_spparks/Makefile.g++
diff --git a/couple/lammps_spparks/README b/examples/COUPLE/lammps_spparks/README
similarity index 100%
rename from couple/lammps_spparks/README
rename to examples/COUPLE/lammps_spparks/README
diff --git a/examples/COUPLE/lammps_spparks/in.lammps b/examples/COUPLE/lammps_spparks/in.lammps
new file mode 100644
index 0000000000..892ee492eb
--- /dev/null
+++ b/examples/COUPLE/lammps_spparks/in.lammps
@@ -0,0 +1,114 @@
+units lj
+dimension 2
+atom_style atomic
+
+read_data data.lammps
+mass * 1.0
+
+pair_style lj/cut 2.5
+pair_coeff * * 1.0 1.2
+pair_coeff 1 1 1.0 1.0
+pair_coeff 2 2 1.0 1.0
+pair_coeff 3 3 1.0 1.0
+pair_coeff 4 4 1.0 1.0
+pair_coeff 5 5 1.0 1.0
+pair_coeff 6 6 1.0 1.0
+pair_coeff 7 7 1.0 1.0
+pair_coeff 8 8 1.0 1.0
+pair_coeff 9 9 1.0 1.0
+pair_coeff 10 10 1.0 1.0
+pair_coeff 11 11 1.0 1.0
+pair_coeff 12 12 1.0 1.0
+pair_coeff 13 13 1.0 1.0
+pair_coeff 14 14 1.0 1.0
+pair_coeff 15 15 1.0 1.0
+pair_coeff 16 16 1.0 1.0
+pair_coeff 17 17 1.0 1.0
+pair_coeff 18 18 1.0 1.0
+pair_coeff 19 19 1.0 1.0
+pair_coeff 20 20 1.0 1.0
+pair_coeff 21 21 1.0 1.0
+pair_coeff 22 22 1.0 1.0
+pair_coeff 23 23 1.0 1.0
+pair_coeff 24 24 1.0 1.0
+pair_coeff 25 25 1.0 1.0
+pair_coeff 26 26 1.0 1.0
+pair_coeff 27 27 1.0 1.0
+pair_coeff 28 28 1.0 1.0
+pair_coeff 29 29 1.0 1.0
+pair_coeff 30 30 1.0 1.0
+pair_coeff 31 31 1.0 1.0
+pair_coeff 32 32 1.0 1.0
+pair_coeff 33 33 1.0 1.0
+pair_coeff 34 34 1.0 1.0
+pair_coeff 35 35 1.0 1.0
+pair_coeff 36 36 1.0 1.0
+pair_coeff 37 37 1.0 1.0
+pair_coeff 38 38 1.0 1.0
+pair_coeff 39 39 1.0 1.0
+pair_coeff 40 40 1.0 1.0
+pair_coeff 41 41 1.0 1.0
+pair_coeff 42 42 1.0 1.0
+pair_coeff 43 43 1.0 1.0
+pair_coeff 44 44 1.0 1.0
+pair_coeff 45 45 1.0 1.0
+pair_coeff 46 46 1.0 1.0
+pair_coeff 47 47 1.0 1.0
+pair_coeff 48 48 1.0 1.0
+pair_coeff 49 49 1.0 1.0
+pair_coeff 50 50 1.0 1.0
+pair_coeff 51 51 1.0 1.0
+pair_coeff 52 52 1.0 1.0
+pair_coeff 53 53 1.0 1.0
+pair_coeff 54 54 1.0 1.0
+pair_coeff 55 55 1.0 1.0
+pair_coeff 56 56 1.0 1.0
+pair_coeff 57 57 1.0 1.0
+pair_coeff 58 58 1.0 1.0
+pair_coeff 59 59 1.0 1.0
+pair_coeff 60 60 1.0 1.0
+pair_coeff 61 61 1.0 1.0
+pair_coeff 62 62 1.0 1.0
+pair_coeff 63 63 1.0 1.0
+pair_coeff 64 64 1.0 1.0
+pair_coeff 65 65 1.0 1.0
+pair_coeff 66 66 1.0 1.0
+pair_coeff 67 67 1.0 1.0
+pair_coeff 68 68 1.0 1.0
+pair_coeff 69 69 1.0 1.0
+pair_coeff 70 70 1.0 1.0
+pair_coeff 71 71 1.0 1.0
+pair_coeff 72 72 1.0 1.0
+pair_coeff 73 73 1.0 1.0
+pair_coeff 74 74 1.0 1.0
+pair_coeff 75 75 1.0 1.0
+pair_coeff 76 76 1.0 1.0
+pair_coeff 77 77 1.0 1.0
+pair_coeff 78 78 1.0 1.0
+pair_coeff 79 79 1.0 1.0
+pair_coeff 80 80 1.0 1.0
+pair_coeff 81 81 1.0 1.0
+pair_coeff 82 82 1.0 1.0
+pair_coeff 83 83 1.0 1.0
+pair_coeff 84 84 1.0 1.0
+pair_coeff 85 85 1.0 1.0
+pair_coeff 86 86 1.0 1.0
+pair_coeff 87 87 1.0 1.0
+pair_coeff 88 88 1.0 1.0
+pair_coeff 89 89 1.0 1.0
+pair_coeff 90 90 1.0 1.0
+pair_coeff 91 91 1.0 1.0
+pair_coeff 92 92 1.0 1.0
+pair_coeff 93 93 1.0 1.0
+pair_coeff 94 94 1.0 1.0
+pair_coeff 95 95 1.0 1.0
+pair_coeff 96 96 1.0 1.0
+pair_coeff 97 97 1.0 1.0
+pair_coeff 98 98 1.0 1.0
+pair_coeff 99 99 1.0 1.0
+pair_coeff 100 100 1.0 1.0
+
+compute da all displace/atom
+
+dump 1 all atom 10 dump.md
+thermo 1
diff --git a/couple/lammps_spparks/in.spparks b/examples/COUPLE/lammps_spparks/in.spparks
similarity index 100%
rename from couple/lammps_spparks/in.spparks
rename to examples/COUPLE/lammps_spparks/in.spparks
diff --git a/couple/lammps_spparks/lmppath.h b/examples/COUPLE/lammps_spparks/lmppath.h
similarity index 100%
rename from couple/lammps_spparks/lmppath.h
rename to examples/COUPLE/lammps_spparks/lmppath.h
diff --git a/couple/lammps_spparks/lmpspk.cpp b/examples/COUPLE/lammps_spparks/lmpspk.cpp
similarity index 100%
rename from couple/lammps_spparks/lmpspk.cpp
rename to examples/COUPLE/lammps_spparks/lmpspk.cpp
diff --git a/couple/lammps_spparks/log.lammps.1 b/examples/COUPLE/lammps_spparks/log.lammps.1
similarity index 100%
rename from couple/lammps_spparks/log.lammps.1
rename to examples/COUPLE/lammps_spparks/log.lammps.1
diff --git a/couple/lammps_spparks/log.lammps.4 b/examples/COUPLE/lammps_spparks/log.lammps.4
similarity index 100%
rename from couple/lammps_spparks/log.lammps.4
rename to examples/COUPLE/lammps_spparks/log.lammps.4
diff --git a/couple/lammps_spparks/log.spparks.1 b/examples/COUPLE/lammps_spparks/log.spparks.1
similarity index 100%
rename from couple/lammps_spparks/log.spparks.1
rename to examples/COUPLE/lammps_spparks/log.spparks.1
diff --git a/couple/lammps_spparks/log.spparks.4 b/examples/COUPLE/lammps_spparks/log.spparks.4
similarity index 100%
rename from couple/lammps_spparks/log.spparks.4
rename to examples/COUPLE/lammps_spparks/log.spparks.4
diff --git a/couple/lammps_spparks/spkpath.h b/examples/COUPLE/lammps_spparks/spkpath.h
similarity index 100%
rename from couple/lammps_spparks/spkpath.h
rename to examples/COUPLE/lammps_spparks/spkpath.h
diff --git a/couple/library/Makefile.g++ b/examples/COUPLE/library/Makefile.g++
similarity index 100%
rename from couple/library/Makefile.g++
rename to examples/COUPLE/library/Makefile.g++
diff --git a/couple/library/README b/examples/COUPLE/library/README
similarity index 100%
rename from couple/library/README
rename to examples/COUPLE/library/README
diff --git a/couple/library/error.cpp b/examples/COUPLE/library/error.cpp
similarity index 100%
rename from couple/library/error.cpp
rename to examples/COUPLE/library/error.cpp
diff --git a/couple/library/error.h b/examples/COUPLE/library/error.h
similarity index 100%
rename from couple/library/error.h
rename to examples/COUPLE/library/error.h
diff --git a/couple/library/files.cpp b/examples/COUPLE/library/files.cpp
similarity index 100%
rename from couple/library/files.cpp
rename to examples/COUPLE/library/files.cpp
diff --git a/couple/library/files.h b/examples/COUPLE/library/files.h
similarity index 100%
rename from couple/library/files.h
rename to examples/COUPLE/library/files.h
diff --git a/couple/library/irregular.cpp b/examples/COUPLE/library/irregular.cpp
similarity index 100%
rename from couple/library/irregular.cpp
rename to examples/COUPLE/library/irregular.cpp
diff --git a/couple/library/irregular.h b/examples/COUPLE/library/irregular.h
similarity index 100%
rename from couple/library/irregular.h
rename to examples/COUPLE/library/irregular.h
diff --git a/couple/library/lammps_data_write.cpp b/examples/COUPLE/library/lammps_data_write.cpp
similarity index 100%
rename from couple/library/lammps_data_write.cpp
rename to examples/COUPLE/library/lammps_data_write.cpp
diff --git a/couple/library/lammps_data_write.h b/examples/COUPLE/library/lammps_data_write.h
similarity index 100%
rename from couple/library/lammps_data_write.h
rename to examples/COUPLE/library/lammps_data_write.h
diff --git a/couple/library/many2many.cpp b/examples/COUPLE/library/many2many.cpp
similarity index 100%
rename from couple/library/many2many.cpp
rename to examples/COUPLE/library/many2many.cpp
diff --git a/couple/library/many2many.h b/examples/COUPLE/library/many2many.h
similarity index 100%
rename from couple/library/many2many.h
rename to examples/COUPLE/library/many2many.h
diff --git a/couple/library/many2one.cpp b/examples/COUPLE/library/many2one.cpp
similarity index 100%
rename from couple/library/many2one.cpp
rename to examples/COUPLE/library/many2one.cpp
diff --git a/couple/library/many2one.h b/examples/COUPLE/library/many2one.h
similarity index 100%
rename from couple/library/many2one.h
rename to examples/COUPLE/library/many2one.h
diff --git a/couple/library/memory.cpp b/examples/COUPLE/library/memory.cpp
similarity index 100%
rename from couple/library/memory.cpp
rename to examples/COUPLE/library/memory.cpp
diff --git a/couple/library/memory.h b/examples/COUPLE/library/memory.h
similarity index 100%
rename from couple/library/memory.h
rename to examples/COUPLE/library/memory.h
diff --git a/couple/library/one2many.cpp b/examples/COUPLE/library/one2many.cpp
similarity index 100%
rename from couple/library/one2many.cpp
rename to examples/COUPLE/library/one2many.cpp
diff --git a/couple/library/one2many.h b/examples/COUPLE/library/one2many.h
similarity index 100%
rename from couple/library/one2many.h
rename to examples/COUPLE/library/one2many.h
diff --git a/couple/library/send2one.cpp b/examples/COUPLE/library/send2one.cpp
similarity index 100%
rename from couple/library/send2one.cpp
rename to examples/COUPLE/library/send2one.cpp
diff --git a/couple/library/send2one.h b/examples/COUPLE/library/send2one.h
similarity index 100%
rename from couple/library/send2one.h
rename to examples/COUPLE/library/send2one.h
diff --git a/couple/simple/README b/examples/COUPLE/simple/README
similarity index 96%
rename from couple/simple/README
rename to examples/COUPLE/simple/README
index 9ab2186f0c..a1d5e2034e 100644
--- a/couple/simple/README
+++ b/examples/COUPLE/simple/README
@@ -26,13 +26,13 @@ This builds the C++ driver with the LAMMPS library using a C++ compiler:
 
 g++ -I/home/sjplimp/lammps/src -c simple.cpp
 g++ -L/home/sjplimp/lammps/src simple.o \
-    -llmp_g++ -lfftw -lmpich -lpthread -o simpleCC
+    -llmp_g++ -lfftw -lmpich -lmpl -lpthread -o simpleCC
 
 This builds the C driver with the LAMMPS library using a C compiler:
 
 gcc -I/home/sjplimp/lammps/src -c simple.c
 gcc -L/home/sjplimp/lammps/src simple.o \
-    -llmp_g++ -lfftw -lmpich -lpthread -lstdc++ -o simpleC
+    -llmp_g++ -lfftw -lmpich -lmpl -lpthread -lstdc++ -o simpleC
 
 This builds the Fortran wrapper and driver with the LAMMPS library
 using a Fortran and C compiler:
diff --git a/couple/simple/in.lj b/examples/COUPLE/simple/in.lj
similarity index 100%
rename from couple/simple/in.lj
rename to examples/COUPLE/simple/in.lj
diff --git a/couple/simple/log.simple.c++.1 b/examples/COUPLE/simple/log.simple.c++.1
similarity index 100%
rename from couple/simple/log.simple.c++.1
rename to examples/COUPLE/simple/log.simple.c++.1
diff --git a/couple/simple/log.simple.c++.4 b/examples/COUPLE/simple/log.simple.c++.4
similarity index 100%
rename from couple/simple/log.simple.c++.4
rename to examples/COUPLE/simple/log.simple.c++.4
diff --git a/couple/simple/simple.c b/examples/COUPLE/simple/simple.c
similarity index 100%
rename from couple/simple/simple.c
rename to examples/COUPLE/simple/simple.c
diff --git a/couple/simple/simple.cpp b/examples/COUPLE/simple/simple.cpp
similarity index 100%
rename from couple/simple/simple.cpp
rename to examples/COUPLE/simple/simple.cpp
diff --git a/couple/simple/simple.f90 b/examples/COUPLE/simple/simple.f90
similarity index 100%
rename from couple/simple/simple.f90
rename to examples/COUPLE/simple/simple.f90
diff --git a/examples/README b/examples/README
index 5f614b9688..825faf2c82 100644
--- a/examples/README
+++ b/examples/README
@@ -1,12 +1,26 @@
 LAMMPS example problems
 
+There are 3 flavors of sub-directories in this file, each with sample
+problems you can run with LAMMPS.
+
+lower-case directories = simple test problems for LAMMPS and its packages
+upper-case directories = more complex problems
+USER directory with its own sub-directories = tests for USER packages
+
+Each is discussed below.
+
+------------------------------------------
+
+Lower-case directories
+
 Each of these sub-directories contains a sample problem you can run
 with LAMMPS.  Most are 2d models so that they run quickly, requiring a
 few seconds to a few minutes to run on a desktop machine.  Each
 problem has an input script (in.*) and produces a log file (log.*) and
 (optionally) a dump file (dump.*) or image files (image.*) when it
 runs.  Some use a data file (data.*) of initial coordinates as
-additional input.
+additional input.  Some require that you install one or more optional
+LAMMPS packages.
 
 A few sample log file outputs on different machines and different
 numbers of processors are included in the directories to compare your
@@ -77,12 +91,22 @@ create a GIF file suitable for viewing in a browser.
 
 ------------------------------------------
 
-There is also an ELASTIC directory with an example script for
-computing elastic constants, using a zero temperature Si example.  See
-the in.elastic file for more info.
+Upper-case directories
 
-There is also a USER directory which contains subdirectories of
-user-provided examples for user packages.  See the README files in
-those directories for more info.  See the doc/Section_start.html file
-for more info about user packages.
+The COUPLE directory has examples of how to use LAMMPS as a library,
+either by itself or in tandem with another code or library.  See the
+COUPLE/README file to get started.
+
+The ELASTIC directory has an example script for computing elastic
+constants, using a zero temperature Si example.  See the
+ELASTIC/in.elastic file for more info.
+
+------------------------------------------
+
+USER directory
+
+The USER directory contains subdirectories of user-provided example
+scripts for ser packages.  See the README files in those directories
+for more info.  See the doc/Section_start.html file for more info
+about installing and building user packages.
 
diff --git a/python/README b/python/README
index 49465b3596..2b51b5f710 100644
--- a/python/README
+++ b/python/README
@@ -1,26 +1,48 @@
 This directory contains Python code which wraps LAMMPS as a library
-and allows the library interface to be invoked from a Python, either
-from a script or interactively.
+and allows the LAMMPS library interface to be invoked from Python,
+either from a script or interactively.
 
-Details on how to build and use this Python interface are given in
+Details on the Python interface to LAMMPS and how to build LAMMPS as a
+shared library for use with Python are given in
 doc/Section_python.html.
 
-Basically you have to extend the Python on your box to include the
-LAMMPS wrappers:
+Basically you need to follow these 3 steps:
 
-python setup_serial.py build           # for serial LAMMPS and Python
-sudo python setup_serial.py install
+a) Add paths to environment variables in your shell script
 
-python setup.py build                  # for parallel LAMMPS and Python
-sudo python setup.py install
+For example, for csh or tcsh, add something like this to ~/.cshrc:
 
-but there are several issues to be aware of, as discussed in the doc
-pages.
+setenv PYTHONPATH ${PYTHONPATH}:/home/sjplimp/lammps/python
+setenv LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/home/sjplimp/lammps/src
+setenv LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/home/sjplimp/lammps/src/STUBS
+
+The latter is only necessary if you will use the MPI stubs library
+instead of an MPI installed on your machine.
+
+b) Build LAMMPS as a dynamic library, including dynamic versions of
+any libraries it includes for the packages you have installed,
+e.g. STUBS, MPI, FFTW, JPEG, package libs.
+
+From the src directory:
+
+% make makeshlib
+% make -f Makefile.shlib g++
+
+If successful, this results in the file src/liblmp_g++.so
+
+c) Launch Python and import the LAMMPS wrapper
+
+% python
+>>> from lammps import lammps
+>>> lmp = lammps()
+
+If that gives no errors, you have succesfully wrapped LAMMPS with
+Python.
 
 -------------------------------------------------------------------
 
-Once you have successfully built and tested the wrappers, you can run
-the Python scripts in the examples sub-directory:
+Once you have successfully wrapped LAMMPS, you can run the Python
+scripts in the examples sub-directory:
 
 trivial.py          read/run a LAMMPS input script thru Python
 demo.py	  	    invoke various LAMMPS library interface routines
diff --git a/python/lammps.py b/python/lammps.py
index 42cdfa496d..aa889e0233 100644
--- a/python/lammps.py
+++ b/python/lammps.py
@@ -26,28 +26,26 @@ LMPDPTRPTR = 4
 LOCATION = os.path.dirname(__file__)
 
 class lammps:
-  def __init__(self,args=None):
+  def __init__(self,name="",cmdlineargs=None):
 
-    # attempt to load parallel library first, serial library next
-    # could provide caller a flag to choose which library to load
+    # load liblmp.so by default
+    # if name = "g++", load liblmp_g++.so
     
     try:
-      self.lib = CDLL(os.path.join(LOCATION, "_lammps.so"))
+      if not name: self.lib = CDLL("liblmp.so")
+      else: self.lib = CDLL("liblmp_%s.so" % name)
     except:
-      try:
-        self.lib = CDLL(os.path.join(LOCATION, "_lammps_serial.so"))
-      except:
-        raise OSError,"Could not load LAMMPS dynamic library"
+      raise OSError,"Could not load LAMMPS dynamic library"
 
     # create an instance of LAMMPS
     # don't know how to pass an MPI communicator from PyPar
     # no_mpi call lets LAMMPS use MPI_COMM_WORLD
     # cargs = array of C strings from args
     
-    if args:
-      args.insert(0,"lammps.py")
-      narg = len(args)
-      cargs = (c_char_p*narg)(*args)
+    if cmdlineargs:
+      cmdlineargs.insert(0,"lammps.py")
+      narg = len(cmdlineargs)
+      cargs = (c_char_p*narg)(*cmdlineargs)
       self.lmp = c_void_p()
       self.lib.lammps_open_no_mpi(narg,cargs,byref(self.lmp))
     else:
diff --git a/python/setup.py b/python/setup.py
deleted file mode 100755
index 072673c027..0000000000
--- a/python/setup.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/local/bin/python
-
-"""
-setup.py file for LAMMPS with system MPICH library
-"""
-
-from distutils.core import setup, Extension
-
-import os, glob
-path = os.path.dirname(os.getcwd())
-
-# list of src files for LAMMPS
-
-libfiles = glob.glob("%s/src/*.cpp" % path)
-
-lammps_library = Extension("_lammps",
-                           sources = libfiles,
-                           define_macros = [("MPICH_IGNORE_CXX_SEEK",1),
-                                            ("LAMMPS_GZIP",1),
-                                            ("FFT_NONE",1),],
-                           # src files for LAMMPS
-                           include_dirs = ["../src"],
-                           # additional libs for MPICH on Linux
-                           libraries = ["mpich","mpl","pthread"],
-                           # where to find the MPICH lib on Linux
-                           library_dirs = ["/usr/local/lib"],
-                           # additional libs for MPI on Mac
-                           # libraries = ["mpi"],
-                           )
-
-setup(name = "lammps",
-      version = "28Nov11",
-      author = "Steve Plimpton",
-      author_email = "sjplimp@sandia.gov",
-      url = "http://lammps.sandia.gov",
-      description = """LAMMPS molecular dynamics library - parallel""",
-      py_modules = ["lammps"],
-      ext_modules = [lammps_library]
-      )
diff --git a/python/setup_serial.py b/python/setup_serial.py
deleted file mode 100755
index 2aa242b19a..0000000000
--- a/python/setup_serial.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/local/bin/python
-
-"""
-setup_serial.py file for LAMMPS with dummy serial MPI library
-"""
-
-from distutils.core import setup, Extension
-
-import os, glob
-path = os.path.dirname(os.getcwd())
-
-# list of src files for LAMMPS and MPI STUBS
-
-libfiles = glob.glob("%s/src/*.cpp" % path) + \
-           glob.glob("%s/src/STUBS/*.c" % path)
-
-lammps_library = Extension("_lammps_serial",
-                           sources = libfiles,
-                           define_macros = [("MPICH_IGNORE_CXX_SEEK",1),
-                                            ("LAMMPS_GZIP",1),
-                                            ("FFT_NONE",1),],
-                           # src files for LAMMPS and MPI STUBS
-                           include_dirs = ["../src", "../src/STUBS"]
-                           )
-
-setup(name = "lammps_serial",
-      version = "28Nov11",
-      author = "Steve Plimpton",
-      author_email = "sjplimp@sandia.gov",
-      url = "http://lammps.sandia.gov",
-      description = """LAMMPS molecular dynamics library - serial""",
-      py_modules = ["lammps"],
-      ext_modules = [lammps_library]
-      )
diff --git a/src/MAKE/Makefile.altix b/src/MAKE/Makefile.altix
index 39d5d26779..f51239a179 100644
--- a/src/MAKE/Makefile.altix
+++ b/src/MAKE/Makefile.altix
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		icc
 CCFLAGS =	-O2
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		icc
 LINKFLAGS =	-O2
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.bgl b/src/MAKE/Makefile.bgl
index ad3a64d468..08a520d1ce 100644
--- a/src/MAKE/Makefile.bgl
+++ b/src/MAKE/Makefile.bgl
@@ -9,7 +9,9 @@ SHELL = /bin/sh
 
 CC =		/opt/ibmcmp/vacpp/7.0/bin/blrts_xlC
 CCFLAGS =	-O3
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		/opt/ibmcmp/vacpp/7.0/bin/blrts_xlC
 LINKFLAGS =	-O \
                 -L/opt/ibmcmp/xlf/9.1/blrts_lib \
@@ -18,9 +20,11 @@ LINKFLAGS =	-O \
                 -L/bgl/local/bglfftwgel-2.1.5.pre5/lib
 LIB =           -lxlopt -lxlomp_ser -lxl -lxlfmath -lm \
                 -lmsglayer.rts -lrts.rts -ldevices.rts -lmassv
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -82,15 +86,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.cygwin b/src/MAKE/Makefile.cygwin
index 493ae48f66..14600d9882 100644
--- a/src/MAKE/Makefile.cygwin
+++ b/src/MAKE/Makefile.cygwin
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		mpicxx
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpicxx
 LINKFLAGS =	-O
 LIB =           
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.encanto b/src/MAKE/Makefile.encanto
index 8140f73337..cf8ba64ee3 100644
--- a/src/MAKE/Makefile.encanto
+++ b/src/MAKE/Makefile.encanto
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		mpicxx
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpicxx
 LINKFLAGS =	-O
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.fink b/src/MAKE/Makefile.fink
index 67d81497e7..24ccb46afb 100644
--- a/src/MAKE/Makefile.fink
+++ b/src/MAKE/Makefile.fink
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		c++
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		c++
 LINKFLAGS =	-O
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -34,8 +38,8 @@ LMP_INC =	-DLAMMPS_GZIP
 # LIB = name of MPI library
 
 MPI_INC =       -I../STUBS
-MPI_PATH =	
-MPI_LIB =	../STUBS/libmpi.a
+MPI_PATH =	-L../STUBS
+MPI_LIB =	-lmpi_stubs
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.g++ b/src/MAKE/Makefile.g++
index 62f82e372a..a884c91fd7 100644
--- a/src/MAKE/Makefile.g++
+++ b/src/MAKE/Makefile.g++
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		g++
 CCFLAGS =	-g -O # -Wunused
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		g++
 LINKFLAGS =	-g -O
-LIB =           
+LIB = 
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.g++3 b/src/MAKE/Makefile.g++3
index 490f47b7be..ddc7f0488a 100644
--- a/src/MAKE/Makefile.g++3
+++ b/src/MAKE/Makefile.g++3
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		g++
 CCFLAGS =	-g -O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		g++
 LINKFLAGS =	-g -O
 LIB =           
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.glory b/src/MAKE/Makefile.glory
index 689d95cae5..e7dc23de51 100644
--- a/src/MAKE/Makefile.glory
+++ b/src/MAKE/Makefile.glory
@@ -25,13 +25,17 @@ SHELL = /bin/sh
 
 CC =		mpicxx
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpicxx
 LINKFLAGS =	-O
 LIB =           -lstdc++ -lm
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -93,15 +97,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.jaguar b/src/MAKE/Makefile.jaguar
index cc88d37e60..14e2d4479b 100644
--- a/src/MAKE/Makefile.jaguar
+++ b/src/MAKE/Makefile.jaguar
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CXX =           CC
 CCFLAGS =	-g -O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		$(CXX)
 LINKFLAGS =	-g -O
 LIB =           
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -76,18 +80,22 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CXX) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
-	$(CXX) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
+	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
 
 # Individual dependencies
 
diff --git a/src/MAKE/Makefile.lam b/src/MAKE/Makefile.lam
index 2c86ad752f..fed9d8f601 100644
--- a/src/MAKE/Makefile.lam
+++ b/src/MAKE/Makefile.lam
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		mpic++
 CCFLAGS =	-O3
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpic++
 LINKFLAGS =	-O3
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.linux b/src/MAKE/Makefile.linux
index cf39084f2d..5378247c36 100644
--- a/src/MAKE/Makefile.linux
+++ b/src/MAKE/Makefile.linux
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		icc
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		icc
 LINKFLAGS =	-O
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.mac b/src/MAKE/Makefile.mac
index 0742fc945a..ccefa59262 100644
--- a/src/MAKE/Makefile.mac
+++ b/src/MAKE/Makefile.mac
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		c++
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		c++
 LINKFLAGS =	-O
 LIB =           
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -34,8 +38,8 @@ LMP_INC =	-DLAMMPS_GZIP
 # LIB = name of MPI library
 
 MPI_INC =       -I../STUBS
-MPI_PATH =	
-MPI_LIB =	../STUBS/libmpi.a
+MPI_PATH =	-L../STUBS
+MPI_LIB =	-lmpi_stubs
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.mac_mpi b/src/MAKE/Makefile.mac_mpi
index e4a7e31a32..52825c84c2 100644
--- a/src/MAKE/Makefile.mac_mpi
+++ b/src/MAKE/Makefile.mac_mpi
@@ -9,13 +9,17 @@ SHELL = /bin/sh
 
 CC =		${MPI_GCC46_PATH}/mpic++
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		${MPI_GCC46_PATH}/mpic++
 LINKFLAGS =	-O
 LIB =           
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -79,15 +83,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.mingw b/src/MAKE/Makefile.mingw
index 2a553f4275..70ed3649f9 100644
--- a/src/MAKE/Makefile.mingw
+++ b/src/MAKE/Makefile.mingw
@@ -9,13 +9,17 @@ SHELL = /bin/sh
 CC =		i686-pc-mingw32-g++
 CCFLAGS =	-O3 -march=i686 -mtune=generic -mfpmath=387 -mpc64 \
 	 -ffast-math -funroll-loops -fstrict-aliasing -Wall -W -Wno-uninitialized 
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		i686-pc-mingw32-g++
 LINKFLAGS =	-O 
 LIB =         -lwsock32  # -lwsock32 is needed for USER-IMD which uses tcp/ip sockets.
+SIZE =		i686-pc-mingw32-size
+
 ARCHIVE =	ar
 ARFLAGS =	-rcsv
-SIZE =		i686-pc-mingw32-size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -34,9 +38,9 @@ LMP_INC = -DLAMMPS_XDR # -DLAMMPS_GZIP -DMALLOC_MEMALIGN=64
 # PATH = path for MPI library
 # LIB = name of MPI library
 
-MPI_INC =  -I../STUBS 
+MPI_INC =    -I../STUBS 
 MPI_PATH = 
-MPI_LIB = mpi.o	
+MPI_LIB =    mpi.o	
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
diff --git a/src/MAKE/Makefile.mkl b/src/MAKE/Makefile.mkl
index 043390f1c8..c2c0397b2f 100644
--- a/src/MAKE/Makefile.mkl
+++ b/src/MAKE/Makefile.mkl
@@ -14,13 +14,17 @@ SHELL = /bin/sh
 
 CC =		mpiicc
 CCFLAGS =	-O3 -fno-alias -ip -unroll0
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpiicc
 LINKFLAGS =	-O -L/opt/intel/mkl/10.0.011/lib/em64t 
 LIB =           -lstdc++ -lpthread -lmkl_em64t -lguide 
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -82,15 +86,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.odin b/src/MAKE/Makefile.odin
index 479ef95ea9..fd0dd11f78 100644
--- a/src/MAKE/Makefile.odin
+++ b/src/MAKE/Makefile.odin
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		g++
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		G++
 LINKFLAGS =	-O
 LIB =           
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.openmpi b/src/MAKE/Makefile.openmpi
index 4b1c7e42a9..898e85dacb 100644
--- a/src/MAKE/Makefile.openmpi
+++ b/src/MAKE/Makefile.openmpi
@@ -10,13 +10,17 @@ CC =		mpic++
 CCFLAGS =	-O2 -fomit-frame-pointer -fno-rtti -fno-exceptions -g \
 			-march=native -ffast-math -mpc64 -finline-functions \
 			-funroll-loops -fstrict-aliasing -Wall -W -Wno-uninitialized
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpic++
 LINKFLAGS =	-O -g -fno-rtti -fno-exceptions -mpc64
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rcsv
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -78,15 +82,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.pgi b/src/MAKE/Makefile.pgi
index 67db1a0fe2..0ed1544a35 100644
--- a/src/MAKE/Makefile.pgi
+++ b/src/MAKE/Makefile.pgi
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		pgCC
 CCFLAGS =	-fast
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		pgCC
 LINKFLAGS =	
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.power5 b/src/MAKE/Makefile.power5
index 26d87dc92e..d3d0af6512 100644
--- a/src/MAKE/Makefile.power5
+++ b/src/MAKE/Makefile.power5
@@ -9,13 +9,17 @@ SHELL = /bin/sh
 
 CC =		mpCC_r
 CCFLAGS =	-O3 -qnoipa -qlanglvl=oldmath
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpCC_r
 LINKFLAGS =	-O -qnoipa -qlanglvl=oldmath -bmaxdata:0x70000000
 LIB =           -lm
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -77,15 +81,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.qed b/src/MAKE/Makefile.qed
index 49349d45d2..be4fd03bbf 100644
--- a/src/MAKE/Makefile.qed
+++ b/src/MAKE/Makefile.qed
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		mpiCC
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpiCC
 LINKFLAGS =	-O
 LIB =           
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.redsky b/src/MAKE/Makefile.redsky
index 3316dcb6e1..ea9e5bd148 100644
--- a/src/MAKE/Makefile.redsky
+++ b/src/MAKE/Makefile.redsky
@@ -36,13 +36,17 @@ SHELL = /bin/sh
 
 CC =		mpic++
 CCFLAGS =	-O2 -xsse4.2 -funroll-loops -fstrict-aliasing
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpic++
 LINKFLAGS =	-O -xsse4.2 
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rcsv
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -104,15 +108,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.sdsc b/src/MAKE/Makefile.sdsc
index 7ef819b34e..6384149990 100644
--- a/src/MAKE/Makefile.sdsc
+++ b/src/MAKE/Makefile.sdsc
@@ -10,16 +10,20 @@ SHELL = /bin/sh
 CC =		blrts_xlC
 CCFLAGS =	-I/bgl/BlueLight/ppcfloor/bglsys/include \
 		-O2 -qarch=440 -qtune=440
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M -qmakedep=gcc
+
 LINK =		blrts_xlC
 LINKFLAGS =	-O \
 		-L/bgl/BlueLight/ppcfloor/bglsys/lib \
 		-L/opt/ibmcmp/xlf/bg/10.1/blrts_lib \
 		-L/opt/ibmcmp/vacpp/bg/8.0/blrts_lib
 LIB =           -lm
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -81,15 +85,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.seaborg b/src/MAKE/Makefile.seaborg
index e05116dcd4..dc17f2c242 100644
--- a/src/MAKE/Makefile.seaborg
+++ b/src/MAKE/Makefile.seaborg
@@ -9,13 +9,17 @@ SHELL = /bin/sh
 
 CC =		mpCC_r
 CCFLAGS =	-O2 -qnoipa
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpCC_r
 LINKFLAGS =	-O -L/usr/lib
 LIB =           -lm
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -77,15 +81,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.serial b/src/MAKE/Makefile.serial
index f49464993a..b0426b3e8e 100644
--- a/src/MAKE/Makefile.serial
+++ b/src/MAKE/Makefile.serial
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		g++
 CCFLAGS =	-O -g
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		g++
 LINKFLAGS =	-O -g
 LIB =  
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -34,8 +38,8 @@ LMP_INC =	#-DLAMMPS_GZIP -DMALLOC_MEMALIGN=64
 # LIB = name of MPI library
 
 MPI_INC =       -I../STUBS
-MPI_PATH =
-MPI_LIB =	../STUBS/libmpi.a
+MPI_PATH =      -L../STUBS
+MPI_LIB =	-lmpi_stubs
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
@@ -44,7 +48,7 @@ MPI_LIB =	../STUBS/libmpi.a
 # PATH = path for FFT library
 # LIB = name of FFT library
 
-FFT_INC = 
+FFT_INC =
 FFT_PATH = 
 FFT_LIB =       -lfftw3f
 
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.serial_debug b/src/MAKE/Makefile.serial_debug
index 18898b3d32..1841175b55 100644
--- a/src/MAKE/Makefile.serial_debug
+++ b/src/MAKE/Makefile.serial_debug
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		g++
 CCFLAGS =	-O0 -g -Wall -W -fstrict-aliasing
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		g++
 LINKFLAGS =	-O0 -g
 LIB =    
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rcsv
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -34,8 +38,8 @@ LMP_INC =	-DLAMMPS_GZIP
 # LIB = name of MPI library
 
 MPI_INC =       -I../STUBS
-MPI_PATH =	
-MPI_LIB =	../STUBS/libmpi.a
+MPI_PATH =	-L../STUBS
+MPI_LIB =	-lmpi_stubs
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.sgi b/src/MAKE/Makefile.sgi
index 8b928717af..4c17975de6 100644
--- a/src/MAKE/Makefile.sgi
+++ b/src/MAKE/Makefile.sgi
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		CC
 CCFLAGS =	-64 -O -mp
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		CC
 LINKFLAGS =	-O
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.solaris b/src/MAKE/Makefile.solaris
index bdee7ad9fc..c24455fa8c 100644
--- a/src/MAKE/Makefile.solaris
+++ b/src/MAKE/Makefile.solaris
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		c++
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		c++
 LINKFLAGS =	-O
 LIB =           
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -34,8 +38,8 @@ LMP_INC =	-DLAMMPS_GZIP
 # LIB = name of MPI library
 
 MPI_INC =       -I../STUBS
-MPI_PATH =	
-MPI_LIB =	../STUBS/libmpi.a
+MPI_PATH =	-L../STUBS
+MPI_LIB =	-lmpi_stubs
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.spirit b/src/MAKE/Makefile.spirit
index 0c89ed3e71..c6272cd982 100644
--- a/src/MAKE/Makefile.spirit
+++ b/src/MAKE/Makefile.spirit
@@ -14,13 +14,17 @@ SHELL = /bin/sh
 
 CC =		mpicxx
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpicxx
 LINKFLAGS =	-O
 LIB =           -lm
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -84,15 +88,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.storm b/src/MAKE/Makefile.storm
index 3f18ac6124..485b1fde2e 100644
--- a/src/MAKE/Makefile.storm
+++ b/src/MAKE/Makefile.storm
@@ -9,13 +9,17 @@ SHELL = /bin/sh
 
 CC =		CC
 CCFLAGS =	-fastsse
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		CC
 LINKFLAGS =	-O
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -77,11 +81,15 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 .cpp.o:
diff --git a/src/MAKE/Makefile.tacc b/src/MAKE/Makefile.tacc
index 512c44988e..86674bbaad 100644
--- a/src/MAKE/Makefile.tacc
+++ b/src/MAKE/Makefile.tacc
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		mpiCC
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpiCC
 LINKFLAGS =	-O
 LIB =           
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -79,15 +83,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.tbird b/src/MAKE/Makefile.tbird
index 3ee125d876..a2836ecd50 100644
--- a/src/MAKE/Makefile.tbird
+++ b/src/MAKE/Makefile.tbird
@@ -27,13 +27,17 @@ SHELL = /bin/sh
 
 CC =		mpicxx
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpicxx
 LINKFLAGS =	-O
 LIB =           -lstdc++ -lm
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -95,15 +99,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.tesla b/src/MAKE/Makefile.tesla
index 1fa76e6b2f..cf8323c2cb 100644
--- a/src/MAKE/Makefile.tesla
+++ b/src/MAKE/Makefile.tesla
@@ -8,13 +8,17 @@ SHELL = /bin/sh
 
 CC =		g++
 CCFLAGS =	-O
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		g++
 LINKFLAGS =	-O
 LIB =           -lm -lcomplib.sgimath
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -34,8 +38,8 @@ LMP_INC =	-DLAMMPS_GZIP
 # LIB = name of MPI library
 
 MPI_INC =       -I../STUBS
-MPI_PATH =	
-MPI_LIB =	../STUBS/libmpi.a
+MPI_PATH =	-L../STUBS
+MPI_LIB =	-lmpi_stubs
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
@@ -76,15 +80,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.tunnison b/src/MAKE/Makefile.tunnison
index c8b1e38fe6..e5486b8ac8 100644
--- a/src/MAKE/Makefile.tunnison
+++ b/src/MAKE/Makefile.tunnison
@@ -18,13 +18,17 @@ SHELL = /bin/sh
 CC =		mpic++
 CCFLAGS =	-O2 \
 		-funroll-loops -fstrict-aliasing -W -Wno-uninitialized
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		mpic++
 LINKFLAGS =	-O
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rcsv
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -86,15 +90,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.xe6 b/src/MAKE/Makefile.xe6
index ee2f866836..690ad0b938 100644
--- a/src/MAKE/Makefile.xe6
+++ b/src/MAKE/Makefile.xe6
@@ -9,13 +9,17 @@ SHELL = /bin/sh
 
 CC =		CC
 CCFLAGS =	-fastsse
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		CC
 LINKFLAGS =	-O
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -77,15 +81,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.xt3 b/src/MAKE/Makefile.xt3
index f5cdc9f437..e58a10d2f8 100644
--- a/src/MAKE/Makefile.xt3
+++ b/src/MAKE/Makefile.xt3
@@ -8,15 +8,19 @@ SHELL = /bin/sh
 
 CC =		CC
 CCFLAGS =	-O3 --target=catamount \
-CCFLAGS =       -fomit-frame-pointer -finline-functions \
+                -fomit-frame-pointer -finline-functions \
 		-Wall -Wno-unused -funroll-all-loops
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		CC
 LINKFLAGS =	--target=catamount -O
 LIB =           -lgmalloc
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -78,15 +82,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/MAKE/Makefile.xt5 b/src/MAKE/Makefile.xt5
index f5000f0029..c1fd295bcf 100644
--- a/src/MAKE/Makefile.xt5
+++ b/src/MAKE/Makefile.xt5
@@ -9,13 +9,17 @@ SHELL = /bin/sh
 
 CC =		CC
 CCFLAGS =	-fastsse
+SHFLAGS =	-fPIC
 DEPFLAGS =	-M
+
 LINK =		CC
 LINKFLAGS =	-O
 LIB =           -lstdc++
+SIZE =		size
+
 ARCHIVE =	ar
 ARFLAGS =	-rc
-SIZE =		size
+SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
@@ -77,15 +81,19 @@ $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
-# Library target
+# Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
 # Compilation rules
 
 %.o:%.cpp
-	$(CC) $(CCFLAGS) $(EXTRA_INC) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
diff --git a/src/Make.sh b/src/Make.sh
index 0dd0a95183..a2c8976b36 100644
--- a/src/Make.sh
+++ b/src/Make.sh
@@ -64,7 +64,7 @@ if (test $1 = "style") then
   style READER_CLASS    reader_     reader     read_dump
   style REGION_CLASS    region_     region     domain
 
-# edit Makefile.lib
+# edit Makefile.lib, for creating non-shared lib
 # called by "make makelib"
 # use current list of *.cpp and *.h files in src dir w/out main.cpp
 
@@ -75,6 +75,17 @@ elif (test $1 = "Makefile.lib") then
   list=`ls -1 *.h | tr "[:cntrl:]" " "`
   sed -i -e "s/INC =	.*/INC =	$list/" Makefile.lib
 
+# edit Makefile.lib, for creating non-shared lib
+# called by "make makelib"
+# use current list of *.cpp and *.h files in src dir w/out main.cpp
+
+elif (test $1 = "Makefile.shlib") then
+
+  list=`ls -1 *.cpp | sed s/^main\.cpp// | tr "[:cntrl:]" " "`
+  sed -i -e "s/SRC =	.*/SRC =	$list/" Makefile.shlib
+  list=`ls -1 *.h | tr "[:cntrl:]" " "`
+  sed -i -e "s/INC =	.*/INC =	$list/" Makefile.shlib
+
 # edit Makefile.list
 # called by "make makelist"
 # use current list of *.cpp and *.h files in src dir
diff --git a/src/Makefile b/src/Makefile
index 4f48daa918..97a506698b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -39,7 +39,8 @@ help:
 	@echo 'make clean-all           delete all object files'
 	@echo 'make clean-machine       delete object files for one machine'
 	@echo 'make tar                 lmp_src.tar.gz of src dir and packages'
-	@echo 'make makelib             update Makefile.lib for library build'
+	@echo 'make makelib             update Makefile.lib for static library build'
+	@echo 'make makeshlib           update Makefile.shlib for shared library build'
 	@echo 'make makelist            update Makefile.list used by old makes'
 	@echo ''
 	@echo 'make package             list available packages'
@@ -78,7 +79,7 @@ help:
 	  then cp Makefile.package.settings.empty Makefile.package.settings; fi
 	@cp Makefile.package Makefile.package.settings Obj_$@
 	@cd Obj_$@; \
-	$(MAKE) $(MFLAGS) "OBJ = $(OBJ)" "INC = $(INC)" \
+	$(MAKE) $(MFLAGS) "OBJ = $(OBJ)" "INC = $(INC)" "SHFLAGS =" \
 	  "EXE = ../$(EXE)" ../$(EXE)
 	@if [ -d Obj_$@ ]; then cd Obj_$@; rm -f $(SRC) $(INC) Makefile*; fi
 
@@ -116,6 +117,10 @@ makelib:
 	@$(SHELL) Make.sh style
 	@$(SHELL) Make.sh Makefile.lib
 
+makeshlib:
+	@$(SHELL) Make.sh style
+	@$(SHELL) Make.sh Makefile.shlib
+
 makelist:
 	@$(SHELL) Make.sh style
 	@$(SHELL) Make.sh Makefile.list
diff --git a/src/Makefile.lib b/src/Makefile.lib
index 4eca6a8706..bee6f7c3b4 100644
--- a/src/Makefile.lib
+++ b/src/Makefile.lib
@@ -1,4 +1,4 @@
-# LAMMPS library multiple-machine Makefile
+# LAMMPS static library multiple-machine Makefile
 
 SHELL = /bin/sh
 
@@ -7,9 +7,9 @@ SHELL = /bin/sh
 ROOT =	lmp
 EXE =	lib$(ROOT)_$@.a
 
-SRC =	angle.cpp angle_charmm.cpp angle_class2.cpp angle_cosine.cpp angle_cosine_delta.cpp angle_cosine_periodic.cpp angle_cosine_squared.cpp angle_harmonic.cpp angle_hybrid.cpp angle_table.cpp atom.cpp atom_vec.cpp atom_vec_angle.cpp atom_vec_atomic.cpp atom_vec_bond.cpp atom_vec_charge.cpp atom_vec_dipole.cpp atom_vec_ellipsoid.cpp atom_vec_full.cpp atom_vec_hybrid.cpp atom_vec_line.cpp atom_vec_molecular.cpp atom_vec_peri.cpp atom_vec_sphere.cpp atom_vec_tri.cpp balance.cpp bond.cpp bond_class2.cpp bond_fene.cpp bond_fene_expand.cpp bond_harmonic.cpp bond_hybrid.cpp bond_morse.cpp bond_nonlinear.cpp bond_quartic.cpp bond_table.cpp change_box.cpp comm.cpp compute.cpp compute_angle_local.cpp compute_atom_molecule.cpp compute_bond_local.cpp compute_centro_atom.cpp compute_cluster_atom.cpp compute_cna_atom.cpp compute_com.cpp compute_com_molecule.cpp compute_coord_atom.cpp compute_damage_atom.cpp compute_dihedral_local.cpp compute_displace_atom.cpp compute_erotate_asphere.cpp compute_erotate_sphere.cpp compute_event_displace.cpp compute_group_group.cpp compute_gyration.cpp compute_gyration_molecule.cpp compute_heat_flux.cpp compute_improper_local.cpp compute_ke.cpp compute_ke_atom.cpp compute_msd.cpp compute_msd_molecule.cpp compute_pair.cpp compute_pair_local.cpp compute_pe.cpp compute_pe_atom.cpp compute_pressure.cpp compute_property_atom.cpp compute_property_local.cpp compute_property_molecule.cpp compute_rdf.cpp compute_reduce.cpp compute_reduce_region.cpp compute_slice.cpp compute_stress_atom.cpp compute_temp.cpp compute_temp_asphere.cpp compute_temp_com.cpp compute_temp_deform.cpp compute_temp_partial.cpp compute_temp_profile.cpp compute_temp_ramp.cpp compute_temp_region.cpp compute_temp_sphere.cpp compute_ti.cpp create_atoms.cpp create_box.cpp delete_atoms.cpp delete_bonds.cpp dihedral.cpp dihedral_charmm.cpp dihedral_class2.cpp dihedral_harmonic.cpp dihedral_helix.cpp dihedral_hybrid.cpp dihedral_multi_harmonic.cpp dihedral_opls.cpp displace_atoms.cpp domain.cpp dump.cpp dump_atom.cpp dump_cfg.cpp dump_custom.cpp dump_dcd.cpp dump_image.cpp dump_local.cpp dump_xtc.cpp dump_xyz.cpp error.cpp ewald.cpp fft3d.cpp fft3d_wrap.cpp finish.cpp fix.cpp fix_adapt.cpp fix_addforce.cpp fix_append_atoms.cpp fix_ave_atom.cpp fix_ave_correlate.cpp fix_ave_histo.cpp fix_ave_spatial.cpp fix_ave_time.cpp fix_aveforce.cpp fix_balance.cpp fix_bond_break.cpp fix_bond_create.cpp fix_bond_swap.cpp fix_box_relax.cpp fix_deform.cpp fix_deposit.cpp fix_drag.cpp fix_dt_reset.cpp fix_efield.cpp fix_enforce2d.cpp fix_evaporate.cpp fix_event.cpp fix_event_prd.cpp fix_event_tad.cpp fix_external.cpp fix_freeze.cpp fix_gcmc.cpp fix_gravity.cpp fix_heat.cpp fix_indent.cpp fix_langevin.cpp fix_lineforce.cpp fix_minimize.cpp fix_momentum.cpp fix_move.cpp fix_msst.cpp fix_neb.cpp fix_nh.cpp fix_nh_asphere.cpp fix_nh_sphere.cpp fix_nph.cpp fix_nph_asphere.cpp fix_nph_sphere.cpp fix_nphug.cpp fix_npt.cpp fix_npt_asphere.cpp fix_npt_sphere.cpp fix_nve.cpp fix_nve_asphere.cpp fix_nve_asphere_noforce.cpp fix_nve_limit.cpp fix_nve_line.cpp fix_nve_noforce.cpp fix_nve_sphere.cpp fix_nve_tri.cpp fix_nvt.cpp fix_nvt_asphere.cpp fix_nvt_sllod.cpp fix_nvt_sphere.cpp fix_orient_fcc.cpp fix_peri_neigh.cpp fix_planeforce.cpp fix_pour.cpp fix_press_berendsen.cpp fix_print.cpp fix_qeq_comb.cpp fix_read_restart.cpp fix_recenter.cpp fix_respa.cpp fix_restrain.cpp fix_rigid.cpp fix_rigid_nve.cpp fix_rigid_nvt.cpp fix_setforce.cpp fix_shake.cpp fix_shear_history.cpp fix_spring.cpp fix_spring_rg.cpp fix_spring_self.cpp fix_srd.cpp fix_store_force.cpp fix_store_state.cpp fix_temp_berendsen.cpp fix_temp_rescale.cpp fix_thermal_conductivity.cpp fix_tmd.cpp fix_ttm.cpp fix_viscosity.cpp fix_viscous.cpp fix_wall.cpp fix_wall_colloid.cpp fix_wall_gran.cpp fix_wall_harmonic.cpp fix_wall_lj126.cpp fix_wall_lj93.cpp fix_wall_piston.cpp fix_wall_reflect.cpp fix_wall_region.cpp fix_wall_srd.cpp force.cpp group.cpp image.cpp improper.cpp improper_class2.cpp improper_cvff.cpp improper_harmonic.cpp improper_hybrid.cpp improper_umbrella.cpp input.cpp integrate.cpp irregular.cpp kspace.cpp lammps.cpp lattice.cpp library.cpp  math_extra.cpp memory.cpp min.cpp min_cg.cpp min_fire.cpp min_hftn.cpp min_linesearch.cpp min_quickmin.cpp min_sd.cpp minimize.cpp modify.cpp neb.cpp neigh_bond.cpp neigh_derive.cpp neigh_full.cpp neigh_gran.cpp neigh_half_bin.cpp neigh_half_multi.cpp neigh_half_nsq.cpp neigh_list.cpp neigh_request.cpp neigh_respa.cpp neigh_stencil.cpp neighbor.cpp output.cpp pair.cpp pair_adp.cpp pair_aeam.cpp pair_airebo.cpp pair_beck.cpp pair_born.cpp pair_born_coul_long.cpp pair_born_coul_wolf.cpp pair_brownian.cpp pair_brownian_poly.cpp pair_buck.cpp pair_buck_coul_cut.cpp pair_buck_coul_long.cpp pair_colloid.cpp pair_comb.cpp pair_coul_cut.cpp pair_coul_debye.cpp pair_coul_long.cpp pair_coul_wolf.cpp pair_dipole_cut.cpp pair_dpd.cpp pair_dpd_tstat.cpp pair_dsmc.cpp pair_eam.cpp pair_eam_alloy.cpp pair_eam_alloy_opt.cpp pair_eam_fs.cpp pair_eam_fs_opt.cpp pair_eam_opt.cpp pair_eim.cpp pair_gauss.cpp pair_gayberne.cpp pair_gran_hertz_history.cpp pair_gran_hooke.cpp pair_gran_hooke_history.cpp pair_hbond_dreiding_lj.cpp pair_hbond_dreiding_morse.cpp pair_hybrid.cpp pair_hybrid_overlay.cpp pair_lcbop.cpp pair_line_lj.cpp pair_lj96_cut.cpp pair_lj_charmm_coul_charmm.cpp pair_lj_charmm_coul_charmm_implicit.cpp pair_lj_charmm_coul_long.cpp pair_lj_charmm_coul_long_opt.cpp pair_lj_class2.cpp pair_lj_class2_coul_cut.cpp pair_lj_class2_coul_long.cpp pair_lj_cubic.cpp pair_lj_cut.cpp pair_lj_cut_coul_cut.cpp pair_lj_cut_coul_debye.cpp pair_lj_cut_coul_long.cpp pair_lj_cut_coul_long_opt.cpp pair_lj_cut_coul_long_tip4p.cpp pair_lj_cut_coul_long_tip4p_opt.cpp pair_lj_cut_opt.cpp pair_lj_expand.cpp pair_lj_gromacs.cpp pair_lj_gromacs_coul_gromacs.cpp pair_lj_smooth.cpp pair_lj_smooth_linear.cpp pair_lubricate.cpp pair_lubricateU.cpp pair_lubricateU_poly.cpp pair_lubricate_poly.cpp pair_meam.cpp pair_morse.cpp pair_morse_opt.cpp pair_peri_lps.cpp pair_peri_pmb.cpp pair_rebo.cpp pair_resquared.cpp pair_soft.cpp pair_sw.cpp pair_table.cpp pair_tersoff.cpp pair_tersoff_zbl.cpp pair_tri_lj.cpp pair_yukawa.cpp pair_yukawa_colloid.cpp pppm.cpp pppm_cg.cpp pppm_tip4p.cpp prd.cpp procmap.cpp random_mars.cpp random_park.cpp read_data.cpp read_restart.cpp region.cpp region_block.cpp region_cone.cpp region_cylinder.cpp region_intersect.cpp region_plane.cpp region_prism.cpp region_sphere.cpp region_union.cpp remap.cpp remap_wrap.cpp replicate.cpp respa.cpp run.cpp set.cpp special.cpp tad.cpp temper.cpp thermo.cpp timer.cpp universe.cpp update.cpp variable.cpp velocity.cpp verlet.cpp verlet_split.cpp write_restart.cpp xdr_compat.cpp 
+SRC =	angle.cpp angle_charmm.cpp angle_class2.cpp angle_cosine.cpp angle_cosine_delta.cpp angle_cosine_periodic.cpp angle_cosine_squared.cpp angle_harmonic.cpp angle_hybrid.cpp angle_table.cpp atom.cpp atom_map.cpp atom_vec.cpp atom_vec_angle.cpp atom_vec_atomic.cpp atom_vec_bond.cpp atom_vec_charge.cpp atom_vec_dipole.cpp atom_vec_ellipsoid.cpp atom_vec_full.cpp atom_vec_hybrid.cpp atom_vec_line.cpp atom_vec_molecular.cpp atom_vec_peri.cpp atom_vec_sphere.cpp atom_vec_tri.cpp balance.cpp bond.cpp bond_class2.cpp bond_fene.cpp bond_fene_expand.cpp bond_harmonic.cpp bond_hybrid.cpp bond_morse.cpp bond_nonlinear.cpp bond_quartic.cpp bond_table.cpp change_box.cpp comm.cpp compute.cpp compute_angle_local.cpp compute_atom_molecule.cpp compute_bond_local.cpp compute_centro_atom.cpp compute_cluster_atom.cpp compute_cna_atom.cpp compute_com.cpp compute_com_molecule.cpp compute_contact_atom.cpp compute_coord_atom.cpp compute_damage_atom.cpp compute_dihedral_local.cpp compute_displace_atom.cpp compute_erotate_asphere.cpp compute_erotate_sphere.cpp compute_erotate_sphere_atom.cpp compute_event_displace.cpp compute_group_group.cpp compute_gyration.cpp compute_gyration_molecule.cpp compute_heat_flux.cpp compute_improper_local.cpp compute_ke.cpp compute_ke_atom.cpp compute_msd.cpp compute_msd_molecule.cpp compute_pair.cpp compute_pair_local.cpp compute_pe.cpp compute_pe_atom.cpp compute_pressure.cpp compute_property_atom.cpp compute_property_local.cpp compute_property_molecule.cpp compute_rdf.cpp compute_reduce.cpp compute_reduce_region.cpp compute_slice.cpp compute_stress_atom.cpp compute_temp.cpp compute_temp_asphere.cpp compute_temp_com.cpp compute_temp_deform.cpp compute_temp_partial.cpp compute_temp_profile.cpp compute_temp_ramp.cpp compute_temp_region.cpp compute_temp_sphere.cpp compute_ti.cpp create_atoms.cpp create_box.cpp delete_atoms.cpp delete_bonds.cpp dihedral.cpp dihedral_charmm.cpp dihedral_class2.cpp dihedral_harmonic.cpp dihedral_helix.cpp dihedral_hybrid.cpp dihedral_multi_harmonic.cpp dihedral_opls.cpp displace_atoms.cpp domain.cpp dump.cpp dump_atom.cpp dump_cfg.cpp dump_custom.cpp dump_dcd.cpp dump_image.cpp dump_local.cpp dump_xtc.cpp dump_xyz.cpp error.cpp ewald.cpp fft3d.cpp fft3d_wrap.cpp finish.cpp fix.cpp fix_adapt.cpp fix_addforce.cpp fix_append_atoms.cpp fix_ave_atom.cpp fix_ave_correlate.cpp fix_ave_histo.cpp fix_ave_spatial.cpp fix_ave_time.cpp fix_aveforce.cpp fix_balance.cpp fix_bond_break.cpp fix_bond_create.cpp fix_bond_swap.cpp fix_box_relax.cpp fix_deform.cpp fix_deposit.cpp fix_drag.cpp fix_dt_reset.cpp fix_efield.cpp fix_enforce2d.cpp fix_evaporate.cpp fix_event.cpp fix_event_prd.cpp fix_event_tad.cpp fix_external.cpp fix_freeze.cpp fix_gcmc.cpp fix_gravity.cpp fix_heat.cpp fix_indent.cpp fix_langevin.cpp fix_lineforce.cpp fix_minimize.cpp fix_momentum.cpp fix_move.cpp fix_msst.cpp fix_neb.cpp fix_nh.cpp fix_nh_asphere.cpp fix_nh_sphere.cpp fix_nph.cpp fix_nph_asphere.cpp fix_nph_sphere.cpp fix_nphug.cpp fix_npt.cpp fix_npt_asphere.cpp fix_npt_sphere.cpp fix_nve.cpp fix_nve_asphere.cpp fix_nve_asphere_noforce.cpp fix_nve_limit.cpp fix_nve_line.cpp fix_nve_noforce.cpp fix_nve_sphere.cpp fix_nve_tri.cpp fix_nvt.cpp fix_nvt_asphere.cpp fix_nvt_sllod.cpp fix_nvt_sphere.cpp fix_orient_fcc.cpp fix_peri_neigh.cpp fix_planeforce.cpp fix_pour.cpp fix_press_berendsen.cpp fix_print.cpp fix_qeq_comb.cpp fix_read_restart.cpp fix_recenter.cpp fix_respa.cpp fix_restrain.cpp fix_rigid.cpp fix_rigid_nve.cpp fix_rigid_nvt.cpp fix_setforce.cpp fix_shake.cpp fix_shear_history.cpp fix_spring.cpp fix_spring_rg.cpp fix_spring_self.cpp fix_srd.cpp fix_store_force.cpp fix_store_state.cpp fix_temp_berendsen.cpp fix_temp_rescale.cpp fix_thermal_conductivity.cpp fix_tmd.cpp fix_ttm.cpp fix_viscosity.cpp fix_viscous.cpp fix_wall.cpp fix_wall_colloid.cpp fix_wall_gran.cpp fix_wall_harmonic.cpp fix_wall_lj126.cpp fix_wall_lj93.cpp fix_wall_piston.cpp fix_wall_reflect.cpp fix_wall_region.cpp fix_wall_srd.cpp force.cpp group.cpp image.cpp improper.cpp improper_class2.cpp improper_cvff.cpp improper_harmonic.cpp improper_hybrid.cpp improper_umbrella.cpp input.cpp integrate.cpp irregular.cpp kspace.cpp lammps.cpp lattice.cpp library.cpp  math_extra.cpp memory.cpp min.cpp min_cg.cpp min_fire.cpp min_hftn.cpp min_linesearch.cpp min_quickmin.cpp min_sd.cpp minimize.cpp modify.cpp neb.cpp neigh_bond.cpp neigh_derive.cpp neigh_full.cpp neigh_gran.cpp neigh_half_bin.cpp neigh_half_multi.cpp neigh_half_nsq.cpp neigh_list.cpp neigh_request.cpp neigh_respa.cpp neigh_stencil.cpp neighbor.cpp output.cpp pair.cpp pair_adp.cpp pair_airebo.cpp pair_beck.cpp pair_bop.cpp pair_born.cpp pair_born_coul_long.cpp pair_born_coul_wolf.cpp pair_brownian.cpp pair_brownian_poly.cpp pair_buck.cpp pair_buck_coul_cut.cpp pair_buck_coul_long.cpp pair_colloid.cpp pair_comb.cpp pair_coul_cut.cpp pair_coul_debye.cpp pair_coul_long.cpp pair_coul_wolf.cpp pair_dipole_cut.cpp pair_dpd.cpp pair_dpd_tstat.cpp pair_dsmc.cpp pair_eam.cpp pair_eam_alloy.cpp pair_eam_alloy_opt.cpp pair_eam_fs.cpp pair_eam_fs_opt.cpp pair_eam_opt.cpp pair_eim.cpp pair_gauss.cpp pair_gayberne.cpp pair_gran_hertz_history.cpp pair_gran_hooke.cpp pair_gran_hooke_history.cpp pair_hbond_dreiding_lj.cpp pair_hbond_dreiding_morse.cpp pair_hybrid.cpp pair_hybrid_overlay.cpp pair_lcbop.cpp pair_line_lj.cpp pair_lj96_cut.cpp pair_lj_charmm_coul_charmm.cpp pair_lj_charmm_coul_charmm_implicit.cpp pair_lj_charmm_coul_long.cpp pair_lj_charmm_coul_long_opt.cpp pair_lj_class2.cpp pair_lj_class2_coul_cut.cpp pair_lj_class2_coul_long.cpp pair_lj_cubic.cpp pair_lj_cut.cpp pair_lj_cut_coul_cut.cpp pair_lj_cut_coul_debye.cpp pair_lj_cut_coul_long.cpp pair_lj_cut_coul_long_opt.cpp pair_lj_cut_coul_long_tip4p.cpp pair_lj_cut_coul_long_tip4p_opt.cpp pair_lj_cut_opt.cpp pair_lj_expand.cpp pair_lj_gromacs.cpp pair_lj_gromacs_coul_gromacs.cpp pair_lj_smooth.cpp pair_lj_smooth_linear.cpp pair_lubricate.cpp pair_lubricateU.cpp pair_lubricateU_poly.cpp pair_lubricate_poly.cpp pair_morse.cpp pair_morse_opt.cpp pair_peri_lps.cpp pair_peri_pmb.cpp pair_rebo.cpp pair_resquared.cpp pair_soft.cpp pair_sw.cpp pair_table.cpp pair_tersoff.cpp pair_tersoff_zbl.cpp pair_tri_lj.cpp pair_yukawa.cpp pair_yukawa_colloid.cpp pppm.cpp pppm_cg.cpp pppm_old.cpp pppm_tip4p.cpp prd.cpp procmap.cpp random_mars.cpp random_park.cpp read_data.cpp read_dump.cpp read_restart.cpp reader.cpp reader_native.cpp reader_xyz.cpp region.cpp region_block.cpp region_cone.cpp region_cylinder.cpp region_intersect.cpp region_plane.cpp region_prism.cpp region_sphere.cpp region_union.cpp remap.cpp remap_wrap.cpp replicate.cpp rerun.cpp respa.cpp run.cpp set.cpp special.cpp tad.cpp temper.cpp thermo.cpp timer.cpp universe.cpp update.cpp variable.cpp velocity.cpp verlet.cpp verlet_split.cpp write_restart.cpp xdr_compat.cpp 
 
-INC =	accelerator_cuda.h accelerator_omp.h angle.h angle_charmm.h angle_class2.h angle_cosine.h angle_cosine_delta.h angle_cosine_periodic.h angle_cosine_squared.h angle_harmonic.h angle_hybrid.h angle_table.h atom.h atom_vec.h atom_vec_angle.h atom_vec_atomic.h atom_vec_bond.h atom_vec_charge.h atom_vec_dipole.h atom_vec_ellipsoid.h atom_vec_full.h atom_vec_hybrid.h atom_vec_line.h atom_vec_molecular.h atom_vec_peri.h atom_vec_sphere.h atom_vec_tri.h balance.h bond.h bond_class2.h bond_fene.h bond_fene_expand.h bond_harmonic.h bond_hybrid.h bond_morse.h bond_nonlinear.h bond_quartic.h bond_table.h change_box.h comm.h compute.h compute_angle_local.h compute_atom_molecule.h compute_bond_local.h compute_centro_atom.h compute_cluster_atom.h compute_cna_atom.h compute_com.h compute_com_molecule.h compute_coord_atom.h compute_damage_atom.h compute_dihedral_local.h compute_displace_atom.h compute_erotate_asphere.h compute_erotate_sphere.h compute_event_displace.h compute_group_group.h compute_gyration.h compute_gyration_molecule.h compute_heat_flux.h compute_improper_local.h compute_ke.h compute_ke_atom.h compute_msd.h compute_msd_molecule.h compute_pair.h compute_pair_local.h compute_pe.h compute_pe_atom.h compute_pressure.h compute_property_atom.h compute_property_local.h compute_property_molecule.h compute_rdf.h compute_reduce.h compute_reduce_region.h compute_slice.h compute_stress_atom.h compute_temp.h compute_temp_asphere.h compute_temp_com.h compute_temp_deform.h compute_temp_partial.h compute_temp_profile.h compute_temp_ramp.h compute_temp_region.h compute_temp_sphere.h compute_ti.h create_atoms.h create_box.h delete_atoms.h delete_bonds.h dihedral.h dihedral_charmm.h dihedral_class2.h dihedral_harmonic.h dihedral_helix.h dihedral_hybrid.h dihedral_multi_harmonic.h dihedral_opls.h displace_atoms.h domain.h dump.h dump_atom.h dump_cfg.h dump_custom.h dump_dcd.h dump_image.h dump_local.h dump_xtc.h dump_xyz.h error.h ewald.h fft3d.h fft3d_wrap.h finish.h fix.h fix_adapt.h fix_addforce.h fix_append_atoms.h fix_ave_atom.h fix_ave_correlate.h fix_ave_histo.h fix_ave_spatial.h fix_ave_time.h fix_aveforce.h fix_balance.h fix_bond_break.h fix_bond_create.h fix_bond_swap.h fix_box_relax.h fix_deform.h fix_deposit.h fix_drag.h fix_dt_reset.h fix_efield.h fix_enforce2d.h fix_evaporate.h fix_event.h fix_event_prd.h fix_event_tad.h fix_external.h fix_freeze.h fix_gcmc.h fix_gravity.h fix_heat.h fix_indent.h fix_langevin.h fix_lineforce.h fix_minimize.h fix_momentum.h fix_move.h fix_msst.h fix_neb.h fix_nh.h fix_nh_asphere.h fix_nh_sphere.h fix_nph.h fix_nph_asphere.h fix_nph_sphere.h fix_nphug.h fix_npt.h fix_npt_asphere.h fix_npt_sphere.h fix_nve.h fix_nve_asphere.h fix_nve_asphere_noforce.h fix_nve_limit.h fix_nve_line.h fix_nve_noforce.h fix_nve_sphere.h fix_nve_tri.h fix_nvt.h fix_nvt_asphere.h fix_nvt_sllod.h fix_nvt_sphere.h fix_orient_fcc.h fix_peri_neigh.h fix_planeforce.h fix_pour.h fix_press_berendsen.h fix_print.h fix_qeq_comb.h fix_read_restart.h fix_recenter.h fix_respa.h fix_restrain.h fix_rigid.h fix_rigid_nve.h fix_rigid_nvt.h fix_setforce.h fix_shake.h fix_shear_history.h fix_spring.h fix_spring_rg.h fix_spring_self.h fix_srd.h fix_store_force.h fix_store_state.h fix_temp_berendsen.h fix_temp_rescale.h fix_thermal_conductivity.h fix_tmd.h fix_ttm.h fix_viscosity.h fix_viscous.h fix_wall.h fix_wall_colloid.h fix_wall_gran.h fix_wall_harmonic.h fix_wall_lj126.h fix_wall_lj93.h fix_wall_piston.h fix_wall_reflect.h fix_wall_region.h fix_wall_srd.h force.h group.h image.h improper.h improper_class2.h improper_cvff.h improper_harmonic.h improper_hybrid.h improper_umbrella.h input.h integrate.h irregular.h kissfft.h kspace.h lammps.h lattice.h library.h lmptype.h lmpwindows.h math_const.h math_extra.h memory.h min.h min_cg.h min_fire.h min_hftn.h min_linesearch.h min_quickmin.h min_sd.h minimize.h modify.h neb.h neigh_bond.h neigh_derive.h neigh_full.h neigh_gran.h neigh_half_bin.h neigh_half_multi.h neigh_half_nsq.h neigh_list.h neigh_request.h neigh_respa.h neighbor.h output.h pack.h pair.h pair_adp.h pair_aeam.h pair_airebo.h pair_beck.h pair_born.h pair_born_coul_long.h pair_born_coul_wolf.h pair_brownian.h pair_brownian_poly.h pair_buck.h pair_buck_coul_cut.h pair_buck_coul_long.h pair_colloid.h pair_comb.h pair_coul_cut.h pair_coul_debye.h pair_coul_long.h pair_coul_wolf.h pair_dipole_cut.h pair_dpd.h pair_dpd_tstat.h pair_dsmc.h pair_eam.h pair_eam_alloy.h pair_eam_alloy_opt.h pair_eam_fs.h pair_eam_fs_opt.h pair_eam_opt.h pair_eim.h pair_gauss.h pair_gayberne.h pair_gran_hertz_history.h pair_gran_hooke.h pair_gran_hooke_history.h pair_hbond_dreiding_lj.h pair_hbond_dreiding_morse.h pair_hybrid.h pair_hybrid_overlay.h pair_lcbop.h pair_line_lj.h pair_lj96_cut.h pair_lj_charmm_coul_charmm.h pair_lj_charmm_coul_charmm_implicit.h pair_lj_charmm_coul_long.h pair_lj_charmm_coul_long_opt.h pair_lj_class2.h pair_lj_class2_coul_cut.h pair_lj_class2_coul_long.h pair_lj_cubic.h pair_lj_cut.h pair_lj_cut_coul_cut.h pair_lj_cut_coul_debye.h pair_lj_cut_coul_long.h pair_lj_cut_coul_long_opt.h pair_lj_cut_coul_long_tip4p.h pair_lj_cut_coul_long_tip4p_opt.h pair_lj_cut_opt.h pair_lj_expand.h pair_lj_gromacs.h pair_lj_gromacs_coul_gromacs.h pair_lj_smooth.h pair_lj_smooth_linear.h pair_lubricate.h pair_lubricateU.h pair_lubricateU_poly.h pair_lubricate_poly.h pair_meam.h pair_morse.h pair_morse_opt.h pair_peri_lps.h pair_peri_pmb.h pair_rebo.h pair_resquared.h pair_soft.h pair_sw.h pair_table.h pair_tersoff.h pair_tersoff_zbl.h pair_tri_lj.h pair_yukawa.h pair_yukawa_colloid.h pointers.h pppm.h pppm_cg.h pppm_tip4p.h prd.h procmap.h random_mars.h random_park.h read_data.h read_restart.h region.h region_block.h region_cone.h region_cylinder.h region_intersect.h region_plane.h region_prism.h region_sphere.h region_union.h remap.h remap_wrap.h replicate.h respa.h run.h set.h special.h style_angle.h style_atom.h style_bond.h style_command.h style_compute.h style_dihedral.h style_dump.h style_fix.h style_improper.h style_integrate.h style_kspace.h style_minimize.h style_pair.h style_region.h suffix.h tad.h temper.h thermo.h timer.h universe.h update.h variable.h velocity.h verlet.h verlet_split.h version.h write_restart.h xdr_compat.h 
+INC =	accelerator_cuda.h accelerator_omp.h angle.h angle_charmm.h angle_class2.h angle_cosine.h angle_cosine_delta.h angle_cosine_periodic.h angle_cosine_squared.h angle_harmonic.h angle_hybrid.h angle_table.h atom.h atom_map.h atom_vec.h atom_vec_angle.h atom_vec_atomic.h atom_vec_bond.h atom_vec_charge.h atom_vec_dipole.h atom_vec_ellipsoid.h atom_vec_full.h atom_vec_hybrid.h atom_vec_line.h atom_vec_molecular.h atom_vec_peri.h atom_vec_sphere.h atom_vec_tri.h balance.h bond.h bond_class2.h bond_fene.h bond_fene_expand.h bond_harmonic.h bond_hybrid.h bond_morse.h bond_nonlinear.h bond_quartic.h bond_table.h change_box.h comm.h compute.h compute_angle_local.h compute_atom_molecule.h compute_bond_local.h compute_centro_atom.h compute_cluster_atom.h compute_cna_atom.h compute_com.h compute_com_molecule.h compute_contact_atom.h compute_coord_atom.h compute_damage_atom.h compute_dihedral_local.h compute_displace_atom.h compute_erotate_asphere.h compute_erotate_sphere.h compute_erotate_sphere_atom.h compute_event_displace.h compute_group_group.h compute_gyration.h compute_gyration_molecule.h compute_heat_flux.h compute_improper_local.h compute_ke.h compute_ke_atom.h compute_msd.h compute_msd_molecule.h compute_pair.h compute_pair_local.h compute_pe.h compute_pe_atom.h compute_pressure.h compute_property_atom.h compute_property_local.h compute_property_molecule.h compute_rdf.h compute_reduce.h compute_reduce_region.h compute_slice.h compute_stress_atom.h compute_temp.h compute_temp_asphere.h compute_temp_com.h compute_temp_deform.h compute_temp_partial.h compute_temp_profile.h compute_temp_ramp.h compute_temp_region.h compute_temp_sphere.h compute_ti.h create_atoms.h create_box.h delete_atoms.h delete_bonds.h dihedral.h dihedral_charmm.h dihedral_class2.h dihedral_harmonic.h dihedral_helix.h dihedral_hybrid.h dihedral_multi_harmonic.h dihedral_opls.h displace_atoms.h domain.h dump.h dump_atom.h dump_cfg.h dump_custom.h dump_dcd.h dump_image.h dump_local.h dump_xtc.h dump_xyz.h error.h ewald.h fft3d.h fft3d_wrap.h finish.h fix.h fix_adapt.h fix_addforce.h fix_append_atoms.h fix_ave_atom.h fix_ave_correlate.h fix_ave_histo.h fix_ave_spatial.h fix_ave_time.h fix_aveforce.h fix_balance.h fix_bond_break.h fix_bond_create.h fix_bond_swap.h fix_box_relax.h fix_deform.h fix_deposit.h fix_drag.h fix_dt_reset.h fix_efield.h fix_enforce2d.h fix_evaporate.h fix_event.h fix_event_prd.h fix_event_tad.h fix_external.h fix_freeze.h fix_gcmc.h fix_gravity.h fix_heat.h fix_indent.h fix_langevin.h fix_lineforce.h fix_minimize.h fix_momentum.h fix_move.h fix_msst.h fix_neb.h fix_nh.h fix_nh_asphere.h fix_nh_sphere.h fix_nph.h fix_nph_asphere.h fix_nph_sphere.h fix_nphug.h fix_npt.h fix_npt_asphere.h fix_npt_sphere.h fix_nve.h fix_nve_asphere.h fix_nve_asphere_noforce.h fix_nve_limit.h fix_nve_line.h fix_nve_noforce.h fix_nve_sphere.h fix_nve_tri.h fix_nvt.h fix_nvt_asphere.h fix_nvt_sllod.h fix_nvt_sphere.h fix_orient_fcc.h fix_peri_neigh.h fix_planeforce.h fix_pour.h fix_press_berendsen.h fix_print.h fix_qeq_comb.h fix_read_restart.h fix_recenter.h fix_respa.h fix_restrain.h fix_rigid.h fix_rigid_nve.h fix_rigid_nvt.h fix_setforce.h fix_shake.h fix_shear_history.h fix_spring.h fix_spring_rg.h fix_spring_self.h fix_srd.h fix_store_force.h fix_store_state.h fix_temp_berendsen.h fix_temp_rescale.h fix_thermal_conductivity.h fix_tmd.h fix_ttm.h fix_viscosity.h fix_viscous.h fix_wall.h fix_wall_colloid.h fix_wall_gran.h fix_wall_harmonic.h fix_wall_lj126.h fix_wall_lj93.h fix_wall_piston.h fix_wall_reflect.h fix_wall_region.h fix_wall_srd.h force.h group.h image.h improper.h improper_class2.h improper_cvff.h improper_harmonic.h improper_hybrid.h improper_umbrella.h input.h integrate.h irregular.h kissfft.h kspace.h lammps.h lattice.h library.h lmptype.h lmpwindows.h math_const.h math_extra.h memory.h min.h min_cg.h min_fire.h min_hftn.h min_linesearch.h min_quickmin.h min_sd.h minimize.h modify.h neb.h neigh_bond.h neigh_derive.h neigh_full.h neigh_gran.h neigh_half_bin.h neigh_half_multi.h neigh_half_nsq.h neigh_list.h neigh_request.h neigh_respa.h neighbor.h output.h pack.h pair.h pair_adp.h pair_airebo.h pair_beck.h pair_bop.h pair_born.h pair_born_coul_long.h pair_born_coul_wolf.h pair_brownian.h pair_brownian_poly.h pair_buck.h pair_buck_coul_cut.h pair_buck_coul_long.h pair_colloid.h pair_comb.h pair_coul_cut.h pair_coul_debye.h pair_coul_long.h pair_coul_wolf.h pair_dipole_cut.h pair_dpd.h pair_dpd_tstat.h pair_dsmc.h pair_eam.h pair_eam_alloy.h pair_eam_alloy_opt.h pair_eam_fs.h pair_eam_fs_opt.h pair_eam_opt.h pair_eim.h pair_gauss.h pair_gayberne.h pair_gran_hertz_history.h pair_gran_hooke.h pair_gran_hooke_history.h pair_hbond_dreiding_lj.h pair_hbond_dreiding_morse.h pair_hybrid.h pair_hybrid_overlay.h pair_lcbop.h pair_line_lj.h pair_lj96_cut.h pair_lj_charmm_coul_charmm.h pair_lj_charmm_coul_charmm_implicit.h pair_lj_charmm_coul_long.h pair_lj_charmm_coul_long_opt.h pair_lj_class2.h pair_lj_class2_coul_cut.h pair_lj_class2_coul_long.h pair_lj_cubic.h pair_lj_cut.h pair_lj_cut_coul_cut.h pair_lj_cut_coul_debye.h pair_lj_cut_coul_long.h pair_lj_cut_coul_long_opt.h pair_lj_cut_coul_long_tip4p.h pair_lj_cut_coul_long_tip4p_opt.h pair_lj_cut_opt.h pair_lj_expand.h pair_lj_gromacs.h pair_lj_gromacs_coul_gromacs.h pair_lj_smooth.h pair_lj_smooth_linear.h pair_lubricate.h pair_lubricateU.h pair_lubricateU_poly.h pair_lubricate_poly.h pair_morse.h pair_morse_opt.h pair_peri_lps.h pair_peri_pmb.h pair_rebo.h pair_resquared.h pair_soft.h pair_sw.h pair_table.h pair_tersoff.h pair_tersoff_zbl.h pair_tri_lj.h pair_yukawa.h pair_yukawa_colloid.h pointers.h pppm.h pppm_cg.h pppm_old.h pppm_tip4p.h prd.h procmap.h random_mars.h random_park.h read_data.h read_dump.h read_restart.h reader.h reader_native.h reader_xyz.h region.h region_block.h region_cone.h region_cylinder.h region_intersect.h region_plane.h region_prism.h region_sphere.h region_union.h remap.h remap_wrap.h replicate.h rerun.h respa.h run.h set.h special.h style_angle.h style_atom.h style_bond.h style_command.h style_compute.h style_dihedral.h style_dump.h style_fix.h style_improper.h style_integrate.h style_kspace.h style_minimize.h style_pair.h style_reader.h style_region.h suffix.h tad.h temper.h thermo.h timer.h universe.h update.h variable.h velocity.h verlet.h verlet_split.h version.h write_restart.h xdr_compat.h 
 
 OBJ =	$(SRC:.cpp=.o)
 
@@ -35,5 +35,6 @@ clean:
 	  then cp Makefile.package.settings.empty Makefile.package.settings; fi
 	@cp Makefile.package Makefile.package.settings Obj_$@
 	@cd Obj_$@; \
-	$(MAKE) $(MFLAGS) "OBJ = $(OBJ)" "INC = $(INC)" "EXE = ../$(EXE)" lib
+	$(MAKE) $(MFLAGS) "OBJ = $(OBJ)" "INC = $(INC)" "SHFLAGS =" \
+          "EXE = ../$(EXE)" lib
 	@if [ -d Obj_$@ ]; then cd Obj_$@; rm -f $(SRC) $(INC) Makefile*; fi
diff --git a/src/Makefile.shlib b/src/Makefile.shlib
new file mode 100644
index 0000000000..791530acc2
--- /dev/null
+++ b/src/Makefile.shlib
@@ -0,0 +1,43 @@
+# LAMMPS shared library multiple-machine Makefile
+
+SHELL = /bin/sh
+
+# Definitions
+
+ROOT =	lmp
+EXE =	lib$(ROOT)_$@.so
+
+SRC =	angle.cpp angle_charmm.cpp angle_class2.cpp angle_cosine.cpp angle_cosine_delta.cpp angle_cosine_periodic.cpp angle_cosine_squared.cpp angle_harmonic.cpp angle_hybrid.cpp angle_table.cpp atom.cpp atom_map.cpp atom_vec.cpp atom_vec_angle.cpp atom_vec_atomic.cpp atom_vec_bond.cpp atom_vec_charge.cpp atom_vec_dipole.cpp atom_vec_ellipsoid.cpp atom_vec_full.cpp atom_vec_hybrid.cpp atom_vec_line.cpp atom_vec_molecular.cpp atom_vec_peri.cpp atom_vec_sphere.cpp atom_vec_tri.cpp balance.cpp bond.cpp bond_class2.cpp bond_fene.cpp bond_fene_expand.cpp bond_harmonic.cpp bond_hybrid.cpp bond_morse.cpp bond_nonlinear.cpp bond_quartic.cpp bond_table.cpp change_box.cpp comm.cpp compute.cpp compute_angle_local.cpp compute_atom_molecule.cpp compute_bond_local.cpp compute_centro_atom.cpp compute_cluster_atom.cpp compute_cna_atom.cpp compute_com.cpp compute_com_molecule.cpp compute_contact_atom.cpp compute_coord_atom.cpp compute_damage_atom.cpp compute_dihedral_local.cpp compute_displace_atom.cpp compute_erotate_asphere.cpp compute_erotate_sphere.cpp compute_erotate_sphere_atom.cpp compute_event_displace.cpp compute_group_group.cpp compute_gyration.cpp compute_gyration_molecule.cpp compute_heat_flux.cpp compute_improper_local.cpp compute_ke.cpp compute_ke_atom.cpp compute_msd.cpp compute_msd_molecule.cpp compute_pair.cpp compute_pair_local.cpp compute_pe.cpp compute_pe_atom.cpp compute_pressure.cpp compute_property_atom.cpp compute_property_local.cpp compute_property_molecule.cpp compute_rdf.cpp compute_reduce.cpp compute_reduce_region.cpp compute_slice.cpp compute_stress_atom.cpp compute_temp.cpp compute_temp_asphere.cpp compute_temp_com.cpp compute_temp_deform.cpp compute_temp_partial.cpp compute_temp_profile.cpp compute_temp_ramp.cpp compute_temp_region.cpp compute_temp_sphere.cpp compute_ti.cpp create_atoms.cpp create_box.cpp delete_atoms.cpp delete_bonds.cpp dihedral.cpp dihedral_charmm.cpp dihedral_class2.cpp dihedral_harmonic.cpp dihedral_helix.cpp dihedral_hybrid.cpp dihedral_multi_harmonic.cpp dihedral_opls.cpp displace_atoms.cpp domain.cpp dump.cpp dump_atom.cpp dump_cfg.cpp dump_custom.cpp dump_dcd.cpp dump_image.cpp dump_local.cpp dump_xtc.cpp dump_xyz.cpp error.cpp finish.cpp fix.cpp fix_adapt.cpp fix_addforce.cpp fix_append_atoms.cpp fix_ave_atom.cpp fix_ave_correlate.cpp fix_ave_histo.cpp fix_ave_spatial.cpp fix_ave_time.cpp fix_aveforce.cpp fix_balance.cpp fix_bond_break.cpp fix_bond_create.cpp fix_bond_swap.cpp fix_box_relax.cpp fix_deform.cpp fix_deposit.cpp fix_drag.cpp fix_dt_reset.cpp fix_efield.cpp fix_enforce2d.cpp fix_evaporate.cpp fix_event.cpp fix_event_prd.cpp fix_event_tad.cpp fix_external.cpp fix_freeze.cpp fix_gcmc.cpp fix_gravity.cpp fix_heat.cpp fix_indent.cpp fix_langevin.cpp fix_lineforce.cpp fix_minimize.cpp fix_momentum.cpp fix_move.cpp fix_msst.cpp fix_neb.cpp fix_nh.cpp fix_nh_asphere.cpp fix_nh_sphere.cpp fix_nph.cpp fix_nph_asphere.cpp fix_nph_sphere.cpp fix_nphug.cpp fix_npt.cpp fix_npt_asphere.cpp fix_npt_sphere.cpp fix_nve.cpp fix_nve_asphere.cpp fix_nve_asphere_noforce.cpp fix_nve_limit.cpp fix_nve_line.cpp fix_nve_noforce.cpp fix_nve_sphere.cpp fix_nve_tri.cpp fix_nvt.cpp fix_nvt_asphere.cpp fix_nvt_sllod.cpp fix_nvt_sphere.cpp fix_orient_fcc.cpp fix_peri_neigh.cpp fix_planeforce.cpp fix_pour.cpp fix_press_berendsen.cpp fix_print.cpp fix_qeq_comb.cpp fix_read_restart.cpp fix_recenter.cpp fix_respa.cpp fix_restrain.cpp fix_rigid.cpp fix_rigid_nve.cpp fix_rigid_nvt.cpp fix_setforce.cpp fix_shake.cpp fix_shear_history.cpp fix_spring.cpp fix_spring_rg.cpp fix_spring_self.cpp fix_srd.cpp fix_store_force.cpp fix_store_state.cpp fix_temp_berendsen.cpp fix_temp_rescale.cpp fix_thermal_conductivity.cpp fix_tmd.cpp fix_ttm.cpp fix_viscosity.cpp fix_viscous.cpp fix_wall.cpp fix_wall_colloid.cpp fix_wall_gran.cpp fix_wall_harmonic.cpp fix_wall_lj126.cpp fix_wall_lj93.cpp fix_wall_piston.cpp fix_wall_reflect.cpp fix_wall_region.cpp fix_wall_srd.cpp force.cpp group.cpp image.cpp improper.cpp improper_class2.cpp improper_cvff.cpp improper_harmonic.cpp improper_hybrid.cpp improper_umbrella.cpp input.cpp integrate.cpp irregular.cpp kspace.cpp lammps.cpp lattice.cpp library.cpp  math_extra.cpp memory.cpp min.cpp min_cg.cpp min_fire.cpp min_hftn.cpp min_linesearch.cpp min_quickmin.cpp min_sd.cpp minimize.cpp modify.cpp neb.cpp neigh_bond.cpp neigh_derive.cpp neigh_full.cpp neigh_gran.cpp neigh_half_bin.cpp neigh_half_multi.cpp neigh_half_nsq.cpp neigh_list.cpp neigh_request.cpp neigh_respa.cpp neigh_stencil.cpp neighbor.cpp output.cpp pair.cpp pair_adp.cpp pair_airebo.cpp pair_beck.cpp pair_bop.cpp pair_born.cpp pair_born_coul_wolf.cpp pair_brownian.cpp pair_brownian_poly.cpp pair_buck.cpp pair_buck_coul_cut.cpp pair_colloid.cpp pair_comb.cpp pair_coul_cut.cpp pair_coul_debye.cpp pair_coul_wolf.cpp pair_dipole_cut.cpp pair_dpd.cpp pair_dpd_tstat.cpp pair_dsmc.cpp pair_eam.cpp pair_eam_alloy.cpp pair_eam_alloy_opt.cpp pair_eam_fs.cpp pair_eam_fs_opt.cpp pair_eam_opt.cpp pair_eim.cpp pair_gauss.cpp pair_gayberne.cpp pair_gran_hertz_history.cpp pair_gran_hooke.cpp pair_gran_hooke_history.cpp pair_hbond_dreiding_lj.cpp pair_hbond_dreiding_morse.cpp pair_hybrid.cpp pair_hybrid_overlay.cpp pair_lcbop.cpp pair_line_lj.cpp pair_lj96_cut.cpp pair_lj_charmm_coul_charmm.cpp pair_lj_charmm_coul_charmm_implicit.cpp pair_lj_class2.cpp pair_lj_class2_coul_cut.cpp pair_lj_class2_coul_long.cpp pair_lj_cubic.cpp pair_lj_cut.cpp pair_lj_cut_coul_cut.cpp pair_lj_cut_coul_debye.cpp pair_lj_cut_opt.cpp pair_lj_expand.cpp pair_lj_gromacs.cpp pair_lj_gromacs_coul_gromacs.cpp pair_lj_smooth.cpp pair_lj_smooth_linear.cpp pair_lubricate.cpp pair_lubricateU.cpp pair_lubricateU_poly.cpp pair_lubricate_poly.cpp pair_morse.cpp pair_morse_opt.cpp pair_peri_lps.cpp pair_peri_pmb.cpp pair_rebo.cpp pair_resquared.cpp pair_soft.cpp pair_sw.cpp pair_table.cpp pair_tersoff.cpp pair_tersoff_zbl.cpp pair_tri_lj.cpp pair_yukawa.cpp pair_yukawa_colloid.cpp prd.cpp procmap.cpp random_mars.cpp random_park.cpp read_data.cpp read_dump.cpp read_restart.cpp reader.cpp reader_native.cpp reader_xyz.cpp region.cpp region_block.cpp region_cone.cpp region_cylinder.cpp region_intersect.cpp region_plane.cpp region_prism.cpp region_sphere.cpp region_union.cpp replicate.cpp rerun.cpp respa.cpp run.cpp set.cpp special.cpp tad.cpp temper.cpp thermo.cpp timer.cpp universe.cpp update.cpp variable.cpp velocity.cpp verlet.cpp verlet_split.cpp write_restart.cpp xdr_compat.cpp 
+
+INC =	accelerator_cuda.h accelerator_omp.h angle.h angle_charmm.h angle_class2.h angle_cosine.h angle_cosine_delta.h angle_cosine_periodic.h angle_cosine_squared.h angle_harmonic.h angle_hybrid.h angle_table.h atom.h atom_map.h atom_vec.h atom_vec_angle.h atom_vec_atomic.h atom_vec_bond.h atom_vec_charge.h atom_vec_dipole.h atom_vec_ellipsoid.h atom_vec_full.h atom_vec_hybrid.h atom_vec_line.h atom_vec_molecular.h atom_vec_peri.h atom_vec_sphere.h atom_vec_tri.h balance.h bond.h bond_class2.h bond_fene.h bond_fene_expand.h bond_harmonic.h bond_hybrid.h bond_morse.h bond_nonlinear.h bond_quartic.h bond_table.h change_box.h comm.h compute.h compute_angle_local.h compute_atom_molecule.h compute_bond_local.h compute_centro_atom.h compute_cluster_atom.h compute_cna_atom.h compute_com.h compute_com_molecule.h compute_contact_atom.h compute_coord_atom.h compute_damage_atom.h compute_dihedral_local.h compute_displace_atom.h compute_erotate_asphere.h compute_erotate_sphere.h compute_erotate_sphere_atom.h compute_event_displace.h compute_group_group.h compute_gyration.h compute_gyration_molecule.h compute_heat_flux.h compute_improper_local.h compute_ke.h compute_ke_atom.h compute_msd.h compute_msd_molecule.h compute_pair.h compute_pair_local.h compute_pe.h compute_pe_atom.h compute_pressure.h compute_property_atom.h compute_property_local.h compute_property_molecule.h compute_rdf.h compute_reduce.h compute_reduce_region.h compute_slice.h compute_stress_atom.h compute_temp.h compute_temp_asphere.h compute_temp_com.h compute_temp_deform.h compute_temp_partial.h compute_temp_profile.h compute_temp_ramp.h compute_temp_region.h compute_temp_sphere.h compute_ti.h create_atoms.h create_box.h delete_atoms.h delete_bonds.h dihedral.h dihedral_charmm.h dihedral_class2.h dihedral_harmonic.h dihedral_helix.h dihedral_hybrid.h dihedral_multi_harmonic.h dihedral_opls.h displace_atoms.h domain.h dump.h dump_atom.h dump_cfg.h dump_custom.h dump_dcd.h dump_image.h dump_local.h dump_xtc.h dump_xyz.h error.h finish.h fix.h fix_adapt.h fix_addforce.h fix_append_atoms.h fix_ave_atom.h fix_ave_correlate.h fix_ave_histo.h fix_ave_spatial.h fix_ave_time.h fix_aveforce.h fix_balance.h fix_bond_break.h fix_bond_create.h fix_bond_swap.h fix_box_relax.h fix_deform.h fix_deposit.h fix_drag.h fix_dt_reset.h fix_efield.h fix_enforce2d.h fix_evaporate.h fix_event.h fix_event_prd.h fix_event_tad.h fix_external.h fix_freeze.h fix_gcmc.h fix_gravity.h fix_heat.h fix_indent.h fix_langevin.h fix_lineforce.h fix_minimize.h fix_momentum.h fix_move.h fix_msst.h fix_neb.h fix_nh.h fix_nh_asphere.h fix_nh_sphere.h fix_nph.h fix_nph_asphere.h fix_nph_sphere.h fix_nphug.h fix_npt.h fix_npt_asphere.h fix_npt_sphere.h fix_nve.h fix_nve_asphere.h fix_nve_asphere_noforce.h fix_nve_limit.h fix_nve_line.h fix_nve_noforce.h fix_nve_sphere.h fix_nve_tri.h fix_nvt.h fix_nvt_asphere.h fix_nvt_sllod.h fix_nvt_sphere.h fix_orient_fcc.h fix_peri_neigh.h fix_planeforce.h fix_pour.h fix_press_berendsen.h fix_print.h fix_qeq_comb.h fix_read_restart.h fix_recenter.h fix_respa.h fix_restrain.h fix_rigid.h fix_rigid_nve.h fix_rigid_nvt.h fix_setforce.h fix_shake.h fix_shear_history.h fix_spring.h fix_spring_rg.h fix_spring_self.h fix_srd.h fix_store_force.h fix_store_state.h fix_temp_berendsen.h fix_temp_rescale.h fix_thermal_conductivity.h fix_tmd.h fix_ttm.h fix_viscosity.h fix_viscous.h fix_wall.h fix_wall_colloid.h fix_wall_gran.h fix_wall_harmonic.h fix_wall_lj126.h fix_wall_lj93.h fix_wall_piston.h fix_wall_reflect.h fix_wall_region.h fix_wall_srd.h force.h group.h image.h improper.h improper_class2.h improper_cvff.h improper_harmonic.h improper_hybrid.h improper_umbrella.h input.h integrate.h irregular.h kspace.h lammps.h lattice.h library.h lmptype.h lmpwindows.h math_const.h math_extra.h memory.h min.h min_cg.h min_fire.h min_hftn.h min_linesearch.h min_quickmin.h min_sd.h minimize.h modify.h neb.h neigh_bond.h neigh_derive.h neigh_full.h neigh_gran.h neigh_half_bin.h neigh_half_multi.h neigh_half_nsq.h neigh_list.h neigh_request.h neigh_respa.h neighbor.h output.h pack.h pair.h pair_adp.h pair_airebo.h pair_beck.h pair_bop.h pair_born.h pair_born_coul_wolf.h pair_brownian.h pair_brownian_poly.h pair_buck.h pair_buck_coul_cut.h pair_colloid.h pair_comb.h pair_coul_cut.h pair_coul_debye.h pair_coul_wolf.h pair_dipole_cut.h pair_dpd.h pair_dpd_tstat.h pair_dsmc.h pair_eam.h pair_eam_alloy.h pair_eam_alloy_opt.h pair_eam_fs.h pair_eam_fs_opt.h pair_eam_opt.h pair_eim.h pair_gauss.h pair_gayberne.h pair_gran_hertz_history.h pair_gran_hooke.h pair_gran_hooke_history.h pair_hbond_dreiding_lj.h pair_hbond_dreiding_morse.h pair_hybrid.h pair_hybrid_overlay.h pair_lcbop.h pair_line_lj.h pair_lj96_cut.h pair_lj_charmm_coul_charmm.h pair_lj_charmm_coul_charmm_implicit.h pair_lj_class2.h pair_lj_class2_coul_cut.h pair_lj_class2_coul_long.h pair_lj_cubic.h pair_lj_cut.h pair_lj_cut_coul_cut.h pair_lj_cut_coul_debye.h pair_lj_cut_opt.h pair_lj_expand.h pair_lj_gromacs.h pair_lj_gromacs_coul_gromacs.h pair_lj_smooth.h pair_lj_smooth_linear.h pair_lubricate.h pair_lubricateU.h pair_lubricateU_poly.h pair_lubricate_poly.h pair_morse.h pair_morse_opt.h pair_peri_lps.h pair_peri_pmb.h pair_rebo.h pair_resquared.h pair_soft.h pair_sw.h pair_table.h pair_tersoff.h pair_tersoff_zbl.h pair_tri_lj.h pair_yukawa.h pair_yukawa_colloid.h pointers.h prd.h procmap.h random_mars.h random_park.h read_data.h read_dump.h read_restart.h reader.h reader_native.h reader_xyz.h region.h region_block.h region_cone.h region_cylinder.h region_intersect.h region_plane.h region_prism.h region_sphere.h region_union.h replicate.h rerun.h respa.h run.h set.h special.h style_angle.h style_atom.h style_bond.h style_command.h style_compute.h style_dihedral.h style_dump.h style_fix.h style_improper.h style_integrate.h style_kspace.h style_minimize.h style_pair.h style_reader.h style_region.h suffix.h tad.h temper.h thermo.h timer.h universe.h update.h variable.h velocity.h verlet.h verlet_split.h version.h write_restart.h xdr_compat.h 
+
+OBJ =	$(SRC:.cpp=.o)
+
+# Targets
+
+help:
+	@echo 'Type "make target" where target is one of:'
+	@echo ''
+	@files="`ls MAKE/Makefile.*`"; \
+	for file in $$files; do head -1 $$file; done
+
+clean:
+	rm -rf Obj_*
+
+.DEFAULT:
+	@test -f MAKE/Makefile.$@
+	@if [ ! -d Obj_shlib_$@ ]; then mkdir Obj_shlib_$@; fi
+	@cp -p $(SRC) $(INC) Obj_shlib_$@
+	@cp MAKE/Makefile.$@ Obj_shlib_$@/Makefile
+	@if [ ! -e Makefile.package ]; \
+	  then cp Makefile.package.empty Makefile.package; fi
+	@if [ ! -e Makefile.package.settings ]; \
+	  then cp Makefile.package.settings.empty Makefile.package.settings; fi
+	@cp Makefile.package Makefile.package.settings Obj_shlib_$@
+	@cd Obj_shlib_$@; \
+	$(MAKE) $(MFLAGS) "OBJ = $(OBJ)" \
+          "INC = $(INC)" "EXE = ../$(EXE)" shlib
+	@rm -f liblmp.so
+	@ln -s $(EXE) liblmp.so
+	@if [ -d Obj_shlib_$@ ]; then cd Obj_shlib_$@; \
+          rm -f $(SRC) $(INC) Makefile*; fi
diff --git a/src/STUBS/Makefile b/src/STUBS/Makefile
index 491e2e75c6..d3a75b6f81 100644
--- a/src/STUBS/Makefile
+++ b/src/STUBS/Makefile
@@ -1,4 +1,11 @@
-# Makefile for MPI stubs - edit this for your platform
+# Makefile for MPI stubs library
+
+# Syntax:
+#   make                 # build static lib as libmpi_stubs.a
+#   make shlib           # build shared lib as libmpi_stubs.so
+#   make clean           # remove *.o and lib files
+
+# edit System-specific settings as needed for your platform
 
 SHELL = /bin/sh
 .IGNORE:
@@ -10,30 +17,35 @@ INC =		mpi.h
 
 # Definitions
 
-EXE =		libmpi.a
+EXE =		libmpi_stubs.a
+SHLIB =		libmpi_stubs.so
 OBJ = 		$(SRC:.c=.o)
 
 # System-specific settings
 
-CC =		cc
-CCFLAGS =	-O # -fPIC
+CC =		g++
+CCFLAGS =	-O
+SHFLAGS =	-fPIC
+
 ARCHIVE =	ar
 ARCHFLAG =	rs
+SHLIBFLAGS =    -shared          
 
-# Target
+# Targets
 
-$(EXE):	$(OBJ)
+lib:	$(OBJ)
 	$(ARCHIVE) $(ARCHFLAG) $(EXE) $(OBJ)
 
-# Clean
+shlib:	$(OBJ)
+	$(CC) $(CFLAGS) $(SHFLAGS) $(SHLIBFLAGS) -o $(SHLIB) $(OBJ)
 
 clean:
-	rm *.o libmpi.a
+	rm -f *.o libmpi_stubs.a libmpi_stubs.so
 
 # Compilation rules
 
 .c.o:
-	$(CC) $(CCFLAGS) -c $<
+	$(CC) $(CCFLAGS) $(SHFLAGS) -c $<
 
 # Individual dependencies
 
diff --git a/src/USER-CUDA/pppm_cuda.cpp b/src/USER-CUDA/pppm_cuda.cpp
index 8b3ff4548d..9d5236ab39 100644
--- a/src/USER-CUDA/pppm_cuda.cpp
+++ b/src/USER-CUDA/pppm_cuda.cpp
@@ -57,6 +57,7 @@
 #include "remap_wrap.h"
 #include "memory.h"
 #include "error.h"
+#include "update.h"
 #include <ctime> //crmadd
 #include "cuda_wrapper_cu.h"
 #include "pppm_cuda_cu.h"
@@ -1391,6 +1392,10 @@ void PPPMCuda::fieldforce()
 int PPPMCuda::timing(int n, double &time3d, double &time1d)
 {
 
+  time3d = cuda->shared_data.cuda_timings.pppm_poisson/update->nsteps*n;
+  time1d = cuda->shared_data.cuda_timings.pppm_poisson/update->nsteps/4*n;
+  return 4;
+
   double time1,time2;
 
   for (int i = 0; i < 2*nfft_both; i++) work1[i] = 0.0;
diff --git a/src/USER-CUDA/verlet_cuda.cpp b/src/USER-CUDA/verlet_cuda.cpp
index d2b1a414c6..217bd371b1 100644
--- a/src/USER-CUDA/verlet_cuda.cpp
+++ b/src/USER-CUDA/verlet_cuda.cpp
@@ -58,14 +58,17 @@ using namespace LAMMPS_NS;
 #define MAKETIMEING
 
 
-VerletCuda::VerletCuda(LAMMPS *lmp, int narg, char **arg) : Verlet(lmp, narg, arg) {
+VerletCuda::VerletCuda(LAMMPS* lmp, int narg, char** arg) : Verlet(lmp, narg, arg)
+{
   cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
 
-        modify_cuda=(ModifyCuda*) modify;
-    int ifix = modify->find_fix("package_omp");
-    if (ifix >= 0) external_force_clear = 1;
+  if(cuda == NULL)
+    error->all(FLERR, "You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
+
+  modify_cuda = (ModifyCuda*) modify;
+  int ifix = modify->find_fix("package_omp");
+
+  if(ifix >= 0) external_force_clear = 1;
 }
 
 /* ----------------------------------------------------------------------
@@ -74,263 +77,309 @@ VerletCuda::VerletCuda(LAMMPS *lmp, int narg, char **arg) : Verlet(lmp, narg, ar
 
 void VerletCuda::setup()
 {
-        //debug related variables
-        cuda->debugdata[0]=0;
-        cuda->cu_debugdata->upload();
-        dotestatom=cuda->dotestatom;
-        int testatom=cuda->testatom;//48267;
-        if(atom->nlocal==0)
-                error->warning(FLERR,"# CUDA: There are currently no atoms on one of the MPI processes. This is known to cause errors with the USER-CUDA package. Please use the 'processors' keyword to enforce more balanced processor layout.");
-        MYDBG(printf("# CUDA VerletCuda::setup start\n"); )
+  //debug related variables
+  cuda->debugdata[0] = 0;
+  cuda->cu_debugdata->upload();
+  dotestatom = cuda->dotestatom;
+  int testatom = cuda->testatom; //48267;
 
-        cuda->oncpu = true;
-        cuda->begin_setup = true;
-        cuda->finished_run = false;
+  if(atom->nlocal == 0)
+    error->warning(FLERR, "# CUDA: There are currently no atoms on one of the MPI processes. This is known to cause errors with the USER-CUDA package. Please use the 'processors' keyword to enforce more balanced processor layout.");
 
-        time_pair=0;
-        time_kspace=0;
-        time_comm=0;
-        time_modify=0;
-        time_fulliterate=0;
+  MYDBG(printf("# CUDA VerletCuda::setup start\n");)
 
-    atom->setup();
+  cuda->oncpu = true;
+  cuda->begin_setup = true;
+  cuda->finished_run = false;
 
-        cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
-        cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
-        cuda_shared_pair*   cu_pair   = & cuda->shared_data.pair;
-        cu_atom->update_nlocal=1;
-        cu_atom->update_nmax=1;
+  time_pair = 0;
+  time_kspace = 0;
+  time_comm = 0;
+  time_modify = 0;
+  time_fulliterate = 0;
 
-        if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;
+  atom->setup();
 
-        cuda->setDomainParams();
+  cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
+  cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
+  cuda_shared_pair*   cu_pair   = & cuda->shared_data.pair;
+  cu_atom->update_nlocal = 1;
+  cu_atom->update_nmax = 1;
+
+  if(atom->molecular || (force->kspace && (not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;
+
+  cuda->setDomainParams();
 
 
-        if(cuda->shared_data.me==0)
-        printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);
-        if(cuda->shared_data.me==0)
-          printf("# CUDA: Using precision: Global: %u X: %u V: %u F: %u PPPM: %u \n", CUDA_PRECISION==1?4:8,static_cast<int>(sizeof(X_FLOAT)),static_cast<int>(sizeof(V_FLOAT)),static_cast<int>(sizeof(F_FLOAT)),static_cast<int>(sizeof(PPPM_FLOAT)));
-        cuda->allocate();
+  if(cuda->shared_data.me == 0)
+    printf("# CUDA: Using precision: Global: %u X: %u V: %u F: %u PPPM: %u \n", CUDA_PRECISION == 1 ? 4 : 8, (int) sizeof(X_FLOAT), (int) sizeof(V_FLOAT), (int) sizeof(F_FLOAT), (int) sizeof(PPPM_FLOAT));
+
+  cuda->allocate();
 
 
-  if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n");
+  if(comm->me == 0 && screen) fprintf(screen, "Setting up run ...\n");
 
   // setup domain, communication and neighboring
   // acquire ghosts
   // build neighbor lists
   modify->setup_pre_exchange();
 
-  if (triclinic) domain->x2lamda(atom->nlocal);
+  if(triclinic) domain->x2lamda(atom->nlocal);
+
   domain->pbc();
   domain->reset_box();
   comm->setup();
-  if (neighbor->style) neighbor->setup_bins();
+
+  if(neighbor->style) neighbor->setup_bins();
+
   comm->exchange();
-  if (atom->sortfreq > 0) atom->sort();
+
+  if(atom->sortfreq > 0) atom->sort();
+
   comm->borders();
-  if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+
+  if(triclinic) domain->lamda2x(atom->nlocal + atom->nghost);
+
   cuda->setSystemParams();
   cuda->checkResize();
 
-  if(cuda->shared_data.me==0)
-  printf("# CUDA: VerletCuda::setup: Upload data...\n");
+  if(cuda->shared_data.me == 0)
+    printf("# CUDA: VerletCuda::setup: Upload data...\n");
+
   cuda->uploadAll();
   neighbor->build();
   neighbor->ncalls = 0;
 
   if(atom->mass)
-  cuda->cu_mass->upload();
+    cuda->cu_mass->upload();
 
   if(cuda->cu_map_array)
-  cuda->cu_map_array->upload();
+    cuda->cu_map_array->upload();
 
   // compute all forces
 
   ev_set(update->ntimestep);
+
   if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
+
   if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
-  if(elist_atom||vlist_atom) cuda->checkResize();
+
+  if(elist_atom || vlist_atom) cuda->checkResize();
 
   int test_BpA_vs_TpA = true;
   timespec starttime;
   timespec endtime;
-  #ifdef NO_PREC_TIMING
-    double startsec,endsec;
-  #endif
-         //if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = false;
-   if(test_BpA_vs_TpA && cuda->shared_data.pair.cudable_force && force->pair &&(cuda->shared_data.pair.override_block_per_atom<0))
-   {
-     int StyleLoops=10;
-     if(cuda->shared_data.me==0)
-    printf("Test TpA\n");
-     cuda->shared_data.pair.use_block_per_atom = 0;
-     neighbor->build();
-     Cuda_Pair_GenerateXType(&cuda->shared_data);
-         if(cuda->cu_v_radius)
-           Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-         if(cuda->cu_omega_rmass)
-           Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-     force->pair->compute(eflag,vflag);
-       CudaWrapper_Sync();
-     #ifdef NO_PREC_TIMING
-       startsec = 1.0*clock()/CLOCKS_PER_SEC;
-     #endif
-     clock_gettime(CLOCK_REALTIME,&starttime);
-     for(int i=0;i<StyleLoops;i++)
-     {
-           Cuda_Pair_GenerateXType(&cuda->shared_data);
-           if(cuda->cu_v_radius)
-             Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-           if(cuda->cu_omega_rmass)
-             Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-       force->pair->compute(eflag,vflag);
-       CudaWrapper_Sync();
-     }
-     clock_gettime(CLOCK_REALTIME,&endtime);
+#ifdef NO_PREC_TIMING
+  double startsec, endsec;
+#endif
 
-           double TpAtime=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-     #ifdef NO_PREC_TIMING
-       endsec = 1.0*clock()/CLOCKS_PER_SEC;
-       TpAtime = endsec - startsec;
-     #endif
-     if(cuda->shared_data.me==0)
-     printf("Test BpA\n");
-     cuda->shared_data.pair.use_block_per_atom = 1;
-     neighbor->build();
-     Cuda_Pair_GenerateXType(&cuda->shared_data);
-         if(cuda->cu_v_radius)
-           Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-         if(cuda->cu_omega_rmass)
-           Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-     force->pair->compute(eflag,vflag);
-       CudaWrapper_Sync();
+  //if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = false;
+  if(test_BpA_vs_TpA && cuda->shared_data.pair.cudable_force && force->pair && (cuda->shared_data.pair.override_block_per_atom < 0)) {
+    int StyleLoops = 10;
 
-     clock_gettime(CLOCK_REALTIME,&starttime);
-     #ifdef NO_PREC_TIMING
-       startsec = 1.0*clock()/CLOCKS_PER_SEC;
-     #endif
-     for(int i=0;i<StyleLoops;i++)
-     {
-           Cuda_Pair_GenerateXType(&cuda->shared_data);
-           if(cuda->cu_v_radius)
-             Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-           if(cuda->cu_omega_rmass)
-             Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-       force->pair->compute(eflag,vflag);
-       CudaWrapper_Sync();
-     }
-     clock_gettime(CLOCK_REALTIME,&endtime);
-           double BpAtime=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-      #ifdef NO_PREC_TIMING
-       endsec = 1.0*clock()/CLOCKS_PER_SEC;
-       BpAtime = endsec - startsec;
-     #endif
+    if(cuda->shared_data.me == 0)
+      printf("Test TpA\n");
 
-        if(cuda->shared_data.me==0)
-           printf("\n# CUDA: Timing of parallelisation layout with %i loops:\n",StyleLoops);
-        if(cuda->shared_data.me==0)
-           printf("# CUDA: BpA TpA\n %lf %lf\n",BpAtime,TpAtime);
-           if(BpAtime>TpAtime) cuda->shared_data.pair.use_block_per_atom = 0;
-   }
-   else
-   cuda->shared_data.pair.use_block_per_atom = cuda->shared_data.pair.override_block_per_atom;
-   //cuda->shared_data.pair.use_block_per_atom = 0;
-   if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;
-   neighbor->build();
-   neighbor->ncalls = 0;
+    cuda->shared_data.pair.use_block_per_atom = 0;
+    neighbor->build();
+    Cuda_Pair_GenerateXType(&cuda->shared_data);
 
-   force_clear();
+    if(cuda->cu_v_radius)
+      Cuda_Pair_GenerateVRadius(&cuda->shared_data);
 
-   modify->setup_pre_force(vflag);
+    if(cuda->cu_omega_rmass)
+      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
 
-   cuda->cu_f->download();
-   if(cuda->cu_torque)
-   cuda->cu_torque->download();
+    force->pair->compute(eflag, vflag);
+    CudaWrapper_Sync();
+#ifdef NO_PREC_TIMING
+    startsec = 1.0 * clock() / CLOCKS_PER_SEC;
+#endif
+    clock_gettime(CLOCK_REALTIME, &starttime);
+
+    for(int i = 0; i < StyleLoops; i++) {
+      Cuda_Pair_GenerateXType(&cuda->shared_data);
+
+      if(cuda->cu_v_radius)
+        Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+
+      if(cuda->cu_omega_rmass)
+        Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+
+      force->pair->compute(eflag, vflag);
+      CudaWrapper_Sync();
+    }
+
+    clock_gettime(CLOCK_REALTIME, &endtime);
+
+    double TpAtime = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+#ifdef NO_PREC_TIMING
+    endsec = 1.0 * clock() / CLOCKS_PER_SEC;
+    TpAtime = endsec - startsec;
+#endif
+
+    if(cuda->shared_data.me == 0)
+      printf("Test BpA\n");
+
+    cuda->shared_data.pair.use_block_per_atom = 1;
+    neighbor->build();
+    Cuda_Pair_GenerateXType(&cuda->shared_data);
+
+    if(cuda->cu_v_radius)
+      Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+
+    if(cuda->cu_omega_rmass)
+      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+
+    force->pair->compute(eflag, vflag);
+    CudaWrapper_Sync();
+
+    clock_gettime(CLOCK_REALTIME, &starttime);
+#ifdef NO_PREC_TIMING
+    startsec = 1.0 * clock() / CLOCKS_PER_SEC;
+#endif
+
+    for(int i = 0; i < StyleLoops; i++) {
+      Cuda_Pair_GenerateXType(&cuda->shared_data);
+
+      if(cuda->cu_v_radius)
+        Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+
+      if(cuda->cu_omega_rmass)
+        Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+
+      force->pair->compute(eflag, vflag);
+      CudaWrapper_Sync();
+    }
+
+    clock_gettime(CLOCK_REALTIME, &endtime);
+    double BpAtime = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+#ifdef NO_PREC_TIMING
+    endsec = 1.0 * clock() / CLOCKS_PER_SEC;
+    BpAtime = endsec - startsec;
+#endif
+
+    if(cuda->shared_data.me == 0)
+      printf("\n# CUDA: Timing of parallelisation layout with %i loops:\n", StyleLoops);
+
+    if(cuda->shared_data.me == 0)
+      printf("# CUDA: BpA TpA\n %lf %lf\n", BpAtime, TpAtime);
+
+    if(BpAtime > TpAtime) cuda->shared_data.pair.use_block_per_atom = 0;
+  } else
+    cuda->shared_data.pair.use_block_per_atom = cuda->shared_data.pair.override_block_per_atom;
+
+  //cuda->shared_data.pair.use_block_per_atom = 0;
+  if(atom->molecular || (force->kspace && (not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;
+
+  neighbor->build();
+  neighbor->ncalls = 0;
+
+  force_clear();
+
+  modify->setup_pre_force(vflag);
+
+  cuda->cu_f->download();
+
+  if(cuda->cu_torque)
+    cuda->cu_torque->download();
 
   //printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
 
-  MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute\n"); )
+  MYDBG(printf("# CUDA: VerletCuda::setup: initial force compute\n");)
 
   //test_atom(testatom,"pre pair force");
 
-  if(cuda->shared_data.pair.cudable_force)
-  {
-        cuda->uploadAll();
-        Cuda_Pair_GenerateXType(&cuda->shared_data);
-        if(cuda->cu_v_radius)
-        Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-        if(cuda->cu_omega_rmass)
-        Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+  if(cuda->shared_data.pair.cudable_force) {
+    cuda->uploadAll();
+    Cuda_Pair_GenerateXType(&cuda->shared_data);
+
+    if(cuda->cu_v_radius)
+      Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+
+    if(cuda->cu_omega_rmass)
+      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
   }
 
-  if (force->pair) force->pair->compute(eflag,vflag);
+  if(force->pair) force->pair->compute(eflag, vflag);
 
-  if(cuda->shared_data.pair.cudable_force)
-  {
-        if(cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->upload();
-          if(eflag) cuda->cu_eng_coul->upload();
-          if(vflag) cuda->cu_virial->upload();
-      Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-        }
-        cuda->downloadAll();
+  if(cuda->shared_data.pair.cudable_force) {
+    if(cuda->shared_data.pair.collect_forces_later) {
+      if(eflag) cuda->cu_eng_vdwl->upload();
+
+      if(eflag) cuda->cu_eng_coul->upload();
+
+      if(vflag) cuda->cu_virial->upload();
+
+      Cuda_Pair_CollectForces(&cuda->shared_data, eflag, vflag);
+
+      if(eflag) cuda->cu_eng_vdwl->download();
+
+      if(eflag) cuda->cu_eng_coul->download();
+
+      if(vflag) cuda->cu_virial->download();
+    }
+
+    cuda->downloadAll();
   }
 
-  test_atom(testatom,"post pair force");
+  test_atom(testatom, "post pair force");
 
-  MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute done\n"); )
-    //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
+  MYDBG(printf("# CUDA: VerletCuda::setup: initial force compute done\n");)
+  //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
 
-  if (atom->molecular) {
-    if (force->bond) force->bond->compute(eflag,vflag);
-    if (force->angle) force->angle->compute(eflag,vflag);
-    if (force->dihedral) force->dihedral->compute(eflag,vflag);
-    if (force->improper) force->improper->compute(eflag,vflag);
+  if(atom->molecular) {
+    if(force->bond) force->bond->compute(eflag, vflag);
+
+    if(force->angle) force->angle->compute(eflag, vflag);
+
+    if(force->dihedral) force->dihedral->compute(eflag, vflag);
+
+    if(force->improper) force->improper->compute(eflag, vflag);
   }
 
 
-  if(cuda->shared_data.pppm.cudable_force)
-  {
-        cuda->cu_tag ->upload();
-        cuda->cu_type->upload();
-        cuda->cu_x   ->upload();
-        cuda->cu_v   ->upload();
-        cuda->cu_f   ->upload();
-        if(cu_atom->q_flag) cuda->cu_q->upload();
+  if(cuda->shared_data.pppm.cudable_force) {
+    cuda->cu_tag ->upload();
+    cuda->cu_type->upload();
+    cuda->cu_x   ->upload();
+    cuda->cu_v   ->upload();
+    cuda->cu_f   ->upload();
+
+    if(cu_atom->q_flag) cuda->cu_q->upload();
   }
-  if (force->kspace) {
+
+  if(force->kspace) {
     force->kspace->setup();
-    force->kspace->compute(eflag,vflag);
-  }
-  if(cuda->shared_data.pppm.cudable_force)
-  {
-        cuda->cu_f   ->download();
+    force->kspace->compute(eflag, vflag);
   }
 
-  test_atom(testatom,"post kspace");
+  if(cuda->shared_data.pppm.cudable_force) {
+    cuda->cu_f   ->download();
+  }
+
+  test_atom(testatom, "post kspace");
 
   cuda->uploadAll();
-  if (force->newton) comm->reverse_comm();
+
+  if(force->newton) comm->reverse_comm();
+
   cuda->downloadAll();
 
-  test_atom(testatom,"post reverse comm");
+  test_atom(testatom, "post reverse comm");
 
- if(cuda->shared_data.me==0)
-  printf("# CUDA: Total Device Memory usage post setup: %lf MB\n",1.0*CudaWrapper_CheckMemUsage()/1024/1024);
+  if(cuda->shared_data.me == 0)
+    printf("# CUDA: Total Device Memory usage post setup: %lf MB\n", 1.0 * CudaWrapper_CheckMemUsage() / 1024 / 1024);
 
-  MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup\n"); )
+  MYDBG(printf("# CUDA: VerletCuda::setup: call modify setup\n");)
   modify->setup(vflag);
 
-  MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup done\n"); )
-    output->setup(1);
+  MYDBG(printf("# CUDA: VerletCuda::setup: call modify setup done\n");)
+  output->setup(1);
 
-  test_atom(testatom,"post setup");
+  test_atom(testatom, "post setup");
 
-  MYDBG( printf("# CUDA: VerletCuda::setup: done\n"); )
+  MYDBG(printf("# CUDA: VerletCuda::setup: done\n");)
   cuda->finished_setup = true;
   cuda->oncpu = false;
 }
@@ -341,35 +390,36 @@ void VerletCuda::setup_minimal(int flag)
 {
 
 
-        dotestatom=0;
-        int testatom=104;
-        cuda->oncpu = true;
-        cuda->begin_setup = true;
-        cuda->finished_run = false;
-        MYDBG(printf("# CUDA VerletCuda::setup start\n"); )
-            time_pair=0;
-            time_kspace=0;
-            time_comm=0;
-            time_modify=0;
-                time_fulliterate=0;
+  dotestatom = 0;
+  int testatom = 104;
+  cuda->oncpu = true;
+  cuda->begin_setup = true;
+  cuda->finished_run = false;
+  MYDBG(printf("# CUDA VerletCuda::setup start\n");)
+  time_pair = 0;
+  time_kspace = 0;
+  time_comm = 0;
+  time_modify = 0;
+  time_fulliterate = 0;
 
-        //cuda->allocate();
+  //cuda->allocate();
 
-        cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
-        cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
-        cuda_shared_pair*   cu_pair   = & cuda->shared_data.pair;
-        cu_atom->update_nlocal=1;
-        cu_atom->update_nmax=1;
+  cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
+  cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
+  cuda_shared_pair*   cu_pair   = & cuda->shared_data.pair;
+  cu_atom->update_nlocal = 1;
+  cu_atom->update_nmax = 1;
 
-        if(atom->molecular) cuda->shared_data.pair.collect_forces_later = true;
+  if(atom->molecular) cuda->shared_data.pair.collect_forces_later = true;
 
-        cuda->setDomainParams();
+  cuda->setDomainParams();
 
 
 
-        if(cuda->shared_data.me==0)
-        printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);
-        cuda->allocate();
+  if(cuda->shared_data.me == 0)
+    printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);
+
+  cuda->allocate();
 
 
 
@@ -378,37 +428,47 @@ void VerletCuda::setup_minimal(int flag)
   // acquire ghosts
   // build neighbor lists
 
-  if (flag) {
-    if (triclinic) domain->x2lamda(atom->nlocal);
+  if(flag) {
+    if(triclinic) domain->x2lamda(atom->nlocal);
+
     domain->pbc();
     domain->reset_box();
     comm->setup();
-    if (neighbor->style) neighbor->setup_bins();
+
+    if(neighbor->style) neighbor->setup_bins();
+
     comm->exchange();
     comm->borders();
-    if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+
+    if(triclinic) domain->lamda2x(atom->nlocal + atom->nghost);
+
     cuda->setSystemParams();
     cuda->checkResize();
     neighbor->build();
     neighbor->ncalls = 0;
   }
 
-        if(cuda->shared_data.me==0)
-        printf("# CUDA: VerletCuda::setup: Upload data...\n");
-        cuda->uploadAll();
-    cuda->uploadAllNeighborLists();
-         if(atom->mass)
-         cuda->cu_mass->upload();
+  if(cuda->shared_data.me == 0)
+    printf("# CUDA: VerletCuda::setup: Upload data...\n");
 
-         if(cuda->cu_map_array)
-         cuda->cu_map_array->upload();
+  cuda->uploadAll();
+  cuda->uploadAllNeighborLists();
+
+  if(atom->mass)
+    cuda->cu_mass->upload();
+
+  if(cuda->cu_map_array)
+    cuda->cu_map_array->upload();
 
   // compute all forces
 
   ev_set(update->ntimestep);
+
   if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
+
   if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
-  if(elist_atom||vlist_atom) cuda->checkResize();
+
+  if(elist_atom || vlist_atom) cuda->checkResize();
 
   force_clear();
   cuda->cu_f->download();
@@ -416,85 +476,97 @@ void VerletCuda::setup_minimal(int flag)
   //printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
 
   cuda->cu_mass->upload();
-  MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute\n"); )
+  MYDBG(printf("# CUDA: VerletCuda::setup: initial force compute\n");)
 
-  test_atom(testatom,"pre pair force");
+  test_atom(testatom, "pre pair force");
 
-  if(cuda->shared_data.pair.cudable_force)
-  {
-        cuda->uploadAll();
-        Cuda_Pair_GenerateXType(&cuda->shared_data);
-        if(cuda->cu_v_radius)
-        Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-        if(cuda->cu_omega_rmass)
-        Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+  if(cuda->shared_data.pair.cudable_force) {
+    cuda->uploadAll();
+    Cuda_Pair_GenerateXType(&cuda->shared_data);
+
+    if(cuda->cu_v_radius)
+      Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+
+    if(cuda->cu_omega_rmass)
+      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
   }
 
-  if (force->pair) force->pair->compute(eflag,vflag);
+  if(force->pair) force->pair->compute(eflag, vflag);
 
-  if(cuda->shared_data.pair.cudable_force)
-  {
-        if(cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->upload();
-          if(eflag) cuda->cu_eng_coul->upload();
-          if(vflag) cuda->cu_virial->upload();
-      Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-        }
-        cuda->downloadAll();
+  if(cuda->shared_data.pair.cudable_force) {
+    if(cuda->shared_data.pair.collect_forces_later) {
+      if(eflag) cuda->cu_eng_vdwl->upload();
+
+      if(eflag) cuda->cu_eng_coul->upload();
+
+      if(vflag) cuda->cu_virial->upload();
+
+      Cuda_Pair_CollectForces(&cuda->shared_data, eflag, vflag);
+
+      if(eflag) cuda->cu_eng_vdwl->download();
+
+      if(eflag) cuda->cu_eng_coul->download();
+
+      if(vflag) cuda->cu_virial->download();
+    }
+
+    cuda->downloadAll();
   }
 
-  test_atom(testatom,"post pair force");
+  test_atom(testatom, "post pair force");
 
-  MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute done\n"); )
-    //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
+  MYDBG(printf("# CUDA: VerletCuda::setup: initial force compute done\n");)
+  //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
 
-  if (atom->molecular) {
-    if (force->bond) force->bond->compute(eflag,vflag);
-    if (force->angle) force->angle->compute(eflag,vflag);
-    if (force->dihedral) force->dihedral->compute(eflag,vflag);
-    if (force->improper) force->improper->compute(eflag,vflag);
+  if(atom->molecular) {
+    if(force->bond) force->bond->compute(eflag, vflag);
+
+    if(force->angle) force->angle->compute(eflag, vflag);
+
+    if(force->dihedral) force->dihedral->compute(eflag, vflag);
+
+    if(force->improper) force->improper->compute(eflag, vflag);
   }
 
 
-  if(cuda->shared_data.pppm.cudable_force)
-  {
-        cuda->cu_tag ->upload();
-        cuda->cu_type->upload();
-        cuda->cu_x   ->upload();
-        cuda->cu_v   ->upload();
-        cuda->cu_f   ->upload();
-        if(cu_atom->q_flag) cuda->cu_q->upload();
+  if(cuda->shared_data.pppm.cudable_force) {
+    cuda->cu_tag ->upload();
+    cuda->cu_type->upload();
+    cuda->cu_x   ->upload();
+    cuda->cu_v   ->upload();
+    cuda->cu_f   ->upload();
+
+    if(cu_atom->q_flag) cuda->cu_q->upload();
   }
-  if (force->kspace) {
+
+  if(force->kspace) {
     force->kspace->setup();
-    force->kspace->compute(eflag,vflag);
-  }
-  if(cuda->shared_data.pppm.cudable_force)
-  {
-        cuda->cu_f   ->download();
+    force->kspace->compute(eflag, vflag);
   }
 
-  test_atom(testatom,"post kspace");
+  if(cuda->shared_data.pppm.cudable_force) {
+    cuda->cu_f   ->download();
+  }
+
+  test_atom(testatom, "post kspace");
 
   cuda->uploadAll();
-  if (force->newton) comm->reverse_comm();
+
+  if(force->newton) comm->reverse_comm();
+
   cuda->downloadAll();
 
-  test_atom(testatom,"post reverse comm");
+  test_atom(testatom, "post reverse comm");
 
- if(cuda->shared_data.me==0)
-  printf("# CUDA: Total Device Memory usage post setup: %lf MB\n",1.0*CudaWrapper_CheckMemUsage()/1024/1024);
+  if(cuda->shared_data.me == 0)
+    printf("# CUDA: Total Device Memory usage post setup: %lf MB\n", 1.0 * CudaWrapper_CheckMemUsage() / 1024 / 1024);
 
-  MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup\n"); )
+  MYDBG(printf("# CUDA: VerletCuda::setup: call modify setup\n");)
   modify->setup(vflag);
 
-  MYDBG( printf("# CUDA: VerletCuda::setup: done\n"); )
-  cuda->finished_setup=true;
-  cuda->oncpu=false;
+  MYDBG(printf("# CUDA: VerletCuda::setup: done\n");)
+  cuda->finished_setup = true;
+  cuda->oncpu = false;
 }
 
 //#define TESTATOM
@@ -504,8 +576,8 @@ void VerletCuda::setup_minimal(int flag)
 
 void VerletCuda::run(int n)
 {
-  dotestatom=cuda->dotestatom;
-  int testatom=cuda->testatom;//48267;
+  dotestatom = cuda->dotestatom;
+  int testatom = cuda->testatom; //48267;
 
 
   timespec starttime;
@@ -515,16 +587,16 @@ void VerletCuda::run(int n)
 
   cuda->setTimingsZero();
 
-  static double testtime=0.0;
-//                                clock_gettime(CLOCK_REALTIME,&starttime);
-//                                  clock_gettime(CLOCK_REALTIME,&endtime);
-//                                testtime+=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-//                                 printf("Time: %lf\n",testtime);*/
+  static double testtime = 0.0;
+  //                                clock_gettime(CLOCK_REALTIME,&starttime);
+  //                                  clock_gettime(CLOCK_REALTIME,&endtime);
+  //                                testtime+=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
+  //                                 printf("Time: %lf\n",testtime);*/
 
 
   cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
 
-  int nflag,ntimestep,sortflag;
+  int nflag, ntimestep, sortflag;
 
   int n_initial_integrate = modify_cuda->n_initial_integrate;
   int n_post_integrate = modify_cuda->n_post_integrate;
@@ -535,463 +607,502 @@ void VerletCuda::run(int n)
   int n_post_force = modify_cuda->n_post_force;
   int n_end_of_step = modify_cuda->n_end_of_step;
   MYDBG(printf("# CUDA: Fixes: i_int: %i p_int: %i f_int: %i pr_exc: %i pr_neigh: %i pr_f: %i p_f: %i eos: %i\n",
-        n_initial_integrate,n_post_integrate,n_final_integrate,n_pre_exchange,n_pre_neighbor,n_pre_force,n_post_force,n_end_of_step);)
+               n_initial_integrate, n_post_integrate, n_final_integrate, n_pre_exchange, n_pre_neighbor, n_pre_force, n_post_force, n_end_of_step);)
 
-  if (atom->sortfreq > 0) sortflag = 1;
+  if(atom->sortfreq > 0) sortflag = 1;
   else sortflag = 0;
 
 
-        if(cuda->shared_data.me==0)
-        {
-          if((not cuda->shared_data.pair.cudable_force)&&(force->pair))
-                  error->warning(FLERR,"# CUDA: You asked for a Verlet integration using Cuda, "
-                                     "but selected a pair force which has not yet been ported to Cuda");
-          if((not cuda->shared_data.pppm.cudable_force)&&(force->kspace))
-                  error->warning(FLERR,"# CUDA: You asked for a Verlet integration using Cuda, "
-                                                 "but selected a kspace force which has not yet been ported to Cuda");
-      if(modify_cuda->n_post_integrate_host+modify_cuda->n_pre_exchange_host+modify_cuda->n_pre_neighbor_host+modify_cuda->n_pre_force_host+modify_cuda->n_post_force_host+modify_cuda->n_end_of_step_host+modify_cuda->n_initial_integrate_host+modify_cuda->n_final_integrate_host)
-                  error->warning(FLERR,"# CUDA: You asked for a Verlet integration using Cuda, "
-                                                 "but several fixes have not yet been ported to Cuda.\n"
-                                                 "This can cause a severe speed penalty due to frequent data synchronization between host and GPU.");
-          if(atom->firstgroupname)
-                  error->warning(FLERR,"Warning: firstgroupname is used, this will cause additional data transfers.");
-        }
-    cuda->uploadAll();
+  if(cuda->shared_data.me == 0) {
+    if((not cuda->shared_data.pair.cudable_force) && (force->pair))
+      error->warning(FLERR, "# CUDA: You asked for a Verlet integration using Cuda, "
+                     "but selected a pair force which has not yet been ported to Cuda");
 
-  if(cuda->neighbor_decide_by_integrator && cuda->cu_xhold)
-  {
-           const int n=cuda->shared_data.atom.maxhold;
-          CudaWrapper_CopyData(cuda->cu_xhold->dev_data(),cuda->cu_x->dev_data(),n*sizeof(X_FLOAT));
-          CudaWrapper_CopyData((void*) &((X_FLOAT*)cuda->cu_xhold->dev_data())[n],(void*) &((X_FLOAT*)cuda->cu_x->dev_data())[atom->nmax],n*sizeof(X_FLOAT));
-          CudaWrapper_CopyData((void*) &((X_FLOAT*)cuda->cu_xhold->dev_data())[2*n],(void*) &((X_FLOAT*)cuda->cu_x->dev_data())[2*atom->nmax],n*sizeof(X_FLOAT));
+    if((not cuda->shared_data.pppm.cudable_force) && (force->kspace))
+      error->warning(FLERR, "# CUDA: You asked for a Verlet integration using Cuda, "
+                     "but selected a kspace force which has not yet been ported to Cuda");
+
+    if(modify_cuda->n_post_integrate_host + modify_cuda->n_pre_exchange_host + modify_cuda->n_pre_neighbor_host + modify_cuda->n_pre_force_host + modify_cuda->n_post_force_host + modify_cuda->n_end_of_step_host + modify_cuda->n_initial_integrate_host + modify_cuda->n_final_integrate_host)
+      error->warning(FLERR, "# CUDA: You asked for a Verlet integration using Cuda, "
+                     "but several fixes have not yet been ported to Cuda.\n"
+                     "This can cause a severe speed penalty due to frequent data synchronization between host and GPU.");
+
+    if(atom->firstgroupname)
+      error->warning(FLERR, "Warning: firstgroupname is used, this will cause additional data transfers.");
   }
 
-  cuda->shared_data.atom.reneigh_flag=0;
-  cuda->shared_data.atom.update_nlocal=1;
-  cuda->shared_data.atom.update_nmax=1;
-  cuda->shared_data.atom.update_neigh=1;
-  cuda->shared_data.domain.update=1;
-  cuda->shared_data.buffer_new=1;
-  cuda->uploadtime=0;
-  cuda->downloadtime=0;
-  int firstreneigh=1;
-
-                for(int i = 0; i < n; i++)
-                {
-                        if(atom->nlocal==0)
-                                error->warning(FLERR,"# CUDA: There are currently no atoms on one of the MPI processes. This is currently prone to encountering errors with USER-CUDA package. Please use the 'processors' keyword to use a more balanced processor layout.");
-                        ntimestep = ++update->ntimestep;
-                        ev_set(ntimestep);
-
-                        // initial time integration
-
-                        test_atom(testatom,"Pre initial");
-
-                        MYDBG( printf("# CUDA VerletCuda::iterate: before initial_integrate\n"); )
-
-                         modify->initial_integrate(vflag);
-
-                        MYDBG( printf("# CUDA VerletCuda::iterate: after initial_integrate\n"); )
-
-                        if(n_post_integrate) modify->post_integrate();
-
-
-
-                        // regular communication vs neighbor list rebuild
-
-                        test_atom(testatom,"Pre Exchange");
-
-                        MYDBG( printf("# CUDA VerletCuda::iterate: before neighbor decide\n"); )
-                        nflag = neighbor->decide();
-                        if(nflag == 0)
-                        {
-                                MYDBG( printf("# CUDA VerletCuda::iterate: communicate\n"); )
-                                timer->stamp();
-
-                            if((not (eflag||vflag))&&(cuda->shared_data.overlap_comm))
-                            {
-                                  //overlap forward communication of ghost atom positions with inner force calculation (interactions between local atoms)
-                              //build communication buffers
-                        //      printf("Pre forward_comm(1)\n");
-                              clock_gettime(CLOCK_REALTIME,&starttotal);
-                                   cuda->shared_data.atom.reneigh_flag=0;
-                                  clock_gettime(CLOCK_REALTIME,&starttime);
-                                   timer->stamp();
-                                  comm->forward_comm(1);
-                                   timer->stamp(Timer::COMM);
-                                  clock_gettime(CLOCK_REALTIME,&endtime);
-                                  cuda->shared_data.cuda_timings.comm_forward_total+=
-                                                 endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-
-                              //prepare force calculation
-                         //     printf("Pre force_clear\n");
-                              force_clear();
-                         //     printf("Pre Generate XType\n");
-                                  Cuda_Pair_GenerateXType(&cuda->shared_data);
-                                  if(cuda->cu_v_radius)
-                                    Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-                                  if(cuda->cu_omega_rmass)
-                                    Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-
-                          //start force calculation asynchronus
-                              cuda->shared_data.comm.comm_phase=1;
-                          force->pair->compute(eflag, vflag);
-                              timer->stamp(Timer::PAIR);
-                  //CudaWrapper_Sync();
-
-                                  //download comm buffers from GPU, perform MPI communication and upload buffers again
-                                  clock_gettime(CLOCK_REALTIME,&starttime);
-                                  comm->forward_comm(2);
-                                   clock_gettime(CLOCK_REALTIME,&endtime);
-                                  cuda->shared_data.cuda_timings.comm_forward_total+=
-                                                 endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-                                   timer->stamp(Timer::COMM);
-
-                                   //wait for force calculation
-                                  CudaWrapper_Sync();
-                                  timer->stamp(Timer::PAIR);
-
-                                  //unpack communication buffers
-                                  clock_gettime(CLOCK_REALTIME,&starttime);
-                                  comm->forward_comm(3);
-                                  clock_gettime(CLOCK_REALTIME,&endtime);
-                                  cuda->shared_data.cuda_timings.comm_forward_total+=
-                                                 endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-
-                                   timer->stamp(Timer::COMM);
-                                  MYDBG( printf("# CUDA VerletCuda::iterate: communicate done\n"); )
-                                  cuda->shared_data.cuda_timings.test1+=
-                                                 endtotal.tv_sec-starttotal.tv_sec+1.0*(endtotal.tv_nsec-starttotal.tv_nsec)/1000000000;
-                            }
-                            else
-                            {
-                                    //perform standard forward communication
-                                  clock_gettime(CLOCK_REALTIME,&starttime);
-                                  comm->forward_comm();
-                                  clock_gettime(CLOCK_REALTIME,&endtime);
-                                  cuda->shared_data.cuda_timings.comm_forward_total+=
-                                         endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-                                   timer->stamp(Timer::COMM);
-                                  MYDBG( printf("# CUDA VerletCuda::iterate: communicate done\n"); )
-                            }
-                        }
-                        else
-                        {
-                                 int nlocalold=cuda->shared_data.atom.nlocal;
-                                 if(firstreneigh)
-                                 {
-                                   cuda->shared_data.atom.update_nlocal=1;
-                                  cuda->shared_data.atom.update_nmax=1;
-                                   firstreneigh=0;
-                                 }
-                                 cuda->shared_data.buffer_new=1;
-                                MYDBG( printf("# CUDA VerletCuda::iterate: neighbor\n"); )
-                                 cuda->setDomainParams();
-                                if(n_pre_exchange) modify->pre_exchange();
-                                if(atom->nlocal!=cuda->shared_data.atom.nlocal) //did someone add atoms during pre_exchange?
-                                {
-                                        cuda->checkResize();
-                                        cuda->uploadAll();
-                                }
-
-                                //check domain changes
-                                if(domain->triclinic) domain->x2lamda(atom->nlocal);
-                                MYDBG( printf("# CUDA VerletCuda::iterate: neighbor pbc\n"); )
-                                domain->pbc();
-                                if(domain->box_change)
-                                {
-                                        domain->reset_box();
-                                        comm->setup();
-                                        if (neighbor->style) neighbor->setup_bins();
-
-                                }
-                                timer->stamp();
-                                MYDBG( printf("# CUDA VerletCuda::iterate: neighbor exchange\n"); )
-
-                                //perform exchange of local atoms
-                                clock_gettime(CLOCK_REALTIME,&starttime);
-                                comm->exchange();
-                                clock_gettime(CLOCK_REALTIME,&endtime);
-
-                                //special and nspecial fields of the atom data are not currently transfered via the GPU buffer might be changed in the future
-                                if(comm->nprocs>1)
-                                {
-                                  clock_gettime(CLOCK_REALTIME,&starttime);
-                                  if(atom->special)
-                                        cuda->cu_special->upload();
-                                  if(atom->nspecial)
-                                        cuda->cu_nspecial->upload();
-                                  clock_gettime(CLOCK_REALTIME,&endtime);
-                                  cuda->shared_data.cuda_timings.test1+=
-                                           endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-                                }
-
-                                cuda->shared_data.cuda_timings.comm_exchange_total+=
-                                         endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-
-                                 if(nlocalold!=cuda->shared_data.atom.nlocal) cuda->shared_data.atom.update_nlocal=2;
-
-                                //sort atoms
-                              if (sortflag && ntimestep >= atom->nextsort) atom->sort();
-                                MYDBG( printf("# CUDA VerletCuda::iterate: neighbor borders\n"); )
-
-                                //generate ghost atom lists, and transfer ghost atom data
-                                clock_gettime(CLOCK_REALTIME,&starttime);
-                                comm->borders();
-                                clock_gettime(CLOCK_REALTIME,&endtime);
-                                cuda->shared_data.cuda_timings.comm_border_total+=
-                                         endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-
-                                clock_gettime(CLOCK_REALTIME,&starttime);
-                                //atom index maps are generated on CPU, and need to be transfered to GPU if they are used
-                                if(cuda->cu_map_array)
-                                  cuda->cu_map_array->upload();
-
-
-                                if(domain->triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
-
-                                if(n_pre_neighbor) modify->pre_neighbor();
-
-                                  cuda->shared_data.buffer_new=2;
-                                  if(atom->molecular) cuda->cu_molecule->download();
-                                MYDBG( printf("# CUDA VerletCuda::iterate: neighbor build\n"); )
-                              timer->stamp(Timer::COMM);
-                                clock_gettime(CLOCK_REALTIME,&endtime);
-                                cuda->shared_data.cuda_timings.test2+=
-                                         endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-
-                          //rebuild neighbor list
-                          test_atom(testatom,"Pre Neighbor");
-                                neighbor->build();
-                          timer->stamp(Timer::NEIGHBOR);
-                                MYDBG( printf("# CUDA VerletCuda::iterate: neighbor done\n"); )
-
-                                //if bonded interactions are used (in this case collect_forces_later is true), transfer data which only changes upon exchange/border routines from GPU to CPU
-                                if(cuda->shared_data.pair.collect_forces_later)
-                                {
-                                        if(cuda->cu_molecule) cuda->cu_molecule->download();
-                                        cuda->cu_tag->download();
-                                        cuda->cu_type->download();
-                                        cuda->cu_mask->download();
-                                   if(cuda->cu_q) cuda->cu_q->download();
-                                }
-                                cuda->shared_data.comm.comm_phase=3;
-                        }
-
-                        test_atom(testatom,"Post Exchange");
-
-                        // force computations
-
-                        //only do force_clear if it has not been done during overlap of communication with local interactions
-                        if(not((not (eflag||vflag))&&(cuda->shared_data.overlap_comm)&&(cuda->shared_data.comm.comm_phase<3)))
-                        force_clear();
-
-                        if(n_pre_force) modify->pre_force(vflag);
-
-                        timer->stamp();
-
-                          //if overlap of bonded interactions with nonbonded interactions takes place, download forces and positions
-         /*            if(cuda->shared_data.pair.collect_forces_later)
-                    {
-                      cuda->cu_x->downloadAsync(2);
-                      cuda->cu_f->downloadAsync(2);
-                    }*/
-
-                        if(force->pair)
-                        {
-                                 if((not (eflag||vflag))&&(cuda->shared_data.overlap_comm)&&(cuda->shared_data.comm.comm_phase<3)&&cuda->shared_data.pair.cudable_force)
-                                {
-                                  //second part of force calculations in case of overlaping it with commuincation. Only interactions between local and ghost atoms are done now
-                                  //regenerate data layout for force computations, its actually only needed for the ghost atoms
-                                  cuda->shared_data.comm.comm_phase=2;
-
-                                timespec atime1,atime2;
-                                    clock_gettime(CLOCK_REALTIME,&atime1);
-
-                                   Cuda_Pair_GenerateXType(&cuda->shared_data);
-                                  if(cuda->cu_v_radius)
-                                    Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-                                  if(cuda->cu_omega_rmass)
-                                    Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-
-                                    clock_gettime(CLOCK_REALTIME,&atime2);
-                                    cuda->shared_data.cuda_timings.pair_xtype_conversion+=
-                                atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
-                                  force->pair->compute(eflag, vflag);
-
-                                }
-                                else
-                                {
-                                  //calculate complete pair interactions
-                               if(not cuda->shared_data.pair.cudable_force) cuda->downloadAll();
-                                  else
-                                  {
-                                    //regenerate data layout for force computations, its actually only needed for the ghost atoms
-                                  timespec atime1,atime2;
-                                      clock_gettime(CLOCK_REALTIME,&atime1);
-
-                                    Cuda_Pair_GenerateXType(&cuda->shared_data);
-                                    if(cuda->cu_v_radius)
-                                      Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-                                    if(cuda->cu_omega_rmass)
-                                      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-
-                                      clock_gettime(CLOCK_REALTIME,&atime2);
-                                      cuda->shared_data.cuda_timings.pair_xtype_conversion+=
-                                        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
-                                  }
-                                  cuda->shared_data.comm.comm_phase=0;
-                                  force->pair->compute(eflag, vflag);
-                                }
-
-                             if(not cuda->shared_data.pair.cudable_force) cuda->uploadAll();
-
-                             //wait for force calculation in case of not using overlap with bonded interactions
-                             if(not cuda->shared_data.pair.collect_forces_later)
-                                   CudaWrapper_Sync();
-
-                                timer->stamp(Timer::PAIR);
-                        }
-
-                           //calculate bonded interactions
-                    if(atom->molecular)
-                        {
-                  cuda->cu_x->downloadAsync(2);
-                      if(n_pre_force==0) Verlet::force_clear();
-                      else  cuda->cu_f->downloadAsync(2);
-
-                          timer->stamp(Timer::PAIR);
-
-                          test_atom(testatom,"pre bond force");
-                          if(force->bond) force->bond->compute(eflag, vflag);
-                          if(force->angle) force->angle->compute(eflag, vflag);
-                          if(force->dihedral) force->dihedral->compute(eflag, vflag);
-                          if(force->improper) force->improper->compute(eflag, vflag);
-                          timer->stamp(Timer::BOND);
-                    }
-
-                   //collect forces in case pair force and bonded interactions were overlapped, and either no KSPACE or a GPU KSPACE style is used
-                       if(cuda->shared_data.pair.collect_forces_later&&cuda->shared_data.pair.cudable_force&&(not (force->kspace&&(not cuda->shared_data.pppm.cudable_force))))
-                       {
-                      clock_gettime(CLOCK_REALTIME,&starttime);
-                      cuda->cu_f->uploadAsync(2);
-
-                        test_atom(testatom,"post molecular force");
-
-
-                   if(eflag) cuda->cu_eng_vdwl->upload();
-                  if(eflag) cuda->cu_eng_coul->upload();
-                  if(vflag) cuda->cu_virial->upload();
-              Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
-                  if(eflag) cuda->cu_eng_vdwl->download();
-                  if(eflag) cuda->cu_eng_coul->download();
-                  if(vflag) cuda->cu_virial->download();
-                      timer->stamp(Timer::PAIR);
-
-                          clock_gettime(CLOCK_REALTIME,&endtime);
-                          cuda->shared_data.cuda_timings.pair_force_collection+=
-                                         endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-                       }
-
-                    //compute kspace force
-                           if(force->kspace)
-                           {
-                          if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
-                                cuda->downloadAll();
-                          if((not cuda->shared_data.pppm.cudable_force) && (cuda->shared_data.pair.collect_forces_later) && (not atom->molecular))
-                          {
-                    cuda->cu_x->downloadAsync(2);
-                        if(n_pre_force==0) Verlet::force_clear();
-                        else  cuda->cu_f->downloadAsync(2);
-
-                            timer->stamp(Timer::PAIR);
-                          }
-
-                          force->kspace->compute(eflag,vflag);
-                          if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
-                                cuda->uploadAll();
-                          timer->stamp(Timer::KSPACE);
-                           }
-
-                        //collect forces in case pair forces and kspace was overlaped
-                       if(cuda->shared_data.pair.collect_forces_later&&cuda->shared_data.pair.cudable_force&&((force->kspace&&(not cuda->shared_data.pppm.cudable_force))))
-                       {
-                      cuda->cu_f->uploadAsync(2);
-
-                          clock_gettime(CLOCK_REALTIME,&starttime);
-
-                  if(eflag) cuda->cu_eng_vdwl->upload();
-                  if(eflag) cuda->cu_eng_coul->upload();
-                        if(vflag) cuda->cu_virial->upload();
-                    Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
-                        if(eflag) cuda->cu_eng_vdwl->download();
-                        if(eflag) cuda->cu_eng_coul->download();
-                        if(vflag) cuda->cu_virial->download();
-                            timer->stamp(Timer::PAIR);
-
-                          clock_gettime(CLOCK_REALTIME,&endtime);
-                            cuda->shared_data.cuda_timings.pair_force_collection+=
-                                         endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-                    }
-
-                        //send forces on ghost atoms back to other GPU: THIS SHOULD NEVER HAPPEN
-                        if(force->newton)
-                        {
-                          comm->reverse_comm();
-                          timer->stamp(Timer::COMM);
-                        }
-                       test_atom(testatom,"post force");
-                        // force modifications, final time integration, diagnostics
-
-                        if(n_post_force) modify->post_force(vflag);
-
-                        test_atom(testatom,"pre final");
-
-                        modify->final_integrate();
-
-                        test_atom(testatom,"post final");
-
-                        if(n_end_of_step) modify->end_of_step();
-
-                        // all output
-
-                        test_atom(testatom,"pre output");
-
-                        if(ntimestep == output->next)
-                        {
-                          if(not output->thermo->cudable)
-                          cuda->downloadAll();
-                          timer->stamp();
-                          output->write(ntimestep);
-                          timer->stamp(Timer::OUTPUT);
-                        }
-
-
-                        test_atom(testatom,"post output");
-
-                        if(cuda->shared_data.atom.update_nlocal>0)
-                          cuda->shared_data.atom.update_nlocal--;
-                  if(cuda->shared_data.atom.update_nmax>0)
-                    cuda->shared_data.atom.update_nmax--;
-      if(cuda->shared_data.atom.update_neigh>0)
-        cuda->shared_data.atom.update_neigh--;
-                  if(cuda->shared_data.domain.update>0)
-                          cuda->shared_data.domain.update--;
-                  if(cuda->shared_data.buffer_new>0)
-                          cuda->shared_data.buffer_new--;
-            cuda->shared_data.atom.reneigh_flag=0;
-                }
-
-
-                  cuda->downloadAll();
-                 cuda->downloadAllNeighborLists();
-                  cuda->shared_data.atom.update_nlocal=1;
-                  cuda->shared_data.atom.update_nmax=1;
-      cuda->shared_data.atom.update_neigh=1;
-                  cuda->shared_data.buffer_new=1;
-                  cuda->shared_data.domain.update=1;
-                  cuda->oncpu = true;
-                  cuda->finished_run = true;
+  cuda->uploadAll();
+
+  if(cuda->neighbor_decide_by_integrator && cuda->cu_xhold) {
+    const int n = cuda->shared_data.atom.maxhold;
+    CudaWrapper_CopyData(cuda->cu_xhold->dev_data(), cuda->cu_x->dev_data(), n * sizeof(X_FLOAT));
+    CudaWrapper_CopyData((void*) & ((X_FLOAT*)cuda->cu_xhold->dev_data())[n], (void*) & ((X_FLOAT*)cuda->cu_x->dev_data())[atom->nmax], n * sizeof(X_FLOAT));
+    CudaWrapper_CopyData((void*) & ((X_FLOAT*)cuda->cu_xhold->dev_data())[2 * n], (void*) & ((X_FLOAT*)cuda->cu_x->dev_data())[2 * atom->nmax], n * sizeof(X_FLOAT));
+  }
+
+  cuda->shared_data.atom.reneigh_flag = 0;
+  cuda->shared_data.atom.update_nlocal = 1;
+  cuda->shared_data.atom.update_nmax = 1;
+  cuda->shared_data.atom.update_neigh = 1;
+  cuda->shared_data.domain.update = 1;
+  cuda->shared_data.buffer_new = 1;
+  cuda->uploadtime = 0;
+  cuda->downloadtime = 0;
+  int firstreneigh = 1;
+
+  for(int i = 0; i < n; i++) {
+    if(atom->nlocal == 0)
+      error->warning(FLERR, "# CUDA: There are currently no atoms on one of the MPI processes. This is currently prone to encountering errors with USER-CUDA package. Please use the 'processors' keyword to use a more balanced processor layout.");
+
+    ntimestep = ++update->ntimestep;
+    ev_set(ntimestep);
+
+    // initial time integration
+
+    test_atom(testatom, "Pre initial");
+
+    MYDBG(printf("# CUDA VerletCuda::iterate: before initial_integrate\n");)
+
+    modify->initial_integrate(vflag);
+
+    MYDBG(printf("# CUDA VerletCuda::iterate: after initial_integrate\n");)
+
+    if(n_post_integrate) modify->post_integrate();
+
+
+
+    // regular communication vs neighbor list rebuild
+
+    test_atom(testatom, "Pre Exchange");
+
+    MYDBG(printf("# CUDA VerletCuda::iterate: before neighbor decide\n");)
+    nflag = neighbor->decide();
+
+    if(nflag == 0) {
+      MYDBG(printf("# CUDA VerletCuda::iterate: communicate\n");)
+      timer->stamp();
+
+      if((not(eflag || vflag)) && (cuda->shared_data.overlap_comm)) {
+        //overlap forward communication of ghost atom positions with inner force calculation (interactions between local atoms)
+        //build communication buffers
+        //      printf("Pre forward_comm(1)\n");
+        clock_gettime(CLOCK_REALTIME, &starttotal);
+        cuda->shared_data.atom.reneigh_flag = 0;
+        clock_gettime(CLOCK_REALTIME, &starttime);
+        timer->stamp();
+        comm->forward_comm(1);
+        timer->stamp(TIME_COMM);
+        clock_gettime(CLOCK_REALTIME, &endtime);
+        cuda->shared_data.cuda_timings.comm_forward_total +=
+          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+
+        //prepare force calculation
+        //     printf("Pre force_clear\n");
+        force_clear();
+        //     printf("Pre Generate XType\n");
+        Cuda_Pair_GenerateXType(&cuda->shared_data);
+
+        if(cuda->cu_v_radius)
+          Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+
+        if(cuda->cu_omega_rmass)
+          Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+
+        //start force calculation asynchronus
+        cuda->shared_data.comm.comm_phase = 1;
+        force->pair->compute(eflag, vflag);
+        timer->stamp(TIME_PAIR);
+        //CudaWrapper_Sync();
+
+        //download comm buffers from GPU, perform MPI communication and upload buffers again
+        clock_gettime(CLOCK_REALTIME, &starttime);
+        comm->forward_comm(2);
+        clock_gettime(CLOCK_REALTIME, &endtime);
+        cuda->shared_data.cuda_timings.comm_forward_total +=
+          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+        timer->stamp(TIME_COMM);
+
+        //wait for force calculation
+        CudaWrapper_Sync();
+        timer->stamp(TIME_PAIR);
+
+        //unpack communication buffers
+        clock_gettime(CLOCK_REALTIME, &starttime);
+        comm->forward_comm(3);
+        clock_gettime(CLOCK_REALTIME, &endtime);
+        cuda->shared_data.cuda_timings.comm_forward_total +=
+          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+
+        timer->stamp(TIME_COMM);
+        MYDBG(printf("# CUDA VerletCuda::iterate: communicate done\n");)
+        cuda->shared_data.cuda_timings.test1 +=
+          endtotal.tv_sec - starttotal.tv_sec + 1.0 * (endtotal.tv_nsec - starttotal.tv_nsec) / 1000000000;
+      } else {
+        //perform standard forward communication
+        clock_gettime(CLOCK_REALTIME, &starttime);
+        comm->forward_comm();
+        clock_gettime(CLOCK_REALTIME, &endtime);
+        cuda->shared_data.cuda_timings.comm_forward_total +=
+          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+        timer->stamp(TIME_COMM);
+        MYDBG(printf("# CUDA VerletCuda::iterate: communicate done\n");)
+      }
+    } else {
+      int nlocalold = cuda->shared_data.atom.nlocal;
+
+      if(firstreneigh) {
+        cuda->shared_data.atom.update_nlocal = 1;
+        cuda->shared_data.atom.update_nmax = 1;
+        firstreneigh = 0;
+      }
+
+      cuda->shared_data.buffer_new = 1;
+      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor\n");)
+      cuda->setDomainParams();
+
+      if(n_pre_exchange) modify->pre_exchange();
+
+      if(atom->nlocal != cuda->shared_data.atom.nlocal) { //did someone add atoms during pre_exchange?
+        cuda->checkResize();
+        cuda->uploadAll();
+      }
+
+      //check domain changes
+      if(domain->triclinic) domain->x2lamda(atom->nlocal);
+
+      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor pbc\n");)
+      domain->pbc();
+
+      if(domain->box_change) {
+        domain->reset_box();
+        comm->setup();
+
+        if(neighbor->style) neighbor->setup_bins();
+
+      }
+
+      timer->stamp();
+      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor exchange\n");)
+
+      //perform exchange of local atoms
+      clock_gettime(CLOCK_REALTIME, &starttime);
+      comm->exchange();
+      clock_gettime(CLOCK_REALTIME, &endtime);
+
+      //special and nspecial fields of the atom data are not currently transfered via the GPU buffer might be changed in the future
+      if(comm->nprocs > 1) {
+        clock_gettime(CLOCK_REALTIME, &starttime);
+
+        if(atom->special)
+          cuda->cu_special->upload();
+
+        if(atom->nspecial)
+          cuda->cu_nspecial->upload();
+
+        clock_gettime(CLOCK_REALTIME, &endtime);
+        cuda->shared_data.cuda_timings.test1 +=
+          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+      }
+
+      cuda->shared_data.cuda_timings.comm_exchange_total +=
+        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+
+      if(nlocalold != cuda->shared_data.atom.nlocal) cuda->shared_data.atom.update_nlocal = 2;
+
+      //sort atoms
+      if(sortflag && ntimestep >= atom->nextsort) atom->sort();
+
+      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor borders\n");)
+
+      //generate ghost atom lists, and transfer ghost atom data
+      clock_gettime(CLOCK_REALTIME, &starttime);
+      comm->borders();
+      clock_gettime(CLOCK_REALTIME, &endtime);
+      cuda->shared_data.cuda_timings.comm_border_total +=
+        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+
+      clock_gettime(CLOCK_REALTIME, &starttime);
+
+      //atom index maps are generated on CPU, and need to be transfered to GPU if they are used
+      if(cuda->cu_map_array)
+        cuda->cu_map_array->upload();
+
+
+      if(domain->triclinic) domain->lamda2x(atom->nlocal + atom->nghost);
+
+      if(n_pre_neighbor) modify->pre_neighbor();
+
+      cuda->shared_data.buffer_new = 2;
+
+      if(atom->molecular) {
+        cuda->cu_molecule->download();
+        cuda->cu_x->download();
+      }
+
+      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor build\n");)
+      timer->stamp(TIME_COMM);
+      clock_gettime(CLOCK_REALTIME, &endtime);
+      cuda->shared_data.cuda_timings.test2 +=
+        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+
+      //rebuild neighbor list
+      test_atom(testatom, "Pre Neighbor");
+      neighbor->build();
+      timer->stamp(TIME_NEIGHBOR);
+      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor done\n");)
+
+      //if bonded interactions are used (in this case collect_forces_later is true), transfer data which only changes upon exchange/border routines from GPU to CPU
+      if(cuda->shared_data.pair.collect_forces_later) {
+        if(cuda->cu_molecule) cuda->cu_molecule->download();
+
+        cuda->cu_tag->download();
+        cuda->cu_type->download();
+        cuda->cu_mask->download();
+
+        if(cuda->cu_q) cuda->cu_q->download();
+      }
+
+      cuda->shared_data.comm.comm_phase = 3;
+    }
+
+    test_atom(testatom, "Post Exchange");
+
+    // force computations
+
+    //only do force_clear if it has not been done during overlap of communication with local interactions
+    if(not((not(eflag || vflag)) && (cuda->shared_data.overlap_comm) && (cuda->shared_data.comm.comm_phase < 3)))
+      force_clear();
+
+    if(n_pre_force) modify->pre_force(vflag);
+
+    timer->stamp();
+
+    //if overlap of bonded interactions with nonbonded interactions takes place, download forces and positions
+    /*            if(cuda->shared_data.pair.collect_forces_later)
+               {
+                 cuda->cu_x->downloadAsync(2);
+                 cuda->cu_f->downloadAsync(2);
+               }*/
+
+    if(force->pair) {
+      if((not(eflag || vflag)) && (cuda->shared_data.overlap_comm) && (cuda->shared_data.comm.comm_phase < 3) && cuda->shared_data.pair.cudable_force) {
+        //second part of force calculations in case of overlaping it with commuincation. Only interactions between local and ghost atoms are done now
+        //regenerate data layout for force computations, its actually only needed for the ghost atoms
+        cuda->shared_data.comm.comm_phase = 2;
+
+        timespec atime1, atime2;
+        clock_gettime(CLOCK_REALTIME, &atime1);
+
+        Cuda_Pair_GenerateXType(&cuda->shared_data);
+
+        if(cuda->cu_v_radius)
+          Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+
+        if(cuda->cu_omega_rmass)
+          Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+
+        clock_gettime(CLOCK_REALTIME, &atime2);
+        cuda->shared_data.cuda_timings.pair_xtype_conversion +=
+          atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
+        force->pair->compute(eflag, vflag);
+
+      } else {
+        //calculate complete pair interactions
+        if(not cuda->shared_data.pair.cudable_force) cuda->downloadAll();
+        else {
+          //regenerate data layout for force computations, its actually only needed for the ghost atoms
+          timespec atime1, atime2;
+          clock_gettime(CLOCK_REALTIME, &atime1);
+
+          Cuda_Pair_GenerateXType(&cuda->shared_data);
+
+          if(cuda->cu_v_radius)
+            Cuda_Pair_GenerateVRadius(&cuda->shared_data);
+
+          if(cuda->cu_omega_rmass)
+            Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
+
+          clock_gettime(CLOCK_REALTIME, &atime2);
+          cuda->shared_data.cuda_timings.pair_xtype_conversion +=
+            atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
+        }
+
+        cuda->shared_data.comm.comm_phase = 0;
+        force->pair->compute(eflag, vflag);
+      }
+
+      if(not cuda->shared_data.pair.cudable_force) cuda->uploadAll();
+
+      //wait for force calculation in case of not using overlap with bonded interactions
+      if(not cuda->shared_data.pair.collect_forces_later)
+        CudaWrapper_Sync();
+
+      timer->stamp(TIME_PAIR);
+    }
+
+    //calculate bonded interactions
+    if(atom->molecular) {
+      cuda->cu_x->downloadAsync(2);
+
+      if(n_pre_force == 0) Verlet::force_clear();
+      else  cuda->cu_f->downloadAsync(2);
+
+      timer->stamp(TIME_PAIR);
+
+      test_atom(testatom, "pre bond force");
+
+      if(force->bond) force->bond->compute(eflag, vflag);
+
+      if(force->angle) force->angle->compute(eflag, vflag);
+
+      if(force->dihedral) force->dihedral->compute(eflag, vflag);
+
+      if(force->improper) force->improper->compute(eflag, vflag);
+
+      timer->stamp(TIME_BOND);
+    }
+
+    //collect forces in case pair force and bonded interactions were overlapped, and either no KSPACE or a GPU KSPACE style is used
+    if(cuda->shared_data.pair.collect_forces_later && cuda->shared_data.pair.cudable_force && (not(force->kspace && (not cuda->shared_data.pppm.cudable_force)))) {
+      clock_gettime(CLOCK_REALTIME, &starttime);
+      cuda->cu_f->uploadAsync(2);
+
+      test_atom(testatom, "post molecular force");
+
+
+      if(eflag) cuda->cu_eng_vdwl->upload();
+
+      if(eflag) cuda->cu_eng_coul->upload();
+
+      if(vflag) cuda->cu_virial->upload();
+
+      Cuda_Pair_CollectForces(&cuda->shared_data, eflag, vflag);
+
+      if(eflag) cuda->cu_eng_vdwl->download();
+
+      if(eflag) cuda->cu_eng_coul->download();
+
+      if(vflag) cuda->cu_virial->download();
+
+      timer->stamp(TIME_PAIR);
+
+      clock_gettime(CLOCK_REALTIME, &endtime);
+      cuda->shared_data.cuda_timings.pair_force_collection +=
+        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+    }
+
+    //compute kspace force
+    if(force->kspace) {
+      if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
+        cuda->downloadAll();
+
+      if((not cuda->shared_data.pppm.cudable_force) && (cuda->shared_data.pair.collect_forces_later) && (not atom->molecular)) {
+        cuda->cu_x->downloadAsync(2);
+
+        if(n_pre_force == 0) Verlet::force_clear();
+        else  cuda->cu_f->downloadAsync(2);
+
+        timer->stamp(TIME_PAIR);
+      }
+
+      force->kspace->compute(eflag, vflag);
+
+      if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
+        cuda->uploadAll();
+
+      timer->stamp(TIME_KSPACE);
+    }
+
+    //collect forces in case pair forces and kspace was overlaped
+    if(cuda->shared_data.pair.collect_forces_later && cuda->shared_data.pair.cudable_force && ((force->kspace && (not cuda->shared_data.pppm.cudable_force)))) {
+      cuda->cu_f->uploadAsync(2);
+
+      clock_gettime(CLOCK_REALTIME, &starttime);
+
+      if(eflag) cuda->cu_eng_vdwl->upload();
+
+      if(eflag) cuda->cu_eng_coul->upload();
+
+      if(vflag) cuda->cu_virial->upload();
+
+      Cuda_Pair_CollectForces(&cuda->shared_data, eflag, vflag);
+
+      if(eflag) cuda->cu_eng_vdwl->download();
+
+      if(eflag) cuda->cu_eng_coul->download();
+
+      if(vflag) cuda->cu_virial->download();
+
+      timer->stamp(TIME_PAIR);
+
+      clock_gettime(CLOCK_REALTIME, &endtime);
+      cuda->shared_data.cuda_timings.pair_force_collection +=
+        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+    }
+
+    //send forces on ghost atoms back to other GPU: THIS SHOULD NEVER HAPPEN
+    if(force->newton) {
+      comm->reverse_comm();
+      timer->stamp(TIME_COMM);
+    }
+
+    test_atom(testatom, "post force");
+    // force modifications, final time integration, diagnostics
+
+    if(n_post_force) modify->post_force(vflag);
+
+    test_atom(testatom, "pre final");
+
+    modify->final_integrate();
+
+    test_atom(testatom, "post final");
+
+    if(n_end_of_step) modify->end_of_step();
+
+    // all output
+
+    test_atom(testatom, "pre output");
+
+    if(ntimestep == output->next) {
+      if(not output->thermo->cudable)
+        cuda->downloadAll();
+
+      timer->stamp();
+      output->write(ntimestep);
+      timer->stamp(TIME_OUTPUT);
+    }
+
+
+    test_atom(testatom, "post output");
+
+    if(cuda->shared_data.atom.update_nlocal > 0)
+      cuda->shared_data.atom.update_nlocal--;
+
+    if(cuda->shared_data.atom.update_nmax > 0)
+      cuda->shared_data.atom.update_nmax--;
+
+    if(cuda->shared_data.atom.update_neigh > 0)
+      cuda->shared_data.atom.update_neigh--;
+
+    if(cuda->shared_data.domain.update > 0)
+      cuda->shared_data.domain.update--;
+
+    if(cuda->shared_data.buffer_new > 0)
+      cuda->shared_data.buffer_new--;
+
+    cuda->shared_data.atom.reneigh_flag = 0;
+  }
+
+
+  cuda->downloadAll();
+  cuda->downloadAllNeighborLists();
+  cuda->shared_data.atom.update_nlocal = 1;
+  cuda->shared_data.atom.update_nmax = 1;
+  cuda->shared_data.atom.update_neigh = 1;
+  cuda->shared_data.buffer_new = 1;
+  cuda->shared_data.domain.update = 1;
+  cuda->oncpu = true;
+  cuda->finished_run = true;
 }
 
 
@@ -1003,56 +1114,64 @@ void VerletCuda::run(int n)
 void VerletCuda::force_clear()
 {
   cuda->cu_f->memset_device(0);
+
   if(cuda->cu_torque) cuda->cu_torque->memset_device(0);
+
   return;
 
   //The rest should not be necessary
   int i;
-  for(i=0;i<atom->nlocal;i++)
-  {
-          atom->f[i][0]=0.0;
-          atom->f[i][1]=0.0;
-          atom->f[i][2]=0.0;
+
+  for(i = 0; i < atom->nlocal; i++) {
+    atom->f[i][0] = 0.0;
+    atom->f[i][1] = 0.0;
+    atom->f[i][2] = 0.0;
   }
+
   // clear force on all particles
   // if either newton flag is set, also include ghosts
 
-  if (neighbor->includegroup == 0) {
+  if(neighbor->includegroup == 0) {
     int nall;
-    if (force->newton) nall = atom->nlocal + atom->nghost;
+
+    if(force->newton) nall = atom->nlocal + atom->nghost;
     else nall = atom->nlocal;
-    if (torqueflag) {
-      double **torque = atom->torque;
-      for (i = 0; i < nall; i++) {
+
+    if(torqueflag) {
+      double** torque = atom->torque;
+
+      for(i = 0; i < nall; i++) {
         torque[i][0] = 0.0;
         torque[i][1] = 0.0;
         torque[i][2] = 0.0;
       }
     }
 
-  // neighbor includegroup flag is set
-  // clear force only on initial nfirst particles
-  // if either newton flag is set, also include ghosts
+    // neighbor includegroup flag is set
+    // clear force only on initial nfirst particles
+    // if either newton flag is set, also include ghosts
 
   } else {
     int nall = atom->nfirst;
 
 
-    if (torqueflag) {
-      double **torque = atom->torque;
-      for (i = 0; i < nall; i++) {
+    if(torqueflag) {
+      double** torque = atom->torque;
+
+      for(i = 0; i < nall; i++) {
         torque[i][0] = 0.0;
         torque[i][1] = 0.0;
         torque[i][2] = 0.0;
       }
     }
 
-    if (force->newton) {
+    if(force->newton) {
       nall = atom->nlocal + atom->nghost;
 
-      if (torqueflag) {
-        double **torque = atom->torque;
-        for (i = atom->nlocal; i < nall; i++) {
+      if(torqueflag) {
+        double** torque = atom->torque;
+
+        for(i = atom->nlocal; i < nall; i++) {
           torque[i][0] = 0.0;
           torque[i][1] = 0.0;
           torque[i][2] = 0.0;
@@ -1065,41 +1184,51 @@ void VerletCuda::force_clear()
 void VerletCuda::test_atom(int aatom, char* string)  //printing properties of one atom for test purposes
 {
   if(not dotestatom) return;
-  bool check=false;
-  if(cuda->finished_setup) cuda->downloadAll();
-  for(int i=0;i<atom->nlocal+atom->nghost;i++)
-  {
-  if((atom->tag[i]==aatom)&&(i<atom->nlocal))
-          {
 
-            printf("%i # CUDA %s: " BIGINT_FORMAT " %i %e %e %e %i ",comm->me,string,update->ntimestep,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);
-            if(atom->molecular && (i<atom->nlocal))
-          {
-                  printf(" // %i %i %i ",atom->num_bond[i],atom->num_angle[i],atom->num_dihedral[i]);
-                  for(int k=0;k<atom->num_bond[i];k++)
-                  printf("// %i %i ",atom->bond_type[i][k],atom->bond_atom[i][k]);
-          }
-          printf("\n");
-          }
-  if(i<atom->nlocal)
-  {
-  if((atom->v[i][0]<-100||atom->v[i][0]>100)||
-           (atom->v[i][1]<-100||atom->v[i][1]>100)||
-           (atom->v[i][2]<-100||atom->v[i][2]>100)||
-           (atom->v[i][0]!=atom->v[i][0])||
-           (atom->v[i][1]!=atom->v[i][1])||
-           (atom->v[i][2]!=atom->v[i][2]))
-          {printf("%i # CUDA %s velocity: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);  check=true;}
-  if((atom->f[i][0]<-10000||atom->f[i][0]>10000)||
-           (atom->f[i][1]<-10000||atom->f[i][1]>10000)||
-           (atom->f[i][2]<-10000||atom->f[i][2]>10000)||
-           (atom->f[i][0]!=atom->f[i][0])||
-           (atom->f[i][1]!=atom->f[i][1])||
-           (atom->f[i][2]!=atom->f[i][2]))
-          {printf("%i # CUDA %s force: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);    check=true;}
-  if(atom->tag[i]<=0)
-          printf("%i # CUDA %s tag: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);
-  }
+  bool check = false;
+
+  if(cuda->finished_setup) cuda->downloadAll();
+
+  for(int i = 0; i < atom->nlocal + atom->nghost; i++) {
+    if((atom->tag[i] == aatom) && (i < atom->nlocal)) {
+
+      printf("%i # CUDA %s: %i %i %e %e %e %i ", comm->me, string, update->ntimestep, atom->tag[i], atom->x[i][0], atom->v[i][0], atom->f[i][0], i);
+
+      if(atom->molecular && (i < atom->nlocal)) {
+        printf(" // %i %i %i ", atom->num_bond[i], atom->num_angle[i], atom->num_dihedral[i]);
+
+        for(int k = 0; k < atom->num_bond[i]; k++)
+          printf("// %i %i ", atom->bond_type[i][k], atom->bond_atom[i][k]);
+      }
+
+      printf("\n");
+    }
+
+    if(i < atom->nlocal) {
+      if((atom->v[i][0] < -100 || atom->v[i][0] > 100) ||
+          (atom->v[i][1] < -100 || atom->v[i][1] > 100) ||
+          (atom->v[i][2] < -100 || atom->v[i][2] > 100) ||
+          (atom->v[i][0] != atom->v[i][0]) ||
+          (atom->v[i][1] != atom->v[i][1]) ||
+          (atom->v[i][2] != atom->v[i][2])) {
+        printf("%i # CUDA %s velocity: %i %e %e %e %i\n", comm->me, string, atom->tag[i], atom->x[i][0], atom->v[i][0], atom->f[i][0], i);
+        check = true;
+      }
+
+      if((atom->f[i][0] < -10000 || atom->f[i][0] > 10000) ||
+          (atom->f[i][1] < -10000 || atom->f[i][1] > 10000) ||
+          (atom->f[i][2] < -10000 || atom->f[i][2] > 10000) ||
+          (atom->f[i][0] != atom->f[i][0]) ||
+          (atom->f[i][1] != atom->f[i][1]) ||
+          (atom->f[i][2] != atom->f[i][2])) {
+        printf("%i # CUDA %s force: %i %e %e %e %i\n", comm->me, string, atom->tag[i], atom->x[i][0], atom->v[i][0], atom->f[i][0], i);
+        check = true;
+      }
+
+      if(atom->tag[i] <= 0)
+        printf("%i # CUDA %s tag: %i %e %e %e %i\n", comm->me, string, atom->tag[i], atom->x[i][0], atom->v[i][0], atom->f[i][0], i);
+    }
   }
+
   if(check) exit(0);
 }
diff --git a/src/compute_property_local.cpp b/src/compute_property_local.cpp
index 32787313ef..c9dff6d999 100644
--- a/src/compute_property_local.cpp
+++ b/src/compute_property_local.cpp
@@ -62,6 +62,18 @@ ComputePropertyLocal::ComputePropertyLocal(LAMMPS *lmp, int narg, char **arg) :
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = NEIGH;
+    } else if (strcmp(arg[iarg],"ntype1") == 0) {
+      pack_choice[i] = &ComputePropertyLocal::pack_ptype1;
+      if (kindflag != NONE && kindflag != NEIGH)
+        error->all(FLERR,
+                   "Compute property/local cannot use these inputs together");
+      kindflag = NEIGH;
+    } else if (strcmp(arg[iarg],"ntype2") == 0) {
+      pack_choice[i] = &ComputePropertyLocal::pack_ptype2;
+      if (kindflag != NONE && kindflag != NEIGH)
+        error->all(FLERR,
+                   "Compute property/local cannot use these inputs together");
+      kindflag = NEIGH;
 
     } else if (strcmp(arg[iarg],"patom1") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_patom1;
@@ -75,6 +87,18 @@ ComputePropertyLocal::ComputePropertyLocal(LAMMPS *lmp, int narg, char **arg) :
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = PAIR;
+    } else if (strcmp(arg[iarg],"ptype1") == 0) {
+      pack_choice[i] = &ComputePropertyLocal::pack_ptype1;
+      if (kindflag != NONE && kindflag != PAIR)
+        error->all(FLERR,
+                   "Compute property/local cannot use these inputs together");
+      kindflag = PAIR;
+    } else if (strcmp(arg[iarg],"ptype2") == 0) {
+      pack_choice[i] = &ComputePropertyLocal::pack_ptype2;
+      if (kindflag != NONE && kindflag != PAIR)
+        error->all(FLERR,
+                   "Compute property/local cannot use these inputs together");
+      kindflag = PAIR;
 
     } else if (strcmp(arg[iarg],"batom1") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_batom1;
@@ -359,8 +383,8 @@ int ComputePropertyLocal::count_pairs(int allflag, int forceflag)
       if (forceflag && rsq >= cutsq[itype][jtype]) continue;
 
       if (allflag) {
-        indices[m][0] = tag[i];
-        indices[m][1] = tag[j];
+        indices[m][0] = i;
+        indices[m][1] = j;
       }
       m++;
     }
@@ -582,8 +606,12 @@ double ComputePropertyLocal::memory_usage()
 
 void ComputePropertyLocal::pack_patom1(int n)
 {
+  int i;
+  int *tag = atom->tag;
+
   for (int m = 0; m < ncount; m++) {
-    buf[n] = indices[m][0];
+    i = indices[m][0];
+    buf[n] = tag[i];
     n += nvalues;
   }
 }
@@ -592,8 +620,40 @@ void ComputePropertyLocal::pack_patom1(int n)
 
 void ComputePropertyLocal::pack_patom2(int n)
 {
+  int i;
+  int *tag = atom->tag;
+
   for (int m = 0; m < ncount; m++) {
-    buf[n] = indices[m][1];
+    i = indices[m][1];
+    buf[n] = tag[i];
+    n += nvalues;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputePropertyLocal::pack_ptype1(int n)
+{
+  int i;
+  int *type = atom->type;
+
+  for (int m = 0; m < ncount; m++) {
+    i = indices[m][0];
+    buf[n] = type[i];
+    n += nvalues;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputePropertyLocal::pack_ptype2(int n)
+{
+  int i;
+  int *type = atom->type;
+
+  for (int m = 0; m < ncount; m++) {
+    i = indices[m][0];
+    buf[n] = type[i];
     n += nvalues;
   }
 }
diff --git a/src/compute_property_local.h b/src/compute_property_local.h
index 70f24ac540..343e069984 100644
--- a/src/compute_property_local.h
+++ b/src/compute_property_local.h
@@ -58,6 +58,8 @@ class ComputePropertyLocal : public Compute {
 
   void pack_patom1(int);
   void pack_patom2(int);
+  void pack_ptype1(int);
+  void pack_ptype2(int);
 
   void pack_batom1(int);
   void pack_batom2(int);
diff --git a/src/input.cpp b/src/input.cpp
index 6be6e0985f..a89af5bf3d 100644
--- a/src/input.cpp
+++ b/src/input.cpp
@@ -780,7 +780,6 @@ void Input::partition()
   // ptr = start of 4th word
 
   strcpy(copy,line);
-  copy[strlen(copy)-1] = '\0';
   char *ptr = strtok(copy," \t\n\r\f");
   ptr = strtok(NULL," \t\n\r\f");
   ptr = strtok(NULL," \t\n\r\f");
diff --git a/src/thermo.cpp b/src/thermo.cpp
index e633c5624e..fde5ffcd41 100644
--- a/src/thermo.cpp
+++ b/src/thermo.cpp
@@ -968,13 +968,15 @@ int Thermo::evaluate_keyword(char *word, double *answer)
 
   } else if (strcmp(word,"elapsed") == 0) {
     if (update->whichflag == 0)
-      error->all(FLERR,"This variable thermo keyword cannot be used between runs");
+      error->all(FLERR,
+                 "This variable thermo keyword cannot be used between runs");
     compute_elapsed();
     dvalue = bivalue;
 
   } else if (strcmp(word,"elaplong") == 0) {
     if (update->whichflag == 0)
-      error->all(FLERR,"This variable thermo keyword cannot be used between runs");
+      error->all(FLERR,
+                 "This variable thermo keyword cannot be used between runs");
     compute_elapsed_long();
     dvalue = bivalue;
 
@@ -983,17 +985,20 @@ int Thermo::evaluate_keyword(char *word, double *answer)
 
   } else if (strcmp(word,"cpu") == 0) {
     if (update->whichflag == 0)
-      error->all(FLERR,"This variable thermo keyword cannot be used between runs");
+      error->all(FLERR,
+                 "This variable thermo keyword cannot be used between runs");
     compute_cpu();
 
   } else if (strcmp(word,"tpcpu") == 0) {
     if (update->whichflag == 0)
-      error->all(FLERR,"This variable thermo keyword cannot be used between runs");
+      error->all(FLERR,
+                 "This variable thermo keyword cannot be used between runs");
     compute_tpcpu();
 
   } else if (strcmp(word,"spcpu") == 0) {
     if (update->whichflag == 0)
-      error->all(FLERR,"This variable thermo keyword cannot be used between runs");
+      error->all(FLERR,
+                 "This variable thermo keyword cannot be used between runs");
     compute_spcpu();
 
   } else if (strcmp(word,"atoms") == 0) {
@@ -1030,7 +1035,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
 
   } else if (strcmp(word,"pe") == 0) {
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     if (update->whichflag == 0) {
       if (pe->invoked_scalar != update->ntimestep)
         error->all(FLERR,"Compute used in variable thermo keyword between runs "
@@ -1057,7 +1063,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
 
   } else if (strcmp(word,"etotal") == 0) {
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     if (update->whichflag == 0) {
       if (pe->invoked_scalar != update->ntimestep)
         error->all(FLERR,"Compute used in variable thermo keyword between runs "
@@ -1081,7 +1088,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
 
   } else if (strcmp(word,"enthalpy") == 0) {
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     if (update->whichflag == 0) {
       if (pe->invoked_scalar != update->ntimestep)
         error->all(FLERR,"Compute used in variable thermo keyword between runs "
@@ -1118,7 +1126,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
     if (update->eflag_global != update->ntimestep)
       error->all(FLERR,"Energy was not tallied on needed timestep");
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     pe->invoked_flag |= INVOKED_SCALAR;
     compute_evdwl();
 
@@ -1126,7 +1135,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
     if (update->eflag_global != update->ntimestep)
       error->all(FLERR,"Energy was not tallied on needed timestep");
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     pe->invoked_flag |= INVOKED_SCALAR;
     compute_ecoul();
 
@@ -1134,7 +1144,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
     if (update->eflag_global != update->ntimestep)
       error->all(FLERR,"Energy was not tallied on needed timestep");
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     pe->invoked_flag |= INVOKED_SCALAR;
     compute_epair();
 
@@ -1142,7 +1153,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
     if (update->eflag_global != update->ntimestep)
       error->all(FLERR,"Energy was not tallied on needed timestep");
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     pe->invoked_flag |= INVOKED_SCALAR;
     compute_ebond();
 
@@ -1150,7 +1162,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
     if (update->eflag_global != update->ntimestep)
       error->all(FLERR,"Energy was not tallied on needed timestep");
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     pe->invoked_flag |= INVOKED_SCALAR;
     compute_eangle();
 
@@ -1158,7 +1171,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
     if (update->eflag_global != update->ntimestep)
       error->all(FLERR,"Energy was not tallied on needed timestep");
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     pe->invoked_flag |= INVOKED_SCALAR;
     compute_edihed();
 
@@ -1166,7 +1180,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
     if (update->eflag_global != update->ntimestep)
       error->all(FLERR,"Energy was not tallied on needed timestep");
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     pe->invoked_flag |= INVOKED_SCALAR;
     compute_eimp();
 
@@ -1174,7 +1189,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
     if (update->eflag_global != update->ntimestep)
       error->all(FLERR,"Energy was not tallied on needed timestep");
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     pe->invoked_flag |= INVOKED_SCALAR;
     compute_emol();
 
@@ -1182,7 +1198,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
     if (update->eflag_global != update->ntimestep)
       error->all(FLERR,"Energy was not tallied on needed timestep");
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     pe->invoked_flag |= INVOKED_SCALAR;
     compute_elong();
 
@@ -1190,7 +1207,8 @@ int Thermo::evaluate_keyword(char *word, double *answer)
     if (update->eflag_global != update->ntimestep)
       error->all(FLERR,"Energy was not tallied on needed timestep");
     if (!pe)
-      error->all(FLERR,"Thermo keyword in variable requires thermo to use/init pe");
+      error->all(FLERR,
+                 "Thermo keyword in variable requires thermo to use/init pe");
     pe->invoked_flag |= INVOKED_SCALAR;
     compute_etail();
 
diff --git a/src/version.h b/src/version.h
index e6560d0f0c..704cb3282e 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "14 Aug 2012"
+#define LAMMPS_VERSION "16 Aug 2012"