diff --git a/doc/package.html b/doc/package.html
index f31f8db721..2263301ebe 100644
--- a/doc/package.html
+++ b/doc/package.html
@@ -25,22 +25,24 @@
     last = ID of last GPU to be used on each node
     split = fraction of particles assigned to the GPU
     zero or more keyword/value pairs may be appended
-    keywords = <I>threads_per_atom</I>
-    <I>threads_per_atom</I> value = Nthreads
-      Nthreads = # of GPU threads used per atom
+    keywords = <I>threads_per_atom</I> or <I>cellsize</I>
+      <I>threads_per_atom</I> value = Nthreads
+        Nthreads = # of GPU threads used per atom
+      <I>cellsize</I> value = dist
+        dist = length (distance units) in each dimension for neighbor bins
   <I>cuda</I> args = keyword value ...
     one or more keyword/value pairs may be appended
     keywords = <I>gpu/node</I> or <I>gpu/node/special</I> or <I>timing</I> or <I>test</I> or <I>override/bpa</I>
-    <I>gpu/node</I> value = N
-      N = number of GPUs to be used per node
-    <I>gpu/node/special</I> values = N gpu1 .. gpuN
-      N = number of GPUs to be used per node
-      gpu1 .. gpuN = N IDs of the GPUs to use
-    <I>timing</I> values = none
-    <I>test</I> values = id
-      id = atom-ID of a test particle
-    <I>override/bpa</I> values = flag
-      flag = 0 for TpA algorithm, 1 for BpA algorithm 
+      <I>gpu/node</I> value = N
+        N = number of GPUs to be used per node
+      <I>gpu/node/special</I> values = N gpu1 .. gpuN
+        N = number of GPUs to be used per node
+        gpu1 .. gpuN = N IDs of the GPUs to use
+      <I>timing</I> values = none
+      <I>test</I> values = id
+        id = atom-ID of a test particle
+      <I>override/bpa</I> values = flag
+        flag = 0 for TpA algorithm, 1 for BpA algorithm 
   <I>omp</I> args = Nthreads mode
     Nthreads = # of OpenMP threads to associate with each MPI process
     mode = force or force/neigh (optional) 
@@ -133,6 +135,18 @@ large cutoffs or with a small number of particles per GPU, increasing
 the value can improve performance. The number of threads per atom must
 be a power of 2 and currently cannot be greater than 32.
 </P>
+<P>The <I>cellsize</I> keyword can be used to control the size of the cells used
+for binning atoms in neighbor list calculations. Setting this value is 
+normally not needed; the optimal value is close to the default 
+(equal to the cutoff distance for the short range interactions 
+plus the neighbor skin). GPUs can perform efficiently with much larger cutoffs 
+than CPUs and this can be used to reduce the time required for long-range 
+calculations or in some cases to eliminate them with models such as 
+<A HREF = "pair_coul.html">coul/wolf</A> or <A HREF = "pair_coul.html">coul/dsf</A>. For very large cutoffs,
+it can be more efficient to use smaller values for cellsize in parallel
+simulations. For example, with a cutoff of 20*sigma and a neighbor skin of
+sigma, a cellsize of 5.25*sigma can be efficient for parallel simulations.
+</P>
 <HR>
 
 <P>The <I>cuda</I> style invokes options associated with the use of the
diff --git a/doc/package.txt b/doc/package.txt
index c9b6d9681f..fb2431f8ad 100644
--- a/doc/package.txt
+++ b/doc/package.txt
@@ -20,22 +20,24 @@ args = arguments specific to the style :l
     last = ID of last GPU to be used on each node
     split = fraction of particles assigned to the GPU
     zero or more keyword/value pairs may be appended
-    keywords = {threads_per_atom}
-    {threads_per_atom} value = Nthreads
-      Nthreads = # of GPU threads used per atom
+    keywords = {threads_per_atom} or {cellsize}
+      {threads_per_atom} value = Nthreads
+        Nthreads = # of GPU threads used per atom
+      {cellsize} value = dist
+        dist = length (distance units) in each dimension for neighbor bins
   {cuda} args = keyword value ...
     one or more keyword/value pairs may be appended
     keywords = {gpu/node} or {gpu/node/special} or {timing} or {test} or {override/bpa}
-    {gpu/node} value = N
-      N = number of GPUs to be used per node
-    {gpu/node/special} values = N gpu1 .. gpuN
-      N = number of GPUs to be used per node
-      gpu1 .. gpuN = N IDs of the GPUs to use
-    {timing} values = none
-    {test} values = id
-      id = atom-ID of a test particle
-    {override/bpa} values = flag
-      flag = 0 for TpA algorithm, 1 for BpA algorithm 
+      {gpu/node} value = N
+        N = number of GPUs to be used per node
+      {gpu/node/special} values = N gpu1 .. gpuN
+        N = number of GPUs to be used per node
+        gpu1 .. gpuN = N IDs of the GPUs to use
+      {timing} values = none
+      {test} values = id
+        id = atom-ID of a test particle
+      {override/bpa} values = flag
+        flag = 0 for TpA algorithm, 1 for BpA algorithm 
   {omp} args = Nthreads mode
     Nthreads = # of OpenMP threads to associate with each MPI process
     mode = force or force/neigh (optional) :pre
@@ -127,6 +129,18 @@ large cutoffs or with a small number of particles per GPU, increasing
 the value can improve performance. The number of threads per atom must
 be a power of 2 and currently cannot be greater than 32.
 
+The {cellsize} keyword can be used to control the size of the cells used
+for binning atoms in neighbor list calculations. Setting this value is 
+normally not needed; the optimal value is close to the default 
+(equal to the cutoff distance for the short range interactions 
+plus the neighbor skin). GPUs can perform efficiently with much larger cutoffs 
+than CPUs and this can be used to reduce the time required for long-range 
+calculations or in some cases to eliminate them with models such as 
+"coul/wolf"_pair_coul.html or "coul/dsf"_pair_coul.html. For very large cutoffs,
+it can be more efficient to use smaller values for cellsize in parallel
+simulations. For example, with a cutoff of 20*sigma and a neighbor skin of
+sigma, a cellsize of 5.25*sigma can be efficient for parallel simulations.
+
 :line
 
 The {cuda} style invokes options associated with the use of the
diff --git a/doc/pair_coul.html b/doc/pair_coul.html
index 62cab162ac..153cfcb150 100644
--- a/doc/pair_coul.html
+++ b/doc/pair_coul.html
@@ -17,6 +17,10 @@
 </H3>
 <H3>pair_style coul/debye/omp command 
 </H3>
+<H3>pair_style coul/dsf command 
+</H3>
+<H3>pair_style coul/dsf/gpu command 
+</H3>
 <H3>pair_style coul/long command 
 </H3>
 <H3>pair_style coul/long/omp command 
@@ -31,6 +35,7 @@
 </P>
 <PRE>pair_style coul/cut cutoff
 pair_style coul/debye kappa cutoff
+pair_style coul/dsf alpha cutoff
 pair_style coul/long cutoff
 pair_style coul/long/gpu cutoff 
 pair_style coul/wolf alpha cutoff 
@@ -49,6 +54,9 @@ pair_coeff 2 2 3.5
 pair_coeff * *
 pair_coeff 2 2 3.5 
 </PRE>
+<PRE>pair_style coul/dsf 0.05 10.0
+pair_coeff * * 
+</PRE>
 <PRE>pair_style coul/long 10.0
 pair_coeff * * 
 </PRE>
@@ -75,6 +83,17 @@ Coulombic term, given by
 <P>where kappa is the Debye length.  This potential is another way to
 mimic the screening effect of a polar solvent.
 </P>
+<P>Style <I>coul/dsf</I> computes Coulombic interactions via the damped 
+shifted force model described in <A HREF = "#Fennell">Fennell</A>, given by:
+</P>
+<CENTER><IMG SRC = "Eqs/pair_coul_dsf.jpg">
+</CENTER>
+<P>where <I>alpha</I> is the damping parameter and erfc() is the
+complementary error-function. The potential corrects issues in the
+Wolf model (described below) to provide consistent forces and energies
+(the Wolf potential is not differentiable at the cutoff) and smooth
+decay to zero.
+</P>
 <P>Style <I>coul/wolf</I> computes Coulombic interactions via the Wolf
 summation method, described in <A HREF = "#Wolf">Wolf</A>, given by:
 </P>
@@ -193,5 +212,11 @@ hybrid/overlay</A>
 <A NAME = "Wolf"></A>
 
 <P><B>(Wolf)</B> D. Wolf, P. Keblinski, S. R. Phillpot, J. Eggebrecht, J Chem
-Phys, 110, 8254 (1999).</P>
+Phys, 110, 8254 (1999).
+</P>
+<A NAME = "Fennell"></A>
+
+<P><B>(Fennell)</B> C. J. Fennell, J. D. Gezelter, J Chem Phys, 124, 
+234104 (2006).
+</P>
 </HTML>
diff --git a/doc/pair_coul.txt b/doc/pair_coul.txt
index bc6cf84db5..060c4a99ed 100644
--- a/doc/pair_coul.txt
+++ b/doc/pair_coul.txt
@@ -10,6 +10,8 @@ pair_style coul/cut command :h3
 pair_style coul/cut/omp command :h3
 pair_style coul/debye command :h3
 pair_style coul/debye/omp command :h3
+pair_style coul/dsf command :h3
+pair_style coul/dsf/gpu command :h3
 pair_style coul/long command :h3
 pair_style coul/long/omp command :h3
 pair_style coul/long/gpu command :h3
@@ -20,6 +22,7 @@ pair_style coul/wolf/omp command :h3
 
 pair_style coul/cut cutoff
 pair_style coul/debye kappa cutoff
+pair_style coul/dsf alpha cutoff
 pair_style coul/long cutoff
 pair_style coul/long/gpu cutoff 
 pair_style coul/wolf alpha cutoff :pre
@@ -38,6 +41,9 @@ pair_style coul/debye 1.4 3.0
 pair_coeff * *
 pair_coeff 2 2 3.5 :pre
 
+pair_style coul/dsf 0.05 10.0
+pair_coeff * * :pre
+
 pair_style coul/long 10.0
 pair_coeff * * :pre
 
@@ -64,6 +70,17 @@ Coulombic term, given by
 where kappa is the Debye length.  This potential is another way to
 mimic the screening effect of a polar solvent.
 
+Style {coul/dsf} computes Coulombic interactions via the damped 
+shifted force model described in "Fennell"_#Fennell, given by:
+
+:c,image(Eqs/pair_coul_dsf.jpg)
+
+where {alpha} is the damping parameter and erfc() is the
+complementary error-function. The potential corrects issues in the
+Wolf model (described below) to provide consistent forces and energies
+(the Wolf potential is not differentiable at the cutoff) and smooth
+decay to zero.
+
 Style {coul/wolf} computes Coulombic interactions via the Wolf
 summation method, described in "Wolf"_#Wolf, given by:
 
@@ -181,4 +198,8 @@ hybrid/overlay"_pair_hybrid.html
 
 :link(Wolf)
 [(Wolf)] D. Wolf, P. Keblinski, S. R. Phillpot, J. Eggebrecht, J Chem
-Phys, 110, 8254 (1999).
\ No newline at end of file
+Phys, 110, 8254 (1999).
+
+:link(Fennell)
+[(Fennell)] C. J. Fennell, J. D. Gezelter, J Chem Phys, 124, 
+234104 (2006).
diff --git a/doc/pair_lj.html b/doc/pair_lj.html
index 828624190a..355e924eab 100644
--- a/doc/pair_lj.html
+++ b/doc/pair_lj.html
@@ -37,6 +37,10 @@
 </H3>
 <H3>pair_style lj/cut/coul/debye/omp command 
 </H3>
+<H3>pair_style lj/cut/coul/dsf command 
+</H3>
+<H3>pair_style lj/cut/coul/dsf/gpu command 
+</H3>
 <H3>pair_style lj/cut/coul/long command 
 </H3>
 <H3>pair_style lj/cut/coul/long/cuda command 
@@ -57,7 +61,7 @@
 </P>
 <PRE>pair_style style args 
 </PRE>
-<UL><LI>style = <I>lj/cut</I> or <I>lj/cut/coul/cut</I> or <I>lj/cut/coul/debye</I> or <I>lj/cut/coul/long</I> or <I>lj/cut/coul/long/tip4p</I>
+<UL><LI>style = <I>lj/cut</I> or <I>lj/cut/coul/cut</I> or <I>lj/cut/coul/debye</I> or <I>lj/cut/coul/dsf</I> or <I>lj/cut/coul/long</I> or <I>lj/cut/coul/long/tip4p</I>
 <LI>args = list of arguments for a particular style 
 </UL>
 <PRE>  <I>lj/cut</I> args = cutoff
@@ -69,6 +73,10 @@
     kappa = inverse of the Debye length (inverse distance units)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
+  <I>lj/cut/coul/dsf</I> args = alpha cutoff (cutoff2)
+    alpha = damping parameter (inverse distance units)
+    cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
+    cutoff2 = global cutoff for Coulombic (distance units)
   <I>lj/cut/coul/long</I> args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
@@ -97,6 +105,10 @@ pair_coeff * * 1.0 1.0
 pair_coeff 1 1 1.0 1.5 2.5
 pair_coeff 1 1 1.0 1.5 2.5 5.0 
 </PRE>
+<PRE>pair_style lj/cut/coul/dsf 0.05 2.5 10.0
+pair_coeff * * 1.0 1.0
+pair_coeff 1 1 1.0 1.0 2.5 
+</PRE>
 <PRE>pair_style lj/cut/coul/long 10.0
 pair_style lj/cut/coul/long 10.0 8.0
 pair_coeff * * 100.0 3.0
@@ -135,6 +147,23 @@ to the Coulombic term, given by
 <P>where kappa is the inverse of the Debye length.  This potential is
 another way to mimic the screening effect of a polar solvent.
 </P>
+<P>Style <I>lj/cut/coul/dsf</I> computes the Coulombic term via the damped 
+shifted force model described in <A HREF = "#Fennell">Fennell</A>, given by:
+</P>
+<CENTER><IMG SRC = "Eqs/pair_coul_dsf.jpg">
+</CENTER>
+<P>where <I>alpha</I> is the damping parameter and erfc() is the complementary
+error-function. This potential is essentially a short-range,
+spherically-truncated, charge-neutralized, shifted, pairwise <I>1/r</I>
+summation.  The potential is based on Wolf summation, proposed as an
+alternative to Ewald summation for condensed phase systems where
+charge screening causes electrostatic interactions to become
+effectively short-ranged. In order for the electrostatic sum to be
+absolutely convergent, charge neutralization within the cutoff radius
+is enforced by shifting the potential through placement of image
+charges on the cutoff sphere. Convergence can often be improved by
+setting <I>alpha</I> to a small non-zero value.
+</P>
 <P>Style <I>lj/cut/coul/long</I> computes the same Coulombic interactions as
 style <I>lj/cut/coul/cut</I> except that an additional damping factor is
 applied to the Coulombic term so it can be used in conjunction with
@@ -283,4 +312,9 @@ default.
 <P><B>(Jorgensen)</B> Jorgensen, Chandrasekhar, Madura, Impey, Klein, J Chem
 Phys, 79, 926 (1983).
 </P>
+<A NAME = "Fennell"></A>
+
+<P><B>(Fennell)</B> C. J. Fennell, J. D. Gezelter, J Chem Phys, 124, 
+234104 (2006).
+</P>
 </HTML>
diff --git a/doc/pair_lj.txt b/doc/pair_lj.txt
index 4340b15251..e87ffbcb0d 100644
--- a/doc/pair_lj.txt
+++ b/doc/pair_lj.txt
@@ -20,6 +20,8 @@ pair_style lj/cut/coul/debye command :h3
 pair_style lj/cut/coul/debye/cuda command :h3
 pair_style lj/cut/coul/debye/gpu command :h3
 pair_style lj/cut/coul/debye/omp command :h3
+pair_style lj/cut/coul/dsf command :h3
+pair_style lj/cut/coul/dsf/gpu command :h3
 pair_style lj/cut/coul/long command :h3
 pair_style lj/cut/coul/long/cuda command :h3
 pair_style lj/cut/coul/long/gpu command :h3
@@ -33,7 +35,7 @@ pair_style lj/cut/coul/long/tip4p/opt command :h3
 
 pair_style style args :pre
 
-style = {lj/cut} or {lj/cut/coul/cut} or {lj/cut/coul/debye} or {lj/cut/coul/long} or {lj/cut/coul/long/tip4p}
+style = {lj/cut} or {lj/cut/coul/cut} or {lj/cut/coul/debye} or {lj/cut/coul/dsf} or {lj/cut/coul/long} or {lj/cut/coul/long/tip4p}
 args = list of arguments for a particular style :ul
   {lj/cut} args = cutoff
     cutoff = global cutoff for Lennard Jones interactions (distance units)
@@ -44,6 +46,10 @@ args = list of arguments for a particular style :ul
     kappa = inverse of the Debye length (inverse distance units)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
+  {lj/cut/coul/dsf} args = alpha cutoff (cutoff2)
+    alpha = damping parameter (inverse distance units)
+    cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
+    cutoff2 = global cutoff for Coulombic (distance units)
   {lj/cut/coul/long} args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
@@ -72,6 +78,10 @@ pair_coeff * * 1.0 1.0
 pair_coeff 1 1 1.0 1.5 2.5
 pair_coeff 1 1 1.0 1.5 2.5 5.0 :pre
 
+pair_style lj/cut/coul/dsf 0.05 2.5 10.0
+pair_coeff * * 1.0 1.0
+pair_coeff 1 1 1.0 1.0 2.5 :pre
+
 pair_style lj/cut/coul/long 10.0
 pair_style lj/cut/coul/long 10.0 8.0
 pair_coeff * * 100.0 3.0
@@ -110,6 +120,23 @@ to the Coulombic term, given by
 where kappa is the inverse of the Debye length.  This potential is
 another way to mimic the screening effect of a polar solvent.
 
+Style {lj/cut/coul/dsf} computes the Coulombic term via the damped 
+shifted force model described in "Fennell"_#Fennell, given by:
+
+:c,image(Eqs/pair_coul_dsf.jpg)
+
+where {alpha} is the damping parameter and erfc() is the complementary
+error-function. This potential is essentially a short-range,
+spherically-truncated, charge-neutralized, shifted, pairwise {1/r}
+summation.  The potential is based on Wolf summation, proposed as an
+alternative to Ewald summation for condensed phase systems where
+charge screening causes electrostatic interactions to become
+effectively short-ranged. In order for the electrostatic sum to be
+absolutely convergent, charge neutralization within the cutoff radius
+is enforced by shifting the potential through placement of image
+charges on the cutoff sphere. Convergence can often be improved by
+setting {alpha} to a small non-zero value.
+
 Style {lj/cut/coul/long} computes the same Coulombic interactions as
 style {lj/cut/coul/cut} except that an additional damping factor is
 applied to the Coulombic term so it can be used in conjunction with
@@ -256,3 +283,7 @@ default.
 :link(Jorgensen)
 [(Jorgensen)] Jorgensen, Chandrasekhar, Madura, Impey, Klein, J Chem
 Phys, 79, 926 (1983).
+
+:link(Fennell)
+[(Fennell)] C. J. Fennell, J. D. Gezelter, J Chem Phys, 124, 
+234104 (2006).