From 4ae4792b009942900b5636a2d5283bd2adf3bbd0 Mon Sep 17 00:00:00 2001
From: pscrozi <pscrozi@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Thu, 4 Feb 2010 21:33:18 +0000
Subject: [PATCH] Updating lib/gpu to version 2.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3785 f3b2605a-c512-4ea7-a41b-209d697bcdaa
---
 lib/gpu/Makefile.cyg       |  72 ++++++
 lib/gpu/Makefile.nvidia    |  43 ++--
 lib/gpu/README             |  51 ++--
 lib/gpu/gb_gpu.cu          |  78 +++---
 lib/gpu/gb_gpu_extra.h     |  32 ++-
 lib/gpu/gb_gpu_kernel.h    |  33 ++-
 lib/gpu/gb_gpu_memory.cu   |  32 ++-
 lib/gpu/gb_gpu_memory.h    |  32 ++-
 lib/gpu/lj_gpu.cu          | 469 ++++++++++++++++++++++++++-----------
 lib/gpu/lj_gpu_kernel.h    | 347 +++++++++++++++++++++++++--
 lib/gpu/lj_gpu_memory.cu   |  32 ++-
 lib/gpu/lj_gpu_memory.h    |  34 ++-
 lib/gpu/nvc_device.cu      |  32 ++-
 lib/gpu/nvc_device.h       |  32 ++-
 lib/gpu/nvc_get_devices.cu |  32 ++-
 lib/gpu/nvc_macros.h       |  19 ++
 lib/gpu/nvc_memory.h       |  32 ++-
 lib/gpu/nvc_timer.h        |  32 ++-
 lib/gpu/nvc_traits.h       |  32 ++-
 lib/gpu/pair_gpu_atom.cu   |  32 ++-
 lib/gpu/pair_gpu_atom.h    |  32 ++-
 lib/gpu/pair_gpu_cell.cu   | 454 +++++++++++++++++++++++++++++++++++
 lib/gpu/pair_gpu_cell.h    |  62 +++++
 lib/gpu/pair_gpu_nbor.cu   |  32 ++-
 lib/gpu/pair_gpu_nbor.h    |  32 ++-
 lib/gpu/pair_gpu_texture.h |  32 ++-
 lib/gpu/pair_tex_tar.cu    |  33 ++-
 27 files changed, 1602 insertions(+), 573 deletions(-)
 create mode 100644 lib/gpu/Makefile.cyg
 create mode 100644 lib/gpu/pair_gpu_cell.cu
 create mode 100644 lib/gpu/pair_gpu_cell.h

diff --git a/lib/gpu/Makefile.cyg b/lib/gpu/Makefile.cyg
new file mode 100644
index 0000000000..3d2294b1a2
--- /dev/null
+++ b/lib/gpu/Makefile.cyg
@@ -0,0 +1,72 @@
+# /* ----------------------------------------------------------------------
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+#    http://lammps.sandia.gov, Sandia National Laboratories
+#    Steve Plimpton, sjplimp@sandia.gov
+# 
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+#    certain rights in this software.  This software is distributed under 
+#    the GNU General Public License.
+# 
+#    See the README file in the top-level LAMMPS directory.
+# ------------------------------------------------------------------------- */
+# 
+# /* ----------------------------------------------------------------------
+#    Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+#                          Peng Wang (Nvidia), penwang@nvidia.com
+#                          Paul Crozier (SNL), pscrozi@sandia.gov
+# ------------------------------------------------------------------------- */
+
+BIN_DIR = .
+OBJ_DIR = .
+AR = ar
+CUDA_CPP  = /cygdrive/c/CUDA/bin/nvcc -I/cygdrive/c/CUDA/include -O3 -DWINDLL -DUNIX -Xptxas -v --use_fast_math
+CUDA_ARCH = -arch=sm_13
+CUDA_PREC = -D_SINGLE_SINGLE
+CUDA_LINK = -L/cygdrive/c/CUDA/lib -lcudart $(CUDA_LIB)
+
+CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
+
+CUDA_LIB = $(OBJ_DIR)/gpu.dll
+
+# Headers for CUDA Stuff
+NVC_H  = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
+# Headers for Pair Stuff
+PAIR_H  = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h
+# Dependencies for the Texture Tar
+TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
+        lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
+        gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu
+
+ALL_H = $(NVC_H) $(PAIR_H)
+
+EXECS = $(BIN_DIR)/nvc_get_devices
+OBJS = $(OBJ_DIR)/nvc_device.obj $(OBJ_DIR)/pair_gpu_nbor.obj \
+       $(OBJ_DIR)/pair_tex_tar.obj $(OBJ_DIR)/pair_gpu_cell.obj 
+
+all: $(CUDA_LIB) $(EXECS)
+
+$(OBJ_DIR)/nvc_device.obj : nvc_device.cu $(NVC_H)
+	$(CUDA) -o $@ -c nvc_device.cu
+
+$(OBJ_DIR)/pair_gpu_nbor.obj: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
+	$(CUDA) -o $@ -c pair_gpu_nbor.cu
+
+$(OBJ_DIR)/pair_tex_tar.obj: $(TAR_H)
+	$(CUDA) -o $@ -c pair_tex_tar.cu
+
+$(OBJ_DIR)/pair_gpu_cell.obj: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h
+	$(CUDA) -o $@ -c pair_gpu_cell.cu
+
+$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.obj 
+	$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.obj 
+
+$(CUDA_LIB): $(OBJS) $(TAR_H)
+	$(CUDA) -o $@ -shared $(OBJS)
+	
+clean:
+	rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.exe *.exp *.lib *.dll *.linkinfo
+
+veryclean: clean
+	rm -rf *~ *.linkinfo
+
diff --git a/lib/gpu/Makefile.nvidia b/lib/gpu/Makefile.nvidia
index a1defe2254..78489850b4 100644
--- a/lib/gpu/Makefile.nvidia
+++ b/lib/gpu/Makefile.nvidia
@@ -1,25 +1,29 @@
-#***************************************************************************
-#                               Makefile
-#                          -------------------
-#                           W. Michael Brown
-#
-#  _________________________________________________________________________
-#    Build for the LAMMPS GPU Force Library
-#
-#  _________________________________________________________________________
-#
-#    begin                : Tue June 23 2009
-#    copyright            : (C) 2009 by W. Michael Brown
-#    email                : wmbrown@sandia.gov
-# ***************************************************************************/
+# /* ----------------------------------------------------------------------   
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
+#    http://lammps.sandia.gov, Sandia National Laboratories                   
+#    Steve Plimpton, sjplimp@sandia.gov                                       
+#                                                                             
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
+#    certain rights in this software.  This software is distributed under      
+#    the GNU General Public License.                                          
+#                                                                             
+#    See the README file in the top-level LAMMPS directory.                   
+# ------------------------------------------------------------------------- */
+#                                                                             
+# /* ----------------------------------------------------------------------   
+#    Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov               
+#                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Paul Crozier (SNL), pscrozi@sandia.gov             
+# ------------------------------------------------------------------------- */
 
 BIN_DIR = .
 OBJ_DIR = .
 AR = ar
 CUDA_CPP  = nvcc -I/usr/local/cuda/include -DUNIX -O3 -Xptxas -v --use_fast_math
-CUDA_ARCH = -maxrregcount 128 #-arch=sm_13
+CUDA_ARCH = -arch=sm_13
 CUDA_PREC = -D_SINGLE_SINGLE
-CUDA_LINK = -L/usr/local/cuda/lib64 -lcudart $(CUDA_LIB)
+CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB)
 
 CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
 
@@ -28,7 +32,7 @@ CUDA_LIB = $(OBJ_DIR)/libgpu.a
 # Headers for CUDA Stuff
 NVC_H  = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
 # Headers for Pair Stuff
-PAIR_H  = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h
+PAIR_H  = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h
 # Dependencies for the Texture Tar
 TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
         lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
@@ -38,7 +42,7 @@ ALL_H = $(NVC_H) $(PAIR_H)
 
 EXECS = $(BIN_DIR)/nvc_get_devices
 OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/pair_gpu_nbor.cu_o \
-       $(OBJ_DIR)/pair_tex_tar.cu_o
+       $(OBJ_DIR)/pair_tex_tar.cu_o $(OBJ_DIR)/pair_gpu_cell.cu_o 
 
 all: $(CUDA_LIB) $(EXECS)
 
@@ -51,6 +55,9 @@ $(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor
 $(OBJ_DIR)/pair_tex_tar.cu_o: $(TAR_H)
 	$(CUDA) -o $@ -c pair_tex_tar.cu
 
+$(OBJ_DIR)/pair_gpu_cell.cu_o: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h
+	$(CUDA) -o $@ -c pair_gpu_cell.cu
+
 $(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o
 	$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o 
 
diff --git a/lib/gpu/README b/lib/gpu/README
index 62af16e4cb..02859b17ad 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -1,29 +1,25 @@
-/***************************************************************************
-                                   README
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  README for building LAMMPS GPU Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Thu Jun 25 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
                           GENERAL NOTES
                           
-This library, pair_gpu_lib.a, provides routines for GPGPU acceleration
+This library, libgpu.a, provides routines for GPU acceleration
 of LAMMPS pair styles.  Currently, only CUDA enabled GPUs are
 supported.  Compilation of this library requires installing the CUDA
 GPU driver and CUDA toolkit for your operating system. In addition to
@@ -33,14 +29,14 @@ devices on your system.
 
 NOTE: Installation of the CUDA SDK is not required.
 
-Current pair styles supporting GPU Accelartion:
+Current pair styles supporting GPU acceleration:
 
   1. lj/cut/gpu
   2. gayberne/gpu
 
                      MULTIPLE LAMMPS PROCESSES
                      
-When using GPGPU acceleration, you are restricted to one physical GPU
+When using GPU acceleration, you are restricted to one physical GPU
 per LAMMPS process. This can be multiple GPUs on a single node or
 across multiple nodes. Intructions on GPU assignment can be found in
 the LAMMPS documentation.
@@ -66,6 +62,9 @@ the CUDA_PREC variable:
   CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
   CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
 
+NOTE: For the lj/cut pair style, only single precision will be used, even 
+      if double precision is specified.
+  
 NOTE: Double precision is only supported on certain GPUS (with
       compute capability>=1.3).
       
@@ -74,14 +73,14 @@ NOTE: For Tesla and other graphics cards with compute capability>=1.3,
 
 NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
       package has been installed before installing the GPU package in LAMMPS.
-
+      
                            GPU MEMORY
 
-Upon initialization of the pair style, the library will reserve memory
-for 64K atoms per GPU or 70% of each cards GPU memory, whichever value
+Upon initialization of the gayberne/gpu pair style, the library will reserve
+memory for 64K atoms per GPU or 70% of each cards GPU memory, whichever value
 is limiting.  The value of 70% can be changed by editing the
-PERCENT_GPU_MEMORY definition in the source file. The value of 64K
-cannot be increased and is the maximum number of atoms allowed per
+PERCENT_GPU_MEMORY definition in the source file. For gayberne/gpu, the value
+of 64K cannot be increased and is the maximum number of atoms allowed per
 GPU. Using the 'neigh_modify one' modifier in your LAMMPS input script
 can help to increase maximum number of atoms per GPU for cards with
 limited memory.
diff --git a/lib/gpu/gb_gpu.cu b/lib/gpu/gb_gpu.cu
index 9bfb53a497..10c6557b31 100644
--- a/lib/gpu/gb_gpu.cu
+++ b/lib/gpu/gb_gpu.cu
@@ -1,27 +1,21 @@
-/***************************************************************************
-                                  gb_gpu.cu
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Gay-Berne anisotropic potential GPU calcultation
-
-   *** Force decomposition by Atom Version ***
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Jun 23 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #include <iostream>
 #include <cassert>
@@ -190,30 +184,30 @@ inline string gb_gpu_toa(const t& in) {
 // ---------------------------------------------------------------------------
 // Return string with GPU info
 // ---------------------------------------------------------------------------
-string gb_gpu_name(const int id, const int max_nbors) {
-  string name=GBMF[0].gpu.name(id)+", "+
+EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) {
+  string sname=GBMF[0].gpu.name(id)+", "+
               gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
               gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
               gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
               gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
                                                max_nbors))+" Atoms";
-  return name;
+  strcpy(name,sname.c_str());
 }
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
-                 const double upsilon, const double mu, double **shape,
-                 double **well, double **cutsq, double **sigma, 
-                 double **epsilon, double *host_lshape, int **form,
-                 double **host_lj1, double **host_lj2, double **host_lj3, 
-                 double **host_lj4, double **offset, double *special_lj,
-                 const int max_nbors, const int thread, const int gpu_id) {
+EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
+                  const double upsilon, const double mu, double **shape,
+                  double **well, double **cutsq, double **sigma, 
+                  double **epsilon, double *host_lshape, int **form,
+                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj4, double **offset, double *special_lj,
+                  const int max_nbors, const int thread, const int gpu_id) {
   assert(thread<MAX_GPU_THREADS);
   
   GBMF[thread].gpu.init();
-  
+
   if (GBMF[thread].gpu.num_devices()==0)
     return false;                   
 
@@ -227,7 +221,7 @@ bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
 // ---------------------------------------------------------------------------
 // Clear memory on host and device
 // ---------------------------------------------------------------------------
-void gb_gpu_clear(const int thread) {
+EXTERN void gb_gpu_clear(const int thread) {
   GBMF[thread].clear();
 }
 
@@ -262,7 +256,7 @@ inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
   atom.time_atom.stop();
 }
 
-void gb_gpu_atom(double **host_x, double **host_quat, 
+EXTERN void gb_gpu_atom(double **host_x, double **host_quat, 
                  const int *host_type, const bool rebuild, const int thread) {
   _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
                GBMF[thread].pair_stream);
@@ -327,7 +321,7 @@ int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal,
   return ilist;
 }
 
-int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
+EXTERN int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
                          int *ilist, const int *numj, const int *type,
                          const int thread, bool &success) {
   return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
@@ -340,7 +334,7 @@ int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
 // ---------------------------------------------------------------------------
 template <class gbmtyp>
 void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij, 
-                   const bool eflag) {
+        const bool eflag) {
   gbm.nbor.time_nbor.add_to_total();
   // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
   
@@ -350,8 +344,8 @@ void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij,
   gbm.nbor.time_nbor.stop();
 }
 
-void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag, 
-                  const int thread) {
+EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
+            const int thread) {
   _gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
 }
 
@@ -453,7 +447,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
   }
 }
 
-void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
+EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
                      const int thread) {
   _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
 }
@@ -490,7 +484,7 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
   return evdw;
 }
 
-double gb_gpu_forces(double **f, double **tor, const int *ilist,
+EXTERN double gb_gpu_forces(double **f, double **tor, const int *ilist,
                      const bool eflag, const bool vflag, const bool eflag_atom,
                      const bool vflag_atom, double *eatom, double **vatom,
                      double *virial, const int thread) {
@@ -499,7 +493,7 @@ double gb_gpu_forces(double **f, double **tor, const int *ilist,
                         vflag_atom,eatom,vatom,virial);
 }
 
-void gb_gpu_time(const int i) {
+EXTERN void gb_gpu_time(const int i) {
   cout.precision(4);
   cout << "Atom copy:     " << GBMF[i].atom.time_atom.total_seconds() 
        << " s.\n"
@@ -515,10 +509,10 @@ void gb_gpu_time(const int i) {
        << " s.\n";
 }
 
-int gb_gpu_num_devices() {
+EXTERN int gb_gpu_num_devices() {
   return GBMF[0].gpu.num_devices();
 }
 
-double gb_gpu_bytes() {
+EXTERN double gb_gpu_bytes() {
   return GBMF[0].host_memory_usage();
 }
diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h
index 87bcecb3ca..0e060a80a8 100644
--- a/lib/gpu/gb_gpu_extra.h
+++ b/lib/gpu/gb_gpu_extra.h
@@ -1,25 +1,21 @@
-/***************************************************************************
-                               gb_gpu_extra.h
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Inline GPU kernel routines ala math_extra for the CPU.
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Jun 23 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #ifndef GB_GPU_EXTRA_H
 #define GB_GPU_EXTRA_H
diff --git a/lib/gpu/gb_gpu_kernel.h b/lib/gpu/gb_gpu_kernel.h
index 22c276e130..110a41fecf 100644
--- a/lib/gpu/gb_gpu_kernel.h
+++ b/lib/gpu/gb_gpu_kernel.h
@@ -1,26 +1,21 @@
-/***************************************************************************
-                              gb_gpu_kernel.cu
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Routines that actually perform the force/torque computation
-
-   *** Force Decomposition by Atom Version ***
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Jun 23 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #ifndef GB_GPU_KERNEL
 #define GB_GPU_KERNEL
diff --git a/lib/gpu/gb_gpu_memory.cu b/lib/gpu/gb_gpu_memory.cu
index de24284751..b092c5cd8b 100644
--- a/lib/gpu/gb_gpu_memory.cu
+++ b/lib/gpu/gb_gpu_memory.cu
@@ -1,25 +1,21 @@
-/***************************************************************************
-                              gb_gpu_memory.cu
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Global variables for GPU Gayberne Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Thu Jun 25 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #include "gb_gpu_memory.h"
 #define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h
index eb9fb92a1c..496ce2033d 100644
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@@ -1,25 +1,21 @@
-/***************************************************************************
-                               gb_gpu_memory.h
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Global variables for GPU Gayberne Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Thu Jun 25 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #ifndef GB_GPU_MEMORY_H
 #define GB_GPU_MEMORY_H
diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu
index ffc67eb068..e8791069d5 100644
--- a/lib/gpu/lj_gpu.cu
+++ b/lib/gpu/lj_gpu.cu
@@ -1,25 +1,21 @@
-/***************************************************************************
-                                  lj_gpu.cu
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Lennard-Jones potential GPU calcultation
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #include <iostream>
 #include <cassert>
@@ -27,18 +23,39 @@
 #include "nvc_timer.h"
 #include "nvc_device.h"
 #include "pair_gpu_texture.h"
+#include "pair_gpu_cell.h"
 #include "lj_gpu_memory.cu"
 #include "lj_gpu_kernel.h"
 
+#ifdef WINDLL
+#include <windows.h>
+BOOL APIENTRY DllMain(HANDLE hModule, DWORD dwReason, LPVOID lpReserved)
+{
+    return TRUE;
+}
+#endif
+
+#ifdef WINDLL
+#define EXTERN extern "C" __declspec(dllexport) 
+#else
+#define EXTERN 
+#endif
 using namespace std;
 
 static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
 #define LJMT LJ_GPU_Memory<numtyp,acctyp>
 
+static float kernelTime = 0.0;
+static int ncell1D;
+static float *energy, *d_energy;
+static float3 *d_force, *f_temp, *v_temp, *d_virial;
+static cell_list cell_list_gpu;
+
 // ---------------------------------------------------------------------------
 // Convert something to a string
 // ---------------------------------------------------------------------------
 #include <sstream>
+
 template <class t>
 inline string lj_gpu_toa(const t& in) {
   ostringstream o;
@@ -50,113 +67,54 @@ inline string lj_gpu_toa(const t& in) {
 // ---------------------------------------------------------------------------
 // Return string with GPU info
 // ---------------------------------------------------------------------------
-string lj_gpu_name(const int id, const int max_nbors) {
-  string name=LJMF.gpu.name(id)+", "+
+EXTERN void lj_gpu_name(const int id, const int max_nbors, char * name) {
+  string sname=LJMF.gpu.name(id)+", "+
               lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+
               lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+
-              lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+
-              lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id),
-                                            max_nbors))+" Atoms";
-  return name;
+              lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ";
+  strcpy(name,sname.c_str());
 }
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
-                 double **epsilon, double **host_lj1, double **host_lj2, 
-                 double **host_lj3, double **host_lj4, double **offset, 
-                 double *special_lj, const int max_nbors, const int gpu_id) {
-  LJMF.gpu.init();                  
+EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
+			 double **epsilon, double **host_lj1, double **host_lj2, 
+			 double **host_lj3, double **host_lj4, double **offset, 
+			 double *special_lj, double *boxlo, double *boxhi, 
+			 double cell_size, double skin,
+			 const int max_nbors, const int gpu_id) {
+  LJMF.gpu.init();
   if (LJMF.gpu.num_devices()==0)
     return false;                   
 
   ij_size=IJ_SIZE;
-  return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
-                   host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
+
+  bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
+		       host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
+
+  ncell1D = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
+
+  init_cell_list_const(cell_size, skin, boxlo, boxhi);
+
+  return ret;
 }
 
 // ---------------------------------------------------------------------------
 // Clear memory on host and device
 // ---------------------------------------------------------------------------
-void lj_gpu_clear() {
+EXTERN void lj_gpu_clear() {
+  free(energy);
+  free(v_temp);
+  cudaFreeHost(f_temp);
+  cudaFree(d_force);
+  cudaFree(d_energy);
+  cudaFree(d_virial);
+  clear_cell_list(cell_list_gpu);
+
   LJMF.clear();
 }
 
-// ---------------------------------------------------------------------------
-// copy atom positions and optionally types to device
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-inline void _lj_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
-                         const int *host_type, const bool rebuild,
-                         cudaStream_t &stream) {
-  atom.time_atom.start();
-  atom.reset_write_buffer();
-  
-  // First row of dev_x is x position, second is y, third is z
-  atom.add_atom_data(host_x[0],3);
-  atom.add_atom_data(host_x[0]+1,3);
-  atom.add_atom_data(host_x[0]+2,3);
-
-  int csize=3;
-
-  // If a rebuild occured, copy type data
-  if (rebuild) {
-    atom.add_atom_data(host_type);
-    csize++;
-  }
-  
-  atom.copy_atom_data(csize,stream);
-  atom.time_atom.stop();
-}
-
-void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) {
-  _lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream);
-}
-
-// ---------------------------------------------------------------------------
-// Signal that we need to transfer a new neighbor list
-// ---------------------------------------------------------------------------
-template <class LJMTyp>
-bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum, 
-                         int *ilist, const int *numj) {
-  if (nall>ljm.max_atoms)
-    return false;
-  
-  ljm.nbor.time_nbor.start();
-
-  ljm.atom.nall(nall);
-  ljm.atom.inum(inum);
-  ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream);
-
-  ljm.nbor.time_nbor.stop();
-  return true;
-}
-
-bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, 
-                        const int *numj) {
-  return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj);
-}
-
-// ---------------------------------------------------------------------------
-// Copy a set of ij_size ij interactions to device and compute energies,
-// forces, and torques for those interactions
-// ---------------------------------------------------------------------------
-template <class LJMTyp>
-void _lj_gpu_nbors(LJMTyp &ljm, const int *ij, const int num_ij) {
-  ljm.nbor.time_nbor.add_to_total();
-  
-  // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
-
-  memcpy(ljm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
-  ljm.nbor.time_nbor.start();
-  ljm.nbor.add(num_ij,ljm.pair_stream);
-  ljm.nbor.time_nbor.stop();
-}
-
-void lj_gpu_nbors(const int *ij, const int num_ij) {
-  _lj_gpu_nbors(LJMF,ij,num_ij);
-}
 
 // ---------------------------------------------------------------------------
 // Calculate energies and forces for all ij interactions
@@ -169,6 +127,7 @@ void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
   int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
 
   ljm.time_pair.start();
+
   if (ljm.shared_types)
     kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
            (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), 
@@ -181,47 +140,279 @@ void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
             ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), 
             ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, 
             vflag, ljm.atom.inum(), ljm.atom.nall());
+	    ljm.time_pair.stop();
+}
+
+EXTERN void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
+  _lj_gpu<PRECISION,ACC_PRECISION>(LJMF, eflag,vflag,rebuild);
+}
+
+template <class numtyp, class acctyp>
+double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
+		    double **host_x, int *host_type, const int inum, 
+		    const int nall, const int ago, const bool eflag, const bool vflag, 
+		    const double *boxlo, const double *boxhi)
+{
+  ljm.atom.nall(nall);
+  ljm.atom.inum(inum);
+
+  ljm.nbor.time_nbor.start();
+  ljm.nbor.time_nbor.stop();
+
+  double evdwl=0.0;
+
+  static int buffer = CELL_SIZE;
+  static int ncell = (int)pow((float)ncell1D,3);
+
+  static int first_call = 1;
+
+  // allocate memory on CPU and GPU
+  if (first_call) {
+    energy    = (float*) malloc(inum*sizeof(float));
+    v_temp    = (float3*)malloc(inum*2*sizeof(float3));
+    cudaMallocHost((void**)&f_temp,   inum*sizeof(float3));
+
+    cudaMalloc((void**)&d_force,     inum*sizeof(float3));
+    cudaMalloc((void**)&d_energy,    inum*sizeof(float));
+    cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));
+
+    init_cell_list(cell_list_gpu, nall, ncell, buffer);
+
+    first_call = 0;
+  }
+
+  if (!first_call && ago == 0) {
+    free(energy);
+    free(v_temp);
+    cudaFreeHost(f_temp);
+    cudaFree(d_force);
+    cudaFree(d_energy);
+    cudaFree(d_virial);
+
+    energy    = (float*) malloc(inum*sizeof(float));
+    v_temp    = (float3*)malloc(inum*2*sizeof(float3));
+    cudaMallocHost((void**)&f_temp,   inum*sizeof(float3));
+
+    cudaMalloc((void**)&d_force,     inum*sizeof(float3));
+    cudaMalloc((void**)&d_energy,    inum*sizeof(float));
+    cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));
+
+    clear_cell_list(cell_list_gpu);
+    init_cell_list(cell_list_gpu, nall, ncell, buffer);
+  }
+
+  // build cell-list on GPU
+  ljm.atom.time_atom.start();
+  build_cell_list(host_x[0], host_type, cell_list_gpu, 
+		  ncell, ncell1D, buffer, inum, nall, ago);
+  ljm.atom.time_atom.stop();
+
+  ljm.time_pair.start();
+
+#ifdef TIMING
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+#endif
+
+  // call the cell-list force kernel
+  const int BX=BLOCK_1D;
+  dim3 GX(ncell1D, ncell1D*ncell1D);
+  if (eflag == 0 && vflag == 0) {
+    kernel_lj_cell<false,false><<<GX, BX, 0>>>
+      (d_force, d_energy, d_virial, 
+       cell_list_gpu.pos, 
+       cell_list_gpu.idx, 
+       cell_list_gpu.type, 
+       cell_list_gpu.natom,
+       inum, nall, ncell);
+  } else {
+    kernel_lj_cell<true,true><<<GX, BX, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES>>>
+      (d_force, d_energy, d_virial, 
+       cell_list_gpu.pos, 
+       cell_list_gpu.idx, 
+       cell_list_gpu.type, 
+       cell_list_gpu.natom,
+       inum, nall, ncell);
+  }
+
+#ifdef TIMING
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float kTime;
+  cudaEventElapsedTime(&kTime, start, stop);
+  kernelTime += kTime;
+  printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+#endif
+
+  // copy results from GPU to CPU
+  cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
+  if (eflag) {
+    cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < inum; i++) {
+      evdwl += energy[i];
+    }
+    evdwl *= 0.5f;
+  }
+  if (vflag) {
+    cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < inum; i++) {
+      virial[0] += v_temp[2*i].x;
+      virial[1] += v_temp[2*i].y;
+      virial[2] += v_temp[2*i].z;
+      virial[3] += v_temp[2*i+1].x;
+      virial[4] += v_temp[2*i+1].y;
+      virial[5] += v_temp[2*i+1].z;
+    }
+    for (int i = 0; i < 6; i++) 
+      virial[i] *= 0.5f;
+  }
+
+  for (int i = 0; i < inum; i++) {
+    force[i][0] += f_temp[i].x;
+    force[i][1] += f_temp[i].y;
+    force[i][2] += f_temp[i].z;
+  }
+
   ljm.time_pair.stop();
-}
-
-void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
-  _lj_gpu<PRECISION,ACC_PRECISION>(LJMF,eflag,vflag,rebuild);
-}
-
-// ---------------------------------------------------------------------------
-// Get energies and forces to host
-// ---------------------------------------------------------------------------
-template<class numtyp, class acctyp>
-double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
-                      const bool eflag, const bool vflag, const bool eflag_atom,
-                      const bool vflag_atom, double *eatom, double **vatom,
-                      double *virial) {
-  double evdw;
-  
-  ljm.atom.time_answer.start();
-  ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream);
 
   ljm.atom.time_atom.add_to_total();
   ljm.nbor.time_nbor.add_to_total();
   ljm.time_pair.add_to_total();
-  CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream));
 
-  evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
-  ljm.atom.add_forces(ilist,f);
-  ljm.atom.time_answer.stop();
-  ljm.atom.time_answer.add_to_total();
-  return evdw;
+
+  return evdwl;
+ 
 }
 
-double lj_gpu_forces(double **f, const int *ilist, const bool eflag, 
-                     const bool vflag, const bool eflag_atom,
-                     const bool vflag_atom, double *eatom, double **vatom,
-                     double *virial) {
-  return _lj_gpu_forces<PRECISION,ACC_PRECISION> 
-    (LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial);
+EXTERN double lj_gpu_cell(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, 
+		   const int ago, const bool eflag, const bool vflag, 
+		   const double *boxlo, const double *boxhi) 
+{
+  return _lj_gpu_cell<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall, 
+					       ago, eflag, vflag, boxlo, boxhi);
 }
 
-void lj_gpu_time() {
+template <class numtyp, class acctyp>
+double _lj_gpu_n2(LJMT &ljm, double **force, double *virial,
+		  double **host_x, int *host_type, const int inum, const int nall, const bool eflag, const bool vflag,
+		  const double *boxlo, const double *boxhi)
+{
+  ljm.atom.nall(nall);
+  ljm.atom.inum(inum);
+
+
+  ljm.nbor.time_nbor.start();
+  ljm.nbor.time_nbor.stop();
+
+  
+  double evdwl=0.0;
+
+#ifdef NOUSE
+  static int first_call = 1;
+
+  if (first_call) {
+    energy    = (float*)    malloc(inum*sizeof(float));
+    v_temp    = (float3*)   malloc(inum*2*sizeof(float3));
+    cudaMallocHost((void**)&f_temp,   inum*sizeof(float3));
+    cudaMallocHost((void**)&pos_temp, nall*sizeof(float3));
+    cudaMalloc((void**)&d_force,     inum*sizeof(float3));
+    cudaMalloc((void**)&d_energy,    inum*sizeof(float));
+    cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));
+    cudaMalloc((void**)&d_pos,       nall*sizeof(float3));
+    cudaMalloc((void**)&d_type,      nall*sizeof(int));
+    first_call = 0;
+  }
+
+
+  ljm.atom.time_atom.start();
+  double *atom_pos = host_x[0];
+  for (int i = 0; i < 3*nall; i+=3) { 
+    pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]);
+  }
+  cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_type, host_type, nall*sizeof(int),  cudaMemcpyHostToDevice);
+
+  ljm.atom.time_atom.stop();
+
+  ljm.time_pair.start();
+  
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=BLOCK_1D;
+  dim3 GX(static_cast<int>(ceil(static_cast<double>(inum)/BX)));
+
+#ifdef TIMING
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+#endif
+
+  // N^2 force kernel
+  kernel_lj_n2<numtyp, acctyp><<<GX, BX>>>(d_force, d_energy, d_virial, 
+					   d_pos, d_type, eflag, vflag, inum, nall);
+
+#ifdef TIMING
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float kTime;
+  cudaEventElapsedTime(&kTime, start, stop);
+  kernelTime += kTime;
+  printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+#endif
+
+  // copy results from GPU to CPU
+  cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
+  if (eflag) {
+    cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < inum; i++) {
+      evdwl += energy[i];
+    }
+    evdwl *= 0.5f;
+  }
+  if (vflag) {
+    cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < inum; i++) {
+      virial[0] += v_temp[2*i].x;
+      virial[1] += v_temp[2*i].y;
+      virial[2] += v_temp[2*i].z;
+      virial[3] += v_temp[2*i+1].x;
+      virial[4] += v_temp[2*i+1].y;
+      virial[5] += v_temp[2*i+1].z;
+    }
+    for (int i = 0; i < 6; i++) 
+      virial[i] *= 0.5f;
+  }
+
+  for (int i = 0; i < inum; i++) {
+    force[i][0] += f_temp[i].x;
+    force[i][1] += f_temp[i].y;
+    force[i][2] += f_temp[i].z;
+  }
+#endif
+  ljm.time_pair.stop();
+
+  ljm.atom.time_atom.add_to_total();
+  ljm.nbor.time_nbor.add_to_total();
+  ljm.time_pair.add_to_total();
+
+  return evdwl;
+}
+
+EXTERN double lj_gpu_n2(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, 
+		 const bool eflag, const bool vflag,
+		 const double *boxlo, const double *boxhi) 
+{
+  return _lj_gpu_n2<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall, 
+					  eflag, vflag, boxlo, boxhi);
+}
+
+EXTERN void lj_gpu_time() {
   cout.precision(4);
   cout << "Atom copy:     " << LJMF.atom.time_atom.total_seconds() << " s.\n";
   cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n";
@@ -229,10 +420,10 @@ void lj_gpu_time() {
   cout << "Answer copy:   " << LJMF.atom.time_answer.total_seconds() << " s.\n";
 }
 
-int lj_gpu_num_devices() {
+EXTERN int lj_gpu_num_devices() {
   return LJMF.gpu.num_devices();
 }
 
-double lj_gpu_bytes() {
+EXTERN double lj_gpu_bytes() {
   return LJMF.host_memory_usage();
 }
diff --git a/lib/gpu/lj_gpu_kernel.h b/lib/gpu/lj_gpu_kernel.h
index b2febe0f7a..7af33ce6de 100644
--- a/lib/gpu/lj_gpu_kernel.h
+++ b/lib/gpu/lj_gpu_kernel.h
@@ -1,29 +1,220 @@
-/***************************************************************************
-                              lj_gpu_kernel.cu
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Routines that actually perform the force computation
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #ifndef LJ_GPU_KERNEL
 #define LJ_GPU_KERNEL
 
+/* Cell list version of LJ kernel */
+template<bool eflag, bool vflag>
+__global__ void kernel_lj_cell(float3 *force3,
+			       float *energy, float3 *virial, 
+			       float3 *cell_list, unsigned int *cell_idx, 
+			       int *cell_type, int *cell_atom,
+			       const int inum, const int nall, const int ncell)
+{
+  // calculate 3D block idx from 2d block
+  int bx = blockIdx.x;
+  int by = blockIdx.y % gridDim.x;
+  int bz = blockIdx.y / gridDim.x;
+
+  int tid = threadIdx.x;
+
+  // compute cell idx from 3D block idx
+  int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
+
+  __shared__ int   typeSh[CELL_SIZE];
+  __shared__ float posSh[CELL_SIZE*3];
+  __shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+
+  extern __shared__ float smem[];
+
+  __shared__ float *lj3Sh;
+  __shared__ float *lj4Sh;
+  __shared__ float *offsetSh;
+
+  // load force parameters into shared memory
+  for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
+    int itype = i/MAX_SHARED_TYPES;
+    int jtype = i%MAX_SHARED_TYPES;
+    cutsqSh[i] = _cutsq_<float>(itype,jtype);
+    lj1Sh[i]   = _lj1_<float>(itype,jtype).x;
+    lj2Sh[i]   = _lj1_<float>(itype,jtype).y;
+  }
+
+  // Only allocate shared memory when needed, 
+  // this reduces shared memory limitation on occupancy
+  if (eflag || vflag) {
+    lj3Sh = smem;
+    lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
+    offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
+    for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
+      int itype = i/MAX_SHARED_TYPES;
+      int jtype = i%MAX_SHARED_TYPES;
+      lj3Sh[i]   = _lj3_<float>(itype,jtype).x+0.01;
+      lj4Sh[i]   = _lj3_<float>(itype,jtype).y;
+      offsetSh[i]= _offset_<float>(itype,jtype);
+    }
+  }
+
+  __syncthreads();
+
+  int nborz0 = max(bz-1,0), nborz1 = min(bz+1, gridDim.x-1),
+    nbory0 = max(by-1,0), nbory1 = min(by+1, gridDim.x-1),
+    nborx0 = max(bx-1,0), nborx1 = min(bx+1, gridDim.x-1);
+
+  for (int ii = 0; ii < ceil((float)(cell_atom[cid])/BLOCK_1D); ii++) {
+    float3 f = {0.0f, 0.0f, 0.0f};
+    float ener = 0.0f;
+    float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
+    int itype;
+    float ix, iy, iz;
+    int i = tid + ii*BLOCK_1D;
+    unsigned int answer_pos = cell_idx[cid*CELL_SIZE+i];
+
+    // load current cell atom position and type into sMem
+    for (int j = tid; j < cell_atom[cid]; j += BLOCK_1D) {
+      int pid = cid*CELL_SIZE + j;
+      float3 pos = cell_list[pid];
+      posSh[j            ] = pos.x;
+      posSh[j+  CELL_SIZE] = pos.y;
+      posSh[j+2*CELL_SIZE] = pos.z;
+      typeSh[j]            = cell_type[pid];
+    }
+    __syncthreads();
+    if (answer_pos < inum) {
+      itype = typeSh[i];
+      ix = posSh[i            ];
+      iy = posSh[i+  CELL_SIZE];
+      iz = posSh[i+2*CELL_SIZE];
+
+      // compute force from current cell
+      for (int j = 0; j < cell_atom[cid]; j++) {
+	if (j == i) continue;
+	float delx = ix - posSh[j            ];
+	float dely = iy - posSh[j+  CELL_SIZE];
+	float delz = iz - posSh[j+2*CELL_SIZE];
+	int jtype = typeSh[j];
+	int mtype = itype + jtype*MAX_SHARED_TYPES;
+	float r2inv = delx*delx + dely*dely + delz*delz;
+	
+	if (r2inv < cutsqSh[mtype]) {
+	  r2inv = 1.0f/r2inv;
+	  float r6inv = r2inv * r2inv * r2inv;
+	  float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
+	  f.x += delx * force;
+	  f.y += dely * force;
+	  f.z += delz * force;
+
+	  if (eflag) {
+	    float e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
+	    ener += (e - offsetSh[mtype]); 
+	  }
+	  
+	  if (vflag) {
+	    v0.x += delx*delx*force;
+	    v0.y += dely*dely*force;
+	    v0.z += delz*delz*force;
+	    v1.x += delx*dely*force;
+	    v1.y += delx*delz*force;
+	    v1.z += dely*delz*force;
+	  }
+
+	} 
+      }
+    }
+    __syncthreads();
+
+    // compute force from neigboring cells
+    for (int nborz = nborz0; nborz <= nborz1; nborz++) {
+      for (int nbory = nbory0; nbory <= nbory1; nbory++) {
+	for (int nborx = nborx0; nborx <= nborx1; nborx++) {
+	  if (nborz == bz && nbory == by && nborx == bx) continue;
+	  
+	  // compute cell id
+	  int cid_nbor = nborx + INT_MUL(nbory,gridDim.x) + 
+	    INT_MUL(nborz,gridDim.x*gridDim.x);
+	
+	  // load neighbor cell position and type into smem
+	  for (int j = tid; j < cell_atom[cid_nbor]; j += BLOCK_1D) {
+	    int pid = INT_MUL(cid_nbor,CELL_SIZE) + j;
+	    float3 pos = cell_list[pid];
+	    posSh[j            ] = pos.x;
+	    posSh[j+  CELL_SIZE] = pos.y;
+	    posSh[j+2*CELL_SIZE] = pos.z;
+	    typeSh[j]           = cell_type[pid];
+	  }
+	  __syncthreads();
+	  // compute force
+	  if (answer_pos < inum) {
+	    for (int j = 0; j < cell_atom[cid_nbor]; j++) {
+	      float delx = ix - posSh[j           ];
+	      float dely = iy - posSh[j+  CELL_SIZE];
+	      float delz = iz - posSh[j+2*CELL_SIZE];
+	      int jtype = typeSh[j];
+	      int mtype = itype + jtype*MAX_SHARED_TYPES;
+	      float r2inv = delx*delx + dely*dely + delz*delz;
+	      
+	      if (r2inv < cutsqSh[mtype]) {
+		r2inv = 1.0f/r2inv;
+		float r6inv = r2inv * r2inv * r2inv;
+		float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
+		f.x += delx * force;
+		f.y += dely * force;
+		f.z += delz * force;
+
+		if (eflag) {
+		  float e=r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);				
+		  ener += (e-offsetSh[mtype]); 
+		}
+		if (vflag) {
+		  v0.x += delx*delx*force;
+		  v0.y += dely*dely*force;
+		  v0.z += delz*delz*force;
+		  v1.x += delx*dely*force;
+		  v1.y += delx*delz*force;
+		  v1.z += dely*delz*force;
+		}
+	      }
+	    }
+	  }
+	  __syncthreads();
+	}
+      }
+    }
+
+    if (answer_pos < inum) {
+      force3[answer_pos] = f;
+      if (eflag)
+	energy[answer_pos] = ener;
+      if (vflag) {
+	virial[2*answer_pos] = v0;
+	virial[2*answer_pos+1] = v1;
+      }
+    }
+  }
+
+}
+
+
+/* Neigbhor list version of LJ kernel */
 template<class numtyp, class acctyp>
 __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, 
                           const int *dev_ij, const int nbor_pitch, acctyp *ans, 
@@ -36,7 +227,6 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
   if (ii<4)
     sp_lj[ii]=special_lj[ii];    
   ii+=INT_MUL(blockIdx.x,blockDim.x);
-  __syncthreads();
 
   if (ii<inum) {
   
@@ -158,7 +348,6 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
     }
   }
   ii+=INT_MUL(blockIdx.x,blockDim.x);
-  __syncthreads();
   
   if (ii<inum) {
   
@@ -184,9 +373,11 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
     int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,3));
 
     numtyp factor_lj;
+
     for ( ; list<list_end; list++) {
-  
-      int j=*list;
+
+      int j= *list;
+      
       if (j < nall) 
         factor_lj = 1.0;
       else {
@@ -247,4 +438,116 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
   } // if ii
 }
 
+
+/* Brute force O(N^2) version of LJ kernel */
+template<class numtyp, class acctyp>
+  __global__ void kernel_lj_n2(float3 *force3,
+			       float *energy, float3 *virial,
+			       float3 *pos, int *type, 
+			       const bool eflag, const bool vflag, const int inum, const int nall)
+{
+  int gid = threadIdx.x + INT_MUL(blockIdx.x, blockDim.x);
+  int tid = threadIdx.x;
+  __shared__ float posSh[BLOCK_1D*3];
+  __shared__ int   typeSh[BLOCK_1D];
+  __shared__ numtyp cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp lj3Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp lj4Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __shared__ numtyp offsetSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    int itype=tid/MAX_SHARED_TYPES;
+    int jtype=tid%MAX_SHARED_TYPES;
+    cutsqSh[tid]=_cutsq_<numtyp>(itype,jtype);
+    lj1Sh[tid]=_lj1_<numtyp>(itype,jtype).x;
+    lj2Sh[tid]=_lj1_<numtyp>(itype,jtype).y;
+    lj3Sh[tid]=_lj3_<numtyp>(itype,jtype).x;
+    lj4Sh[tid]=_lj3_<numtyp>(itype,jtype).y;
+    offsetSh[tid]=_offset_<numtyp>(itype,jtype);
+  }
+  __syncthreads();
+ 
+  float3 f = {0.0f, 0.0f, 0.0f};
+  float ener = 0.0f;
+  float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
+
+  int itype, jtype;
+  int mtype;
+
+  numtyp ix, iy, iz;
+
+  if (gid < inum) {
+    ix = pos[gid].x;
+    iy = pos[gid].y;
+    iz = pos[gid].z;
+    itype = type[gid];
+  }
+
+  int pid = tid;
+  int nIter = ceil((float)nall/BLOCK_1D);
+  for (int jj = 0; jj < nIter; jj++, pid += BLOCK_1D) {
+
+    if (pid < nall) {
+      posSh[tid           ] = pos[pid].x;
+      posSh[tid+  BLOCK_1D] = pos[pid].y;
+      posSh[tid+2*BLOCK_1D] = pos[pid].z;
+      typeSh[tid]           = type[pid];
+    }
+    __syncthreads();
+
+    if (gid < inum) {
+      int pid_j = jj*BLOCK_1D;
+      
+      for (int j = 0; j < BLOCK_1D; j++, pid_j++) {
+	if (jj == blockIdx.x && tid == j) continue;
+	if (pid_j < nall) {
+	  numtyp delx = ix - posSh[j           ];
+	  numtyp dely = iy - posSh[j+  BLOCK_1D];
+	  numtyp delz = iz - posSh[j+2*BLOCK_1D];
+	  jtype = typeSh[j];
+	  mtype = itype + jtype*MAX_SHARED_TYPES;
+	  numtyp r2inv = delx * delx + dely * dely + delz * delz;
+	  
+	  if (r2inv < cutsqSh[mtype]) {
+	    r2inv = (numtyp)1.0/r2inv;
+	    numtyp r6inv = r2inv * r2inv * r2inv;
+	    numtyp force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
+	    f.x += delx * force;
+	    f.y += dely * force;
+	    f.z += delz * force;
+	    
+	    if (eflag) {
+	      numtyp e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
+	      ener +=(e-offsetSh[mtype]);
+	    }
+	    if (vflag) {
+	      v0.x += delx*delx*force;
+	      v0.y += dely*dely*force;
+	      v0.z += delz*delz*force;
+	      v1.x += delx*dely*force;
+	      v1.y += delx*delz*force;
+	      v1.z += dely*delz*force;
+	    }
+	  }
+	}
+      }
+    }
+    
+    __syncthreads();
+  }
+
+  if (gid < inum) {
+    if (eflag)
+      energy[gid] = ener;
+    if (vflag) {
+      virial[2*gid  ] = v0;
+      virial[2*gid+1] = v1;
+    }
+    force3[gid] = f;
+  }
+
+}
+
 #endif
diff --git a/lib/gpu/lj_gpu_memory.cu b/lib/gpu/lj_gpu_memory.cu
index a5000a7eaf..20ac9e1440 100644
--- a/lib/gpu/lj_gpu_memory.cu
+++ b/lib/gpu/lj_gpu_memory.cu
@@ -1,25 +1,21 @@
-/***************************************************************************
-                               lj_gpu_memory.cu
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Global variables for GPU Lennard-Jones Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #include "lj_gpu_memory.h"
 #define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp>
diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h
index 9052ad4956..35057bcfad 100644
--- a/lib/gpu/lj_gpu_memory.h
+++ b/lib/gpu/lj_gpu_memory.h
@@ -1,25 +1,21 @@
-/***************************************************************************
-                               lj_gpu_memory.h
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Global variables for GPU Lennard-Jones Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #ifndef LJ_GPU_MEMORY_H
 #define LJ_GPU_MEMORY_H
@@ -30,8 +26,10 @@
 #include "pair_gpu_nbor.h"
 
 #define BLOCK_1D 64
+#define CELL_SIZE 64
 #define MAX_SHARED_TYPES 8
 #define PERCENT_GPU_MEMORY 0.7
+#define BIG_NUMBER 100000000
 
 template <class numtyp, class acctyp>
 class LJ_GPU_Memory {
diff --git a/lib/gpu/nvc_device.cu b/lib/gpu/nvc_device.cu
index 2f25d61af2..3f2d81228a 100644
--- a/lib/gpu/nvc_device.cu
+++ b/lib/gpu/nvc_device.cu
@@ -1,25 +1,21 @@
-/***************************************************************************
-                                nvc_device.cu
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Utilities for dealing with cuda devices
-
- __________________________________________________________________________
-    This file is part of the NVC Library
- __________________________________________________________________________
-
-    begin                : Wed Jan 28 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/lib/gpu/nvc_device.h b/lib/gpu/nvc_device.h
index 4286c6022f..61a4bb003a 100644
--- a/lib/gpu/nvc_device.h
+++ b/lib/gpu/nvc_device.h
@@ -1,25 +1,21 @@
-/***************************************************************************
-                                nvc_device.h
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Utilities for dealing with cuda devices
-
- __________________________________________________________________________
-    This file is part of the NVC Library
- __________________________________________________________________________
-
-    begin                : Wed Jan 28 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #ifndef NVC_DEVICE
 #define NVC_DEVICE
diff --git a/lib/gpu/nvc_get_devices.cu b/lib/gpu/nvc_get_devices.cu
index b85626f3f7..652c4fc22a 100644
--- a/lib/gpu/nvc_get_devices.cu
+++ b/lib/gpu/nvc_get_devices.cu
@@ -1,25 +1,21 @@
-/***************************************************************************
-                              nvc_get_devices.h
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  List properties of cuda devices
-
- __________________________________________________________________________
-    This file is part of the NVC Library
- __________________________________________________________________________
-
-    begin                : Wed Jan 28 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #include "nvc_device.h"
 
diff --git a/lib/gpu/nvc_macros.h b/lib/gpu/nvc_macros.h
index 4aaa167af5..2273b96895 100644
--- a/lib/gpu/nvc_macros.h
+++ b/lib/gpu/nvc_macros.h
@@ -1,3 +1,22 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
+
 #ifndef NVC_MACROS_H
 #define NVC_MACROS_H
 
diff --git a/lib/gpu/nvc_memory.h b/lib/gpu/nvc_memory.h
index 5aaeffd2f0..5b2c164a7f 100644
--- a/lib/gpu/nvc_memory.h
+++ b/lib/gpu/nvc_memory.h
@@ -1,25 +1,21 @@
-/***************************************************************************
-                                nvc_memory.h
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Routines for memory management on CUDA devices
-
- __________________________________________________________________________
-    This file is part of the NVC Library
- __________________________________________________________________________
-
-    begin                : Thu Jun 25 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #ifndef NVC_MEMORY_H
 #define NVC_MEMORY_H
diff --git a/lib/gpu/nvc_timer.h b/lib/gpu/nvc_timer.h
index 12346b5ea2..1df9156fb8 100644
--- a/lib/gpu/nvc_timer.h
+++ b/lib/gpu/nvc_timer.h
@@ -1,25 +1,21 @@
-/***************************************************************************
-                                 nvc_timer.h
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Class for timing CUDA routines
-
- __________________________________________________________________________
-    This file is part of the NVC Library
- __________________________________________________________________________
-
-    begin                : Tue Feb 3 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #ifndef NVC_TIMER_H
 #define NVC_TIMER_H
diff --git a/lib/gpu/nvc_traits.h b/lib/gpu/nvc_traits.h
index eee92b7028..41cb9487ec 100644
--- a/lib/gpu/nvc_traits.h
+++ b/lib/gpu/nvc_traits.h
@@ -1,25 +1,21 @@
-/***************************************************************************
-                             nvc_texture_traits.h
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Tricks for templating textures
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Jun 23 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #ifndef NVC_TEXTURE_TRAITS_H
 #define NVC_TEXTURE_TRAITS_H
diff --git a/lib/gpu/pair_gpu_atom.cu b/lib/gpu/pair_gpu_atom.cu
index 4d27bb4692..a5573101d0 100644
--- a/lib/gpu/pair_gpu_atom.cu
+++ b/lib/gpu/pair_gpu_atom.cu
@@ -1,25 +1,21 @@
-/***************************************************************************
-                               pair_gpu_atom.cu
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Memory routines for moving atom and force data between host and gpu
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #include "pair_gpu_texture.h"
 #include "pair_gpu_atom.h"
diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h
index 01f450ffcc..dcbcaa83b2 100644
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@@ -1,25 +1,21 @@
-/***************************************************************************
-                               pair_gpu_atom.h
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Memory routines for moving atom and force data between host and gpu
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #ifndef PAIR_GPU_ATOM_H
 #define PAIR_GPU_ATOM_H
diff --git a/lib/gpu/pair_gpu_cell.cu b/lib/gpu/pair_gpu_cell.cu
new file mode 100644
index 0000000000..11c78e87d6
--- /dev/null
+++ b/lib/gpu/pair_gpu_cell.cu
@@ -0,0 +1,454 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
+
+#include "lj_gpu_memory.h"
+#include "pair_gpu_cell.h"
+
+static __constant__ float d_boxlo[3];
+static __constant__ float d_boxhi[3];
+static __constant__ float d_cell_size[1];
+static __constant__ float d_skin[1];
+
+void init_cell_list_const(double cell_size, double skin,
+			  double *boxlo, double *boxhi)
+{
+  float cell_size1 = cell_size;
+  float skin1 = skin;
+  float boxlo1[3], boxhi1[3];
+  for (int i = 0; i < 3; i++) {
+    boxlo1[i] = boxlo[i];
+    boxhi1[i] = boxhi[i];
+  }
+
+  cudaMemcpyToSymbol(d_cell_size, &cell_size1, sizeof(float),   
+		     0, cudaMemcpyHostToDevice);
+  cudaMemcpyToSymbol(d_boxlo,     boxlo1,      3*sizeof(float), 
+		     0, cudaMemcpyHostToDevice);
+  cudaMemcpyToSymbol(d_boxhi,     boxhi1,      3*sizeof(float), 
+		     0, cudaMemcpyHostToDevice);
+  cudaMemcpyToSymbol(d_skin,      &skin1,       sizeof(float),   
+		     0, cudaMemcpyHostToDevice); 
+}
+
+__global__ void kernel_set_cell_list(unsigned int *cell_idx)
+{
+  unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
+  cell_idx[gid] = BIG_NUMBER;
+}
+
+// build the cell list
+__global__ void kernel_build_cell_list(float3 *cell_list, 
+				       unsigned int *cell_idx, 
+				       int *cell_type, 
+				       int *cell_atom,
+				       float3 *pos, 
+				       int *type, 
+				       const int inum, 
+				       const int nall)
+{
+  unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
+  float cSize = d_cell_size[0];
+  int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
+
+  if (gid < nall) {
+    float3 p = pos[gid];
+    p.x = fmaxf(p.x, d_boxlo[0]-cSize);
+    p.x = fminf(p.x, d_boxhi[0]+cSize);
+    p.y = fmaxf(p.y, d_boxlo[1]-cSize);
+    p.y = fminf(p.y, d_boxhi[1]+cSize);
+    p.z = fmaxf(p.z, d_boxlo[2]-cSize);
+    p.z = fminf(p.z, d_boxhi[2]+cSize);
+
+    int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncell1D
+    		  + (int)(p.z/cSize + 1.0) * ncell1D * ncell1D;
+
+    int atom_pos = atomicAdd(&cell_atom[cell_id], 1);
+    int pid = cell_id*CELL_SIZE + atom_pos;
+
+    cell_list[pid] = pos[gid];
+    cell_type[pid] = type[gid];
+    cell_idx [pid] = gid;
+    
+  }
+}
+
+__global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebuild)
+{
+
+  float cSize = d_cell_size[0];
+  int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
+
+  // calculate 3D block idx from 2d block
+  int bx = blockIdx.x;
+  int by = blockIdx.y % gridDim.x;
+  int bz = blockIdx.y / gridDim.x;
+
+  int tid = threadIdx.x;
+
+  // compute cell idx from 3D block idx
+  int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
+  int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
+
+  float skin = d_skin[0];
+  float lowx = d_boxlo[0] + (bx-1)*cSize - 0.5*skin;
+  float hix  = lowx + cSize + skin;
+  float lowy = d_boxlo[1] + (by-1)*cSize - 0.5*skin;
+  float hiy  = lowy + cSize + skin;
+  float lowz = d_boxlo[2] + (bz-1)*cSize - 0.5*skin;
+  float hiz  = lowz + cSize + skin;
+
+  for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
+    int pid = pbase + i;
+    float3 p = cell_list[pid];
+    p.x = fmaxf(p.x, d_boxlo[0]-cSize);
+    p.x = fminf(p.x, d_boxhi[0]+cSize);
+    p.y = fmaxf(p.y, d_boxlo[1]-cSize);
+    p.y = fminf(p.y, d_boxhi[1]+cSize);
+    p.z = fmaxf(p.z, d_boxlo[2]-cSize);
+    p.z = fminf(p.z, d_boxhi[2]+cSize);
+
+    if (p.x < lowx || p.x > hix || p.y < lowy || p.y > hiy || p.z < lowz || p.z > hiz) {
+      *rebuild = 1;   
+    }
+  }
+
+}
+
+
+__global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int ncell)
+{
+  unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
+
+  if (gid < ncell) {
+    if (cell_atom[gid] > CELL_SIZE) 
+      *overflow = 1;
+  }
+}
+
+__global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int *cell_atom, float3 *pos)
+{
+  // calculate 3D block idx from 2d block
+  int bx = blockIdx.x;
+  int by = blockIdx.y % gridDim.x;
+  int bz = blockIdx.y / gridDim.x;
+
+  int tid = threadIdx.x;
+
+  // compute cell idx from 3D block idx
+  int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
+  int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
+
+  for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
+    int pid = pbase + i;
+    cell_list[pid] = pos[cell_idx[pid]];
+  }
+
+}
+
+
+__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit); 
+
+void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size)
+{
+  int i = 0;
+  const unsigned int bitSize = sizeof(unsigned int)*8;
+  const unsigned int bitStep = 4;
+  const int gSize = size/BLOCK_1D;
+  while (bitSize > i*bitStep) {
+    radixSortBlocks<<<gSize, BLOCK_1D, 2*BLOCK_1D*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
+    i++;
+  }
+}
+
+#ifdef __DEVICE_EMULATION__
+#define __SYNC __syncthreads();
+#else
+#define __SYNC 
+#endif
+
+
+#define WARP_SIZE 32
+
+template<class T, int maxlevel>
+__device__ T scanwarp(T val, T* sData)
+{
+    // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = 
+    // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1))
+    int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
+    sData[idx] = 0;
+    idx += WARP_SIZE;
+    sData[idx] = val;          __SYNC
+
+#ifdef __DEVICE_EMULATION__
+	T t = sData[idx -  1]; __SYNC 
+        sData[idx] += t;       __SYNC
+        t = sData[idx -  2];   __SYNC 
+        sData[idx] += t;       __SYNC
+        t = sData[idx -  4];   __SYNC 
+        sData[idx] += t;       __SYNC
+        t = sData[idx -  8];   __SYNC 
+        sData[idx] += t;       __SYNC
+        t = sData[idx - 16];   __SYNC 
+        sData[idx] += t;       __SYNC
+#else
+        if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; } __SYNC
+        if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; } __SYNC
+        if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; } __SYNC
+        if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; } __SYNC
+        if (4 <= maxlevel) { sData[idx] += sData[idx -16]; } __SYNC
+#endif
+
+        return sData[idx] - val;  // convert inclusive -> exclusive
+}
+
+__device__ unsigned int scan(unsigned int idata)
+{    
+    extern  __shared__  unsigned int ptr[];
+    
+    unsigned int idx = threadIdx.x;
+    
+    unsigned int val = idata;
+    
+    val = scanwarp<unsigned int, 4>(val, ptr);
+    __syncthreads();
+
+    if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
+    {
+        ptr[idx >> 5] = val + idata;
+    }
+    __syncthreads();
+
+#ifndef __DEVICE_EMULATION__
+    if (idx < WARP_SIZE)
+#endif
+    {
+        ptr[idx] = scanwarp<unsigned int, 2>(ptr[idx], ptr);
+    }
+    __syncthreads();
+
+    val += ptr[idx >> 5];
+
+    return val;
+}
+
+
+__device__ unsigned int rank(unsigned int preds)
+{
+    unsigned int address = scan(preds);  
+
+    __shared__ unsigned int numtrue;
+    if (threadIdx.x == BLOCK_1D - 1)
+    {
+        numtrue = address + preds;
+    }
+    __syncthreads();
+
+    unsigned int rank;
+    unsigned int idx = threadIdx.x;
+    rank = (preds) ? address : numtrue + idx - address;
+
+    return rank;
+}
+
+__device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, unsigned int nbits, unsigned int startbit)
+{
+  extern __shared__ unsigned int sMem1[];
+  __shared__ float sMem2[BLOCK_1D];
+  __shared__ int sMem3[BLOCK_1D];
+
+  int tid = threadIdx.x;
+
+  for(unsigned int shift = startbit; shift < (startbit + nbits); ++shift) {
+    unsigned int lsb;
+    lsb = !(((*key) >> shift) & 0x1);
+
+    unsigned int r;
+		
+    r = rank(lsb);
+
+    // This arithmetic strides the ranks across 4 CTA_SIZE regions
+    sMem1[r] = *key;
+    __syncthreads();
+
+    // The above allows us to read without 4-way bank conflicts:
+    *key = sMem1[tid];    
+    __syncthreads();
+
+    sMem2[r] = (*value1).x;
+    __syncthreads();
+    (*value1).x = sMem2[tid];
+    __syncthreads();
+
+    sMem2[r] = (*value1).y;
+    __syncthreads();
+    (*value1).y = sMem2[tid];
+    __syncthreads();
+
+    sMem2[r] = (*value1).z;
+    __syncthreads();
+    (*value1).z = sMem2[tid];
+    __syncthreads();
+
+    sMem3[r] = *value2;
+    __syncthreads();
+    *value2 = sMem3[tid];
+    __syncthreads();
+
+  }
+
+}
+
+__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit)
+{
+
+  extern __shared__ unsigned int sMem[];
+
+  int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  unsigned int key;
+  float3 value1;
+  int value2;
+  key = keys[gid];
+  value1 = values1[gid];
+  value2 = values2[gid];
+  __syncthreads();
+
+  radixSortBlock(&key, &value1, &value2, nbits, startbit);
+
+  keys[gid] = key;
+  values1[gid] = value1;
+  values2[gid] = value2;
+}
+
+static float3 *d_pos, *pos_temp;
+static int *d_type;
+static int *d_overflow, *d_rebuild;
+
+void init_cell_list(cell_list &cell_list_gpu, 
+		   const int nall,
+		   const int ncell, 
+		   const int buffer)
+{
+  cudaMalloc((void**)&(cell_list_gpu.pos), ncell*buffer*sizeof(float3));
+  cudaMalloc((void**)&(cell_list_gpu.idx),  ncell*buffer*sizeof(unsigned int));
+  cudaMalloc((void**)&(cell_list_gpu.type), ncell*buffer*sizeof(int));
+  cudaMalloc((void**)&(cell_list_gpu.natom), ncell*sizeof(int));
+
+  cudaMallocHost((void**)&pos_temp, nall*sizeof(float3));
+  cudaMalloc((void**)&d_pos,       nall*sizeof(float3));
+  cudaMalloc((void**)&d_type,      nall*sizeof(int));
+  cudaMalloc((void**)&d_overflow, sizeof(int));
+  cudaMalloc((void**)&d_rebuild, sizeof(int));
+
+  cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int));
+  cudaMemset(cell_list_gpu.pos, 0, ncell*buffer*sizeof(float3));
+}
+
+void clear_cell_list(cell_list &cell_list_gpu)
+{
+  cudaFree(cell_list_gpu.pos);
+  cudaFree(cell_list_gpu.idx);
+  cudaFree(cell_list_gpu.natom);
+  cudaFree(cell_list_gpu.type);
+
+  cudaFreeHost(pos_temp);
+  cudaFree(d_pos);
+  cudaFree(d_type);
+  cudaFree(d_overflow);
+  cudaFree(d_rebuild);
+}
+
+
+void build_cell_list(double *atom_pos, int *atom_type, 
+		     cell_list &cell_list_gpu, 
+		     const int ncell, const int ncell1D, const int buffer,
+		     const int inum, const int nall, const int ago)
+{
+  cudaMemset(d_overflow, 0, sizeof(int));
+  cudaMemset(d_rebuild, 0, sizeof(int));
+
+  // copy position and type to GPU
+  for (int i = 0; i < 3*nall; i+=3) { 
+    pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]);
+  }
+  cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_type, atom_type, nall*sizeof(int),  cudaMemcpyHostToDevice);
+
+  static int first_build = 1;
+  int rebuild = 0;
+
+  // copy the last built cell-list and test whether it needs to be rebuilt
+  if (!first_build) {
+    dim3 block(BLOCK_1D);
+    dim3 grid(ncell1D, ncell1D*ncell1D);
+    kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos, 
+				 cell_list_gpu.idx, 
+				 cell_list_gpu.natom, d_pos);
+    cudaMemset(d_rebuild, 0, sizeof(int));
+    int *temp = (int*)malloc(sizeof(int)*ncell);
+    kernel_test_rebuild<<<grid, block>>>(cell_list_gpu.pos, 
+					 cell_list_gpu.natom,
+					 d_rebuild);
+    cudaMemcpy(&rebuild, d_rebuild, sizeof(int), cudaMemcpyDeviceToHost);
+  }
+
+  /*if (!first_build) {
+    dim3 block(BLOCK_1D);
+    dim3 grid(ncell1D, ncell1D*ncell1D);
+    kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos, 
+				      cell_list_gpu.idx, 
+				      cell_list_gpu.natom, d_pos);
+				      }*/
+  if (ago == 0) rebuild = 1;
+
+  // build cell-list for the first time
+  if (first_build || rebuild) {
+    first_build = 0;
+    // cout << "Building cell list..." << endl;
+    cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int));
+    // initialize d_cell_idx for the sorting routine
+    kernel_set_cell_list<<<ncell, buffer>>>(cell_list_gpu.idx);
+    
+    // build cell list
+    dim3 blockDim(128);
+    dim3 gridDim(static_cast<int>(ceil(static_cast<double>(nall)/blockDim.x)));
+    kernel_build_cell_list<<<gridDim, blockDim>>>(cell_list_gpu.pos, 
+						  cell_list_gpu.idx, 
+						  cell_list_gpu.type, 
+						  cell_list_gpu.natom, 
+						  d_pos, d_type, inum, nall);
+    
+    // check cell list overflow
+    int overflow;
+    int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/BLOCK_1D));
+    kernel_test_overflow<<<gDimCell, BLOCK_1D>>>(cell_list_gpu.natom, 
+						 d_overflow, ncell);
+    cudaMemcpy(&overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost);
+    if (overflow > 0) {
+      printf("\n\nBLOCK_1D too small for cell list, please increase it!\n\n");
+      exit(0);
+    }
+
+    // sort atoms in every cell by atom index to avoid floating point associativity problem.
+    sortBlocks(cell_list_gpu.idx, cell_list_gpu.pos, 
+	       cell_list_gpu.type, ncell*buffer);
+
+    cudaThreadSynchronize();
+  }
+
+}
diff --git a/lib/gpu/pair_gpu_cell.h b/lib/gpu/pair_gpu_cell.h
new file mode 100644
index 0000000000..072f5ffbd0
--- /dev/null
+++ b/lib/gpu/pair_gpu_cell.h
@@ -0,0 +1,62 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
+
+#ifndef PAIR_GPU_CELL_H
+#define PAIR_GPU_CELL_H
+
+
+typedef struct {
+  float3 *pos;
+  unsigned int *idx;
+  int *type;
+  int *natom;
+} cell_list;
+
+__global__ void kernel_set_cell_list(unsigned int *cell_idx);
+__global__ void kernel_build_cell_list(float3 *cell_list, 
+				       unsigned int *cell_idx, 
+				       int *cell_type, 
+				       int *cell_atom,
+				       float3 *pos, 
+				       int *type, 
+				       const int inum, 
+				       const int nall);
+__global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebuild);
+__global__ void kernel_copy_list(float3 *cell_list, 
+				 unsigned int *cell_idx, 
+				 int *cell_atom, 
+				 float3 *pos);
+__global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int ncell);
+void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size);
+
+void init_cell_list_const(double cell_size, double skin,
+			 double *boxlo, double *boxhi);
+void init_cell_list(cell_list &cell_list_gpu, 
+		   const int nall,
+		   const int ncell, 
+		   const int buffer);
+
+void build_cell_list(double *atom_pos, int *atom_type, 
+		    cell_list &cell_list_gpu, 
+		    const int ncell, const int ncell1D, const int buffer,
+		    const int inum, const int nall, const int ago);
+
+void clear_cell_list(cell_list &cell_list_gpu);
+
+#endif
diff --git a/lib/gpu/pair_gpu_nbor.cu b/lib/gpu/pair_gpu_nbor.cu
index e3fc2b28d8..88fdef8dfe 100644
--- a/lib/gpu/pair_gpu_nbor.cu
+++ b/lib/gpu/pair_gpu_nbor.cu
@@ -1,25 +1,21 @@
-/***************************************************************************
-                               pair_gpu_nbor.cu
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Neighbor memory operations for LAMMPS GPU Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #include "pair_gpu_nbor.h"
 
diff --git a/lib/gpu/pair_gpu_nbor.h b/lib/gpu/pair_gpu_nbor.h
index 2e833b4c3d..f7eb376ea6 100644
--- a/lib/gpu/pair_gpu_nbor.h
+++ b/lib/gpu/pair_gpu_nbor.h
@@ -1,25 +1,21 @@
-/***************************************************************************
-                               pair_gpu_nbor.h
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Neighbor memory operations for LAMMPS GPU Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #ifndef PAIR_GPU_NBOR_H
 #define PAIR_GPU_NBOR_H
diff --git a/lib/gpu/pair_gpu_texture.h b/lib/gpu/pair_gpu_texture.h
index e647dda8b2..accc4e6b29 100644
--- a/lib/gpu/pair_gpu_texture.h
+++ b/lib/gpu/pair_gpu_texture.h
@@ -1,25 +1,21 @@
-/***************************************************************************
-                              pair_gpu_texture.h
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  Tricks for templating textures
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Jun 23 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #include "nvc_traits.h"
 #include "nvc_memory.h"
diff --git a/lib/gpu/pair_tex_tar.cu b/lib/gpu/pair_tex_tar.cu
index d0b07177e7..b9a275ee22 100644
--- a/lib/gpu/pair_tex_tar.cu
+++ b/lib/gpu/pair_tex_tar.cu
@@ -1,26 +1,21 @@
-/***************************************************************************
-                               pair_tex_tar.cu
-                             -------------------
-                               W. Michael Brown
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
-  "Tar" of header and source files that need texture reference definitions
-  within file scope.
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Jun 23 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Paul Crozier (SNL), pscrozi@sandia.gov
+------------------------------------------------------------------------- */
 
 #include "pair_gpu_atom.cu"
 #include "lj_gpu.cu"