Updating lib/gpu to version 2.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3785 f3b2605a-c512-4ea7-a41b-209d697bcdaa
2010-02-04 21:33:18 +00:00
parent 5ce854b780
commit 4ae4792b00
27 changed files with 1602 additions and 573 deletions
--- a/lib/gpu/Makefile.cyg
+++ b/lib/gpu/Makefile.cyg
@ -0,0 +1,72 @@
 # /* ----------------------------------------------------------------------
 #    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
 #    http://lammps.sandia.gov, Sandia National Laboratories
 #    Steve Plimpton, sjplimp@sandia.gov
 # 
 #    Copyright (2003) Sandia Corporation.  Under the terms of Contract
 #    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
 #    certain rights in this software.  This software is distributed under 
 #    the GNU General Public License.
 # 
 #    See the README file in the top-level LAMMPS directory.
 # ------------------------------------------------------------------------- */
 # 
 # /* ----------------------------------------------------------------------
 #    Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
 #                          Peng Wang (Nvidia), penwang@nvidia.com
 #                          Paul Crozier (SNL), pscrozi@sandia.gov
 # ------------------------------------------------------------------------- */
 BIN_DIR = .
 OBJ_DIR = .
 AR = ar
 CUDA_CPP  = /cygdrive/c/CUDA/bin/nvcc -I/cygdrive/c/CUDA/include -O3 -DWINDLL -DUNIX -Xptxas -v --use_fast_math
 CUDA_ARCH = -arch=sm_13
 CUDA_PREC = -D_SINGLE_SINGLE
 CUDA_LINK = -L/cygdrive/c/CUDA/lib -lcudart $(CUDA_LIB)
 CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
 CUDA_LIB = $(OBJ_DIR)/gpu.dll
 # Headers for CUDA Stuff
 NVC_H  = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
 # Headers for Pair Stuff
 PAIR_H  = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h
 # Dependencies for the Texture Tar
 TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
        lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
        gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu
 ALL_H = $(NVC_H) $(PAIR_H)
 EXECS = $(BIN_DIR)/nvc_get_devices
 OBJS = $(OBJ_DIR)/nvc_device.obj $(OBJ_DIR)/pair_gpu_nbor.obj \
       $(OBJ_DIR)/pair_tex_tar.obj $(OBJ_DIR)/pair_gpu_cell.obj 
 all: $(CUDA_LIB) $(EXECS)
 $(OBJ_DIR)/nvc_device.obj : nvc_device.cu $(NVC_H)
 	$(CUDA) -o $@ -c nvc_device.cu
 $(OBJ_DIR)/pair_gpu_nbor.obj: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
 	$(CUDA) -o $@ -c pair_gpu_nbor.cu
 $(OBJ_DIR)/pair_tex_tar.obj: $(TAR_H)
 	$(CUDA) -o $@ -c pair_tex_tar.cu
 $(OBJ_DIR)/pair_gpu_cell.obj: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h
 	$(CUDA) -o $@ -c pair_gpu_cell.cu
 $(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.obj 
 	$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.obj 
 $(CUDA_LIB): $(OBJS) $(TAR_H)
 	$(CUDA) -o $@ -shared $(OBJS)
 clean:
 	rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.exe *.exp *.lib *.dll *.linkinfo
 veryclean: clean
 	rm -rf *~ *.linkinfo
--- a/lib/gpu/Makefile.nvidia
+++ b/lib/gpu/Makefile.nvidia
@ -1,25 +1,29 @@
-#***************************************************************************
+# /* ----------------------------------------------------------------------   
-#                               Makefile
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
-#                          -------------------
+#    http://lammps.sandia.gov, Sandia National Laboratories                   
-#                           W. Michael Brown
+#    Steve Plimpton, sjplimp@sandia.gov                                       
-#
+#                                                                             
-#  _________________________________________________________________________
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
-#    Build for the LAMMPS GPU Force Library
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
-#
+#    certain rights in this software.  This software is distributed under      
-#  _________________________________________________________________________
+#    the GNU General Public License.                                          
-#
+#                                                                             
-#    begin                : Tue June 23 2009
+#    See the README file in the top-level LAMMPS directory.                   
-#    copyright            : (C) 2009 by W. Michael Brown
+# ------------------------------------------------------------------------- */
-#    email                : wmbrown@sandia.gov
+#                                                                             
-# ***************************************************************************/
+# /* ----------------------------------------------------------------------   
 #    Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov               
 #                          Peng Wang (Nvidia), penwang@nvidia.com             
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
 BIN_DIR = .
 OBJ_DIR = .
 AR = ar
 CUDA_CPP  = nvcc -I/usr/local/cuda/include -DUNIX -O3 -Xptxas -v --use_fast_math
-CUDA_ARCH = -maxrregcount 128 #-arch=sm_13
+CUDA_ARCH = -arch=sm_13
 CUDA_PREC = -D_SINGLE_SINGLE
-CUDA_LINK = -L/usr/local/cuda/lib64 -lcudart $(CUDA_LIB)
+CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB)
 CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
@ -28,7 +32,7 @@ CUDA_LIB = $(OBJ_DIR)/libgpu.a
 # Headers for CUDA Stuff
 NVC_H  = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
 # Headers for Pair Stuff
-PAIR_H  = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h
+PAIR_H  = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h
 # Dependencies for the Texture Tar
 TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
        lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
@ -38,7 +42,7 @@ ALL_H = $(NVC_H) $(PAIR_H)
 EXECS = $(BIN_DIR)/nvc_get_devices
 OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/pair_gpu_nbor.cu_o \
-       $(OBJ_DIR)/pair_tex_tar.cu_o
+       $(OBJ_DIR)/pair_tex_tar.cu_o $(OBJ_DIR)/pair_gpu_cell.cu_o 
 all: $(CUDA_LIB) $(EXECS)
@ -51,6 +55,9 @@ $(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor
 $(OBJ_DIR)/pair_tex_tar.cu_o: $(TAR_H)
 	$(CUDA) -o $@ -c pair_tex_tar.cu
 $(OBJ_DIR)/pair_gpu_cell.cu_o: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h
 	$(CUDA) -o $@ -c pair_gpu_cell.cu
 $(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o
 	$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o 
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -1,29 +1,25 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                                   README
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  README for building LAMMPS GPU Library
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Thu Jun 25 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
                          GENERAL NOTES
-This library, pair_gpu_lib.a, provides routines for GPGPU acceleration
+This library, libgpu.a, provides routines for GPU acceleration
 of LAMMPS pair styles.  Currently, only CUDA enabled GPUs are
 supported.  Compilation of this library requires installing the CUDA
 GPU driver and CUDA toolkit for your operating system. In addition to
@ -33,14 +29,14 @@ devices on your system.
 NOTE: Installation of the CUDA SDK is not required.
-Current pair styles supporting GPU Accelartion:
+Current pair styles supporting GPU acceleration:
  1. lj/cut/gpu
  2. gayberne/gpu
                     MULTIPLE LAMMPS PROCESSES
-When using GPGPU acceleration, you are restricted to one physical GPU
+When using GPU acceleration, you are restricted to one physical GPU
 per LAMMPS process. This can be multiple GPUs on a single node or
 across multiple nodes. Intructions on GPU assignment can be found in
 the LAMMPS documentation.
@ -66,6 +62,9 @@ the CUDA_PREC variable:
  CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
  CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
 NOTE: For the lj/cut pair style, only single precision will be used, even 
      if double precision is specified.
 NOTE: Double precision is only supported on certain GPUS (with
      compute capability>=1.3).
@ -74,14 +73,14 @@ NOTE: For Tesla and other graphics cards with compute capability>=1.3,
 NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
      package has been installed before installing the GPU package in LAMMPS.
-
+      
                           GPU MEMORY
-Upon initialization of the pair style, the library will reserve memory
+Upon initialization of the gayberne/gpu pair style, the library will reserve
-for 64K atoms per GPU or 70% of each cards GPU memory, whichever value
+memory for 64K atoms per GPU or 70% of each cards GPU memory, whichever value
 is limiting.  The value of 70% can be changed by editing the
-PERCENT_GPU_MEMORY definition in the source file. The value of 64K
+PERCENT_GPU_MEMORY definition in the source file. For gayberne/gpu, the value
-cannot be increased and is the maximum number of atoms allowed per
+of 64K cannot be increased and is the maximum number of atoms allowed per
 GPU. Using the 'neigh_modify one' modifier in your LAMMPS input script
 can help to increase maximum number of atoms per GPU for cards with
 limited memory.
--- a/lib/gpu/gb_gpu.cu
+++ b/lib/gpu/gb_gpu.cu
@ -1,27 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                                  gb_gpu.cu
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Gay-Berne anisotropic potential GPU calcultation
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   *** Force decomposition by Atom Version ***
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Jun 23 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #include <iostream>
 #include <cassert>
@ -190,30 +184,30 @@ inline string gb_gpu_toa(const t& in) {
 // ---------------------------------------------------------------------------
 // Return string with GPU info
 // ---------------------------------------------------------------------------
-string gb_gpu_name(const int id, const int max_nbors) {
+EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) {
-  string name=GBMF[0].gpu.name(id)+", "+
+  string sname=GBMF[0].gpu.name(id)+", "+
              gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
              gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
              gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
              gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
                                               max_nbors))+" Atoms";
-  return name;
+  strcpy(name,sname.c_str());
 }
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
+EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
-                 const double upsilon, const double mu, double **shape,
+                  const double upsilon, const double mu, double **shape,
-                 double **well, double **cutsq, double **sigma, 
+                  double **well, double **cutsq, double **sigma, 
-                 double **epsilon, double *host_lshape, int **form,
+                  double **epsilon, double *host_lshape, int **form,
-                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                 double **host_lj4, double **offset, double *special_lj,
+                  double **host_lj4, double **offset, double *special_lj,
-                 const int max_nbors, const int thread, const int gpu_id) {
+                  const int max_nbors, const int thread, const int gpu_id) {
  assert(thread<MAX_GPU_THREADS);
  GBMF[thread].gpu.init();
-  
+
  if (GBMF[thread].gpu.num_devices()==0)
    return false;                   
@ -227,7 +221,7 @@ bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
 // ---------------------------------------------------------------------------
 // Clear memory on host and device
 // ---------------------------------------------------------------------------
-void gb_gpu_clear(const int thread) {
+EXTERN void gb_gpu_clear(const int thread) {
  GBMF[thread].clear();
 }
@ -262,7 +256,7 @@ inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
  atom.time_atom.stop();
 }
-void gb_gpu_atom(double **host_x, double **host_quat, 
+EXTERN void gb_gpu_atom(double **host_x, double **host_quat, 
                 const int *host_type, const bool rebuild, const int thread) {
  _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
               GBMF[thread].pair_stream);
@ -327,7 +321,7 @@ int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal,
  return ilist;
 }
-int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
+EXTERN int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
                         int *ilist, const int *numj, const int *type,
                         const int thread, bool &success) {
  return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
@ -340,7 +334,7 @@ int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
 // ---------------------------------------------------------------------------
 template <class gbmtyp>
 void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij, 
-                   const bool eflag) {
+        const bool eflag) {
  gbm.nbor.time_nbor.add_to_total();
  // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
@ -350,8 +344,8 @@ void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij,
  gbm.nbor.time_nbor.stop();
 }
-void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag, 
+EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
-                  const int thread) {
+            const int thread) {
  _gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
 }
@ -453,7 +447,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
  }
 }
-void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
+EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
                     const int thread) {
  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
 }
@ -490,7 +484,7 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
  return evdw;
 }
-double gb_gpu_forces(double **f, double **tor, const int *ilist,
+EXTERN double gb_gpu_forces(double **f, double **tor, const int *ilist,
                     const bool eflag, const bool vflag, const bool eflag_atom,
                     const bool vflag_atom, double *eatom, double **vatom,
                     double *virial, const int thread) {
@ -499,7 +493,7 @@ double gb_gpu_forces(double **f, double **tor, const int *ilist,
                        vflag_atom,eatom,vatom,virial);
 }
-void gb_gpu_time(const int i) {
+EXTERN void gb_gpu_time(const int i) {
  cout.precision(4);
  cout << "Atom copy:     " << GBMF[i].atom.time_atom.total_seconds() 
       << " s.\n"
@ -515,10 +509,10 @@ void gb_gpu_time(const int i) {
       << " s.\n";
 }
-int gb_gpu_num_devices() {
+EXTERN int gb_gpu_num_devices() {
  return GBMF[0].gpu.num_devices();
 }
-double gb_gpu_bytes() {
+EXTERN double gb_gpu_bytes() {
  return GBMF[0].host_memory_usage();
 }
--- a/lib/gpu/gb_gpu_extra.h
+++ b/lib/gpu/gb_gpu_extra.h
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                               gb_gpu_extra.h
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Inline GPU kernel routines ala math_extra for the CPU.
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Jun 23 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef GB_GPU_EXTRA_H
 #define GB_GPU_EXTRA_H
--- a/lib/gpu/gb_gpu_kernel.h
+++ b/lib/gpu/gb_gpu_kernel.h
@ -1,26 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                              gb_gpu_kernel.cu
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Routines that actually perform the force/torque computation
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   *** Force Decomposition by Atom Version ***
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Jun 23 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef GB_GPU_KERNEL
 #define GB_GPU_KERNEL
--- a/lib/gpu/gb_gpu_memory.cu
+++ b/lib/gpu/gb_gpu_memory.cu
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                              gb_gpu_memory.cu
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Global variables for GPU Gayberne Library
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Thu Jun 25 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #include "gb_gpu_memory.h"
 #define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                               gb_gpu_memory.h
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Global variables for GPU Gayberne Library
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Thu Jun 25 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef GB_GPU_MEMORY_H
 #define GB_GPU_MEMORY_H
--- a/lib/gpu/lj_gpu.cu
+++ b/lib/gpu/lj_gpu.cu
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                                  lj_gpu.cu
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Lennard-Jones potential GPU calcultation
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Aug 4 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #include <iostream>
 #include <cassert>
@ -27,18 +23,39 @@
 #include "nvc_timer.h"
 #include "nvc_device.h"
 #include "pair_gpu_texture.h"
 #include "pair_gpu_cell.h"
 #include "lj_gpu_memory.cu"
 #include "lj_gpu_kernel.h"
 #ifdef WINDLL
 #include <windows.h>
 BOOL APIENTRY DllMain(HANDLE hModule, DWORD dwReason, LPVOID lpReserved)
 {
    return TRUE;
 }
 #endif
 #ifdef WINDLL
 #define EXTERN extern "C" __declspec(dllexport) 
 #else
 #define EXTERN 
 #endif
 using namespace std;
 static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
 #define LJMT LJ_GPU_Memory<numtyp,acctyp>
 static float kernelTime = 0.0;
 static int ncell1D;
 static float *energy, *d_energy;
 static float3 *d_force, *f_temp, *v_temp, *d_virial;
 static cell_list cell_list_gpu;
 // ---------------------------------------------------------------------------
 // Convert something to a string
 // ---------------------------------------------------------------------------
 #include <sstream>
 template <class t>
 inline string lj_gpu_toa(const t& in) {
  ostringstream o;
@ -50,113 +67,54 @@ inline string lj_gpu_toa(const t& in) {
 // ---------------------------------------------------------------------------
 // Return string with GPU info
 // ---------------------------------------------------------------------------
-string lj_gpu_name(const int id, const int max_nbors) {
+EXTERN void lj_gpu_name(const int id, const int max_nbors, char * name) {
-  string name=LJMF.gpu.name(id)+", "+
+  string sname=LJMF.gpu.name(id)+", "+
              lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+
              lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+
-              lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+
+              lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ";
-              lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id),
+  strcpy(name,sname.c_str());
                                            max_nbors))+" Atoms";
  return name;
 }
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
+EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
-                 double **epsilon, double **host_lj1, double **host_lj2, 
+			 double **epsilon, double **host_lj1, double **host_lj2, 
-                 double **host_lj3, double **host_lj4, double **offset, 
+			 double **host_lj3, double **host_lj4, double **offset, 
-                 double *special_lj, const int max_nbors, const int gpu_id) {
+			 double *special_lj, double *boxlo, double *boxhi, 
-  LJMF.gpu.init();                  
+			 double cell_size, double skin,
 			 const int max_nbors, const int gpu_id) {
  LJMF.gpu.init();
  if (LJMF.gpu.num_devices()==0)
    return false;                   
  ij_size=IJ_SIZE;
-  return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
+
-                   host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
+  bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
 		       host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
  ncell1D = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
  init_cell_list_const(cell_size, skin, boxlo, boxhi);
  return ret;
 }
 // ---------------------------------------------------------------------------
 // Clear memory on host and device
 // ---------------------------------------------------------------------------
-void lj_gpu_clear() {
+EXTERN void lj_gpu_clear() {
  free(energy);
  free(v_temp);
  cudaFreeHost(f_temp);
  cudaFree(d_force);
  cudaFree(d_energy);
  cudaFree(d_virial);
  clear_cell_list(cell_list_gpu);
  LJMF.clear();
 }
 // ---------------------------------------------------------------------------
 // copy atom positions and optionally types to device
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline void _lj_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
                         const int *host_type, const bool rebuild,
                         cudaStream_t &stream) {
  atom.time_atom.start();
  atom.reset_write_buffer();
  // First row of dev_x is x position, second is y, third is z
  atom.add_atom_data(host_x[0],3);
  atom.add_atom_data(host_x[0]+1,3);
  atom.add_atom_data(host_x[0]+2,3);
  int csize=3;
  // If a rebuild occured, copy type data
  if (rebuild) {
    atom.add_atom_data(host_type);
    csize++;
  }
  atom.copy_atom_data(csize,stream);
  atom.time_atom.stop();
 }
 void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) {
  _lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream);
 }
 // ---------------------------------------------------------------------------
 // Signal that we need to transfer a new neighbor list
 // ---------------------------------------------------------------------------
 template <class LJMTyp>
 bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum, 
                         int *ilist, const int *numj) {
  if (nall>ljm.max_atoms)
    return false;
  ljm.nbor.time_nbor.start();
  ljm.atom.nall(nall);
  ljm.atom.inum(inum);
  ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream);
  ljm.nbor.time_nbor.stop();
  return true;
 }
 bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, 
                        const int *numj) {
  return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj);
 }
 // ---------------------------------------------------------------------------
 // Copy a set of ij_size ij interactions to device and compute energies,
 // forces, and torques for those interactions
 // ---------------------------------------------------------------------------
 template <class LJMTyp>
 void _lj_gpu_nbors(LJMTyp &ljm, const int *ij, const int num_ij) {
  ljm.nbor.time_nbor.add_to_total();
  // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
  memcpy(ljm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
  ljm.nbor.time_nbor.start();
  ljm.nbor.add(num_ij,ljm.pair_stream);
  ljm.nbor.time_nbor.stop();
 }
 void lj_gpu_nbors(const int *ij, const int num_ij) {
  _lj_gpu_nbors(LJMF,ij,num_ij);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies and forces for all ij interactions
@ -169,6 +127,7 @@ void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
  int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
  ljm.time_pair.start();
  if (ljm.shared_types)
    kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
           (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), 
@ -181,47 +140,279 @@ void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
            ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), 
            ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, 
            vflag, ljm.atom.inum(), ljm.atom.nall());
 	    ljm.time_pair.stop();
 }
 EXTERN void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
  _lj_gpu<PRECISION,ACC_PRECISION>(LJMF, eflag,vflag,rebuild);
 }
 template <class numtyp, class acctyp>
 double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
 		    double **host_x, int *host_type, const int inum, 
 		    const int nall, const int ago, const bool eflag, const bool vflag, 
 		    const double *boxlo, const double *boxhi)
 {
  ljm.atom.nall(nall);
  ljm.atom.inum(inum);
  ljm.nbor.time_nbor.start();
  ljm.nbor.time_nbor.stop();
  double evdwl=0.0;
  static int buffer = CELL_SIZE;
  static int ncell = (int)pow((float)ncell1D,3);
  static int first_call = 1;
  // allocate memory on CPU and GPU
  if (first_call) {
    energy    = (float*) malloc(inum*sizeof(float));
    v_temp    = (float3*)malloc(inum*2*sizeof(float3));
    cudaMallocHost((void**)&f_temp,   inum*sizeof(float3));
    cudaMalloc((void**)&d_force,     inum*sizeof(float3));
    cudaMalloc((void**)&d_energy,    inum*sizeof(float));
    cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));
    init_cell_list(cell_list_gpu, nall, ncell, buffer);
    first_call = 0;
  }
  if (!first_call && ago == 0) {
    free(energy);
    free(v_temp);
    cudaFreeHost(f_temp);
    cudaFree(d_force);
    cudaFree(d_energy);
    cudaFree(d_virial);
    energy    = (float*) malloc(inum*sizeof(float));
    v_temp    = (float3*)malloc(inum*2*sizeof(float3));
    cudaMallocHost((void**)&f_temp,   inum*sizeof(float3));
    cudaMalloc((void**)&d_force,     inum*sizeof(float3));
    cudaMalloc((void**)&d_energy,    inum*sizeof(float));
    cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));
    clear_cell_list(cell_list_gpu);
    init_cell_list(cell_list_gpu, nall, ncell, buffer);
  }
  // build cell-list on GPU
  ljm.atom.time_atom.start();
  build_cell_list(host_x[0], host_type, cell_list_gpu, 
 		  ncell, ncell1D, buffer, inum, nall, ago);
  ljm.atom.time_atom.stop();
  ljm.time_pair.start();
 #ifdef TIMING
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start, 0);
 #endif
  // call the cell-list force kernel
  const int BX=BLOCK_1D;
  dim3 GX(ncell1D, ncell1D*ncell1D);
  if (eflag == 0 && vflag == 0) {
    kernel_lj_cell<false,false><<<GX, BX, 0>>>
      (d_force, d_energy, d_virial, 
       cell_list_gpu.pos, 
       cell_list_gpu.idx, 
       cell_list_gpu.type, 
       cell_list_gpu.natom,
       inum, nall, ncell);
  } else {
    kernel_lj_cell<true,true><<<GX, BX, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES>>>
      (d_force, d_energy, d_virial, 
       cell_list_gpu.pos, 
       cell_list_gpu.idx, 
       cell_list_gpu.type, 
       cell_list_gpu.natom,
       inum, nall, ncell);
  }
 #ifdef TIMING
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  float kTime;
  cudaEventElapsedTime(&kTime, start, stop);
  kernelTime += kTime;
  printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);
 #endif
  // copy results from GPU to CPU
  cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
  if (eflag) {
    cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < inum; i++) {
      evdwl += energy[i];
    }
    evdwl *= 0.5f;
  }
  if (vflag) {
    cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
    for (int i = 0; i < inum; i++) {
      virial[0] += v_temp[2*i].x;
      virial[1] += v_temp[2*i].y;
      virial[2] += v_temp[2*i].z;
      virial[3] += v_temp[2*i+1].x;
      virial[4] += v_temp[2*i+1].y;
      virial[5] += v_temp[2*i+1].z;
    }
    for (int i = 0; i < 6; i++) 
      virial[i] *= 0.5f;
  }
  for (int i = 0; i < inum; i++) {
    force[i][0] += f_temp[i].x;
    force[i][1] += f_temp[i].y;
    force[i][2] += f_temp[i].z;
  }
  ljm.time_pair.stop();
 }
 void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
  _lj_gpu<PRECISION,ACC_PRECISION>(LJMF,eflag,vflag,rebuild);
 }
 // ---------------------------------------------------------------------------
 // Get energies and forces to host
 // ---------------------------------------------------------------------------
 template<class numtyp, class acctyp>
 double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
                      const bool eflag, const bool vflag, const bool eflag_atom,
                      const bool vflag_atom, double *eatom, double **vatom,
                      double *virial) {
  double evdw;
  ljm.atom.time_answer.start();
  ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream);
  ljm.atom.time_atom.add_to_total();
  ljm.nbor.time_nbor.add_to_total();
  ljm.time_pair.add_to_total();
  CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream));
-  evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
+
-  ljm.atom.add_forces(ilist,f);
+  return evdwl;
-  ljm.atom.time_answer.stop();
+ 
  ljm.atom.time_answer.add_to_total();
  return evdw;
 }
-double lj_gpu_forces(double **f, const int *ilist, const bool eflag, 
+EXTERN double lj_gpu_cell(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, 
-                     const bool vflag, const bool eflag_atom,
+		   const int ago, const bool eflag, const bool vflag, 
-                     const bool vflag_atom, double *eatom, double **vatom,
+		   const double *boxlo, const double *boxhi) 
-                     double *virial) {
+{
-  return _lj_gpu_forces<PRECISION,ACC_PRECISION> 
+  return _lj_gpu_cell<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall, 
-    (LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial);
+					       ago, eflag, vflag, boxlo, boxhi);
 }
-void lj_gpu_time() {
+template <class numtyp, class acctyp>
 double _lj_gpu_n2(LJMT &ljm, double **force, double *virial,
 		  double **host_x, int *host_type, const int inum, const int nall, const bool eflag, const bool vflag,
 		  const double *boxlo, const double *boxhi)
 {
  ljm.atom.nall(nall);
  ljm.atom.inum(inum);
  ljm.nbor.time_nbor.start();
  ljm.nbor.time_nbor.stop();
  double evdwl=0.0;
 #ifdef NOUSE
  static int first_call = 1;
  if (first_call) {
    energy    = (float*)    malloc(inum*sizeof(float));
    v_temp    = (float3*)   malloc(inum*2*sizeof(float3));
    cudaMallocHost((void**)&f_temp,   inum*sizeof(float3));
    cudaMallocHost((void**)&pos_temp, nall*sizeof(float3));
    cudaMalloc((void**)&d_force,     inum*sizeof(float3));
    cudaMalloc((void**)&d_energy,    inum*sizeof(float));
    cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));
    cudaMalloc((void**)&d_pos,       nall*sizeof(float3));
    cudaMalloc((void**)&d_type,      nall*sizeof(int));
    first_call = 0;
  }
  ljm.atom.time_atom.start();
  double *atom_pos = host_x[0];
  for (int i = 0; i < 3*nall; i+=3) { 
    pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]);
  }
  cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice);
  cudaMemcpy(d_type, host_type, nall*sizeof(int),  cudaMemcpyHostToDevice);
  ljm.atom.time_atom.stop();
  ljm.time_pair.start();
  // Compute the block size and grid size to keep all cores busy
  const int BX=BLOCK_1D;
  dim3 GX(static_cast<int>(ceil(static_cast<double>(inum)/BX)));
 #ifdef TIMING
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start, 0);
 #endif
  // N^2 force kernel
  kernel_lj_n2<numtyp, acctyp><<<GX, BX>>>(d_force, d_energy, d_virial, 
 					   d_pos, d_type, eflag, vflag, inum, nall);
 #ifdef TIMING
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  float kTime;
  cudaEventElapsedTime(&kTime, start, stop);
  kernelTime += kTime;
  printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);
 #endif
  // copy results from GPU to CPU
  cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
  if (eflag) {
    cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < inum; i++) {
      evdwl += energy[i];
    }
    evdwl *= 0.5f;
  }
  if (vflag) {
    cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
    for (int i = 0; i < inum; i++) {
      virial[0] += v_temp[2*i].x;
      virial[1] += v_temp[2*i].y;
      virial[2] += v_temp[2*i].z;
      virial[3] += v_temp[2*i+1].x;
      virial[4] += v_temp[2*i+1].y;
      virial[5] += v_temp[2*i+1].z;
    }
    for (int i = 0; i < 6; i++) 
      virial[i] *= 0.5f;
  }
  for (int i = 0; i < inum; i++) {
    force[i][0] += f_temp[i].x;
    force[i][1] += f_temp[i].y;
    force[i][2] += f_temp[i].z;
  }
 #endif
  ljm.time_pair.stop();
  ljm.atom.time_atom.add_to_total();
  ljm.nbor.time_nbor.add_to_total();
  ljm.time_pair.add_to_total();
  return evdwl;
 }
 EXTERN double lj_gpu_n2(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, 
 		 const bool eflag, const bool vflag,
 		 const double *boxlo, const double *boxhi) 
 {
  return _lj_gpu_n2<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall, 
 					  eflag, vflag, boxlo, boxhi);
 }
 EXTERN void lj_gpu_time() {
  cout.precision(4);
  cout << "Atom copy:     " << LJMF.atom.time_atom.total_seconds() << " s.\n";
  cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n";
@ -229,10 +420,10 @@ void lj_gpu_time() {
  cout << "Answer copy:   " << LJMF.atom.time_answer.total_seconds() << " s.\n";
 }
-int lj_gpu_num_devices() {
+EXTERN int lj_gpu_num_devices() {
  return LJMF.gpu.num_devices();
 }
-double lj_gpu_bytes() {
+EXTERN double lj_gpu_bytes() {
  return LJMF.host_memory_usage();
 }
--- a/lib/gpu/lj_gpu_kernel.h
+++ b/lib/gpu/lj_gpu_kernel.h
@ -1,29 +1,220 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                              lj_gpu_kernel.cu
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Routines that actually perform the force computation
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Aug 4 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef LJ_GPU_KERNEL
 #define LJ_GPU_KERNEL
 /* Cell list version of LJ kernel */
 template<bool eflag, bool vflag>
 __global__ void kernel_lj_cell(float3 *force3,
 			       float *energy, float3 *virial, 
 			       float3 *cell_list, unsigned int *cell_idx, 
 			       int *cell_type, int *cell_atom,
 			       const int inum, const int nall, const int ncell)
 {
  // calculate 3D block idx from 2d block
  int bx = blockIdx.x;
  int by = blockIdx.y % gridDim.x;
  int bz = blockIdx.y / gridDim.x;
  int tid = threadIdx.x;
  // compute cell idx from 3D block idx
  int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
  __shared__ int   typeSh[CELL_SIZE];
  __shared__ float posSh[CELL_SIZE*3];
  __shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  extern __shared__ float smem[];
  __shared__ float *lj3Sh;
  __shared__ float *lj4Sh;
  __shared__ float *offsetSh;
  // load force parameters into shared memory
  for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
    int itype = i/MAX_SHARED_TYPES;
    int jtype = i%MAX_SHARED_TYPES;
    cutsqSh[i] = _cutsq_<float>(itype,jtype);
    lj1Sh[i]   = _lj1_<float>(itype,jtype).x;
    lj2Sh[i]   = _lj1_<float>(itype,jtype).y;
  }
  // Only allocate shared memory when needed, 
  // this reduces shared memory limitation on occupancy
  if (eflag || vflag) {
    lj3Sh = smem;
    lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
    offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
    for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
      int itype = i/MAX_SHARED_TYPES;
      int jtype = i%MAX_SHARED_TYPES;
      lj3Sh[i]   = _lj3_<float>(itype,jtype).x+0.01;
      lj4Sh[i]   = _lj3_<float>(itype,jtype).y;
      offsetSh[i]= _offset_<float>(itype,jtype);
    }
  }
  __syncthreads();
  int nborz0 = max(bz-1,0), nborz1 = min(bz+1, gridDim.x-1),
    nbory0 = max(by-1,0), nbory1 = min(by+1, gridDim.x-1),
    nborx0 = max(bx-1,0), nborx1 = min(bx+1, gridDim.x-1);
  for (int ii = 0; ii < ceil((float)(cell_atom[cid])/BLOCK_1D); ii++) {
    float3 f = {0.0f, 0.0f, 0.0f};
    float ener = 0.0f;
    float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
    int itype;
    float ix, iy, iz;
    int i = tid + ii*BLOCK_1D;
    unsigned int answer_pos = cell_idx[cid*CELL_SIZE+i];
    // load current cell atom position and type into sMem
    for (int j = tid; j < cell_atom[cid]; j += BLOCK_1D) {
      int pid = cid*CELL_SIZE + j;
      float3 pos = cell_list[pid];
      posSh[j            ] = pos.x;
      posSh[j+  CELL_SIZE] = pos.y;
      posSh[j+2*CELL_SIZE] = pos.z;
      typeSh[j]            = cell_type[pid];
    }
    __syncthreads();
    if (answer_pos < inum) {
      itype = typeSh[i];
      ix = posSh[i            ];
      iy = posSh[i+  CELL_SIZE];
      iz = posSh[i+2*CELL_SIZE];
      // compute force from current cell
      for (int j = 0; j < cell_atom[cid]; j++) {
 	if (j == i) continue;
 	float delx = ix - posSh[j            ];
 	float dely = iy - posSh[j+  CELL_SIZE];
 	float delz = iz - posSh[j+2*CELL_SIZE];
 	int jtype = typeSh[j];
 	int mtype = itype + jtype*MAX_SHARED_TYPES;
 	float r2inv = delx*delx + dely*dely + delz*delz;
 	if (r2inv < cutsqSh[mtype]) {
 	  r2inv = 1.0f/r2inv;
 	  float r6inv = r2inv * r2inv * r2inv;
 	  float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
 	  f.x += delx * force;
 	  f.y += dely * force;
 	  f.z += delz * force;
 	  if (eflag) {
 	    float e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
 	    ener += (e - offsetSh[mtype]); 
 	  }
 	  if (vflag) {
 	    v0.x += delx*delx*force;
 	    v0.y += dely*dely*force;
 	    v0.z += delz*delz*force;
 	    v1.x += delx*dely*force;
 	    v1.y += delx*delz*force;
 	    v1.z += dely*delz*force;
 	  }
 	} 
      }
    }
    __syncthreads();
    // compute force from neigboring cells
    for (int nborz = nborz0; nborz <= nborz1; nborz++) {
      for (int nbory = nbory0; nbory <= nbory1; nbory++) {
 	for (int nborx = nborx0; nborx <= nborx1; nborx++) {
 	  if (nborz == bz && nbory == by && nborx == bx) continue;
 	  // compute cell id
 	  int cid_nbor = nborx + INT_MUL(nbory,gridDim.x) + 
 	    INT_MUL(nborz,gridDim.x*gridDim.x);
 	  // load neighbor cell position and type into smem
 	  for (int j = tid; j < cell_atom[cid_nbor]; j += BLOCK_1D) {
 	    int pid = INT_MUL(cid_nbor,CELL_SIZE) + j;
 	    float3 pos = cell_list[pid];
 	    posSh[j            ] = pos.x;
 	    posSh[j+  CELL_SIZE] = pos.y;
 	    posSh[j+2*CELL_SIZE] = pos.z;
 	    typeSh[j]           = cell_type[pid];
 	  }
 	  __syncthreads();
 	  // compute force
 	  if (answer_pos < inum) {
 	    for (int j = 0; j < cell_atom[cid_nbor]; j++) {
 	      float delx = ix - posSh[j           ];
 	      float dely = iy - posSh[j+  CELL_SIZE];
 	      float delz = iz - posSh[j+2*CELL_SIZE];
 	      int jtype = typeSh[j];
 	      int mtype = itype + jtype*MAX_SHARED_TYPES;
 	      float r2inv = delx*delx + dely*dely + delz*delz;
 	      if (r2inv < cutsqSh[mtype]) {
 		r2inv = 1.0f/r2inv;
 		float r6inv = r2inv * r2inv * r2inv;
 		float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
 		f.x += delx * force;
 		f.y += dely * force;
 		f.z += delz * force;
 		if (eflag) {
 		  float e=r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);				
 		  ener += (e-offsetSh[mtype]); 
 		}
 		if (vflag) {
 		  v0.x += delx*delx*force;
 		  v0.y += dely*dely*force;
 		  v0.z += delz*delz*force;
 		  v1.x += delx*dely*force;
 		  v1.y += delx*delz*force;
 		  v1.z += dely*delz*force;
 		}
 	      }
 	    }
 	  }
 	  __syncthreads();
 	}
      }
    }
    if (answer_pos < inum) {
      force3[answer_pos] = f;
      if (eflag)
 	energy[answer_pos] = ener;
      if (vflag) {
 	virial[2*answer_pos] = v0;
 	virial[2*answer_pos+1] = v1;
      }
    }
  }
 }
 /* Neigbhor list version of LJ kernel */
 template<class numtyp, class acctyp>
 __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, 
                          const int *dev_ij, const int nbor_pitch, acctyp *ans, 
@ -36,7 +227,6 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
  if (ii<4)
    sp_lj[ii]=special_lj[ii];    
  ii+=INT_MUL(blockIdx.x,blockDim.x);
  __syncthreads();
  if (ii<inum) {
@ -158,7 +348,6 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
    }
  }
  ii+=INT_MUL(blockIdx.x,blockDim.x);
  __syncthreads();
  if (ii<inum) {
@ -184,9 +373,11 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,3));
    numtyp factor_lj;
    for ( ; list<list_end; list++) {
-  
+
-      int j=*list;
+      int j= *list;
      if (j < nall) 
        factor_lj = 1.0;
      else {
@ -247,4 +438,116 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
  } // if ii
 }
 /* Brute force O(N^2) version of LJ kernel */
 template<class numtyp, class acctyp>
  __global__ void kernel_lj_n2(float3 *force3,
 			       float *energy, float3 *virial,
 			       float3 *pos, int *type, 
 			       const bool eflag, const bool vflag, const int inum, const int nall)
 {
  int gid = threadIdx.x + INT_MUL(blockIdx.x, blockDim.x);
  int tid = threadIdx.x;
  __shared__ float posSh[BLOCK_1D*3];
  __shared__ int   typeSh[BLOCK_1D];
  __shared__ numtyp cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ numtyp lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ numtyp lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ numtyp lj3Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ numtyp lj4Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ numtyp offsetSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    int itype=tid/MAX_SHARED_TYPES;
    int jtype=tid%MAX_SHARED_TYPES;
    cutsqSh[tid]=_cutsq_<numtyp>(itype,jtype);
    lj1Sh[tid]=_lj1_<numtyp>(itype,jtype).x;
    lj2Sh[tid]=_lj1_<numtyp>(itype,jtype).y;
    lj3Sh[tid]=_lj3_<numtyp>(itype,jtype).x;
    lj4Sh[tid]=_lj3_<numtyp>(itype,jtype).y;
    offsetSh[tid]=_offset_<numtyp>(itype,jtype);
  }
  __syncthreads();
  float3 f = {0.0f, 0.0f, 0.0f};
  float ener = 0.0f;
  float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
  int itype, jtype;
  int mtype;
  numtyp ix, iy, iz;
  if (gid < inum) {
    ix = pos[gid].x;
    iy = pos[gid].y;
    iz = pos[gid].z;
    itype = type[gid];
  }
  int pid = tid;
  int nIter = ceil((float)nall/BLOCK_1D);
  for (int jj = 0; jj < nIter; jj++, pid += BLOCK_1D) {
    if (pid < nall) {
      posSh[tid           ] = pos[pid].x;
      posSh[tid+  BLOCK_1D] = pos[pid].y;
      posSh[tid+2*BLOCK_1D] = pos[pid].z;
      typeSh[tid]           = type[pid];
    }
    __syncthreads();
    if (gid < inum) {
      int pid_j = jj*BLOCK_1D;
      for (int j = 0; j < BLOCK_1D; j++, pid_j++) {
 	if (jj == blockIdx.x && tid == j) continue;
 	if (pid_j < nall) {
 	  numtyp delx = ix - posSh[j           ];
 	  numtyp dely = iy - posSh[j+  BLOCK_1D];
 	  numtyp delz = iz - posSh[j+2*BLOCK_1D];
 	  jtype = typeSh[j];
 	  mtype = itype + jtype*MAX_SHARED_TYPES;
 	  numtyp r2inv = delx * delx + dely * dely + delz * delz;
 	  if (r2inv < cutsqSh[mtype]) {
 	    r2inv = (numtyp)1.0/r2inv;
 	    numtyp r6inv = r2inv * r2inv * r2inv;
 	    numtyp force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
 	    f.x += delx * force;
 	    f.y += dely * force;
 	    f.z += delz * force;
 	    if (eflag) {
 	      numtyp e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
 	      ener +=(e-offsetSh[mtype]);
 	    }
 	    if (vflag) {
 	      v0.x += delx*delx*force;
 	      v0.y += dely*dely*force;
 	      v0.z += delz*delz*force;
 	      v1.x += delx*dely*force;
 	      v1.y += delx*delz*force;
 	      v1.z += dely*delz*force;
 	    }
 	  }
 	}
      }
    }
    __syncthreads();
  }
  if (gid < inum) {
    if (eflag)
      energy[gid] = ener;
    if (vflag) {
      virial[2*gid  ] = v0;
      virial[2*gid+1] = v1;
    }
    force3[gid] = f;
  }
 }
 #endif
--- a/lib/gpu/lj_gpu_memory.cu
+++ b/lib/gpu/lj_gpu_memory.cu
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                               lj_gpu_memory.cu
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Global variables for GPU Lennard-Jones Library
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Aug 4 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #include "lj_gpu_memory.h"
 #define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp>
--- a/lib/gpu/lj_gpu_memory.h
+++ b/lib/gpu/lj_gpu_memory.h
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                               lj_gpu_memory.h
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Global variables for GPU Lennard-Jones Library
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Aug 4 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef LJ_GPU_MEMORY_H
 #define LJ_GPU_MEMORY_H
@ -30,8 +26,10 @@
 #include "pair_gpu_nbor.h"
 #define BLOCK_1D 64
 #define CELL_SIZE 64
 #define MAX_SHARED_TYPES 8
 #define PERCENT_GPU_MEMORY 0.7
 #define BIG_NUMBER 100000000
 template <class numtyp, class acctyp>
 class LJ_GPU_Memory {
--- a/lib/gpu/nvc_device.cu
+++ b/lib/gpu/nvc_device.cu
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                                nvc_device.cu
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Utilities for dealing with cuda devices
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the NVC Library
 __________________________________________________________________________
    begin                : Wed Jan 28 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #include <stdlib.h>
 #include <stdio.h>
--- a/lib/gpu/nvc_device.h
+++ b/lib/gpu/nvc_device.h
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                                nvc_device.h
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Utilities for dealing with cuda devices
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the NVC Library
 __________________________________________________________________________
    begin                : Wed Jan 28 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef NVC_DEVICE
 #define NVC_DEVICE
--- a/lib/gpu/nvc_get_devices.cu
+++ b/lib/gpu/nvc_get_devices.cu
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                              nvc_get_devices.h
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  List properties of cuda devices
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the NVC Library
 __________________________________________________________________________
    begin                : Wed Jan 28 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #include "nvc_device.h"
--- a/lib/gpu/nvc_macros.h
+++ b/lib/gpu/nvc_macros.h
@ -1,3 +1,22 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef NVC_MACROS_H
 #define NVC_MACROS_H
--- a/lib/gpu/nvc_memory.h
+++ b/lib/gpu/nvc_memory.h
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                                nvc_memory.h
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Routines for memory management on CUDA devices
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the NVC Library
 __________________________________________________________________________
    begin                : Thu Jun 25 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef NVC_MEMORY_H
 #define NVC_MEMORY_H
--- a/lib/gpu/nvc_timer.h
+++ b/lib/gpu/nvc_timer.h
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                                 nvc_timer.h
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Class for timing CUDA routines
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the NVC Library
 __________________________________________________________________________
    begin                : Tue Feb 3 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef NVC_TIMER_H
 #define NVC_TIMER_H
--- a/lib/gpu/nvc_traits.h
+++ b/lib/gpu/nvc_traits.h
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                             nvc_texture_traits.h
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Tricks for templating textures
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Jun 23 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef NVC_TEXTURE_TRAITS_H
 #define NVC_TEXTURE_TRAITS_H
--- a/lib/gpu/pair_gpu_atom.cu
+++ b/lib/gpu/pair_gpu_atom.cu
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                               pair_gpu_atom.cu
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Memory routines for moving atom and force data between host and gpu
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Aug 4 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #include "pair_gpu_texture.h"
 #include "pair_gpu_atom.h"
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                               pair_gpu_atom.h
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Memory routines for moving atom and force data between host and gpu
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Aug 4 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef PAIR_GPU_ATOM_H
 #define PAIR_GPU_ATOM_H
--- a/lib/gpu/pair_gpu_cell.cu
+++ b/lib/gpu/pair_gpu_cell.cu
@ -0,0 +1,454 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #include "lj_gpu_memory.h"
 #include "pair_gpu_cell.h"
 static __constant__ float d_boxlo[3];
 static __constant__ float d_boxhi[3];
 static __constant__ float d_cell_size[1];
 static __constant__ float d_skin[1];
 void init_cell_list_const(double cell_size, double skin,
 			  double *boxlo, double *boxhi)
 {
  float cell_size1 = cell_size;
  float skin1 = skin;
  float boxlo1[3], boxhi1[3];
  for (int i = 0; i < 3; i++) {
    boxlo1[i] = boxlo[i];
    boxhi1[i] = boxhi[i];
  }
  cudaMemcpyToSymbol(d_cell_size, &cell_size1, sizeof(float),   
 		     0, cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(d_boxlo,     boxlo1,      3*sizeof(float), 
 		     0, cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(d_boxhi,     boxhi1,      3*sizeof(float), 
 		     0, cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(d_skin,      &skin1,       sizeof(float),   
 		     0, cudaMemcpyHostToDevice); 
 }
 __global__ void kernel_set_cell_list(unsigned int *cell_idx)
 {
  unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
  cell_idx[gid] = BIG_NUMBER;
 }
 // build the cell list
 __global__ void kernel_build_cell_list(float3 *cell_list, 
 				       unsigned int *cell_idx, 
 				       int *cell_type, 
 				       int *cell_atom,
 				       float3 *pos, 
 				       int *type, 
 				       const int inum, 
 				       const int nall)
 {
  unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
  float cSize = d_cell_size[0];
  int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
  if (gid < nall) {
    float3 p = pos[gid];
    p.x = fmaxf(p.x, d_boxlo[0]-cSize);
    p.x = fminf(p.x, d_boxhi[0]+cSize);
    p.y = fmaxf(p.y, d_boxlo[1]-cSize);
    p.y = fminf(p.y, d_boxhi[1]+cSize);
    p.z = fmaxf(p.z, d_boxlo[2]-cSize);
    p.z = fminf(p.z, d_boxhi[2]+cSize);
    int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncell1D
    		  + (int)(p.z/cSize + 1.0) * ncell1D * ncell1D;
    int atom_pos = atomicAdd(&cell_atom[cell_id], 1);
    int pid = cell_id*CELL_SIZE + atom_pos;
    cell_list[pid] = pos[gid];
    cell_type[pid] = type[gid];
    cell_idx [pid] = gid;
  }
 }
 __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebuild)
 {
  float cSize = d_cell_size[0];
  int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
  // calculate 3D block idx from 2d block
  int bx = blockIdx.x;
  int by = blockIdx.y % gridDim.x;
  int bz = blockIdx.y / gridDim.x;
  int tid = threadIdx.x;
  // compute cell idx from 3D block idx
  int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
  int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
  float skin = d_skin[0];
  float lowx = d_boxlo[0] + (bx-1)*cSize - 0.5*skin;
  float hix  = lowx + cSize + skin;
  float lowy = d_boxlo[1] + (by-1)*cSize - 0.5*skin;
  float hiy  = lowy + cSize + skin;
  float lowz = d_boxlo[2] + (bz-1)*cSize - 0.5*skin;
  float hiz  = lowz + cSize + skin;
  for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
    int pid = pbase + i;
    float3 p = cell_list[pid];
    p.x = fmaxf(p.x, d_boxlo[0]-cSize);
    p.x = fminf(p.x, d_boxhi[0]+cSize);
    p.y = fmaxf(p.y, d_boxlo[1]-cSize);
    p.y = fminf(p.y, d_boxhi[1]+cSize);
    p.z = fmaxf(p.z, d_boxlo[2]-cSize);
    p.z = fminf(p.z, d_boxhi[2]+cSize);
    if (p.x < lowx || p.x > hix || p.y < lowy || p.y > hiy || p.z < lowz || p.z > hiz) {
      *rebuild = 1;   
    }
  }
 }
 __global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int ncell)
 {
  unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
  if (gid < ncell) {
    if (cell_atom[gid] > CELL_SIZE) 
      *overflow = 1;
  }
 }
 __global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int *cell_atom, float3 *pos)
 {
  // calculate 3D block idx from 2d block
  int bx = blockIdx.x;
  int by = blockIdx.y % gridDim.x;
  int bz = blockIdx.y / gridDim.x;
  int tid = threadIdx.x;
  // compute cell idx from 3D block idx
  int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
  int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
  for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
    int pid = pbase + i;
    cell_list[pid] = pos[cell_idx[pid]];
  }
 }
 __global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit); 
 void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size)
 {
  int i = 0;
  const unsigned int bitSize = sizeof(unsigned int)*8;
  const unsigned int bitStep = 4;
  const int gSize = size/BLOCK_1D;
  while (bitSize > i*bitStep) {
    radixSortBlocks<<<gSize, BLOCK_1D, 2*BLOCK_1D*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
    i++;
  }
 }
 #ifdef __DEVICE_EMULATION__
 #define __SYNC __syncthreads();
 #else
 #define __SYNC 
 #endif
 #define WARP_SIZE 32
 template<class T, int maxlevel>
 __device__ T scanwarp(T val, T* sData)
 {
    // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = 
    // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1))
    int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
    sData[idx] = 0;
    idx += WARP_SIZE;
    sData[idx] = val;          __SYNC
 #ifdef __DEVICE_EMULATION__
 	T t = sData[idx -  1]; __SYNC 
        sData[idx] += t;       __SYNC
        t = sData[idx -  2];   __SYNC 
        sData[idx] += t;       __SYNC
        t = sData[idx -  4];   __SYNC 
        sData[idx] += t;       __SYNC
        t = sData[idx -  8];   __SYNC 
        sData[idx] += t;       __SYNC
        t = sData[idx - 16];   __SYNC 
        sData[idx] += t;       __SYNC
 #else
        if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; } __SYNC
        if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; } __SYNC
        if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; } __SYNC
        if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; } __SYNC
        if (4 <= maxlevel) { sData[idx] += sData[idx -16]; } __SYNC
 #endif
        return sData[idx] - val;  // convert inclusive -> exclusive
 }
 __device__ unsigned int scan(unsigned int idata)
 {    
    extern  __shared__  unsigned int ptr[];
    unsigned int idx = threadIdx.x;
    unsigned int val = idata;
    val = scanwarp<unsigned int, 4>(val, ptr);
    __syncthreads();
    if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
    {
        ptr[idx >> 5] = val + idata;
    }
    __syncthreads();
 #ifndef __DEVICE_EMULATION__
    if (idx < WARP_SIZE)
 #endif
    {
        ptr[idx] = scanwarp<unsigned int, 2>(ptr[idx], ptr);
    }
    __syncthreads();
    val += ptr[idx >> 5];
    return val;
 }
 __device__ unsigned int rank(unsigned int preds)
 {
    unsigned int address = scan(preds);  
    __shared__ unsigned int numtrue;
    if (threadIdx.x == BLOCK_1D - 1)
    {
        numtrue = address + preds;
    }
    __syncthreads();
    unsigned int rank;
    unsigned int idx = threadIdx.x;
    rank = (preds) ? address : numtrue + idx - address;
    return rank;
 }
 __device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, unsigned int nbits, unsigned int startbit)
 {
  extern __shared__ unsigned int sMem1[];
  __shared__ float sMem2[BLOCK_1D];
  __shared__ int sMem3[BLOCK_1D];
  int tid = threadIdx.x;
  for(unsigned int shift = startbit; shift < (startbit + nbits); ++shift) {
    unsigned int lsb;
    lsb = !(((*key) >> shift) & 0x1);
    unsigned int r;
    r = rank(lsb);
    // This arithmetic strides the ranks across 4 CTA_SIZE regions
    sMem1[r] = *key;
    __syncthreads();
    // The above allows us to read without 4-way bank conflicts:
    *key = sMem1[tid];    
    __syncthreads();
    sMem2[r] = (*value1).x;
    __syncthreads();
    (*value1).x = sMem2[tid];
    __syncthreads();
    sMem2[r] = (*value1).y;
    __syncthreads();
    (*value1).y = sMem2[tid];
    __syncthreads();
    sMem2[r] = (*value1).z;
    __syncthreads();
    (*value1).z = sMem2[tid];
    __syncthreads();
    sMem3[r] = *value2;
    __syncthreads();
    *value2 = sMem3[tid];
    __syncthreads();
  }
 }
 __global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit)
 {
  extern __shared__ unsigned int sMem[];
  int gid = threadIdx.x + blockIdx.x * blockDim.x;
  unsigned int key;
  float3 value1;
  int value2;
  key = keys[gid];
  value1 = values1[gid];
  value2 = values2[gid];
  __syncthreads();
  radixSortBlock(&key, &value1, &value2, nbits, startbit);
  keys[gid] = key;
  values1[gid] = value1;
  values2[gid] = value2;
 }
 static float3 *d_pos, *pos_temp;
 static int *d_type;
 static int *d_overflow, *d_rebuild;
 void init_cell_list(cell_list &cell_list_gpu, 
 		   const int nall,
 		   const int ncell, 
 		   const int buffer)
 {
  cudaMalloc((void**)&(cell_list_gpu.pos), ncell*buffer*sizeof(float3));
  cudaMalloc((void**)&(cell_list_gpu.idx),  ncell*buffer*sizeof(unsigned int));
  cudaMalloc((void**)&(cell_list_gpu.type), ncell*buffer*sizeof(int));
  cudaMalloc((void**)&(cell_list_gpu.natom), ncell*sizeof(int));
  cudaMallocHost((void**)&pos_temp, nall*sizeof(float3));
  cudaMalloc((void**)&d_pos,       nall*sizeof(float3));
  cudaMalloc((void**)&d_type,      nall*sizeof(int));
  cudaMalloc((void**)&d_overflow, sizeof(int));
  cudaMalloc((void**)&d_rebuild, sizeof(int));
  cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int));
  cudaMemset(cell_list_gpu.pos, 0, ncell*buffer*sizeof(float3));
 }
 void clear_cell_list(cell_list &cell_list_gpu)
 {
  cudaFree(cell_list_gpu.pos);
  cudaFree(cell_list_gpu.idx);
  cudaFree(cell_list_gpu.natom);
  cudaFree(cell_list_gpu.type);
  cudaFreeHost(pos_temp);
  cudaFree(d_pos);
  cudaFree(d_type);
  cudaFree(d_overflow);
  cudaFree(d_rebuild);
 }
 void build_cell_list(double *atom_pos, int *atom_type, 
 		     cell_list &cell_list_gpu, 
 		     const int ncell, const int ncell1D, const int buffer,
 		     const int inum, const int nall, const int ago)
 {
  cudaMemset(d_overflow, 0, sizeof(int));
  cudaMemset(d_rebuild, 0, sizeof(int));
  // copy position and type to GPU
  for (int i = 0; i < 3*nall; i+=3) { 
    pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]);
  }
  cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice);
  cudaMemcpy(d_type, atom_type, nall*sizeof(int),  cudaMemcpyHostToDevice);
  static int first_build = 1;
  int rebuild = 0;
  // copy the last built cell-list and test whether it needs to be rebuilt
  if (!first_build) {
    dim3 block(BLOCK_1D);
    dim3 grid(ncell1D, ncell1D*ncell1D);
    kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos, 
 				 cell_list_gpu.idx, 
 				 cell_list_gpu.natom, d_pos);
    cudaMemset(d_rebuild, 0, sizeof(int));
    int *temp = (int*)malloc(sizeof(int)*ncell);
    kernel_test_rebuild<<<grid, block>>>(cell_list_gpu.pos, 
 					 cell_list_gpu.natom,
 					 d_rebuild);
    cudaMemcpy(&rebuild, d_rebuild, sizeof(int), cudaMemcpyDeviceToHost);
  }
  /*if (!first_build) {
    dim3 block(BLOCK_1D);
    dim3 grid(ncell1D, ncell1D*ncell1D);
    kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos, 
 				      cell_list_gpu.idx, 
 				      cell_list_gpu.natom, d_pos);
 				      }*/
  if (ago == 0) rebuild = 1;
  // build cell-list for the first time
  if (first_build || rebuild) {
    first_build = 0;
    // cout << "Building cell list..." << endl;
    cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int));
    // initialize d_cell_idx for the sorting routine
    kernel_set_cell_list<<<ncell, buffer>>>(cell_list_gpu.idx);
    // build cell list
    dim3 blockDim(128);
    dim3 gridDim(static_cast<int>(ceil(static_cast<double>(nall)/blockDim.x)));
    kernel_build_cell_list<<<gridDim, blockDim>>>(cell_list_gpu.pos, 
 						  cell_list_gpu.idx, 
 						  cell_list_gpu.type, 
 						  cell_list_gpu.natom, 
 						  d_pos, d_type, inum, nall);
    // check cell list overflow
    int overflow;
    int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/BLOCK_1D));
    kernel_test_overflow<<<gDimCell, BLOCK_1D>>>(cell_list_gpu.natom, 
 						 d_overflow, ncell);
    cudaMemcpy(&overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost);
    if (overflow > 0) {
      printf("\n\nBLOCK_1D too small for cell list, please increase it!\n\n");
      exit(0);
    }
    // sort atoms in every cell by atom index to avoid floating point associativity problem.
    sortBlocks(cell_list_gpu.idx, cell_list_gpu.pos, 
 	       cell_list_gpu.type, ncell*buffer);
    cudaThreadSynchronize();
  }
 }
--- a/lib/gpu/pair_gpu_cell.h
+++ b/lib/gpu/pair_gpu_cell.h
@ -0,0 +1,62 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef PAIR_GPU_CELL_H
 #define PAIR_GPU_CELL_H
 typedef struct {
  float3 *pos;
  unsigned int *idx;
  int *type;
  int *natom;
 } cell_list;
 __global__ void kernel_set_cell_list(unsigned int *cell_idx);
 __global__ void kernel_build_cell_list(float3 *cell_list, 
 				       unsigned int *cell_idx, 
 				       int *cell_type, 
 				       int *cell_atom,
 				       float3 *pos, 
 				       int *type, 
 				       const int inum, 
 				       const int nall);
 __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebuild);
 __global__ void kernel_copy_list(float3 *cell_list, 
 				 unsigned int *cell_idx, 
 				 int *cell_atom, 
 				 float3 *pos);
 __global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int ncell);
 void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size);
 void init_cell_list_const(double cell_size, double skin,
 			 double *boxlo, double *boxhi);
 void init_cell_list(cell_list &cell_list_gpu, 
 		   const int nall,
 		   const int ncell, 
 		   const int buffer);
 void build_cell_list(double *atom_pos, int *atom_type, 
 		    cell_list &cell_list_gpu, 
 		    const int ncell, const int ncell1D, const int buffer,
 		    const int inum, const int nall, const int ago);
 void clear_cell_list(cell_list &cell_list_gpu);
 #endif
--- a/lib/gpu/pair_gpu_nbor.cu
+++ b/lib/gpu/pair_gpu_nbor.cu
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                               pair_gpu_nbor.cu
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Neighbor memory operations for LAMMPS GPU Library
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Aug 4 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #include "pair_gpu_nbor.h"
--- a/lib/gpu/pair_gpu_nbor.h
+++ b/lib/gpu/pair_gpu_nbor.h
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                               pair_gpu_nbor.h
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Neighbor memory operations for LAMMPS GPU Library
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Aug 4 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #ifndef PAIR_GPU_NBOR_H
 #define PAIR_GPU_NBOR_H
--- a/lib/gpu/pair_gpu_texture.h
+++ b/lib/gpu/pair_gpu_texture.h
@ -1,25 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                              pair_gpu_texture.h
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  Tricks for templating textures
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Jun 23 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #include "nvc_traits.h"
 #include "nvc_memory.h"
--- a/lib/gpu/pair_tex_tar.cu
+++ b/lib/gpu/pair_tex_tar.cu
@ -1,26 +1,21 @@
-/***************************************************************************
+/* ----------------------------------------------------------------------
-                               pair_tex_tar.cu
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-                             -------------------
+   http://lammps.sandia.gov, Sandia National Laboratories
-                               W. Michael Brown
+   Steve Plimpton, sjplimp@sandia.gov
-  "Tar" of header and source files that need texture reference definitions
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
  within file scope.
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________
    begin                : Tue Jun 23 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
-   ----------------------------------------------------------------------- */
+
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 #include "pair_gpu_atom.cu"
 #include "lj_gpu.cu"