From 27b1805916abf6d8697afe56d278d8dfbc54decd Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Mon, 10 Aug 2009 20:28:57 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3042
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 lib/gpu/Makefile              |  75 ---
 lib/gpu/README                |  94 ----
 lib/gpu/gb_gpu.cu             | 518 --------------------
 lib/gpu/gb_gpu_extra.h        | 337 -------------
 lib/gpu/gb_gpu_kernel.h       | 861 ----------------------------------
 lib/gpu/gb_gpu_memory.cu      | 146 ------
 lib/gpu/gb_gpu_memory.h       |  74 ---
 lib/gpu/lj_gpu.cu             | 234 ---------
 lib/gpu/lj_gpu_kernel.h       | 248 ----------
 lib/gpu/lj_gpu_memory.cu      | 167 -------
 lib/gpu/lj_gpu_memory.h       |  88 ----
 lib/gpu/nvc_device.cu         |  96 ----
 lib/gpu/nvc_device.h          |  92 ----
 lib/gpu/nvc_get_devices.cu    |  31 --
 lib/gpu/nvc_macros.h          | 121 -----
 lib/gpu/nvc_memory.h          | 447 ------------------
 lib/gpu/nvc_timer.h           |  83 ----
 lib/gpu/pair_gpu_atom.cu      | 173 -------
 lib/gpu/pair_gpu_atom.h       | 151 ------
 lib/gpu/pair_gpu_nbor.cu      |  80 ----
 lib/gpu/pair_gpu_nbor.h       |  91 ----
 lib/gpu/pair_gpu_texture.h    | 292 ------------
 src/GPU/Install.csh           |  34 --
 src/GPU/pair_gayberne_gpu.cpp | 369 ---------------
 src/GPU/pair_gayberne_gpu.h   |  43 --
 src/GPU/pair_lj_cut_gpu.cpp   | 270 -----------
 src/GPU/pair_lj_cut_gpu.h     |  40 --
 src/GPU/style_gpu.h           |  41 --
 28 files changed, 5296 deletions(-)
 delete mode 100644 lib/gpu/Makefile
 delete mode 100644 lib/gpu/README
 delete mode 100644 lib/gpu/gb_gpu.cu
 delete mode 100644 lib/gpu/gb_gpu_extra.h
 delete mode 100644 lib/gpu/gb_gpu_kernel.h
 delete mode 100644 lib/gpu/gb_gpu_memory.cu
 delete mode 100644 lib/gpu/gb_gpu_memory.h
 delete mode 100644 lib/gpu/lj_gpu.cu
 delete mode 100644 lib/gpu/lj_gpu_kernel.h
 delete mode 100644 lib/gpu/lj_gpu_memory.cu
 delete mode 100644 lib/gpu/lj_gpu_memory.h
 delete mode 100644 lib/gpu/nvc_device.cu
 delete mode 100644 lib/gpu/nvc_device.h
 delete mode 100644 lib/gpu/nvc_get_devices.cu
 delete mode 100644 lib/gpu/nvc_macros.h
 delete mode 100644 lib/gpu/nvc_memory.h
 delete mode 100644 lib/gpu/nvc_timer.h
 delete mode 100644 lib/gpu/pair_gpu_atom.cu
 delete mode 100644 lib/gpu/pair_gpu_atom.h
 delete mode 100644 lib/gpu/pair_gpu_nbor.cu
 delete mode 100644 lib/gpu/pair_gpu_nbor.h
 delete mode 100644 lib/gpu/pair_gpu_texture.h
 delete mode 100644 src/GPU/Install.csh
 delete mode 100644 src/GPU/pair_gayberne_gpu.cpp
 delete mode 100644 src/GPU/pair_gayberne_gpu.h
 delete mode 100644 src/GPU/pair_lj_cut_gpu.cpp
 delete mode 100644 src/GPU/pair_lj_cut_gpu.h
 delete mode 100644 src/GPU/style_gpu.h

diff --git a/lib/gpu/Makefile b/lib/gpu/Makefile
deleted file mode 100644
index 08e4982a24..0000000000
--- a/lib/gpu/Makefile
+++ /dev/null
@@ -1,75 +0,0 @@
-#***************************************************************************
-#                               Makefile
-#                          -------------------
-#                           W. Michael Brown
-#
-#  _________________________________________________________________________
-#    Build for the LAMMPS GPU Force Library
-#
-#  _________________________________________________________________________
-#
-#    begin                : Tue June 23 2009
-#    copyright            : (C) 2009 by W. Michael Brown
-#    email                : wmbrown@sandia.gov
-# ***************************************************************************/
-
-BIN_DIR = .
-OBJ_DIR = .
-AR = ar
-CUDA_CPP  = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math
-CUDA_ARCH = -maxrregcount 128 #-arch=sm_13
-CUDA_PREC = -D_SINGLE_SINGLE
-CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB)
-
-CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
-
-CUDA_LIB = $(OBJ_DIR)/pair_gpu_lib.a
-
-# Headers for CUDA Stuff
-NVC_H  = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h
-# Headers for Pair Stuff
-PAIR_H  = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h
-
-ALL_H = $(NVC_H) $(PAIR_H)
-
-EXECS = $(BIN_DIR)/nvc_get_devices
-OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/gb_gpu.cu_o \
-       $(OBJ_DIR)/gb_gpu_memory.cu_o $(OBJ_DIR)/lj_gpu.cu_o \
-       $(OBJ_DIR)/lj_gpu_memory.cu_o $(OBJ_DIR)/pair_gpu_atom.cu_o \
-       $(OBJ_DIR)/pair_gpu_nbor.cu_o
-
-all: $(CUDA_LIB) $(EXECS)
-
-$(OBJ_DIR)/nvc_device.o: nvc_device.cu $(NVC_H)
-	$(CUDA) -o $@ -c nvc_device.cu
-
-$(OBJ_DIR)/pair_gpu_atom.cu_o: pair_gpu_atom.cu pair_gpu_texture.h pair_gpu_atom.h $(NVC_H)
-	$(CUDA) -o $@ -c pair_gpu_atom.cu
-
-$(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
-	$(CUDA) -o $@ -c pair_gpu_nbor.cu
-
-$(OBJ_DIR)/lj_gpu_memory.cu_o: lj_gpu_memory.cu lj_gpu_memory.h $(ALL_H)
-	$(CUDA) -o $@ -c lj_gpu_memory.cu
-
-$(OBJ_DIR)/lj_gpu.cu_o: lj_gpu.cu $(ALL_H) lj_gpu_memory.h lj_gpu_kernel.h
-	$(CUDA) -o $@ -c lj_gpu.cu
-
-$(OBJ_DIR)/gb_gpu_memory.cu_o: gb_gpu_memory.cu gb_gpu_memory.h $(ALL_H)
-	$(CUDA) -o $@ -c gb_gpu_memory.cu
-
-$(OBJ_DIR)/gb_gpu.cu_o: gb_gpu.cu $(ALL_H) gb_gpu_memory.h gb_gpu_kernel.h gb_gpu_extra.h
-	$(CUDA) -o $@ -c gb_gpu.cu
-
-$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o
-	$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o 
-
-$(CUDA_LIB): $(OBJS)
-	$(AR) -crusv $(CUDA_LIB) $(OBJS)
-
-clean:
-	rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.linkinfo
-
-veryclean: clean
-	rm -rf *~ *.linkinfo
-
diff --git a/lib/gpu/README b/lib/gpu/README
deleted file mode 100644
index 17f6e5457d..0000000000
--- a/lib/gpu/README
+++ /dev/null
@@ -1,94 +0,0 @@
-/***************************************************************************
-                                   README
-                             -------------------
-                               W. Michael Brown
-
-  README for building LAMMPS GPU Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Thu Jun 25 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-
-                          GENERAL NOTES
-                          
-This library, pair_gpu_lib.a, provides routines for GPGPU acceleration of
-LAMMPS pair styles. Currently, only CUDA enabled GPUs are supported. 
-Compilation of this library requires installing the CUDA GPU driver and
-CUDA toolkit for your operating system. In addition to the LAMMPS library,
-the binary nvc_get_devices will also be built. This can be used to query
-the names and properties of GPU devices on your system.
-
-NOTE: Installation of the CUDA SDK is not required.
-
-Current pair styles supporting GPU Accelartion:
-
-  1. lj/cut/gpu
-  2. gayberne/gpu
-
-
-
-                     MULTIPLE LAMMPS PROCESSES
-                     
-When using GPGPU acceleration, you are restricted to one physical GPU per
-LAMMPS process. This can be multiple GPUs on a single node or across multiple
-nodes. Intructions on GPU assignment can be found in the LAMMPS documentation.
-          
-
-
-                            SPEEDUPS
-
-The speedups that can be obtained using this library are highly
-dependent on the GPU architecture and the computational expense of
-the pair potential. When comparing a single precision Tesla C1060 run to a 
-serial Intel Xeon 5140 2.33 GHz serial run, the speedup is ~4.42x for lj/cut
-with a cutoff of 2.5. For gayberne with a cutoff of 7, the speedup is >103x
-for 8000 particles. The speedup will improve with an increase in the
-number of particles or an increase in the cutoff.
-
-
-                         
-                    BUILDING AND PRECISION MODES
-
-To build, edit the CUDA_CPP, CUDA_ARCH, CUDA_PREC, and CUDA_LINK files for
-your machine. Type make. Additionally, the GPU package must be installed and
-compiled for LAMMPS. The library supports 3 precision modes as determined by 
-the CUDA_PREC variable:
-
-  CUDA_PREC = -D_SINGLE_SINGLE  # Single precision for all calculations
-  CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
-  CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
-
-NOTE: Double precision is only supported on certain GPUS (with
-      compute capability>=1.3).
-      
-NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
-      package has been installed before installing the GPU package in LAMMPS.
-
-
-
-                           GPU MEMORY
-
-Upon initialization of the pair style, the library will reserve memory for
-64K atoms per GPU or 70% of each cards GPU memory, whichever value is limiting. 
-The value of 70% can be changed by editing the PERCENT_GPU_MEMORY definition 
-in the source file. The value of 64K cannot be increased and is the maximum 
-number of atoms allowed per GPU. Using the 'neigh_modify one' modifier in
-your LAMMPS input script can help to increase maximum number of atoms per
-GPU for cards with limited memory.
-
-
-
-
diff --git a/lib/gpu/gb_gpu.cu b/lib/gpu/gb_gpu.cu
deleted file mode 100644
index cde56d569a..0000000000
--- a/lib/gpu/gb_gpu.cu
+++ /dev/null
@@ -1,518 +0,0 @@
-/***************************************************************************
-                                  gb_gpu.cu
-                             -------------------
-                               W. Michael Brown
-
-  Gay-Berne anisotropic potential GPU calcultation
-
-   *** Force decomposition by Atom Version ***
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Jun 23 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include "nvc_macros.h"
-#include "nvc_timer.h"
-#include "nvc_device.h"
-#include "gb_gpu_memory.h"
-#include "gb_gpu_kernel.h"
-
-using namespace std;
-
-static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
-#define GBMT GB_GPU_Memory<numtyp,acctyp>
-
-// ---------------------------------------------------------------------------
-// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
-// -- Only pack neighbors matching the specified inclusive range of forms
-// -- Only pack neighbors within cutoff
-// ---------------------------------------------------------------------------
-template<class numtyp>
-__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, 
-                                 const int start, const int inum, 
-                                 const int *dev_ij, const int form_low, 
-                                 const int form_high, const int nall) {
-                                
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start;
-
-  if (ii<inum) {
-    int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-    nbor+=nbor_pitch;
-    int *nbor_newj=nbor;
-    nbor+=nbor_pitch;
-  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=_x_<numtyp>(i,7);
-
-    int newj=0;  
-    for ( ; list<list_end; list++) {
-      int j=*list;
-      if (j>=nall)
-        j%=nall;
-      int jtype=_x_<numtyp>(j,7);
-      
-      if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
-        // Compute r12;
-        numtyp rsq=_x_<numtyp>(j,0)-ix;
-        rsq*=rsq;
-        numtyp t=_x_<numtyp>(j,1)-iy;
-        rsq+=t*t;
-        t=_x_<numtyp>(j,2)-iz;
-        rsq+=t*t;
-
-        if (rsq< _cutsq_<numtyp>(itype,jtype)) {
-          *nbor=j;
-          nbor+=nbor_pitch;
-          newj++;
-        }
-      }
-    }
-    *nbor_newj=newj;
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
-// -- Only pack neighbors matching the specified inclusive range of forms
-// -- Only pack neighbors within cutoff
-// -- Fast version of routine that uses shared memory for LJ constants
-// ---------------------------------------------------------------------------
-template<class numtyp>
-__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch, 
-                                      const int start, const int inum, 
-                                      const int *dev_ij, const int form_low, 
-                                      const int form_high, const int nall) {
-                                
-  int ii=threadIdx.x;
-  __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    int itype=ii/MAX_SHARED_TYPES;
-    int jtype=ii%MAX_SHARED_TYPES;
-    cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
-    form[ii]=_form_(itype,jtype);
-  }
-  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
-
-  if (ii<inum) {
-    int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-    nbor+=nbor_pitch;
-    int *nbor_newj=nbor;
-    nbor+=nbor_pitch;
-  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
-
-    int newj=0;  
-    for ( ; list<list_end; list++) {
-      int j=*list;
-      if (j>=nall)
-        j%=nall;
-      int mtype=itype+_x_<numtyp>(j,7);
-      
-      if (form[mtype]>=form_low && form[mtype]<=form_high) {
-        // Compute r12;
-        numtyp rsq=_x_<numtyp>(j,0)-ix;
-        rsq*=rsq;
-        numtyp t=_x_<numtyp>(j,1)-iy;
-        rsq+=t*t;
-        t=_x_<numtyp>(j,2)-iz;
-        rsq+=t*t;
-
-        if (rsq<cutsq[mtype]) {
-          *nbor=j;
-          nbor+=nbor_pitch;
-          newj++;
-        }
-      }
-    }
-    *nbor_newj=newj;
-  }
-}
-
-template<class numtyp, class acctyp>
-void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, 
-                const int inum, const int form_low, const int form_high) {
-  if (gbm.shared_types)
-    kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
-          (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
-           gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
-  else
-    kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
-          (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
-           gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
-}
-
-// ---------------------------------------------------------------------------
-// Convert something to a string
-// ---------------------------------------------------------------------------
-#include <sstream>
-template <class t>
-inline string gb_gpu_toa(const t& in) {
-  ostringstream o;
-  o.precision(2);
-  o << in;
-  return o.str();
-}
-
-// ---------------------------------------------------------------------------
-// Return string with GPU info
-// ---------------------------------------------------------------------------
-string gb_gpu_name(const int id, const int max_nbors) {
-  string name=GBMF[0].gpu.name(id)+", "+
-              gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
-              gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
-              gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
-              gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
-                                               max_nbors))+" Atoms";
-  return name;
-}
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
-                  const double upsilon, const double mu, double **shape,
-                  double **well, double **cutsq, double **sigma, 
-                  double **epsilon, double *host_lshape, int **form,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int max_nbors, const int thread, const int gpu_id) {
-  assert(thread<MAX_GPU_THREADS);
-  
-  if (GBMF[thread].gpu.num_devices()==0)
-    return 0;                   
-
-  ij_size=IJ_SIZE;
-  return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
-                           well, cutsq, sigma, epsilon, host_lshape, form,
-                           host_lj1, host_lj2, host_lj3, host_lj4, offset,
-                           special_lj, max_nbors, false, gpu_id);
-}
-
-// ---------------------------------------------------------------------------
-// Clear memory on host and device
-// ---------------------------------------------------------------------------
-void gb_gpu_clear(const int thread) {
-  GBMF[thread].clear();
-}
-
-// ---------------------------------------------------------------------------
-// copy atom positions, quaternions, and optionally types to device
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x, 
-                         double **host_quat, const int *host_type, 
-                         const bool rebuild, cudaStream_t &stream) {
-  atom.time_atom.start();
-  atom.reset_write_buffer();
- 
-  // Rows 1-3 of dev_x are position; rows 4-7 are quaternion
-  atom.add_atom_data(host_x[0],3);
-  atom.add_atom_data(host_x[0]+1,3);
-  atom.add_atom_data(host_x[0]+2,3);
-  atom.add_atom_data(host_quat[0],4);
-  atom.add_atom_data(host_quat[0]+1,4);
-  atom.add_atom_data(host_quat[0]+2,4);
-  atom.add_atom_data(host_quat[0]+3,4);
-   
-  int csize=7;
-  
-  // If a rebuild occured, copy type data
-  if (rebuild) {
-    atom.add_atom_data(host_type);
-    csize++;
-  }
-  
-  atom.copy_atom_data(csize,stream);
-  atom.time_atom.stop();
-}
-
-void gb_gpu_atom(double **host_x, double **host_quat, 
-                 const int *host_type, const bool rebuild, const int thread) {
-  _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
-               GBMF[thread].pair_stream);
-}
-
-// ---------------------------------------------------------------------------
-// Signal that we need to transfer a new neighbor list
-// ---------------------------------------------------------------------------
-template <class gbmtyp>
-int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, 
-                          const int inum, int *ilist, const int *numj,
-                          const int *type, bool &success) {
-  if (nall>gbm.max_atoms) {
-    success=false;
-    return 0;
-  }
-  success=true;
-    
-  gbm.nbor.time_nbor.start();
-
-  gbm.atom.nall(nall);
-  gbm.atom.inum(inum);
-
-  if (gbm.multiple_forms) {
-    int p=0, acc=0;
-    for (int i=0; i<inum; i++) {
-      int itype=type[ilist[i]];
-      if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
-        gbm.host_olist[p]=ilist[i];
-        gbm.nbor.host_ij[p]=numj[ilist[i]];
-        gbm.nbor.host_ij[p+inum]=acc;
-        acc+=numj[ilist[i]];
-        p++;
-      }
-    }
-    gbm.last_ellipse=p;
-    for (int i=0; i<inum; i++) {
-      int itype=type[ilist[i]];
-      if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
-        gbm.host_olist[p]=ilist[i];
-        gbm.nbor.host_ij[p]=numj[ilist[i]];
-        gbm.nbor.host_ij[p+inum]=acc;
-        acc+=numj[ilist[i]];
-        p++;
-      }
-    }
-    gbm.nbor.ij_total=0;
-    gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
-    gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+
-                                      gbm.nbor.dev_nbor.row_size(),
-                                      gbm.nbor.dev_nbor.row_size(),2,inum,
-                                      gbm.pair_stream);
-  } else {
-    gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
-    gbm.last_ellipse=inum;
-  }
-
-  gbm.nbor.time_nbor.stop();
-  
-  if (gbm.multiple_forms)
-    return gbm.host_olist.begin();
-  return ilist;
-}
-
-int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
-                         int *ilist, const int *numj, const int *type,
-                         const int thread, bool &success) {
-  return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
-                             success);
-}
-
-// ---------------------------------------------------------------------------
-// Copy a set of ij_size ij interactions to device and compute energies,
-// forces, and torques for those interactions
-// ---------------------------------------------------------------------------
-template <class gbmtyp>
-void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) {
-  gbm.nbor.time_nbor.add_to_total();
-  // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
-  
-  gbm.nbor.time_nbor.start();
-  gbm.nbor.add(num_ij,gbm.pair_stream);
-  gbm.nbor.time_nbor.stop();
-}
-
-void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) {
-  _gb_gpu_nbors(GBMF[thread],num_ij,eflag);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques for all ij interactions
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, 
-                      const bool rebuild) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=BLOCK_1D;
-
-  int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));
-
-  if (gbm.multiple_forms) {
-    gbm.time_kernel.start();
-    if (gbm.last_ellipse>0) {
-      // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
-      GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
-                               static_cast<double>(BX)));
-      pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE);
-      gbm.time_kernel.stop();
-  
-      gbm.time_gayberne.start();                                 
-      kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
-            gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
-            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
-            eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
-      gbm.time_gayberne.stop();
-
-      if (gbm.last_ellipse==gbm.atom.inum()) {
-        gbm.time_kernel2.start();
-        gbm.time_kernel2.stop();
-        gbm.time_gayberne2.start();
-        gbm.time_gayberne2.stop();
-        gbm.time_pair.start();
-        gbm.time_pair.stop();
-        return;
-      }
-                    
-      // ------------ SPHERE_ELLIPSE ---------------
-        
-      gbm.time_kernel2.start();
-      GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
-                               gbm.last_ellipse)/BX));
-      pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
-                 ELLIPSE_SPHERE);
-      gbm.time_kernel2.stop();
-
-      gbm.time_gayberne2.start();
-      kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
-            gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), 
-            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
-            eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
-      gbm.time_gayberne2.stop();
-   } else {
-      gbm.atom.ans.zero();
-      gbm.time_kernel.stop();
-      gbm.time_gayberne.start();                                 
-      gbm.time_gayberne.stop();
-      gbm.time_kernel2.start();
-      gbm.time_kernel2.stop();
-      gbm.time_gayberne2.start();
-      gbm.time_gayberne2.stop();
-    }
-    
-    // ------------         LJ      ---------------
-    gbm.time_pair.start();
-    if (gbm.last_ellipse<gbm.atom.inum()) {
-      if (gbm.shared_types)
-        kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), 
-            gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), 
-            gbm.atom.ans.begin(), gbm.atom.ans.row_size(), 
-            gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, 
-            gbm.atom.inum(), gbm.atom.nall());
-      else
-        kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), 
-            gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), 
-            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
-            eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
-    }
-    gbm.time_pair.stop();
-  } else {
-    gbm.time_kernel.start();
-    pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
-    gbm.time_kernel.stop();
-  
-    gbm.time_gayberne.start();                                 
-    kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-         (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
-          gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), 
-          gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), 
-          eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
-    gbm.time_gayberne.stop();
-  }
-}
-
-void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
-                     const int thread) {
-  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
-}
-
-// ---------------------------------------------------------------------------
-// Get energies, forces, and torques to host
-// ---------------------------------------------------------------------------
-template<class numtyp, class acctyp>
-double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
-                      const bool eflag, const bool vflag, const bool eflag_atom,
-                      const bool vflag_atom, double *eatom, double **vatom,
-                      double *virial) {
-  double evdw;
-
-  gbm.atom.time_answer.start();
-  gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
-
-  gbm.atom.time_atom.add_to_total();
-  gbm.nbor.time_nbor.add_to_total();
-  gbm.time_kernel.add_to_total();
-  gbm.time_gayberne.add_to_total();
-  if (gbm.multiple_forms) {
-    gbm.time_kernel2.add_to_total();
-    gbm.time_gayberne2.add_to_total();
-    gbm.time_pair.add_to_total();
-  }      
-  // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
-  
-  evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
-  gbm.atom.add_forces(ilist,f);
-  gbm.atom.add_torques(ilist,tor,gbm.last_ellipse);
-  gbm.atom.time_answer.stop();
-  gbm.atom.time_answer.add_to_total();
-  return evdw;
-}
-
-double gb_gpu_forces(double **f, double **tor, const int *ilist,
-                     const bool eflag, const bool vflag, const bool eflag_atom,
-                     const bool vflag_atom, double *eatom, double **vatom,
-                     double *virial, const int thread) {
-  return _gb_gpu_forces<PRECISION,ACC_PRECISION>
-                       (GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom,
-                        vflag_atom,eatom,vatom,virial);
-}
-
-void gb_gpu_time(const int i) {
-  cout.precision(4);
-  cout << "Atom copy:     " << GBMF[i].atom.time_atom.total_seconds() 
-       << " s.\n"
-       << "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds() 
-       << " s.\n"
-       << "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+
-                               GBMF[i].time_kernel2.total_seconds() << " s.\n"
-       << "Force calc:    " << GBMF[i].time_gayberne.total_seconds()+
-                               GBMF[i].time_gayberne2.total_seconds()<< " s.\n";
-  if (GBMF[i].multiple_forms)
-    cout << "LJ calc:       " << GBMF[i].time_pair.total_seconds() << " s.\n";
-  cout << "Answer copy:   " << GBMF[i].atom.time_answer.total_seconds() 
-       << " s.\n";
-}
-
-int gb_gpu_num_devices() {
-  return GBMF[0].gpu.num_devices();
-}
-
-double gb_gpu_bytes() {
-  return GBMF[0].host_memory_usage();
-}
diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h
deleted file mode 100644
index c918e07070..0000000000
--- a/lib/gpu/gb_gpu_extra.h
+++ /dev/null
@@ -1,337 +0,0 @@
-/***************************************************************************
-                               gb_gpu_extra.h
-                             -------------------
-                               W. Michael Brown
-
-  Inline GPU kernel routines ala math_extra for the CPU.
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Jun 23 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#ifndef GB_GPU_EXTRA_H
-#define GB_GPU_EXTRA_H
-
-#include "math.h"
-#include "stdio.h"
-#include "string.h"
-
-/* ----------------------------------------------------------------------
-   Atomic update of global memory
-------------------------------------------------------------------------- */
-/*
-template <class numtyp> __device__ 
-inline void atomicAdd(numtyp *address, numtyp val);
-
-template <>
-__device__ inline void atomicAdd<float>(float *address, float val)
-{
-  int i_val = __float_as_int(val);
-  int tmp0 = 0;
-  int tmp1;
-
-  while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) {
-    tmp0 = tmp1;
-    i_val = __float_as_int(val + __int_as_float(tmp1));
-  }
-}*/
-
-/* ----------------------------------------------------------------------
-   dot product of 2 vectors
-------------------------------------------------------------------------- */
-
-template <class numtyp>
-static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
-{
-  return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
-}
-
-/* ----------------------------------------------------------------------
-   cross product of 2 vectors
-------------------------------------------------------------------------- */
-
-template <class numtyp>
-static __inline__ __device__ void gpu_cross3(const numtyp *v1, 
-                                             const numtyp *v2, numtyp *ans)
-{
-  ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
-  ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
-  ans[2] = v1[0]*v2[1]-v1[1]*v2[0];
-}
-
-/* ----------------------------------------------------------------------
-   determinant of a matrix
-------------------------------------------------------------------------- */
-
-template <class numtyp>
-static __inline__ __device__ numtyp gpu_det3(const numtyp m[9])
-{
-  numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - 
-    m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + 
-    m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
-  return ans;
-}
-
-/* ----------------------------------------------------------------------
-   diagonal matrix times a full matrix
-------------------------------------------------------------------------- */
-
-template <class numtyp>
-static __inline__ __device__ void gpu_well_times3(const int i, const numtyp m[9],
-                                                  numtyp ans[9])
-{
-  ans[0] = _well_<numtyp>(i,0)*m[0];
-  ans[1] = _well_<numtyp>(i,0)*m[1];
-  ans[2] = _well_<numtyp>(i,0)*m[2];
-  ans[3] = _well_<numtyp>(i,1)*m[3];
-  ans[4] = _well_<numtyp>(i,1)*m[4];
-  ans[5] = _well_<numtyp>(i,1)*m[5];
-  ans[6] = _well_<numtyp>(i,2)*m[6];
-  ans[7] = _well_<numtyp>(i,2)*m[7];
-  ans[8] = _well_<numtyp>(i,2)*m[8];
-}
-
-/* ----------------------------------------------------------------------
-   diagonal matrix times a full matrix
-------------------------------------------------------------------------- */
-
-template <class numtyp>
-static __inline__ __device__ void gpu_shape_times3(const int i, const numtyp m[9],
-                                                   numtyp ans[9])
-{
-  ans[0] = _shape_<numtyp>(i,0)*m[0];
-  ans[1] = _shape_<numtyp>(i,0)*m[1];
-  ans[2] = _shape_<numtyp>(i,0)*m[2];
-  ans[3] = _shape_<numtyp>(i,1)*m[3];
-  ans[4] = _shape_<numtyp>(i,1)*m[4];
-  ans[5] = _shape_<numtyp>(i,1)*m[5];
-  ans[6] = _shape_<numtyp>(i,2)*m[6];
-  ans[7] = _shape_<numtyp>(i,2)*m[7];
-  ans[8] = _shape_<numtyp>(i,2)*m[8];
-}
-
-/* ----------------------------------------------------------------------
-   add two matrices
-------------------------------------------------------------------------- */
-
-template <class numtyp>
-static __inline__ __device__ void gpu_plus3(const numtyp m[9], 
-                                            const numtyp m2[9], numtyp ans[9])
-{
-  ans[0] = m[0]+m2[0];
-  ans[1] = m[1]+m2[1];
-  ans[2] = m[2]+m2[2];
-  ans[3] = m[3]+m2[3];
-  ans[4] = m[4]+m2[4];
-  ans[5] = m[5]+m2[5];
-  ans[6] = m[6]+m2[6];
-  ans[7] = m[7]+m2[7];
-  ans[8] = m[8]+m2[8];
-}
-
-/* ----------------------------------------------------------------------
-   multiply the transpose of mat1 times mat2
-------------------------------------------------------------------------- */
-
-template <class numtyp>
-static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9], 
-                                                       const numtyp m2[9],
-                                                       numtyp ans[9])
-{
-  ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
-  ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
-  ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8];
-  ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6];
-  ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7];
-  ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8];
-  ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6];
-  ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7];
-  ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8];
-}
-
-/* ----------------------------------------------------------------------
-   row vector times matrix
-------------------------------------------------------------------------- */
-
-template <class numtyp>
-static __inline__ __device__ void gpu_row_times3(const numtyp *v, 
-                                                 const numtyp m[9], numtyp *ans)
-{
-  ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
-  ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
-  ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2];
-}
-
-/* ----------------------------------------------------------------------
-   solve Ax = b or M ans = v
-   use gaussian elimination & partial pivoting on matrix
-   error_flag set to 2 if bad matrix inversion attempted
-------------------------------------------------------------------------- */
-
-template <class numtyp>
-static __inline__ __device__ void gpu_mldivide3(const numtyp m[9], 
-                                                const numtyp *v, numtyp *ans,
-                                                int *error_flag)
-{
-  // create augmented matrix for pivoting
-
-  numtyp aug[12], t;
-
-  aug[3] = v[0];
-  aug[0] = m[0];
-  aug[1] = m[1];
-  aug[2] = m[2];
-  aug[7] = v[1];
-  aug[4] = m[3];
-  aug[5] = m[4];
-  aug[6] = m[5];
-  aug[11] = v[2];
-  aug[8] = m[6];
-  aug[9] = m[7];
-  aug[10] = m[8];
-
-  if (fabs(aug[4]) > fabs(aug[0])) {
-    numtyp swapt;
-    swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
-    swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
-    swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
-    swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
-  }
-  if (fabs(aug[8]) > fabs(aug[0])) {
-    numtyp swapt;
-    swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
-    swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
-    swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
-    swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
-  }
-
-  if (aug[0] != (numtyp)0.0) {
-    if (0!=0) {
-      numtyp swapt;
-      swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt;
-      swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt;
-      swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt;
-      swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt;
-    }
-  } else if (aug[4] != (numtyp)0.0) {
-    if (1!=0) {
-      numtyp swapt;
-      swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
-      swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
-      swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
-      swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
-    }
-  } else if (aug[8] != (numtyp)0.0) {
-    if (2!=0) {
-      numtyp swapt;
-      swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
-      swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
-      swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
-      swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
-    }
-  } else
-    *error_flag=2;
-
-  t = aug[4]/aug[0];
-  aug[5]-=t*aug[1];
-  aug[6]-=t*aug[2];
-  aug[7]-=t*aug[3];
-  t = aug[8]/aug[0];
-  aug[9]-=t*aug[1];
-  aug[10]-=t*aug[2];
-  aug[11]-=t*aug[3];
-
-  if (fabs(aug[9]) > fabs(aug[5])) {
-    numtyp swapt;
-    swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
-    swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
-    swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
-    swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
-  }
-
-  if (aug[5] != (numtyp)0.0) {
-    if (1!=1) {
-      numtyp swapt;
-      swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt;
-      swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt;
-      swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt;
-      swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt;
-    }
-  } else if (aug[9] != (numtyp)0.0) {
-    if (2!=1) {
-      numtyp swapt;
-      swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
-      swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
-      swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
-      swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
-    }
-  }
-
-  t = aug[9]/aug[5];
-  aug[10]-=t*aug[6];
-  aug[11]-=t*aug[7];
-  
-  if (aug[10] == (numtyp)0.0)
-    *error_flag=2;
-
-  ans[2] = aug[11]/aug[10];
-  t = (numtyp)0.0;
-  t += aug[6]*ans[2];
-  ans[1] = (aug[7]-t) / aug[5];
-  t = (numtyp)0.0;
-  t += aug[1]*ans[1];
-  t += aug[2]*ans[2];
-  ans[0] = (aug[3]-t) / aug[0];
-}
-
-/* ----------------------------------------------------------------------
-   compute rotation matrix from quaternion conjugate
-   quat = [w i j k]
-------------------------------------------------------------------------- */
-
-template <class numtyp>
-static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi, 
-                                                        numtyp mat[9])
-{
-  numtyp qi3=_x_<numtyp>(qi,3);
-  numtyp qi4=_x_<numtyp>(qi,4);
-  numtyp qi5=_x_<numtyp>(qi,5);
-  numtyp qi6=_x_<numtyp>(qi,6);
-
-  numtyp w2 = qi3*qi3;
-  numtyp i2 = qi4*qi4;
-  numtyp j2 = qi5*qi5;
-  numtyp k2 = qi6*qi6;
-  numtyp twoij = 2.0*qi4*qi5;
-  numtyp twoik = 2.0*qi4*qi6;
-  numtyp twojk = 2.0*qi5*qi6;
-  numtyp twoiw = 2.0*qi4*qi3;
-  numtyp twojw = 2.0*qi5*qi3;
-  numtyp twokw = 2.0*qi6*qi3;
-
-  mat[0] = w2+i2-j2-k2;
-  mat[3] = twoij-twokw;
-  mat[6] = twojw+twoik;
-
-  mat[1] = twoij+twokw;
-  mat[4] = w2-i2+j2-k2;
-  mat[7] = twojk-twoiw;
-	
-  mat[2] = twoik-twojw;
-  mat[5] = twojk+twoiw;
-  mat[8] = w2-i2-j2+k2;
-}
-
-#endif
diff --git a/lib/gpu/gb_gpu_kernel.h b/lib/gpu/gb_gpu_kernel.h
deleted file mode 100644
index 8ca047e0e1..0000000000
--- a/lib/gpu/gb_gpu_kernel.h
+++ /dev/null
@@ -1,861 +0,0 @@
-/***************************************************************************
-                              gb_gpu_kernel.cu
-                             -------------------
-                               W. Michael Brown
-
-  Routines that actually perform the force/torque computation
-
-   *** Force Decomposition by Atom Version ***
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Jun 23 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#ifndef GB_GPU_KERNEL
-#define GB_GPU_KERNEL
-
-#include "gb_gpu_extra.h"
-
-template <class numtyp>
-static __inline__ __device__ void compute_eta_torque(numtyp m[9], 
-                                                     numtyp m2[9],
-                                                     const int i, 
-                                                     numtyp ans[9])
-{
-  numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
-    m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
-    m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
-  den = (numtyp)1.0/den;
-  
-  numtyp shapex=_shape_<numtyp>(i,0);
-  numtyp shapey=_shape_<numtyp>(i,1);
-  numtyp shapez=_shape_<numtyp>(i,2);
-  
-  ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
-		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
-		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
-		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
-		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
-  
-  ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
-		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
-		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
-		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
-		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
-  
-  ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
-		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
-		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
-		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
-		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
-  
-  ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
-		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
-		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
-		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
-		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
-  
-  ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
-		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
-		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
-		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
-		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
-  
-  ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
-		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
-		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
-		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
-		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
-  
-  ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
-		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
-		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
-		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
-		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
-  
-  ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
-		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
-		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
-		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
-		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
-  
-  ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
-		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
-		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
-		    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
-		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
-}
-
-template<class numtyp, class acctyp>
-__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
-                                const int *dev_nbor, const int nbor_pitch, 
-                                acctyp *ans, size_t ans_pitch, int *err_flag, 
-                                const bool eflag, const bool vflag,
-                                const int inum, const int nall) {
-                                
-  __shared__ numtyp sp_lj[4];
-
-   // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  ii+=INT_MUL(blockIdx.x,blockDim.x);                                  
-
-  if (ii<inum) {
-  
-  acctyp energy=(numtyp)0;
-  acctyp fx=(numtyp)0;
-  acctyp fy=(numtyp)0;
-  acctyp fz=(numtyp)0;
-  acctyp torx=(numtyp)0;
-  acctyp tory=(numtyp)0;
-  acctyp torz=(numtyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(numtyp)0;
-  
-  const int *nbor=dev_nbor+ii;
-  int i=*nbor;
-  nbor+=nbor_pitch;
-  nbor+=nbor_pitch;
-  nbor+=nbor_pitch;
-  int numj=*nbor;
-  nbor+=nbor_pitch;
-  const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
-  
-  numtyp ix=_x_<numtyp>(i,0);
-  numtyp iy=_x_<numtyp>(i,1);
-  numtyp iz=_x_<numtyp>(i,2);
-  int itype=_x_<numtyp>(i,7);
-  numtyp a1[9], b1[9], g1[9];
-  {
-    numtyp t[9];
-    gpu_quat_to_mat_trans(i,a1);
-    gpu_shape_times3(itype,a1,t);
-    gpu_transpose_times3(a1,t,g1);
-    gpu_well_times3(itype,a1,t);
-    gpu_transpose_times3(a1,t,b1);
-  }
-
-  numtyp factor_lj;
-  for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
-  
-  int j=*nbor;
-  if (j < nall) 
-    factor_lj = 1.0;
-  else {
-    factor_lj = sp_lj[j/nall];
-    j %= nall;
-  }
-  int jtype=_x_<numtyp>(j,7);
-
-  // Compute r12
-  numtyp r12[3];
-  r12[0] = _x_<numtyp>(j,0)-ix;
-  r12[1] = _x_<numtyp>(j,1)-iy;
-  r12[2] = _x_<numtyp>(j,2)-iz;
-  numtyp ir = gpu_dot3(r12,r12);
-
-  ir = rsqrt(ir);
-  numtyp r = (numtyp)1.0/ir;
-
-  numtyp a2[9];
-  gpu_quat_to_mat_trans(j,a2);
-  
-  numtyp u_r, dUr[3], tUr[3], eta, teta[3];
-  { // Compute U_r, dUr, eta, and teta
-    // Compute g12
-    numtyp g12[9];
-    {
-      numtyp g2[9];
-      {
-          gpu_shape_times3(jtype,a2,g12);
-          gpu_transpose_times3(a2,g12,g2);
-          gpu_plus3(g1,g2,g12);
-      }
-  
-      { // Compute U_r and dUr
-    
-        // Compute kappa
-        numtyp kappa[3];
-        gpu_mldivide3(g12,r12,kappa,err_flag);
-
-        // -- replace r12 with r12 hat
-        r12[0]*=ir;
-        r12[1]*=ir;
-        r12[2]*=ir;
-
-        // -- kappa is now / r
-        kappa[0]*=ir;
-        kappa[1]*=ir;
-        kappa[2]*=ir;
-  
-        // energy
-  
-        // compute u_r and dUr
-        numtyp uslj_rsq;
-        {
-          // Compute distance of closest approach
-          numtyp h12, sigma12;
-          sigma12 = gpu_dot3(r12,kappa);
-          sigma12 = rsqrt((numtyp)0.5*sigma12);
-          h12 = r-sigma12;
-
-          // -- kappa is now ok
-          kappa[0]*=r;
-          kappa[1]*=r;
-          kappa[2]*=r;
-          
-          numtyp sigma = _sigma_<numtyp>(itype,jtype);
-          numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
-          numtyp varrho = sigma/(h12+gum[0]*sigma);
-          numtyp varrho6 = varrho*varrho*varrho;
-          varrho6*=varrho6;
-          numtyp varrho12 = varrho6*varrho6;
-          u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
-
-          numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
-          temp1 = temp1*(numtyp)24.0*epsilon;
-          uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
-          numtyp temp2 = gpu_dot3(kappa,r12);
-          uslj_rsq = uslj_rsq*ir*ir;
-
-          dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
-          dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
-          dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
-        }
-
-        // torque for particle 1
-        {
-          numtyp tempv[3], tempv2[3];
-          tempv[0] = -uslj_rsq*kappa[0];
-          tempv[1] = -uslj_rsq*kappa[1];
-          tempv[2] = -uslj_rsq*kappa[2];
-          gpu_row_times3(kappa,g1,tempv2);
-          gpu_cross3(tempv,tempv2,tUr);
-        }
-      }
-    }
-     
-    // Compute eta
-    {
-      eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
-      numtyp det_g12 = gpu_det3(g12);
-      eta = pow(eta/det_g12,gum[1]);
-    }
-    
-    // Compute teta
-    numtyp temp[9], tempv[3], tempv2[3];
-    compute_eta_torque(g12,a1,itype,temp);
-    numtyp temp1 = -eta*gum[1];
-
-    tempv[0] = temp1*temp[0];
-    tempv[1] = temp1*temp[1];
-    tempv[2] = temp1*temp[2];
-    gpu_cross3(a1,tempv,tempv2);
-    teta[0] = tempv2[0];
-    teta[1] = tempv2[1];
-    teta[2] = tempv2[2];
-  
-    tempv[0] = temp1*temp[3];
-    tempv[1] = temp1*temp[4];
-    tempv[2] = temp1*temp[5];
-    gpu_cross3(a1+3,tempv,tempv2);
-    teta[0] += tempv2[0];
-    teta[1] += tempv2[1];
-    teta[2] += tempv2[2];
-
-    tempv[0] = temp1*temp[6];
-    tempv[1] = temp1*temp[7];
-    tempv[2] = temp1*temp[8];
-    gpu_cross3(a1+6,tempv,tempv2);
-    teta[0] += tempv2[0];
-    teta[1] += tempv2[1];
-    teta[2] += tempv2[2];
-  }
-  
-  numtyp chi, dchi[3], tchi[3];
-  { // Compute chi and dchi
-
-    // Compute b12
-    numtyp b2[9], b12[9];
-    {
-      gpu_well_times3(jtype,a2,b12);
-      gpu_transpose_times3(a2,b12,b2);
-      gpu_plus3(b1,b2,b12);
-    }
-
-    // compute chi_12
-    r12[0]*=r;
-    r12[1]*=r;
-    r12[2]*=r;
-    numtyp iota[3];
-    gpu_mldivide3(b12,r12,iota,err_flag);
-    // -- iota is now iota/r
-    iota[0]*=ir;
-    iota[1]*=ir;
-    iota[2]*=ir;
-    r12[0]*=ir;
-    r12[1]*=ir;
-    r12[2]*=ir;
-    chi = gpu_dot3(r12,iota);
-    chi = pow(chi*(numtyp)2.0,gum[2]);
-
-    // -- iota is now ok
-    iota[0]*=r;
-    iota[1]*=r;
-    iota[2]*=r;
-
-    numtyp temp1 = gpu_dot3(iota,r12);
-    numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
-    dchi[0] = temp2*(iota[0]-temp1*r12[0]);
-    dchi[1] = temp2*(iota[1]-temp1*r12[1]);
-    dchi[2] = temp2*(iota[2]-temp1*r12[2]);
-
-    // compute t_chi
-    numtyp tempv[3];
-    gpu_row_times3(iota,b1,tempv);
-    gpu_cross3(tempv,iota,tchi);
-    temp1 = (numtyp)-4.0*ir*ir;
-    tchi[0] *= temp1;
-    tchi[1] *= temp1;
-    tchi[2] *= temp1;
-  }
-
-  numtyp temp2 = factor_lj*eta*chi;
-  if (eflag)
-    energy+=u_r*temp2;
-  numtyp temp1 = -eta*u_r*factor_lj;
-  if (vflag) {
-    r12[0]*=-r;
-    r12[1]*=-r;
-    r12[2]*=-r;
-    numtyp ft=temp1*dchi[0]-temp2*dUr[0];
-    fx+=ft;
-    virial[0]+=r12[0]*ft;
-    ft=temp1*dchi[1]-temp2*dUr[1];
-    fy+=ft;
-    virial[1]+=r12[1]*ft;
-    virial[3]+=r12[0]*ft;
-    ft=temp1*dchi[2]-temp2*dUr[2];
-    fz+=ft;
-    virial[2]+=r12[2]*ft;
-    virial[4]+=r12[0]*ft;
-    virial[5]+=r12[1]*ft;
-  } else {
-    fx+=temp1*dchi[0]-temp2*dUr[0];
-    fy+=temp1*dchi[1]-temp2*dUr[1];
-    fz+=temp1*dchi[2]-temp2*dUr[2];
-  }
-
-  // Torque on 1
-  temp1 = -u_r*eta*factor_lj;
-  temp2 = -u_r*chi*factor_lj;
-  numtyp temp3 = -chi*eta*factor_lj;
-  torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
-  tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
-  torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
-
-  } // for nbor
-
-  // Store answers
-  acctyp *ap1=ans+ii;
-  if (eflag) {
-    *ap1=energy;
-    ap1+=ans_pitch;
-  }
-  if (vflag) {
-    for (int i=0; i<6; i++) {
-      *ap1=virial[i];
-      ap1+=ans_pitch;
-    }
-  }
-  *ap1=fx;
-  ap1+=ans_pitch;
-  *ap1=fy;
-  ap1+=ans_pitch;
-  *ap1=fz;
-  ap1+=ans_pitch;
-  *ap1=torx;
-  ap1+=ans_pitch;
-  *ap1=tory;
-  ap1+=ans_pitch;
-  *ap1=torz;
-
-  } // if ii
-}
-
-template<class numtyp, class acctyp>
-__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
-                                 const int *dev_nbor, const int nbor_pitch, 
-                                 acctyp *ans, size_t ans_pitch, int *err_flag, 
-                                 const bool eflag, const bool vflag,
-                                 const int start, const int inum, 
-                                 const int nall) {
-  __shared__ numtyp sp_lj[4];
-
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
-
-  if (ii<inum) {
-  
-    acctyp energy=(numtyp)0;
-    acctyp fx=(numtyp)0;
-    acctyp fy=(numtyp)0;
-    acctyp fz=(numtyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
-  
-    const int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    nbor+=nbor_pitch;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
-  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=_x_<numtyp>(i,7);
-      
-    numtyp oner=_shape_<numtyp>(itype,0);
-    numtyp one_well=_well_<numtyp>(itype,0);
-  
-    numtyp factor_lj;
-    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
-  
-      int j=*nbor;
-      if (j < nall) 
-        factor_lj = 1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
-      int jtype=_x_<numtyp>(j,7);
-
-      // Compute r12
-      numtyp r12[3];
-      r12[0] = _x_<numtyp>(j,0)-ix;
-      r12[1] = _x_<numtyp>(j,1)-iy;
-      r12[2] = _x_<numtyp>(j,2)-iz;
-      numtyp ir = gpu_dot3(r12,r12);
-
-      ir = rsqrt(ir);
-      numtyp r = (numtyp)1.0/ir;
-      
-      numtyp r12hat[3];
-      r12hat[0]=r12[0]*ir;
-      r12hat[1]=r12[1]*ir;
-      r12hat[2]=r12[2]*ir;
-
-      numtyp a2[9];
-      gpu_quat_to_mat_trans(j,a2);
-  
-      numtyp u_r, dUr[3], eta;
-      { // Compute U_r, dUr, eta, and teta
-        // Compute g12
-        numtyp g12[9];
-        {
-          {
-            numtyp g2[9];
-            gpu_shape_times3(jtype,a2,g12);
-            gpu_transpose_times3(a2,g12,g2);
-            g12[0]=g2[0]+oner;
-            g12[4]=g2[4]+oner;
-            g12[8]=g2[8]+oner;
-            g12[1]=g2[1];
-            g12[2]=g2[2];
-            g12[3]=g2[3];
-            g12[5]=g2[5];
-            g12[6]=g2[6];
-            g12[7]=g2[7];    
-          }
-  
-          { // Compute U_r and dUr
-    
-            // Compute kappa
-            numtyp kappa[3];
-            gpu_mldivide3(g12,r12,kappa,err_flag);
-
-            // -- kappa is now / r
-            kappa[0]*=ir;
-            kappa[1]*=ir;
-            kappa[2]*=ir;
-  
-            // energy
-  
-            // compute u_r and dUr
-            numtyp uslj_rsq;
-            {
-              // Compute distance of closest approach
-              numtyp h12, sigma12;
-              sigma12 = gpu_dot3(r12hat,kappa);
-              sigma12 = rsqrt((numtyp)0.5*sigma12);
-              h12 = r-sigma12;
-
-              // -- kappa is now ok
-              kappa[0]*=r;
-              kappa[1]*=r;
-              kappa[2]*=r;
-          
-              numtyp sigma = _sigma_<numtyp>(itype,jtype);
-              numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
-              numtyp varrho = sigma/(h12+gum[0]*sigma);
-              numtyp varrho6 = varrho*varrho*varrho;
-              varrho6*=varrho6;
-              numtyp varrho12 = varrho6*varrho6;
-              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
-
-              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
-              temp1 = temp1*(numtyp)24.0*epsilon;
-              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
-              numtyp temp2 = gpu_dot3(kappa,r12hat);
-              uslj_rsq = uslj_rsq*ir*ir;
-
-              dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
-              dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
-              dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
-            }
-          }
-        }
-     
-        // Compute eta
-        {
-          eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
-          numtyp det_g12 = gpu_det3(g12);
-          eta = pow(eta/det_g12,gum[1]);
-        }
-      }
-  
-      numtyp chi, dchi[3];
-      { // Compute chi and dchi
-
-        // Compute b12
-        numtyp b12[9];
-        {
-          numtyp b2[9];
-          gpu_well_times3(jtype,a2,b12);
-          gpu_transpose_times3(a2,b12,b2);
-          b12[0]=b2[0]+one_well;
-          b12[4]=b2[4]+one_well;
-          b12[8]=b2[8]+one_well;
-          b12[1]=b2[1];
-          b12[2]=b2[2];
-          b12[3]=b2[3];
-          b12[5]=b2[5];
-          b12[6]=b2[6];
-          b12[7]=b2[7];    
-        }
-
-        // compute chi_12
-        numtyp iota[3];
-        gpu_mldivide3(b12,r12,iota,err_flag);
-        // -- iota is now iota/r
-        iota[0]*=ir;
-        iota[1]*=ir;
-        iota[2]*=ir;
-        chi = gpu_dot3(r12hat,iota);
-        chi = pow(chi*(numtyp)2.0,gum[2]);
-
-        // -- iota is now ok
-        iota[0]*=r;
-        iota[1]*=r;
-        iota[2]*=r;
-
-        numtyp temp1 = gpu_dot3(iota,r12hat);
-        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
-        dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
-        dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
-        dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
-      }
-
-      numtyp temp2 = factor_lj*eta*chi;
-      if (eflag)
-        energy+=u_r*temp2;
-      numtyp temp1 = -eta*u_r*factor_lj;
-      if (vflag) {
-        r12[0]*=-1;
-        r12[1]*=-1;
-        r12[2]*=-1;
-        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
-        fx+=ft;
-        virial[0]+=r12[0]*ft;
-        ft=temp1*dchi[1]-temp2*dUr[1];
-        fy+=ft;
-        virial[1]+=r12[1]*ft;
-        virial[3]+=r12[0]*ft;
-        ft=temp1*dchi[2]-temp2*dUr[2];
-        fz+=ft;
-        virial[2]+=r12[2]*ft;
-        virial[4]+=r12[0]*ft;
-        virial[5]+=r12[1]*ft;
-      } else {
-        fx+=temp1*dchi[0]-temp2*dUr[0];
-        fy+=temp1*dchi[1]-temp2*dUr[1];
-        fz+=temp1*dchi[2]-temp2*dUr[2];
-      }
-    } // for nbor
-
-    // Store answers
-    acctyp *ap1=ans+ii;
-    if (eflag) {
-      *ap1=energy;
-      ap1+=ans_pitch;
-    }
-    if (vflag) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=ans_pitch;
-      }
-    }
-    *ap1=fx;
-    ap1+=ans_pitch;
-    *ap1=fy;
-    ap1+=ans_pitch;
-    *ap1=fz;
-  } // if ii
-}
-
-template<class numtyp, class acctyp>
-__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, 
-                          const int *dev_ij, const int nbor_pitch, acctyp *ans, 
-                          size_t ans_pitch, int *err_flag, const bool eflag, 
-                          const bool vflag, const int start, const int inum,
-                          const int nall) {
-  __shared__ numtyp sp_lj[4];                              
-  
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
-
-  if (ii<inum) {
-  
-    acctyp energy=(numtyp)0;
-    acctyp fx=(numtyp)0;
-    acctyp fy=(numtyp)0;
-    acctyp fz=(numtyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
-  
-    const int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=_x_<numtyp>(i,7);
-
-    numtyp factor_lj;
-    for ( ; list<list_end; list++) {
-  
-      int j=*list;
-      if (j < nall) 
-        factor_lj = 1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
-      int jtype=_x_<numtyp>(j,7);
-
-      // Compute r12
-      numtyp delx = ix-_x_<numtyp>(j,0);
-      numtyp dely = iy-_x_<numtyp>(j,1);
-      numtyp delz = iz-_x_<numtyp>(j,2);
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
-          _form_(itype,jtype)==SPHERE_SPHERE) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
-                                    _lj1_<numtyp>(itype,jtype).y);
-        force*=factor_lj;
-      
-        fx+=delx*force;
-        fy+=dely*force;
-        fz+=delz*force;
-
-        if (eflag) {
-          numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
-                          _lj3_<numtyp>(itype,jtype).y);
-          energy+=factor_lj*(e-_offset_<numtyp>(1,1)); 
-        }
-        if (vflag) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-
-    // Store answers
-    acctyp *ap1=ans+ii;
-    if (eflag) {
-      *ap1+=energy;
-      ap1+=ans_pitch;
-    }
-    if (vflag) {
-      for (int i=0; i<6; i++) {
-        *ap1+=virial[i];
-        ap1+=ans_pitch;
-      }
-    }
-    *ap1+=fx;
-    ap1+=ans_pitch;
-    *ap1+=fy;
-    ap1+=ans_pitch;
-    *ap1+=fz;
-
-  } // if ii
-}
-
-template<class numtyp, class acctyp>
-__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, 
-                               const int *dev_ij, const int nbor_pitch, 
-                               acctyp *ans, size_t ans_pitch,int *err_flag,
-                               const bool eflag, const bool vflag, 
-                               const int start, const int inum, const int nall){
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  __shared__ numtyp sp_lj[4];                              
-  __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    int itype=ii/MAX_SHARED_TYPES;
-    int jtype=ii%MAX_SHARED_TYPES;
-    cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
-    form[ii]=_form_(itype,jtype);
-    lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
-    lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
-    if (eflag) {
-      lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
-      lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
-      offset[ii]=_offset_<numtyp>(itype,jtype);
-    }
-  }
-  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
-  
-  if (ii<inum) {
-  
-    acctyp energy=(numtyp)0;
-    acctyp fx=(numtyp)0;
-    acctyp fy=(numtyp)0;
-    acctyp fz=(numtyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
-  
-    const int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
-
-    numtyp factor_lj;
-    for ( ; list<list_end; list++) {
-  
-      int j=*list;
-      if (j < nall) 
-        factor_lj = 1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
-      int mtype=itype+_x_<numtyp>(j,7);
-
-      // Compute r12
-      numtyp delx = ix-_x_<numtyp>(j,0);
-      numtyp dely = iy-_x_<numtyp>(j,1);
-      numtyp delz = iz-_x_<numtyp>(j,2);
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
-      
-        fx+=delx*force;
-        fy+=dely*force;
-        fz+=delz*force;
-
-        if (eflag) {
-          numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
-          energy+=factor_lj*(e-offset[mtype]); 
-        }
-        if (vflag) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-
-    // Store answers
-    acctyp *ap1=ans+ii;
-    if (eflag) {
-      *ap1+=energy;
-      ap1+=ans_pitch;
-    }
-    if (vflag) {
-      for (int i=0; i<6; i++) {
-        *ap1+=virial[i];
-        ap1+=ans_pitch;
-      }
-    }
-    *ap1+=fx;
-    ap1+=ans_pitch;
-    *ap1+=fy;
-    ap1+=ans_pitch;
-    *ap1+=fz;
-
-  } // if ii
-}
-
-#endif
diff --git a/lib/gpu/gb_gpu_memory.cu b/lib/gpu/gb_gpu_memory.cu
deleted file mode 100644
index 021cd85e96..0000000000
--- a/lib/gpu/gb_gpu_memory.cu
+++ /dev/null
@@ -1,146 +0,0 @@
-/***************************************************************************
-                              gb_gpu_memory.cu
-                             -------------------
-                               W. Michael Brown
-
-  Global variables for GPU Gayberne Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Thu Jun 25 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#include "gb_gpu_memory.h"
-#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
-
-template <class numtyp, class acctyp>
-GB_GPU_MemoryT::GB_GPU_Memory() : LJ_GPU_MemoryT() {
-  this->atom.atom_fields(8); 
-  this->atom.ans_fields(13); 
-  this->nbor.packing(true);
-}
-
-template <class numtyp, class acctyp>
-GB_GPU_MemoryT::~GB_GPU_Memory() { 
-  clear(); 
-}
- 
-template <class numtyp, class acctyp>
-int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes, 
-                          const double gamma, const double upsilon, 
-                          const double mu, double **host_shape,
-                          double **host_well, double **host_cutsq, 
-                          double **host_sigma, double **host_epsilon, 
-                          double *host_lshape, int **h_form, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
-                          double *host_special_lj, const int max_nbors, 
-                          const bool force_d, const int me) {
-  if (this->allocated)
-    clear();
-    
-  LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
-                       host_lj1, host_lj2, host_lj3, host_lj4, host_offset,
-                       host_special_lj, max_nbors, me);
-    
-  host_form=h_form;
-    
-  // Initialize timers for the selected GPU
-  time_kernel.init();
-  time_gayberne.init();
-  time_kernel2.init();
-  time_gayberne2.init();
-    
-  // Use the write buffer from atom for data initialization
-  NVC_HostT &host_write=this->atom.host_write;
-  assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
-
-  // Allocate, cast and asynchronous memcpy of constant data
-  gamma_upsilon_mu.safe_alloc(3);
-  host_write[0]=static_cast<numtyp>(gamma); 
-  host_write[1]=static_cast<numtyp>(upsilon);
-  host_write[2]=static_cast<numtyp>(mu);
-  gamma_upsilon_mu.copy_from_host(host_write.begin());
-
-  lshape.safe_alloc(ntypes);
-  lshape.cast_copy(host_lshape,host_write);
-  lshape.copy_from_host(host_write.begin());
-    
-  // Copy shape, well, sigma, epsilon, and cutsq onto GPU
-  shape.safe_alloc(ntypes,3);
-  shape.cast_copy(host_shape[0],host_write);
-  well.safe_alloc(ntypes,3);
-  well.cast_copy(host_well[0],host_write);
-
-  // Copy LJ data onto GPU
-  int lj_types=ntypes;
-  if (lj_types<=MAX_SHARED_TYPES)
-    lj_types=MAX_SHARED_TYPES;
-  form.safe_alloc(lj_types,lj_types);
-  form.copy_2Dfrom_host(host_form[0],ntypes,ntypes);
-
-  // See if we want fast GB-sphere or sphere-sphere calculations
-  multiple_forms=false;
-  for (int i=1; i<ntypes; i++)
-    for (int j=i; j<ntypes; j++) 
-      if (host_form[i][j]!=ELLIPSE_ELLIPSE)
-        multiple_forms=true;
-        
-  // Memory for ilist ordered by particle type
-  host_olist.safe_alloc_rw(this->max_atoms);
-
-  // Bind constant data to textures
-  lshape_bind_texture<numtyp>(lshape);
-  shape_bind_texture<numtyp>(shape);
-  well_bind_texture<numtyp>(well);
-  form_bind_texture(form);
-
-  return this->nbor.host_ij.begin();
-}
-  
-template <class numtyp, class acctyp>
-void GB_GPU_MemoryT::clear() {
-  if (!this->allocated)
-    return;
-
-  int err_flag;
-  this->dev_error.copy_to_host(&err_flag);
-  if (err_flag == 1)
-    std::cerr << "COLLISION BUFFER OVERFLOW OCCURED. INCREASE COLLISION_N "
-              << "and RECOMPILE.\n";
-  else if (err_flag == 2)
-    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
-
-  LJ_GPU_MemoryT::clear();      
-  
-  shape_unbind_texture<numtyp>();
-  well_unbind_texture<numtyp>();
-  form_unbind_texture();
-
-  shape.clear();
-  well.clear();
-  form.clear();
-  lshape.clear();
-  gamma_upsilon_mu.clear();
-  host_olist.clear();
-}  
- 
-template <class numtyp, class acctyp>
-double GB_GPU_MemoryT::host_memory_usage() {
-  return this->atom.host_memory_usage(this->max_atoms)+
-         this->nbor.host_memory_usage()+4*sizeof(numtyp)+
-         sizeof(GB_GPU_Memory<numtyp,acctyp>)+this->max_atoms*sizeof(int);
-}
-
-template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h
deleted file mode 100644
index 8886931c10..0000000000
--- a/lib/gpu/gb_gpu_memory.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/***************************************************************************
-                               gb_gpu_memory.h
-                             -------------------
-                               W. Michael Brown
-
-  Global variables for GPU Gayberne Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Thu Jun 25 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#ifndef GB_GPU_MEMORY_H
-#define GB_GPU_MEMORY_H
-
-#define MAX_GPU_THREADS 4
-#include "lj_gpu_memory.h"
-#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp,acctyp>
-
-enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
-
-template <class numtyp, class acctyp>
-class GB_GPU_Memory : public LJ_GPU_MemoryT {
- public:
-  GB_GPU_Memory();
-  ~GB_GPU_Memory(); 
- 
-  int* init(const int ij_size, const int ntypes, const double gamma,
-            const double upsilon, const double mu, double **host_shape,
-            double **host_well, double **host_cutsq, double **host_sigma, 
-            double **host_epsilon, double *host_lshape, int **h_form,
-            double **host_lj1, double **host_lj2, double **host_lj3, 
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int max_nbors, const bool force_d, const int me);
-
-  void clear();
- 
-  double host_memory_usage();
-  
-  // ----------------------------  DATA  ----------------------------
-
-  // ilist with particles sorted by type
-  NVC_HostI host_olist;
-
-  // --------------- Const Data for Atoms
-  NVC_ConstMatT shape, well;
-  NVC_ConstMatI form;
-  NVC_VecT lshape, gamma_upsilon_mu;
-  
-
-  // --------------- Timing Stuff
-  NVCTimer time_kernel, time_gayberne, time_kernel2, time_gayberne2;
-  
-  // True if we want to use fast GB-sphere or sphere-sphere calculations 
-  bool multiple_forms;
-  int **host_form;
-  int last_ellipse;
-   
- private:
-};
-
-#endif
-
diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu
deleted file mode 100644
index ea5ba440cc..0000000000
--- a/lib/gpu/lj_gpu.cu
+++ /dev/null
@@ -1,234 +0,0 @@
-/***************************************************************************
-                                  lj_gpu.cu
-                             -------------------
-                               W. Michael Brown
-
-  Lennard-Jones potential GPU calcultation
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include "nvc_macros.h"
-#include "nvc_timer.h"
-#include "nvc_device.h"
-#include "lj_gpu_memory.h"
-#include "lj_gpu_kernel.h"
-
-using namespace std;
-
-static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
-#define LJMT LJ_GPU_Memory<numtyp,acctyp>
-
-// ---------------------------------------------------------------------------
-// Convert something to a string
-// ---------------------------------------------------------------------------
-#include <sstream>
-template <class t>
-inline string lj_gpu_toa(const t& in) {
-  ostringstream o;
-  o.precision(2);
-  o << in;
-  return o.str();
-}
-
-// ---------------------------------------------------------------------------
-// Return string with GPU info
-// ---------------------------------------------------------------------------
-string lj_gpu_name(const int id, const int max_nbors) {
-  string name=LJMF.gpu.name(id)+", "+
-              lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+
-              lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+
-              lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+
-              lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id),
-                                            max_nbors))+" Atoms";
-  return name;
-}
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
-                  double **epsilon, double **host_lj1, double **host_lj2, 
-                  double **host_lj3, double **host_lj4, double **offset, 
-                  double *special_lj, const int max_nbors, const int gpu_id) {
-  if (LJMF.gpu.num_devices()==0)
-    return 0;                   
-
-  ij_size=IJ_SIZE;
-  return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
-                   host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
-}
-
-// ---------------------------------------------------------------------------
-// Clear memory on host and device
-// ---------------------------------------------------------------------------
-void lj_gpu_clear() {
-  LJMF.clear();
-}
-
-// ---------------------------------------------------------------------------
-// copy atom positions and optionally types to device
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-inline void _lj_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
-                         const int *host_type, const bool rebuild,
-                         cudaStream_t &stream) {
-  atom.time_atom.start();
-  atom.reset_write_buffer();
-  
-  // First row of dev_x is x position, second is y, third is z
-  atom.add_atom_data(host_x[0],3);
-  atom.add_atom_data(host_x[0]+1,3);
-  atom.add_atom_data(host_x[0]+2,3);
-
-  int csize=3;
-
-  // If a rebuild occured, copy type data
-  if (rebuild) {
-    atom.add_atom_data(host_type);
-    csize++;
-  }
-  
-  atom.copy_atom_data(csize,stream);
-  atom.time_atom.stop();
-}
-
-void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) {
-  _lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream);
-}
-
-// ---------------------------------------------------------------------------
-// Signal that we need to transfer a new neighbor list
-// ---------------------------------------------------------------------------
-template <class LJMTyp>
-bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum, 
-                         int *ilist, const int *numj) {
-  if (nall>ljm.max_atoms)
-    return false;
-  
-  ljm.nbor.time_nbor.start();
-
-  ljm.atom.nall(nall);
-  ljm.atom.inum(inum);
-  ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream);
-
-  ljm.nbor.time_nbor.stop();
-  return true;
-}
-
-bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, 
-                        const int *numj) {
-  return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj);
-}
-
-// ---------------------------------------------------------------------------
-// Copy a set of ij_size ij interactions to device and compute energies,
-// forces, and torques for those interactions
-// ---------------------------------------------------------------------------
-template <class LJMTyp>
-void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) {
-  ljm.nbor.time_nbor.add_to_total();
-  // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
-  
-  ljm.nbor.time_nbor.start();
-  ljm.nbor.add(num_ij,ljm.pair_stream);
-  ljm.nbor.time_nbor.stop();
-}
-
-void lj_gpu_nbors(const int num_ij) {
-  _lj_gpu_nbors(LJMF,num_ij);
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies and forces for all ij interactions
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=BLOCK_1D;
-
-  int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
-
-  ljm.time_pair.start();
-  if (ljm.shared_types)
-    kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
-           (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), 
-            ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), 
-            ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
-            vflag, ljm.atom.inum(), ljm.atom.nall());
-  else
-    kernel_lj<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
-           (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), 
-            ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), 
-            ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, 
-            vflag, ljm.atom.inum(), ljm.atom.nall());
-  ljm.time_pair.stop();
-}
-
-void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
-  _lj_gpu<PRECISION,ACC_PRECISION>(LJMF,eflag,vflag,rebuild);
-}
-
-// ---------------------------------------------------------------------------
-// Get energies and forces to host
-// ---------------------------------------------------------------------------
-template<class numtyp, class acctyp>
-double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
-                      const bool eflag, const bool vflag, const bool eflag_atom,
-                      const bool vflag_atom, double *eatom, double **vatom,
-                      double *virial) {
-  double evdw;
-  
-  ljm.atom.time_answer.start();
-  ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream);
-
-  ljm.atom.time_atom.add_to_total();
-  ljm.nbor.time_nbor.add_to_total();
-  ljm.time_pair.add_to_total();
-  // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed
-
-  evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
-  ljm.atom.add_forces(ilist,f);
-  ljm.atom.time_answer.stop();
-  ljm.atom.time_answer.add_to_total();
-  return evdw;
-}
-
-double lj_gpu_forces(double **f, const int *ilist, const bool eflag, 
-                     const bool vflag, const bool eflag_atom,
-                     const bool vflag_atom, double *eatom, double **vatom,
-                     double *virial) {
-  return _lj_gpu_forces<PRECISION,ACC_PRECISION> 
-    (LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial);
-}
-
-void lj_gpu_time() {
-  cout.precision(4);
-  cout << "Atom copy:     " << LJMF.atom.time_atom.total_seconds() << " s.\n";
-  cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n";
-  cout << "LJ calc:       " << LJMF.time_pair.total_seconds() << " s.\n";
-  cout << "Answer copy:   " << LJMF.atom.time_answer.total_seconds() << " s.\n";
-}
-
-int lj_gpu_num_devices() {
-  return LJMF.gpu.num_devices();
-}
-
-double lj_gpu_bytes() {
-  return LJMF.host_memory_usage();
-}
diff --git a/lib/gpu/lj_gpu_kernel.h b/lib/gpu/lj_gpu_kernel.h
deleted file mode 100644
index 5f3f0ff7c6..0000000000
--- a/lib/gpu/lj_gpu_kernel.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/***************************************************************************
-                              lj_gpu_kernel.cu
-                             -------------------
-                               W. Michael Brown
-
-  Routines that actually perform the force computation
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#ifndef LJ_GPU_KERNEL
-#define LJ_GPU_KERNEL
-
-template<class numtyp, class acctyp>
-__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, 
-                          const int *dev_ij, const int nbor_pitch, acctyp *ans, 
-                          size_t ans_pitch, const bool eflag, 
-                          const bool vflag, const int inum, const int nall) {
-  __shared__ numtyp sp_lj[4];
-                            
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  ii+=INT_MUL(blockIdx.x,blockDim.x);
-
-  if (ii<inum) {
-  
-    acctyp energy=(numtyp)0;
-    acctyp fx=(numtyp)0;
-    acctyp fy=(numtyp)0;
-    acctyp fz=(numtyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
-  
-    const int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=_x_<numtyp>(i,3);
-
-    numtyp factor_lj;
-    for ( ; list<list_end; list++) {
-  
-      int j=*list;
-      if (j < nall) 
-        factor_lj = 1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
-      int jtype=_x_<numtyp>(j,3);
-
-      // Compute r12
-      numtyp delx = ix-_x_<numtyp>(j,0);
-      numtyp dely = iy-_x_<numtyp>(j,1);
-      numtyp delz = iz-_x_<numtyp>(j,2);
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<_cutsq_<numtyp>(itype,jtype)) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv =r2inv*r2inv*r2inv;
-        numtyp force =factor_lj*r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
-                                             _lj1_<numtyp>(itype,jtype).y);
-      
-        fx+=delx*force;
-        fy+=dely*force;
-        fz+=delz*force;
-
-        if (eflag) {
-          numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
-                          _lj3_<numtyp>(itype,jtype).y);
-          energy+=factor_lj*(e-_offset_<numtyp>(1,1)); 
-        }
-        if (vflag) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-
-    // Store answers
-    acctyp *ap1=ans+ii;
-    if (eflag) {
-      *ap1=energy;
-      ap1+=ans_pitch;
-    }
-    if (vflag) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=ans_pitch;
-      }
-    }
-    *ap1=fx;
-    ap1+=ans_pitch;
-    *ap1=fy;
-    ap1+=ans_pitch;
-    *ap1=fz;
-
-  } // if ii
-}
-
-template<class numtyp, class acctyp>
-__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, 
-                               const int *dev_ij, const int nbor_pitch, 
-                               acctyp *ans, size_t ans_pitch,const bool eflag, 
-                               const bool vflag, const int inum, 
-                               const int nall) {
-                                
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  __shared__ numtyp sp_lj[4];
-  __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    int itype=ii/MAX_SHARED_TYPES;
-    int jtype=ii%MAX_SHARED_TYPES;
-    cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
-    lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
-    lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
-    if (eflag) {
-      lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
-      lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
-      offset[ii]=_offset_<numtyp>(itype,jtype);
-    }
-  }
-  ii+=INT_MUL(blockIdx.x,blockDim.x);
-  
-  if (ii<inum) {
-  
-    acctyp energy=(numtyp)0;
-    acctyp fx=(numtyp)0;
-    acctyp fy=(numtyp)0;
-    acctyp fz=(numtyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
-  
-    const int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,3));
-
-    numtyp factor_lj;
-    for ( ; list<list_end; list++) {
-  
-      int j=*list;
-      if (j < nall) 
-        factor_lj = 1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
-      int mtype=itype+_x_<numtyp>(j,3);
-
-      // Compute r12
-      numtyp delx = ix-_x_<numtyp>(j,0);
-      numtyp dely = iy-_x_<numtyp>(j,1);
-      numtyp delz = iz-_x_<numtyp>(j,2);
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-
-      if (r2inv<cutsq[mtype]) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
-      
-        fx+=delx*force;
-        fy+=dely*force;
-        fz+=delz*force;
-
-        if (eflag) {
-          numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
-          energy+=factor_lj*(e-offset[mtype]); 
-        }
-        if (vflag) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-
-    // Store answers
-    acctyp *ap1=ans+ii;
-    if (eflag) {
-      *ap1=energy;
-      ap1+=ans_pitch;
-    }
-    if (vflag) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=ans_pitch;
-      }
-    }
-    *ap1=fx;
-    ap1+=ans_pitch;
-    *ap1=fy;
-    ap1+=ans_pitch;
-    *ap1=fz;
-
-  } // if ii
-}
-
-#endif
diff --git a/lib/gpu/lj_gpu_memory.cu b/lib/gpu/lj_gpu_memory.cu
deleted file mode 100644
index 6abc0a12c9..0000000000
--- a/lib/gpu/lj_gpu_memory.cu
+++ /dev/null
@@ -1,167 +0,0 @@
-/***************************************************************************
-                               lj_gpu_memory.cu
-                             -------------------
-                               W. Michael Brown
-
-  Global variables for GPU Lennard-Jones Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#include "lj_gpu_memory.h"
-#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp>
-
-template <class numtyp, class acctyp>
-int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors); 
-}
-
-template <class numtyp, class acctyp>
-int LJ_GPU_MemoryT::get_max_atoms(const int gpu_bytes, const int max_nbors) {
-  int matoms=static_cast<int>(PERCENT_GPU_MEMORY*gpu_bytes/
-                              bytes_per_atom(max_nbors));
-  if (matoms>MAX_ATOMS)
-    matoms=MAX_ATOMS;
-  return matoms;
-}
-  
-template <class numtyp, class acctyp>
-int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, 
-                          double **host_cutsq, double **host_sigma, 
-                          double **host_epsilon, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
-                          double *host_special_lj, const int max_nbors, 
-                          const int me) {
-  if (allocated)
-    clear();
-    
-  if (me>=gpu.num_devices())
-    return 0;
-  gpu.set(me);
-  if (gpu.revision()<1.0)
-    return 0;  
-    
-  // Initialize timers for the selected GPU
-  time_pair.init();
-
-  // Initialize atom and nbor data
-  max_atoms=get_max_atoms(gpu.bytes(),max_nbors);
-  atom.init(max_atoms);
-  nbor.init(ij_size,max_atoms,max_nbors);
-  
-  // Get a stream for computing pair potentials
-  CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));
-    
-  // Use the write buffer from atom for data initialization
-  NVC_HostT &host_write=atom.host_write;
-  assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
-
-  // Copy data for bonded interactions
-  special_lj.safe_alloc(4);
-  special_lj.cast_copy(host_special_lj,host_write);
-
-  // Copy sigma, epsilon, and cutsq onto GPU
-  sigma.safe_alloc(ntypes,ntypes);
-  sigma.cast_copy(host_sigma[0],host_write);
-  epsilon.safe_alloc(ntypes,ntypes);
-  epsilon.cast_copy(host_epsilon[0],host_write);
-  cutsq.safe_alloc(ntypes,ntypes);
-  cutsq.cast_copy(host_cutsq[0],host_write);
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
-    shared_types=true;
-  }
-  offset.safe_alloc(lj_types,lj_types);
-  offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes);
-  double *t1=host_lj1[0];
-  double *t2=host_lj2[0];
-  for (int i=0; i<lj_types*lj_types; i++) {
-    host_write[i*2]=t1[i];
-    host_write[i*2+1]=t2[i];
-  }
-  lj1.safe_alloc(lj_types,lj_types);
-  lj1.copy_2Dfrom_host(reinterpret_cast<typename cu_vec_traits<numtyp>::vec2 *> (host_write.begin()),
-                       ntypes,ntypes);
-  t1=host_lj3[0];
-  t2=host_lj4[0];
-  for (int i=0; i<lj_types*lj_types; i++) {
-    host_write[i*2]=t1[i];
-    host_write[i*2+1]=t2[i];
-  }
-  lj3.safe_alloc(lj_types,lj_types);
-  lj3.copy_2Dfrom_host(reinterpret_cast<typename cu_vec_traits<numtyp>::vec2 *> (host_write.begin()),
-                       ntypes,ntypes);
-        
-  // Bind constant data to textures
-  sigma_bind_texture<numtyp>(sigma);
-  epsilon_bind_texture<numtyp>(epsilon);
-  cutsq_bind_texture<numtyp>(cutsq);
-  offset_bind_texture<numtyp>(offset);
-  lj1_bind_texture<typename cu_vec_traits<numtyp>::vec2>(lj1);
-  lj3_bind_texture<typename cu_vec_traits<numtyp>::vec2>(lj3);
-
-  dev_error.safe_alloc(1);
-  dev_error.zero();
-    
-  allocated=true;
-
-  return nbor.host_ij.begin();
-}
-  
-template <class numtyp, class acctyp>
-void LJ_GPU_MemoryT::clear() {
-  if (!allocated)
-    return;
-  allocated=false;
-      
-  // Check for any pair style specific errors here
-  int err_flag;
-  dev_error.copy_to_host(&err_flag);
- 
-  atom.clear();
-  nbor.clear();
-    
-  sigma_unbind_texture<numtyp>();
-  epsilon_unbind_texture<numtyp>();
-  cutsq_unbind_texture<numtyp>();
-  offset_unbind_texture<numtyp>();
-  lj1_unbind_texture<typename cu_vec_traits<numtyp>::vec2>();
-  lj3_unbind_texture<typename cu_vec_traits<numtyp>::vec2>();
-
-  CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream));
-
-  dev_error.clear();
-  sigma.clear();
-  epsilon.clear();
-  special_lj.clear();
-  cutsq.clear();
-  offset.clear();
-  lj1.clear();
-  lj3.clear();
-}  
- 
-template <class numtyp, class acctyp>
-double LJ_GPU_MemoryT::host_memory_usage() const {
-  return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+
-         sizeof(LJ_GPU_Memory<numtyp,acctyp>);
-}
-
-template class LJ_GPU_Memory<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h
deleted file mode 100644
index a3b4358192..0000000000
--- a/lib/gpu/lj_gpu_memory.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/***************************************************************************
-                               lj_gpu_memory.h
-                             -------------------
-                               W. Michael Brown
-
-  Global variables for GPU Lennard-Jones Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#ifndef LJ_GPU_MEMORY_H
-#define LJ_GPU_MEMORY_H
-
-#include "nvc_device.h"
-#include "pair_gpu_atom.h"
-#include "pair_gpu_nbor.h"
-
-#define BLOCK_1D 64
-#define MAX_SHARED_TYPES 8
-#define PERCENT_GPU_MEMORY 0.7
-
-template <class numtyp, class acctyp>
-class LJ_GPU_Memory {
- public:
-  LJ_GPU_Memory() : allocated(false) {}
-  ~LJ_GPU_Memory() { clear(); }
- 
-  /// Allocate memory on host and device
-  int* init(const int ij_size, const int ntypes, double **host_cutsq, 
-            double **host_sigma, double **host_epsilon, 
-            double **host_lj1, double **host_lj2, double **host_lj3, 
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int max_nbors, const int me);
-  /// Free any memory on host and device
-  void clear();
-
-  /// Returns memory usage on GPU per atom
-  int bytes_per_atom(const int max_nbors) const;
-  /// Maximum number of atoms that can be stored on GPU
-  int get_max_atoms(const int gpu_bytes, const int max_nbors);
-  /// Total host memory used by library
-  double host_memory_usage() const;
-  
-  // -------------------------   DATA   -----------------------------
-
-  // Device Properties
-  NVCDevice gpu;
-  // Device Error Flag
-  NVC_VecI dev_error;
-  // Stream for asynchronous work
-  cudaStream_t pair_stream;
-  
-  // Atom Data
-  PairGPUAtom<numtyp,acctyp> atom;
-  // Neighbor Data
-  PairGPUNbor nbor;
-  
-  // --------------- Const Data for Atoms
-  NVC_ConstMatT sigma, epsilon, cutsq, offset;
-  NVC_ConstMat< typename cu_vec_traits<numtyp>::vec2 > lj1, lj3;
-  NVC_VecT special_lj;
-  
-  size_t max_atoms;
-  
-  // Timing for pair calculation
-  NVCTimer time_pair;
-  
-  // If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-   
- protected:
-  bool allocated;
-};
-
-#endif
diff --git a/lib/gpu/nvc_device.cu b/lib/gpu/nvc_device.cu
deleted file mode 100644
index 01a2db1bd7..0000000000
--- a/lib/gpu/nvc_device.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-/***************************************************************************
-                                nvc_device.cu
-                             -------------------
-                               W. Michael Brown
-
-  Utilities for dealing with cuda devices
-
- __________________________________________________________________________
-    This file is part of the NVC Library
- __________________________________________________________________________
-
-    begin                : Wed Jan 28 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include "nvc_macros.h"
-#include "nvc_device.h"
-
-// Grabs the properties for all devices
-NVCDevice::NVCDevice() {
-  CUDA_SAFE_CALL(cudaGetDeviceCount(&_num_devices));
-  for (int dev=0; dev<_num_devices; ++dev) {
-    cudaDeviceProp deviceProp;
-    CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));
-    if (deviceProp.major == 9999 && deviceProp.minor == 9999)
-      break;
-    _properties.push_back(deviceProp);
-  }
-  _device=0;
-}
-
-// Set the CUDA device to the specified device number
-void NVCDevice::set(int num) {
-  if (_device==num)
-    return;
-  cudaThreadExit();
-  CUDA_SAFE_CALL(cudaSetDevice(num));
-  _device=num;
-}
-
-// List all devices along with all properties
-void NVCDevice::print_all(ostream &out) {
-  if (num_devices() == 0)
-    printf("There is no device supporting CUDA\n");
-  for (int i=0; i<num_devices(); ++i) {
-    printf("\nDevice %d: \"%s\"\n", i, name(i).c_str());
-    printf("  Revision number:                               %.1f\n", revision(i));
-    printf("  Total amount of global memory:                 %.2f GB\n",
-           gigabytes(i));
-    #if CUDART_VERSION >= 2000
-    printf("  Number of multiprocessors:                     %d\n",
-           _properties[i].multiProcessorCount);
-    printf("  Number of cores:                               %d\n",cores(i));
-    #endif
-    printf("  Total amount of constant memory:               %u bytes\n",
-           _properties[i].totalConstMem); 
-    printf("  Total amount of shared memory per block:       %u bytes\n",
-           _properties[i].sharedMemPerBlock);
-    printf("  Total number of registers available per block: %d\n",
-           _properties[i].regsPerBlock);
-    printf("  Warp size:                                     %d\n",
-           _properties[i].warpSize);
-    printf("  Maximum number of threads per block:           %d\n",
-           _properties[i].maxThreadsPerBlock);
-    printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
-           _properties[i].maxThreadsDim[0],
-           _properties[i].maxThreadsDim[1],
-           _properties[i].maxThreadsDim[2]);
-    printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
-           _properties[i].maxGridSize[0],
-           _properties[i].maxGridSize[1],
-           _properties[i].maxGridSize[2]);
-    printf("  Maximum memory pitch:                          %u bytes\n",
-           _properties[i].memPitch);
-    printf("  Texture alignment:                             %u bytes\n",
-           _properties[i].textureAlignment);
-    printf("  Clock rate:                                    %.2f GHz\n",
-           clock_rate(i));
-    #if CUDART_VERSION >= 2000
-    printf("  Concurrent copy and execution:                 %s\n",
-           _properties[i].deviceOverlap ? "Yes" : "No");
-    #endif
-  }
-}
-
diff --git a/lib/gpu/nvc_device.h b/lib/gpu/nvc_device.h
deleted file mode 100644
index 1ba5f2bc4c..0000000000
--- a/lib/gpu/nvc_device.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/***************************************************************************
-                                nvc_device.h
-                             -------------------
-                               W. Michael Brown
-
-  Utilities for dealing with cuda devices
-
- __________________________________________________________________________
-    This file is part of the NVC Library
- __________________________________________________________________________
-
-    begin                : Wed Jan 28 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#ifndef NVC_DEVICE
-#define NVC_DEVICE
-
-#include <string>
-#include <vector>
-#include <iostream>
-
-using namespace std;
-
-/// Class for looking at device properties
-/** \note Calls to change the device outside of the class results in incorrect
-  *       behavior 
-  * \note There is no error checking for indexing past the number of devices **/
-class NVCDevice {
- public:
-  /// Grabs the properties for all devices
-  NVCDevice();
-
-  /// Return the number of devices that support CUDA
-  inline int num_devices() { return _properties.size(); }
-
-  /// Set the CUDA device to the specified device number
-  void set(int num);
-
-  /// Get the current device number
-  inline int device_num() { return _device; }
-
-  /// Get the current CUDA device name
-  inline string name() { return name(_device); }
-  /// Get the CUDA device name
-  inline string name(const int i) { return string(_properties[i].name); }
-
-  /// Get the number of cores in the current device
-  inline unsigned cores() { return cores(_device); }
-  /// Get the number of cores
-  inline unsigned cores(const int i) 
-    { return _properties[i].multiProcessorCount*8; }
-  
-  /// Get the gigabytes of global memory in the current device
-  inline double gigabytes() { return gigabytes(_device); }
-  /// Get the gigabytes of global memory
-  inline double gigabytes(const int i) 
-    { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
-  
-  /// Get the bytes of global memory in the current device
-  inline size_t bytes() { return bytes(_device); }
-  /// Get the bytes of global memory
-  inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
-
-  /// Return the GPGPU revision number for current device
-  inline double revision() { return revision(_device); }
-  /// Return the GPGPU revision number
-  inline double revision(const int i) 
-    { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
-  
-  /// Clock rate in GHz for current device
-  inline double clock_rate() { return clock_rate(_device); }
-  /// Clock rate in GHz
-  inline double clock_rate(const int i) { return _properties[i].clockRate*1e-6;}
-               
-  /// List all devices along with all properties
-  void print_all(ostream &out);
- 
- private:
-  int _device, _num_devices;
-  vector<cudaDeviceProp> _properties;
-};
-
-#endif
diff --git a/lib/gpu/nvc_get_devices.cu b/lib/gpu/nvc_get_devices.cu
deleted file mode 100644
index a1d35232e6..0000000000
--- a/lib/gpu/nvc_get_devices.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/***************************************************************************
-                              nvc_get_devices.h
-                             -------------------
-                               W. Michael Brown
-
-  List properties of cuda devices
-
- __________________________________________________________________________
-    This file is part of the NVC Library
- __________________________________________________________________________
-
-    begin                : Wed Jan 28 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#include "nvc_device.h"
-
-int main(int argc, char** argv) {
-  NVCDevice gpu;
-  gpu.print_all(cout);
-  return 0;
-}
-
diff --git a/lib/gpu/nvc_macros.h b/lib/gpu/nvc_macros.h
deleted file mode 100644
index 56bb5c64ce..0000000000
--- a/lib/gpu/nvc_macros.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef NVC_MACROS_H
-#define NVC_MACROS_H
-
-#include <stdio.h>
-#include "math_constants.h"
-#define INT_MUL(x,y)	(__mul24(x,y))
-//#define INT_MUL(x,y) ((x)*(y))
-
-template <class numbr>
-static __inline__ __device__ numbr cuda_inf() { return CUDART_INF_F; }
-
-#ifdef CUDA_DOUBLE
-template <>
-static __inline__ __device__ double cuda_inf<double>() { return CUDART_INF; }
-#endif
-
-template <class numbr>
-static __inline__ __device__ numbr cuda_zero() { return 0.0; }
-
-template <>
-static __inline__ __device__ float cuda_zero<float>() { return 0.0f; }
-
-#ifdef DEBUG
-
-#  define CU_SAFE_CALL_NO_SYNC( call ) do {                                  \
-    CUresult err = call;                                                     \
-    if( CUDA_SUCCESS != err) {                                               \
-        fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n",   \
-                err, __FILE__, __LINE__ );                                   \
-        exit(EXIT_FAILURE);                                                  \
-    } } while (0)
-
-#  define CU_SAFE_CALL( call ) do {                                          \
-    CU_SAFE_CALL_NO_SYNC(call);                                              \
-    CUresult err = cuCtxSynchronize();                                       \
-    if( CUDA_SUCCESS != err) {                                               \
-        fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n",   \
-                err, __FILE__, __LINE__ );                                   \
-        exit(EXIT_FAILURE);                                                  \
-    } } while (0)
-
-#  define CUDA_SAFE_CALL_NO_SYNC( call) do {                                 \
-    cudaError err = call;                                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
-                __FILE__, __LINE__, cudaGetErrorString( err) );              \
-        exit(EXIT_FAILURE);                                                  \
-    } } while (0)
-
-#  define CUDA_SAFE_CALL( call) do {                                         \
-    CUDA_SAFE_CALL_NO_SYNC(call);                                            \
-    cudaError err = cudaThreadSynchronize();                                 \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
-                __FILE__, __LINE__, cudaGetErrorString( err) );              \
-        exit(EXIT_FAILURE);                                                  \
-    } } while (0)
-
-#  define CUFFT_SAFE_CALL( call) do {                                        \
-    cufftResult err = call;                                                  \
-    if( CUFFT_SUCCESS != err) {                                              \
-        fprintf(stderr, "CUFFT error in file '%s' in line %i.\n",            \
-                __FILE__, __LINE__);                                         \
-        exit(EXIT_FAILURE);                                                  \
-    } } while (0)
-
-#  define CUT_SAFE_CALL( call)                                               \
-    if( CUTTrue != call) {                                                   \
-        fprintf(stderr, "Cut error in file '%s' in line %i.\n",              \
-                __FILE__, __LINE__);                                         \
-        exit(EXIT_FAILURE);                                                  \
-    } 
-
-    //! Check for CUDA error
-#  define CUT_CHECK_ERROR(errorMessage) do {                                 \
-    cudaError_t err = cudaGetLastError();                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    err = cudaThreadSynchronize();                                           \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    } } while (0)
-
-    //! Check for malloc error
-#  define CUT_SAFE_MALLOC( mallocCall ) do{                                  \
-    if( !(mallocCall)) {                                                     \
-        fprintf(stderr, "Host malloc failure in file '%s' in line %i\n",     \
-                __FILE__, __LINE__);                                         \
-        exit(EXIT_FAILURE);                                                  \
-    } } while(0);
-
-    //! Check if conditon is true (flexible assert)
-#  define CUT_CONDITION( val)                                                \
-    if( CUTFalse == cutCheckCondition( val, __FILE__, __LINE__)) {           \
-        exit(EXIT_FAILURE);                                                  \
-    }
-
-#else  // not DEBUG
-
-#define CUT_BANK_CHECKER( array, index)  array[index]
-
-    // void macros for performance reasons
-#  define CUT_CHECK_ERROR(errorMessage)
-#  define CUT_CHECK_ERROR_GL()
-#  define CUT_CONDITION( val) 
-#  define CU_SAFE_CALL_NO_SYNC( call) call
-#  define CU_SAFE_CALL( call) call
-#  define CUDA_SAFE_CALL_NO_SYNC( call) call
-#  define CUDA_SAFE_CALL( call) call
-#  define CUT_SAFE_CALL( call) call
-#  define CUFFT_SAFE_CALL( call) call
-#  define CUT_SAFE_MALLOC( mallocCall ) mallocCall
-
-#endif
-
-#endif
diff --git a/lib/gpu/nvc_memory.h b/lib/gpu/nvc_memory.h
deleted file mode 100644
index a83178985c..0000000000
--- a/lib/gpu/nvc_memory.h
+++ /dev/null
@@ -1,447 +0,0 @@
-/***************************************************************************
-                                nvc_memory.h
-                             -------------------
-                               W. Michael Brown
-
-  Routines for memory management on CUDA devices
-
- __________________________________________________________________________
-    This file is part of the NVC Library
- __________________________________________________________________________
-
-    begin                : Thu Jun 25 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#ifndef NVC_MEMORY_H
-#define NVC_MEMORY_H
-
-#include <iostream>
-
-#define NVC_HostT NVC_Host<numtyp>
-#define NVC_HostD NVC_Host<double>
-#define NVC_HostS NVC_Host<float>
-#define NVC_HostI NVC_Host<int>
-
-#define NVC_VecT NVC_Vec<numtyp>
-#define NVC_VecD NVC_Vec<double>
-#define NVC_VecS NVC_Vec<float>
-#define NVC_VecI NVC_Vec<int>
-#define NVC_VecI2 NVC_Vec<int2>
-#define NVC_VecU2 NVC_Vec<uint2>
-
-#define NVC_MatT NVC_Mat<numtyp>
-#define NVC_MatD NVC_Mat<double>
-#define NVC_MatS NVC_Mat<float>
-#define NVC_MatI NVC_Mat<int>
-
-#define NVC_ConstMatT NVC_ConstMat<numtyp>
-#define NVC_ConstMatD NVC_ConstMat<double>
-#define NVC_ConstMatS NVC_ConstMat<float>
-#define NVC_ConstMatI NVC_ConstMat<int>
-#define NVC_ConstMatD2 NVC_ConstMat<double2>
-
-namespace NVC {
-
-// Get a channel for float array
-template <class numtyp>
-inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) {
-  channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
-}
-
-// Get a channel for float2 array
-template <>
-inline void cuda_gb_get_channel<float2>(cudaChannelFormatDesc &channel) {
-  channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
-}
-
-// Get a channel for double array
-template <>
-inline void cuda_gb_get_channel<double>(cudaChannelFormatDesc &channel) {
-  channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned);
-}
-
-// Get a channel for double array
-template <>
-inline void cuda_gb_get_channel<double2>(cudaChannelFormatDesc &channel) {
-  channel = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindSigned);
-}
-
-// Get a channel for int array
-template <>
-inline void cuda_gb_get_channel<int>(cudaChannelFormatDesc &channel) {
-  channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-}
-
-/// Page-locked Row Vector on Host
-template <class numtyp>
-class NVC_Host {
- public:
-  NVC_Host() { _cols=0; }
-  ~NVC_Host() { if (_cols>0) CUDA_SAFE_CALL(cudaFreeHost(_array)); }
-  
-  // Allocate page-locked memory with fast write/slow read on host
-  inline void safe_alloc_w(const size_t cols) {
-    _cols=cols;
-    _row_bytes=cols*sizeof(numtyp);
-    CUDA_SAFE_CALL(cudaHostAlloc((void **)&_array,_row_bytes,
-                   cudaHostAllocWriteCombined));
-    _end=_array+cols;
-  }
-
-  // Allocate page-locked memory with fast read/write on host
-  inline void safe_alloc_rw(const size_t cols) {
-    _cols=cols;
-    _row_bytes=cols*sizeof(numtyp);
-    CUDA_SAFE_CALL(cudaMallocHost((void **)&_array,_row_bytes));
-    _end=_array+cols;
-  }
-  
-  /// Free any memory associated with device
-  inline void clear() 
-    { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } }
-
-  /// Set each element to zero
-  inline void zero() { memset(_array,0,row_bytes()); }
-  
-  /// Set first n elements to zero
-  inline void zero(const int n) { memset(_array,0,n*sizeof(numtyp)); }
-
-  inline numtyp * begin() { return _array; }
-  inline const numtyp * begin() const { return _array; }
-  inline numtyp * end() { return _end; }
-  inline const numtyp * end() const { return _end; }
-
-  inline size_t numel() const { return _cols; }
-  inline size_t rows() const { return 1; }
-  inline size_t cols() const { return _cols; }
-  inline size_t row_size() const { return _cols; }
-  inline size_t row_bytes() const { return _row_bytes; }
-    
-  inline numtyp & operator[](const int i) { return _array[i]; }
-  inline const numtyp & operator[](const int i) const { return _array[i]; }
-  
-  /// Copy from device (numel is not bytes)
-  inline void copy_from_device(const numtyp *device_p, size_t numel) {
-    CUDA_SAFE_CALL(cudaMemcpy(_array,device_p,numel*sizeof(numtyp),
-                              cudaMemcpyDeviceToHost));
-  }
-  
-  /// Copy to device (numel is not bytes)
-  inline void copy_to_device(numtyp *device_p, size_t numel) {
-    CUDA_SAFE_CALL(cudaMemcpy(device_p,_array,numel*sizeof(numtyp),
-                              cudaMemcpyHostToDevice));
-  }
-  
-  /// Copy to 2D matrix on device (numel is not bytes)
-  inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
-                               const size_t rows, const size_t cols) {
-    CUDA_SAFE_CALL(cudaMemcpy2D(device_p,dev_row_size*sizeof(numtyp),
-                                _array,cols*sizeof(numtyp),
-                                cols*sizeof(numtyp),rows,
-                                cudaMemcpyHostToDevice));
-  }
-
-  /// Asynchronous copy from device (numel is not bytes)
-  inline void copy_from_device(const numtyp *device_p, size_t numel,
-                               cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
-                                   cudaMemcpyDeviceToHost,stream));
-  }
-  
-  /// Asynchronous copy to device (numel is not bytes)
-  inline void copy_to_device(numtyp *device_p, size_t numel,
-                             cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
-                                   cudaMemcpyHostToDevice,stream));
-  }
-  
-  /// Asynchronous copy to 2D matrix on device (numel is not bytes)
-  inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
-                               const size_t rows, const size_t cols,
-                               cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp),
-                                     _array,cols*sizeof(numtyp),
-                                     cols*sizeof(numtyp),rows,
-                                     cudaMemcpyHostToDevice,stream));
-  }
-  
- private:
-  numtyp *_array, *_end;
-  size_t _row_bytes, _row_size, _rows, _cols;
-};
-
-/// Row vector on device 
-template <class numtyp>
-class NVC_Vec {
- public:
-  NVC_Vec() { _cols=0; }
-  ~NVC_Vec() { if (_cols>0) CUDA_SAFE_CALL(cudaFree(_array)); }
-  
-  // Row vector on device
-  inline void safe_alloc(const size_t cols) {
-    _cols=cols;
-    _row_bytes=cols*sizeof(numtyp);
-    CUDA_SAFE_CALL(cudaMalloc((void **)&_array,_row_bytes));
-    _end=_array+cols;
-  }
-  
-  /// Free any memory associated with device
-  inline void clear() 
-    { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFree(_array)); } }
-
-  /// Set each element to zero
-  inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0,row_bytes())); }
-
-  inline numtyp * begin() { return _array; }
-  inline const numtyp * begin() const { return _array; }
-  inline numtyp * end() { return _end; }
-  inline const numtyp * end() const { return _end; }
-
-  inline size_t numel() const { return _cols; }
-  inline size_t rows() const { return 1; }
-  inline size_t cols() const { return _cols; }
-  inline size_t row_size() const { return _cols; }
-  inline size_t row_bytes() const { return _row_bytes; }
-    
-  /// Copy from host
-  inline void copy_from_host(const numtyp *host_p)
-    { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(), 
-                                cudaMemcpyHostToDevice)); }
-
-  /// Asynchronous copy from host
-  inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
-    { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(), 
-                     cudaMemcpyHostToDevice, stream)); }
-
-  /// Copy to host
-  inline void copy_to_host(numtyp *host_p)
-    { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(),
-                                cudaMemcpyDeviceToHost)); }
-
-  /// Copy n elements to host
-  inline void copy_to_host(numtyp *host_p, const int n)
-    { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,n*sizeof(numtyp),
-                                cudaMemcpyDeviceToHost)); }
-
-  /// Cast and then copy to device
-  template <class numtyp2>
-  inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) {
-    for (int i=0; i<numel(); i++)
-      host_write[i]=static_cast<numtyp>(buffer[i]);
-    copy_from_host(host_write.begin());
-  }
-  
-  /// Bind to texture
-  template <class texture>
-  inline void bind_texture(texture &texi, cudaChannelFormatDesc &channel) {
-    NVC::cuda_gb_get_channel<numtyp>(channel);
-    texi.addressMode[0] = cudaAddressModeClamp;
-    texi.addressMode[1] = cudaAddressModeClamp;
-    texi.filterMode = cudaFilterModePoint;
-    texi.normalized = false;
-    CUDA_SAFE_CALL(cudaBindTexture(NULL,&texi,_array,&channel));
-  }
-
-  /// Output the vector (debugging)
-  inline void print(std::ostream &out) { print (out, numel()); }    
-
-  // Output first n elements of vector
-  inline void print(std::ostream &out, const int n) {
-    numtyp *t=new numtyp[n];
-    copy_to_host(t,n);
-    for (int i=0; i<n; i++)
-      out << t[i] << " ";
-    delete []t;
-  }
-    
- private:
-  numtyp *_array, *_end;
-  size_t _row_bytes, _row_size, _rows, _cols;
-};
-
-/// 2D Matrix on device (can have extra column storage to get correct alignment)
-template <class numtyp>
-class NVC_Mat {
- public:
-  NVC_Mat() { _rows=0; }
-  ~NVC_Mat() { if (_rows>0) CUDA_SAFE_CALL(cudaFree(_array)); }
-  
-  // Row major matrix on device
-  // - Coalesced access using adjacent cols on same row
-  // - NVC_Mat(row,col) given by array[row*row_size()+col]
-  inline void safe_alloc(const size_t rows, const size_t cols) {
-    _rows=rows;
-    _cols=cols;
-    CUDA_SAFE_CALL(cudaMallocPitch((void **)&_array,&_pitch, 
-                   cols*sizeof(numtyp),rows));
-   _row_size=_pitch/sizeof(numtyp);                
-    _end=_array+_row_size*cols;
-  }
-  
-  /// Free any memory associated with device
-  inline void clear() 
-    { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFree(_array)); } }
-
-  /// Set each element to zero
-  inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0, _pitch*_rows)); }
-
-  inline numtyp * begin() { return _array; }
-  inline const numtyp * begin() const { return _array; }
-  inline numtyp * end() { return _end; }
-  inline const numtyp * end() const { return _end; }
-
-
-  inline size_t numel() const { return _cols*_rows; }
-  inline size_t rows() const { return _rows; }
-  inline size_t cols() const { return _cols; }
-  inline size_t row_size() const { return _row_size; }
-  inline size_t row_bytes() const { return _pitch; }
-    
-  /// Copy from host (elements not bytes)
-  inline void copy_from_host(const numtyp *host_p, const size_t numel)
-    { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,numel*sizeof(numtyp), 
-                     cudaMemcpyHostToDevice)); }
-
-  /// Asynchronous copy from host (elements not bytes)
-  inline void copy_from_host(const numtyp *host_p, const size_t numel,
-                             cudaStream_t &stream)
-    { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), 
-                     cudaMemcpyHostToDevice, stream)); }
-
-  /// Asynchronous Copy from Host
-  /** \note Used when the number of columns/rows allocated on host smaller than
-    *       on device **/
-  inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
-                               const size_t cols, cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp), 
-                                     cols*sizeof(numtyp), rows,
-                                     cudaMemcpyHostToDevice,stream));
-  }
-
- private:
-  numtyp *_array, *_end;
-  size_t _pitch, _row_size, _rows, _cols;
-};
-
-/// Const 2D Matrix on device (requires texture binding)
-template <class numtyp>
-class NVC_ConstMat {
- public:
-  NVC_ConstMat() { _rows=0; }
-  ~NVC_ConstMat() { if (_rows>0) CUDA_SAFE_CALL(cudaFreeArray(_array)); }
-  
-  /// Row major matrix on device
-  inline void safe_alloc(const size_t rows, const size_t cols) {
-    _rows=rows;
-    _cols=cols;
-
-    NVC::cuda_gb_get_channel<numtyp>(_channel);
-    CUDA_SAFE_CALL(cudaMallocArray(&_array, &_channel, cols, rows));
-  }
-  
-  /// Bind to texture
-  template <class texture>
-  inline void bind_texture(texture &texi) {
-    texi.addressMode[0] = cudaAddressModeClamp;
-    texi.addressMode[1] = cudaAddressModeClamp;
-    texi.filterMode = cudaFilterModePoint;
-    texi.normalized = false;
-    CUDA_SAFE_CALL(cudaBindTextureToArray(&texi,_array,&_channel));
-  }
-  
-  /// Free any memory associated with device
-  inline void clear() 
-    { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFreeArray(_array)); } }
-
-  inline size_t numel() const { return _cols*_rows; }
-  inline size_t rows() const { return _rows; }
-  inline size_t cols() const { return _cols; }
-  inline size_t row_size() const { return _cols; }
-  inline size_t row_bytes() const { return _cols*sizeof(numtyp); }
-
-  /// Copy from Host
-  inline void copy_from_host(const numtyp *host_p) {
-    CUDA_SAFE_CALL(cudaMemcpyToArray(_array, 0, 0, host_p,
-                                     numel()*sizeof(numtyp),
-                                     cudaMemcpyHostToDevice));
-  }
-
-  /// Copy from Host
-  /** \note Used when the number of columns/rows allocated on host smaller than
-    *       on device **/
-  inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
-                               const size_t cols) {
-    CUDA_SAFE_CALL(cudaMemcpy2DToArray(_array, 0, 0, host_p, 
-                         cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
-                         cudaMemcpyHostToDevice));
-  }                               
-
-  /// Asynchronous Copy from Host
-  inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, 
-                                          numel()*sizeof(numtyp),
-                                          cudaMemcpyHostToDevice,stream));
-  }
-      
-  /// Asynchronous Copy from Host
-  /** \note Used when the number of columns/rows allocated on host smaller than
-    *       on device **/
-  inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
-                               const size_t cols, cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, 
-                         cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
-                         cudaMemcpyHostToDevice,stream));
-  }                               
-
-  /// Cast buffer to numtyp in host_write and copy to array
-  template <class numtyp2>
-  inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) {
-    int n=numel();                        
-    for (int i=0; i<n; i++) {
-      host_write[i]=static_cast<numtyp>(*buffer); buffer++;
-    }
-    copy_from_host(host_write.begin());
-  }
-
-  /// Cast buffer to numtyp in host_write and copy to array
-  /** \note Used when the number of columns/rows allocated on host smaller than
-    *       on device **/
-  template <class numtyp2>
-  inline void cast_copy2D(const numtyp2 *buffer, NVC_HostT &host_write,
-                          const size_t rows, const size_t cols) {
-    int n=rows*cols;                        
-    for (int i=0; i<n; i++) {
-      host_write[i]=static_cast<numtyp>(*buffer); buffer++;
-    }
-    copy_2Dfrom_host(host_write.begin(),rows,cols);
-  }
-
-  /// Cast buffer to numtyp in host_write and copy to array asynchronously
-  template <class numtyp2>
-  inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write,
-                        cudaStream_t &stream) {
-    int n=numel();                        
-    for (int i=0; i<n; i++) {
-      host_write[i]=static_cast<numtyp>(*buffer); buffer++;
-    }
-    copy_from_host(host_write.begin(),stream);
-  }
-
- private:
-  size_t _rows, _cols;
-  cudaArray *_array;
-  cudaChannelFormatDesc _channel;
-};
-
-#endif
diff --git a/lib/gpu/nvc_timer.h b/lib/gpu/nvc_timer.h
deleted file mode 100644
index 12346b5ea2..0000000000
--- a/lib/gpu/nvc_timer.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/***************************************************************************
-                                 nvc_timer.h
-                             -------------------
-                               W. Michael Brown
-
-  Class for timing CUDA routines
-
- __________________________________________________________________________
-    This file is part of the NVC Library
- __________________________________________________________________________
-
-    begin                : Tue Feb 3 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#ifndef NVC_TIMER_H
-#define NVC_TIMER_H
-
-#include "nvc_macros.h"
-
-#define cudaEventDestroy(a)
-
-/// Class for timing CUDA events
-class NVCTimer {
- public:
-  NVCTimer() : _total_time(0.0f), initialized(false) { }
-  
-  ~NVCTimer() {
-    if (initialized) 
-      { cudaEventDestroy(start_event); cudaEventDestroy(stop_event); }
-  }
-
-  inline void init() {
-    if (!initialized) {
-      initialized=true;
-      CUDA_SAFE_CALL( cudaEventCreate(&start_event) );
-      CUDA_SAFE_CALL( cudaEventCreate(&stop_event) );
-    }
-  }
-  
-  /// Start timing
-  inline void start() { cudaEventRecord(start_event,0); }
-  
-  /// Stop timing and store event time
-  inline void stop() { cudaEventRecord(stop_event,0); }
-  
-  /// Set the time elapsed to zero (not the total_time)
-  inline void zero() 
-    { cudaEventRecord(start_event,0); cudaEventRecord(stop_event,0); } 
-  
-  /// Add time from previous start and stop to total
-  /** Forces synchronization **/
-  inline void add_to_total() { _total_time+=time(); }
-  
-  /// Return the time (ms) of last start to stop - Forces synchronization
-  inline double time() { 
-    float timer;
-    cudaEventSynchronize(stop_event);
-    CUDA_SAFE_CALL( cudaEventElapsedTime(&timer,start_event,stop_event) );
-    return timer; 
-  }
-  
-  /// Return the total time in ms
-  inline double total_time() { return _total_time; }
-
-  /// Return the total time in seconds
-  inline double total_seconds() { return _total_time/1000.0; }
-
- private:
-  cudaEvent_t start_event, stop_event;
-  double _total_time;
-  bool initialized;
-};
-
-#endif
diff --git a/lib/gpu/pair_gpu_atom.cu b/lib/gpu/pair_gpu_atom.cu
deleted file mode 100644
index 94617a5c11..0000000000
--- a/lib/gpu/pair_gpu_atom.cu
+++ /dev/null
@@ -1,173 +0,0 @@
-/***************************************************************************
-                               pair_gpu_atom.cu
-                             -------------------
-                               W. Michael Brown
-
-  Memory routines for moving atom and force data between host and gpu
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#include "pair_gpu_atom.h"
-#define PairGPUAtomT PairGPUAtom<numtyp,acctyp>
-
-template <class numtyp, class acctyp>
-int PairGPUAtomT::bytes_per_atom() const { 
-  return atom_fields()*sizeof(numtyp)+ans_fields()*sizeof(acctyp); 
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::init(const int max_atoms) {
-  if (allocated)
-    clear();
-    
-  _max_atoms=max_atoms;
-
-  // Initialize timers for the selected GPU
-  time_atom.init();
-  time_answer.init();
-
-  // Device matrices for atom and force data
-  dev_x.safe_alloc(atom_fields(),max_atoms);
-  x_bind_texture<numtyp>(dev_x);
-  ans.safe_alloc(ans_fields(),max_atoms);
-
-  // Get a host write only buffer
-  host_write.safe_alloc_w(max_atoms*4);
-  // Get a host read/write buffer
-  host_read.safe_alloc_rw(ans.row_size()*ans_fields());
-    
-  allocated=true;
-}
-  
-template <class numtyp, class acctyp>
-void PairGPUAtomT::clear() {
-  if (!allocated)
-      return;
-  allocated=false;
-      
-  x_unbind_texture<numtyp>();
-  ans.clear();                               
-  host_write.clear();
-  host_read.clear();
-  dev_x.clear();
-}  
- 
-template <class numtyp, class acctyp>
-double PairGPUAtomT::host_memory_usage(const int max_atoms) const {
-  return max_atoms*atom_fields()*sizeof(numtyp)+
-         ans_fields()*(max_atoms)*sizeof(acctyp)+
-         sizeof(PairGPUAtom<numtyp,acctyp>);
-}
-  
-template <class numtyp, class acctyp>
-void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, 
-                                cudaStream_t &s) {
-  _eflag=eflag;
-  _vflag=vflag;
-    
-  int csize=ans_fields();    
-  if (!eflag)
-    csize--;
-  if (!vflag)
-    csize-=6;
-      
-  host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s);
-}
-  
-template <class numtyp, class acctyp>
-double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom,
-                                   const bool vflag_atom, double *eatom, 
-                                   double **vatom, double *virial) {
-  double evdwl=0.0;
-  int gap=ans.row_size()-_inum;
-
-  acctyp *ap=host_read.begin();
-  if (_eflag) {
-    if (eflag_atom) {
-      for (int i=0; i<_inum; i++) {
-        evdwl+=*ap;
-        eatom[ilist[i]]+=*ap*0.5;
-        ap++;
-      }
-    } else
-      for (int i=0; i<_inum; i++) {
-        evdwl+=*ap;
-        ap++;
-      }
-    ap+=gap;
-    evdwl*=0.5;
-  }
-  _read_loc=ap;
-  gap=ans.row_size();
-  if (_vflag) {
-    if (vflag_atom) {
-      for (int ii=0; ii<_inum; ii++) {
-        int i=ilist[ii];
-        ap=_read_loc+ii;
-        for (int j=0; j<6; j++) {
-          vatom[i][j]+=*ap*0.5;
-          virial[j]+=*ap;
-          ap+=gap;
-        }
-      }
-    } else {
-      for (int ii=0; ii<_inum; ii++) {
-        ap=_read_loc+ii;
-        for (int j=0; j<6; j++) {
-          virial[j]+=*ap;
-          ap+=gap;
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-    _read_loc+=gap*6;
-  }
-  
-  return evdwl;
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::add_forces(const int *ilist, double **f) {
-  int gap=ans.row_size();
-  for (int ii=0; ii<_inum; ii++) {
-    acctyp *ap=_read_loc+ii;
-    int i=ilist[ii];
-    f[i][0]+=*ap;
-    ap+=gap;
-    f[i][1]+=*ap;
-    ap+=gap;
-    f[i][2]+=*ap;
-  }
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) {
-  int gap=ans.row_size();
-  _read_loc+=gap*3;
-  for (int ii=0; ii<n; ii++) {
-    acctyp *ap=_read_loc+ii;
-    int i=ilist[ii];
-    tor[i][0]+=*ap;
-    ap+=gap;
-    tor[i][1]+=*ap;
-    ap+=gap;
-    tor[i][2]+=*ap;
-  }
-}
-
-template class PairGPUAtom<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h
deleted file mode 100644
index a5796db45a..0000000000
--- a/lib/gpu/pair_gpu_atom.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/***************************************************************************
-                               pair_gpu_atom.h
-                             -------------------
-                               W. Michael Brown
-
-  Memory routines for moving atom and force data between host and gpu
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#ifndef PAIR_GPU_ATOM_H
-#define PAIR_GPU_ATOM_H
-
-// PRECISION - Precision for rsq, energy, force, and torque calculation
-// ACC_PRECISION - Precision for accumulation of energies, forces, and torques
-#ifdef _SINGLE_DOUBLE
-#define PRECISION float
-#define ACC_PRECISION double
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-#define PRECISION double
-#define ACC_PRECISION double
-#endif
-
-#ifndef PRECISION
-#define PRECISION float
-#define ACC_PRECISION double
-#endif
-
-#define MAX_ATOMS 65536
-#include "nvc_timer.h"
-#include "pair_gpu_texture.h"
-
-template <class numtyp, class acctyp>
-class PairGPUAtom {
- public:
-  PairGPUAtom() : _atom_fields(4), _ans_fields(10), allocated(false) {}
-  ~PairGPUAtom() { clear(); }
-
-  // Accessors
-  inline int atom_fields() const { return _atom_fields; }
-  inline int ans_fields() const { return _ans_fields; }
-  inline int max_atoms() const { return _max_atoms; }
-  inline int nall() const { return _nall; }
-  inline int inum() const { return _inum; }
-
-  /// Set number of atoms for future copy operations
-  inline void nall(const int n) { _nall=n; }
-  /// Set number of inum for future copy operations
-  inline void inum(const int n) { _inum=n; }
-  /// Set the number of atom fields (x, y, z, type, etc)
-  inline void atom_fields(const int n) { _atom_fields=n; }
-  /// Set the number of answer fields (energy, virial, force, etc.)
-  inline void ans_fields(const int n) { _ans_fields=n; }
-  
-  /// Memory usage per atom in this class
-  /** \note atom_fields and ans_fields should be set for correct answer **/
-  int bytes_per_atom() const; 
-
-  /// Must be called once to allocate host and device memory
-  /** \note atom_fields and ans_fields should be set first if not default **/
-  void init(const int max_atoms);
-  
-  /// Free all memory on host and device
-  void clear();
- 
-  /// Return the total amount of host memory used by class
-  double host_memory_usage(const int max_atoms) const;
-
-  
-  // -------------------------COPY TO GPU ----------------------------------
-
-  /// Reset the write buffer pointer (Start copying new atom data)
-  inline void reset_write_buffer() { _write_loc=host_write.begin(); }
-  
-  /// Add a row to write buffer with unit stride
-  /** Copies nall() elements **/
-  template<class cpytyp>
-  inline void add_atom_data(const cpytyp *host_ptr)
-    { for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i]; _write_loc++; } }
-  
-  /// Add a row to write buffer with non-unit stride
-  /** Copies nall() elements **/
-  template<class cpytyp>
-  inline void add_atom_data(const cpytyp *hostptr, const int stride) {
-    int t=_nall*stride; 
-    for (int i=0; i<t; i+=stride) { *_write_loc=hostptr[i]; _write_loc++; }
-  }
-  
-  /// Copy num_rows x nall() write buffer to x in GPU
-  /** num_rows<=atom_fields() **/
-  inline void copy_atom_data(const int num_rows, cudaStream_t &stream) 
-    { dev_x.copy_2Dfrom_host(host_write.begin(),num_rows,nall(),stream); }
-
-    
-  // -------------------------COPY FROM GPU -------------------------------
-
-  /// Copy answers from GPU into read buffer
-  void copy_answers(const bool eflag, const bool vflag, cudaStream_t &s);
-  
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(const int *ilist, const bool eflag_atom,
-                       const bool vflag_atom, double *eatom, double **vatom,
-                       double *virial);
-
-  /// Add forces from the GPU into a LAMMPS pointer
-  void add_forces(const int *ilist, double **f);
-
-  /// Add torques from the GPU into a LAMMPS pointer
-  void add_torques(const int *ilist, double **tor, const int n);
-    
-  // ------------------------------ DATA ----------------------------------
-
-  // atom_fields() x n (example: rows 1-3 position, 4 is type)
-  NVC_ConstMatT dev_x;
-  // ans_fields() x n Storage for Forces, etc.
-  // example: if (eflag and vflag) 1 is energy, 2-7 is virial, 8-10 is force 
-  // example: if (!eflag) 1-3 is force
-  NVC_Mat<acctyp> ans;                               
-
-  // Buffer for moving floating point data to GPU
-  NVC_HostT host_write;
-  // Buffer for moving floating point data to CPU
-  NVC_Host<acctyp> host_read;
-  
-  // Timing Stuff
-  NVCTimer time_atom, time_answer;
-  
- private:
-  bool allocated, _eflag, _vflag;
-  int _atom_fields, _ans_fields;
-  int _max_atoms, _nall, _inum;
-  numtyp * _write_loc;
-  acctyp * _read_loc;
-};
-
-#endif
diff --git a/lib/gpu/pair_gpu_nbor.cu b/lib/gpu/pair_gpu_nbor.cu
deleted file mode 100644
index e3fc2b28d8..0000000000
--- a/lib/gpu/pair_gpu_nbor.cu
+++ /dev/null
@@ -1,80 +0,0 @@
-/***************************************************************************
-                               pair_gpu_nbor.cu
-                             -------------------
-                               W. Michael Brown
-
-  Neighbor memory operations for LAMMPS GPU Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#include "pair_gpu_nbor.h"
-
-int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
-  if (_use_packing)
-    return (max_nbors*2+4)*sizeof(int);
-  else
-    return (max_nbors+3)*sizeof(int);
-}
-
-void PairGPUNbor::init(const int ij_size, const int max_atoms, 
-                       const int max_nbors) {
-  if (allocated)
-    clear();
-    
-  // Initialize timers for the selected GPU
-  time_nbor.init();
-
-  if (_use_packing)
-    dev_nbor.safe_alloc(max_nbors+4,max_atoms);
-  else  
-    dev_nbor.safe_alloc(3,max_atoms);
-  
-  ij.safe_alloc(max_nbors*max_atoms);
-  host_ij.safe_alloc_w(ij_size);
-    
-  allocated=true;
-}
-  
-void PairGPUNbor::clear() {
-  if (!allocated)
-    return;
-  allocated=false;
-
-  ij.clear();
-  host_ij.clear();
-  dev_nbor.clear();
-}  
- 
-double PairGPUNbor::host_memory_usage() const {
-  return IJ_SIZE*sizeof(int)+sizeof(PairGPUNbor);
-}
-
-void PairGPUNbor::reset(const int inum, int *ilist, const int *numj, 
-                        cudaStream_t &s) {  
-  ij_total=0;
-
-  dev_nbor.copy_from_host(ilist,inum);
-  int acc=0;
-  for (int i=0; i<inum; i++) {
-    host_ij[i]=numj[ilist[i]];
-    host_ij[i+inum]=acc;
-    acc+=numj[ilist[i]];
-  }
-  
-  host_ij.copy_to_2Ddevice(dev_nbor.begin()+dev_nbor.row_size(),
-                           dev_nbor.row_size(),2,inum, s);
-}
diff --git a/lib/gpu/pair_gpu_nbor.h b/lib/gpu/pair_gpu_nbor.h
deleted file mode 100644
index 2e833b4c3d..0000000000
--- a/lib/gpu/pair_gpu_nbor.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
-                               pair_gpu_nbor.h
-                             -------------------
-                               W. Michael Brown
-
-  Neighbor memory operations for LAMMPS GPU Library
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Aug 4 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#ifndef PAIR_GPU_NBOR_H
-#define PAIR_GPU_NBOR_H
-
-#include "nvc_macros.h"
-#include "nvc_timer.h"
-#include "nvc_memory.h"
-
-#define BLOCK_1D 64
-#define IJ_SIZE 131072
-
-class PairGPUNbor {
- public:
-  PairGPUNbor() : _use_packing(false), allocated(false) {}
-  ~PairGPUNbor() { clear(); }
- 
-  /// Determine whether neighbor packing should be used
-  /** If true, twice as much memory is reserved to allow packing neighbors by 
-    * atom for coalesced access after cutoff evaluation. This can be used
-    * for expensive potentials where it is more efficient to evaluate the 
-    * cutoff separately from the potential in order to reduce thread divergence 
-    * for expensive routines **/
-  void packing(const bool use_packing) { _use_packing=use_packing; }
-  
-  /// Called once to allocate memory
-  void init(const int ij_size, const int max_atoms, const int max_nbors);
-  
-  /// Free all memory on host and device
-  void clear();
- 
-  /// Bytes per atom used on device
-  int bytes_per_atom(const int max_nbors) const;
-  /// Total host memory used by class
-  double host_memory_usage() const;
-
-  /// Reset neighbor data (first time or from a rebuild)  
-  void reset(const int inum, int *ilist, const int *numj, cudaStream_t &s);
-  /// Add neighbor data from host
-  inline void add(const int num_ij, cudaStream_t &s)
-    { host_ij.copy_to_device(ij.begin()+ij_total,num_ij,s); ij_total+=num_ij; }
-
-  /// Pack neighbors satisfying cutoff by atom for coalesced access
-  void pack_nbors(const int GX, const int BX, const int start, 
-                  const int inum, const int form_low, const int form_high);
-
-    
-  // ------------------------------- Data -------------------------------
-
-  // Store IJ interactions on device
-  NVC_VecI ij;
-  // Buffer for moving ij data to GPU
-  NVC_HostI host_ij;
-
-  // --------------- Atom neighbors
-  // 3 x n
-  // - 1st row is i
-  // - 2nd row is numj (number of neighbors)
-  // - 3rd row is starting address in host_ij of neighbors
-  NVC_MatI dev_nbor;
-
-  // --------------- Timing Stuff
-  NVCTimer time_nbor;
-  
-  int ij_total;
- private:
-  bool allocated, _use_packing;
-};
-
-#endif
diff --git a/lib/gpu/pair_gpu_texture.h b/lib/gpu/pair_gpu_texture.h
deleted file mode 100644
index 8ba6b830c3..0000000000
--- a/lib/gpu/pair_gpu_texture.h
+++ /dev/null
@@ -1,292 +0,0 @@
-/***************************************************************************
-                              pair_gpu_texture.h
-                             -------------------
-                               W. Michael Brown
-
-  Tricks for templating textures
-
- __________________________________________________________________________
-    This file is part of the LAMMPS GPU Library
- __________________________________________________________________________
-
-    begin                : Tue Jun 23 2009
-    copyright            : (C) 2009 by W. Michael Brown
-    email                : wmbrown@sandia.gov
- ***************************************************************************/
-
-/* -----------------------------------------------------------------------
-   Copyright (2009) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-   ----------------------------------------------------------------------- */
-
-#include "nvc_memory.h"
-
-#ifndef PAIR_GPU_TEXTURE_H
-#define PAIR_GPU_TEXTURE_H
-
-template <class numtyp> class cu_vec_traits;
-template <> class cu_vec_traits<float> { public: typedef float2 vec2; };
-template <> class cu_vec_traits<double> { public: typedef double2 vec2; };
-
-// ------------------------------- x ------------------------------------
-
-static texture<float, 2, cudaReadModeElementType> x_float_tex;
-static texture<int2, 2, cudaReadModeElementType> x_double_tex;
-template <class numtyp> inline void x_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(x_float_tex); }
-
-template <> inline void x_bind_texture<double>(NVC_ConstMatD &mat)
-  { mat.bind_texture(x_double_tex); } 
-template <class numtyp> inline void x_unbind_texture()
-  { cudaUnbindTexture(x_float_tex); }
-template <> inline void x_unbind_texture<double>() 
-  { cudaUnbindTexture(x_double_tex); }
-template <class numtyp>
-static __inline__ __device__ numtyp _x_(const int i, const int j) {
-  return tex2D(x_float_tex,i,j);
-}
-#ifdef GB_GPU_DOUBLE
-template <>
-static __inline__ __device__ double _x_<double>(const int i,const int j) {
-  int2 t=tex2D(x_double_tex,i,j);
-  return __hiloint2double(t.y, t.x);
-}
-#endif
-
-// ------------------------------- form ------------------------------------
-
-static texture<int, 2, cudaReadModeElementType> form_tex;
-inline void form_bind_texture(NVC_ConstMatI &mat)
-  { mat.bind_texture(form_tex); }
-inline void form_unbind_texture()
-  { cudaUnbindTexture(form_tex); }
-static __inline__ __device__ int _form_(const int i, const int j) {
-  return tex2D(form_tex,i,j);
-}
-
-// ------------------------------- lshape ------------------------------------
-
-static texture<float, 1, cudaReadModeElementType> lshape_float_tex;
-static texture<int2, 1, cudaReadModeElementType> lshape_double_tex;
-static cudaChannelFormatDesc channel_lshape;
-template <class numtyp> inline void lshape_bind_texture(NVC_VecT &vec) 
- { vec.bind_texture(lshape_float_tex,channel_lshape); }
-template <> inline void lshape_bind_texture<double>(NVC_VecD &vec)
- { vec.bind_texture(lshape_double_tex,channel_lshape); }
-template <class numtyp> inline void lshape_unbind_texture() 
-  { cudaUnbindTexture(lshape_float_tex); }
-template <> inline void lshape_unbind_texture<double>() 
-  { cudaUnbindTexture(lshape_double_tex); }
-template <class numtyp>
-static __inline__ __device__ numtyp _lshape_(const int i) 
-  { return tex1Dfetch(lshape_float_tex,i); }
-#ifdef GB_GPU_DOUBLE
-template <>
-static __inline__ __device__ double _lshape_<double>(const int i) {
-  int2 t=tex1Dfetch(lshape_double_tex,i);
-  return __hiloint2double(t.y, t.x);
-}
-#endif
-
-// ------------------------------- shape ------------------------------------
-
-static texture<float, 2, cudaReadModeElementType> shape_float_tex;
-static texture<int2, 2, cudaReadModeElementType> shape_double_tex;
-template <class numtyp> inline void shape_bind_texture(NVC_ConstMatT &mat) 
- { mat.bind_texture(shape_float_tex); }
-template <> inline void shape_bind_texture<double>(NVC_ConstMatD &mat) 
- { mat.bind_texture(shape_double_tex); } 
-template <class numtyp> inline void shape_unbind_texture() 
-  { cudaUnbindTexture(shape_float_tex); }
-template <> inline void shape_unbind_texture<double>() 
-  { cudaUnbindTexture(shape_double_tex); }
-template <class numtyp> 
-static __inline__ __device__ numtyp _shape_(const int i, const int j) {
-  return tex2D(shape_float_tex,j,i);
-}
-#ifdef GB_GPU_DOUBLE
-template <>
-static __inline__ __device__ double _shape_<double>(const int i, const int j) {
-  int2 t=tex2D(shape_double_tex,j,i);
-  return __hiloint2double(t.y, t.x);
-}
-#endif
-
-// ------------------------------- well ------------------------------------
-
-static texture<float, 2, cudaReadModeElementType> well_float_tex;
-static texture<int2, 2, cudaReadModeElementType> well_double_tex;
-template <class numtyp> inline void well_bind_texture(NVC_ConstMatT &mat) 
-  { mat.bind_texture(well_float_tex); }
-template <> inline void well_bind_texture<double>(NVC_ConstMatD &mat)
-  { mat.bind_texture(well_double_tex); }
-template <class numtyp> inline void well_unbind_texture() 
-  { cudaUnbindTexture(well_float_tex); }
-template <> inline void well_unbind_texture<double>() 
-  { cudaUnbindTexture(well_double_tex); }
-template <class numtyp> 
-static __inline__ __device__ numtyp _well_(const int i, const int j) 
-  { return tex2D(well_float_tex,j,i); }
-#ifdef GB_GPU_DOUBLE
-template <>
-static __inline__ __device__ double _well_<double>(const int i,const int j) {
-  int2 t=tex2D(well_double_tex,j,i);
-  return __hiloint2double(t.y, t.x);
-}
-#endif
-
-// ------------------------------- sigma ------------------------------------
-
-static texture<float, 2, cudaReadModeElementType> sigma_float_tex;
-static texture<int2, 2, cudaReadModeElementType> sigma_double_tex;
-template <class numtyp> inline void sigma_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(sigma_float_tex); }
-template <> inline void sigma_bind_texture<double>(NVC_ConstMatD &mat)
-  { mat.bind_texture(sigma_double_tex); }
-template <class numtyp> inline void sigma_unbind_texture() 
-  { cudaUnbindTexture(sigma_float_tex); }
-template <> inline void sigma_unbind_texture<double>() 
-  { cudaUnbindTexture(sigma_double_tex); }
-template <class numtyp>
-static __inline__ __device__ numtyp _sigma_(const int i, const int j) {
-  return tex2D(sigma_float_tex,j,i);
-}
-#ifdef GB_GPU_DOUBLE
-template <>
-static __inline__ __device__ double _sigma_<double>(const int i,const int j) {
-  int2 t=tex2D(sigma_double_tex,j,i);
-  return __hiloint2double(t.y, t.x);
-}
-#endif
-
-// ------------------------------- epsilon ------------------------------------
-
-static texture<float, 2, cudaReadModeElementType> epsilon_float_tex;
-static texture<int2, 2, cudaReadModeElementType> epsilon_double_tex;
-template <class numtyp> inline void epsilon_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(epsilon_float_tex); }
-template <> inline void epsilon_bind_texture<double>(NVC_ConstMatD &mat)
-  { mat.bind_texture(epsilon_double_tex); }
-template <class numtyp> inline void epsilon_unbind_texture() 
-  { cudaUnbindTexture(epsilon_float_tex); }
-template <> inline void epsilon_unbind_texture<double>() 
-  { cudaUnbindTexture(epsilon_double_tex); }
-template <class numtyp>
-static __inline__ __device__ numtyp _epsilon_(const int i, const int j) {
-  return tex2D(epsilon_float_tex,j,i);
-}
-#ifdef GB_GPU_DOUBLE
-template <>
-static __inline__ __device__ double _epsilon_<double>(const int i,const int j) {
-  int2 t=tex2D(epsilon_double_tex,j,i);
-  return __hiloint2double(t.y, t.x);
-}
-#endif
-
-// ------------------------------- cutsq ------------------------------------
-
-static texture<float, 2, cudaReadModeElementType> cutsq_float_tex;
-static texture<int2, 2, cudaReadModeElementType> cutsq_double_tex;
-template <class numtyp> inline void cutsq_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(cutsq_float_tex); }
-template <> inline void cutsq_bind_texture<double>(NVC_ConstMatD &mat)
-  { mat.bind_texture(cutsq_double_tex); }
-template <class numtyp> inline void cutsq_unbind_texture() 
- { cudaUnbindTexture(cutsq_float_tex); }
-template <> inline void cutsq_unbind_texture<double>() 
- { cudaUnbindTexture(cutsq_double_tex); }
-template <class numtyp>
-static __inline__ __device__ numtyp _cutsq_(const int i, const int j) {
-  return tex2D(cutsq_float_tex,j,i);
-}
-#ifdef GB_GPU_DOUBLE
-template <>
-static __inline__ __device__ double _cutsq_<double>(const int i,const int j) {
-  int2 t=tex2D(cutsq_double_tex,j,i);
-  return __hiloint2double(t.y, t.x);
-}
-#endif
-
-// ------------------------------- lj1 ------------------------------------
-
-static texture<float2, 2, cudaReadModeElementType> lj1_float_tex;
-static texture<int4, 2, cudaReadModeElementType> lj1_double_tex;
-template <class numtyp> inline void lj1_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(lj1_float_tex); }
-template <> inline void lj1_bind_texture<double2>(NVC_ConstMatD2 &mat)
-  { mat.bind_texture(lj1_double_tex); }
-template <class numtyp> inline void lj1_unbind_texture() 
- { cudaUnbindTexture(lj1_float_tex); }
-template <> inline void lj1_unbind_texture<double2>() 
- { cudaUnbindTexture(lj1_double_tex); }
-template <class numtyp>
-static __inline__ __device__ 
-typename cu_vec_traits<numtyp>::vec2 _lj1_(const int i, const int j) {
-  return tex2D(lj1_float_tex,j,i);
-}
-#ifdef GB_GPU_DOUBLE
-template <>
-static __inline__ __device__ double2 _lj1_<double>(const int i,const int j) {
-  int4 t=tex2D(lj1_double_tex,j,i);
-  double2 ans;
-  ans.x=__hiloint2double(t.y, t.x);
-  ans.y=__hiloint2double(t.w, t.z);
-  return ans;
-}
-#endif
-
-// ------------------------------- lj3 ------------------------------------
-
-static texture<float2, 2, cudaReadModeElementType> lj3_float_tex;
-static texture<int4, 2, cudaReadModeElementType> lj3_double_tex;
-template <class numtyp> inline void lj3_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(lj3_float_tex); }
-template <> inline void lj3_bind_texture<double2>(NVC_ConstMatD2 &mat)
-  { mat.bind_texture(lj3_double_tex); }
-template <class numtyp> inline void lj3_unbind_texture() 
- { cudaUnbindTexture(lj3_float_tex); }
-template <> inline void lj3_unbind_texture<double2>() 
- { cudaUnbindTexture(lj3_double_tex); }
-template <class numtyp>
-static __inline__ __device__ 
-typename cu_vec_traits<numtyp>::vec2 _lj3_(const int i, const int j) {
-  return tex2D(lj3_float_tex,j,i);
-}
-#ifdef GB_GPU_DOUBLE
-template <>
-static __inline__ __device__ double2 _lj3_<double>(const int i,const int j) {
-  int4 t=tex2D(lj3_double_tex,j,i);
-  double2 ans;
-  ans.x=__hiloint2double(t.y, t.x);
-  ans.y=__hiloint2double(t.w, t.z);
-  return ans;
-}
-#endif
-
-// ------------------------------- offset ------------------------------------
-
-static texture<float, 2, cudaReadModeElementType> offset_float_tex;
-static texture<int2, 2, cudaReadModeElementType> offset_double_tex;
-template <class numtyp> inline void offset_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(offset_float_tex); }
-template <> inline void offset_bind_texture<double>(NVC_ConstMatD &mat)
-  { mat.bind_texture(offset_double_tex); }
-template <class numtyp> inline void offset_unbind_texture() 
- { cudaUnbindTexture(offset_float_tex); }
-template <> inline void offset_unbind_texture<double>() 
- { cudaUnbindTexture(offset_double_tex); }
-template <class numtyp>
-static __inline__ __device__ numtyp _offset_(const int i, const int j) {
-  return tex2D(offset_float_tex,j,i);
-}
-#ifdef GB_GPU_DOUBLE
-template <>
-static __inline__ __device__ double _offset_<double>(const int i,const int j) {
-  int2 t=tex2D(offset_double_tex,j,i);
-  return __hiloint2double(t.y, t.x);
-}
-#endif
-
-#endif
diff --git a/src/GPU/Install.csh b/src/GPU/Install.csh
deleted file mode 100644
index d5d521911f..0000000000
--- a/src/GPU/Install.csh
+++ /dev/null
@@ -1,34 +0,0 @@
-# Install/unInstall package classes in LAMMPS
-# edit Makefile.package to include/exclude GPU library
-
-if ($1 == 1) then
-
-#  sed -i 's/\S*gpu //' ../Makefile.package
-#  sed -i 's|^PKGPATH =\s*|&-L../../lib/gpu |' ../Makefile.package
-#  sed -i 's|^PKGLIB =\s*|&-lpair_gpu_lib |' ../Makefile.package
-
-  cp -f pair_lj_cut_gpu.h ../
-  cp -f pair_lj_cut_gpu.cpp ../
-  
-  if (-e ../pair_gayberne.cpp) then
-    cp -f style_gpu.h ../
-    cp -f pair_gayberne_gpu.h ../
-    cp -f pair_gayberne_gpu.cpp ../
-  else
-    grep -v erne style_gpu.h > ../style_gpu.h
-  endif 
-  
-else if ($1 == 0) then
-
-#  sed -i 's/\S*gpu //' ../Makefile.package
-
-  rm -f ../style_gpu.h
-  touch ../style_gpu.h
-
-  rm -f ../pair_lj_cut_gpu.h
-  rm -f ../pair_lj_cut_gpu.cpp
-
-  rm -f ../pair_gayberne_gpu.h
-  rm -f ../pair_gayberne_gpu.cpp
-
-endif
diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp
deleted file mode 100644
index 23ddff98dd..0000000000
--- a/src/GPU/pair_gayberne_gpu.cpp
+++ /dev/null
@@ -1,369 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   cetain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Mike Brown (SNL)
-------------------------------------------------------------------------- */
-
-#include "math.h"
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
-#include "pair_gayberne_gpu.h"
-#include "math_extra.h"
-#include "atom.h"
-#include "atom_vec.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "integrate.h"
-#include "memory.h"
-#include "error.h"
-#include "neigh_request.h"
-#include "universe.h"
-
-#ifdef GB_GPU_OMP
-#include "omp.h"
-#endif
-
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-
-// External functions from cuda library for atom decomposition
-int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
-                  const double upsilon, const double mu, double **shape,
-                  double **well, double **cutsq, double **sigma, 
-                  double **epsilon, double *host_lshape, int **form,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj, 
-                  const int max_nbors, const int thread, const int gpu_id);
-void gb_gpu_clear(const int thread);
-int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
-                         int *ilist, const int *numj, const int *type,
-                         const int thread, bool &success);
-void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread);
-void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type, 
-                 const bool rebuild, const int thread);
-void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
-                     const int thread);
-double gb_gpu_forces(double **f, double **tor, const int *ilist,
-                     const bool eflag, const bool vflag, const bool eflag_atom,
-                     const bool vflag_atom, double *eatom, double **vatom,
-                     double *virial, const int thread);
-std::string gb_gpu_name(const int i, const int max_nbors);
-void gb_gpu_time(const int thread);
-int gb_gpu_num_devices();
-double gb_gpu_bytes();
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0),
-                                                omp_chunk(0), nthreads(1),
-                                                multi_gpu_mode(ONE_NODE),
-                                                multi_gpu_param(0)
-{
-  ij_new[0]=NULL;
-}
-
-/* ----------------------------------------------------------------------
-   free all arrays
-------------------------------------------------------------------------- */
-
-PairGayBerneGPU::~PairGayBerneGPU()
-{
-  if (comm->me == 0 && screen) {
-    printf("\n\n-------------------------------------");
-    printf("--------------------------------\n");
-    printf("      GPU Time Stamps: ");
-    printf("\n-------------------------------------");
-    printf("--------------------------------\n");
-    gb_gpu_time(my_thread);
-    std::cout << "Procs: " << universe->nprocs << std::endl;
-    printf("-------------------------------------");
-    printf("--------------------------------\n\n");
-  }
-  #pragma omp parallel
-  {
-    #ifdef GB_GPU_OMP
-    int my_thread=omp_get_thread_num();
-    #endif
-    gb_gpu_clear(my_thread);
-    if (ij_new[my_thread]!=NULL) {
-      ij_new[my_thread]=NULL;
-      delete [] ij_new[my_thread];
-    }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairGayBerneGPU::compute(int eflag, int vflag)
-{
-  if (eflag || vflag) ev_setup(eflag,vflag);
-  else evflag = vflag_fdotr = 0;
-  if (vflag_atom) 
-    error->all("Per-atom virial not available with GPU Gay-Berne.");
-
-  int nlocal = atom->nlocal;
-  int nall = nlocal + atom->nghost;
-  int inum = list->inum;
-  
-  bool rebuild=false;
-  if (neighbor->ncalls > last_neighbor) {
-    last_neighbor=neighbor->ncalls;
-    rebuild=true;
-  }
-  
-  #pragma omp parallel
-  {
-    
-  bool success=true;
-  #ifdef GB_GPU_OMP
-  int my_thread=omp_get_thread_num();
-  if (rebuild) {
-    omp_chunk=static_cast<int>(ceil(static_cast<double>(inum)/nthreads));
-    if (my_thread==nthreads-1)
-      thread_inum[my_thread]=inum-(nthreads-1)*omp_chunk;
-    else
-      thread_inum[my_thread]=omp_chunk;
-    olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, 
-                                        thread_inum[my_thread],  
-                                        list->ilist+omp_chunk*my_thread, 
-                                        list->numneigh, atom->type, my_thread,
-                                        success);
-  }
-  #else
-  if (rebuild)
-    olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, inum, list->ilist, 
-                                        list->numneigh, atom->type, my_thread,
-                                        success);
-  #endif
-  if (!success)
-    error->one("Total # of atoms exceeds maximum allowed per GPGPU.\n");
-  
-  // copy atom data to GPU
-  gb_gpu_atom(atom->x,atom->quat,atom->type,rebuild,my_thread);
-
-  int i,j,ii,jj,jnum;
-  double factor_lj;
-  int *jlist;
-
-  if (rebuild==true) {
-    int num_ij = 0;
-
-    // loop over neighbors of my atoms
-    int *ijp=ij_new[my_thread];
-    #ifdef GB_GPU_OMP
-    int mgo=my_thread*omp_chunk;
-    int mgot=mgo+thread_inum[my_thread];
-    #else
-    int mgo=0, mgot=inum;
-    #endif
-    for (ii = mgo; ii<mgot; ii++) {
-      i = olist[my_thread][ii];
-      jlist = list->firstneigh[i];
-      jnum = list->numneigh[i];
-      
-      for (jj = 0; jj < jnum; jj++) {
-        j = jlist[jj];
-
-        *ijp=j;
-        ijp++;
-        num_ij++;
-          
-        if (num_ij==ij_size) {
-          memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int));
-          gb_gpu_nbors(num_ij,eflag,my_thread);
-          ijp=ij_new[my_thread];
-          num_ij=0;
-        }
-      }
-    }
-    if (num_ij>0) {
-      memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int));
-      gb_gpu_nbors(num_ij,eflag,my_thread);
-    }
-  }
-  
-  gb_gpu_gayberne(eflag,vflag,rebuild,my_thread);
-  double lvirial[6];
-  for (int i=0; i<6; i++) lvirial[i]=0.0;
-  double my_eng=gb_gpu_forces(atom->f,atom->torque,olist[my_thread],eflag,vflag,
-                              eflag_atom, vflag_atom, eatom, vatom, lvirial,
-                              my_thread);
-  #pragma omp critical
-  {
-    eng_vdwl+=my_eng;
-    virial[0]+=lvirial[0];
-    virial[1]+=lvirial[1];
-    virial[2]+=lvirial[2];
-    virial[3]+=lvirial[3];
-    virial[4]+=lvirial[4];
-    virial[5]+=lvirial[5];
-  }
-  
-  } //End parallel
-  
-  if (vflag_fdotr) virial_compute();
-}
-
-/* ----------------------------------------------------------------------
-   global settings
-------------------------------------------------------------------------- */
-
-void PairGayBerneGPU::settings(int narg, char **arg)
-{
-  if (narg != 4 && narg != 6) error->all("Illegal pair_style command");
-  
-  // Set multi_gpu_mode to one_node for multiple gpus on 1 node
-  // -- param is starting gpu id
-  // Set multi_gpu_mode to one_gpu to select the same gpu id on every node
-  // -- param is id of gpu
-  // Set multi_gpu_mode to multi_gpu to get ma
-  // -- param is number of gpus per node
-  multi_gpu_mode=ONE_NODE;
-  multi_gpu_param=0;
-  if (narg==6) {
-    multi_gpu_param=atoi(arg[5]);
-    if (strcmp("one_node",arg[4])==0)
-      multi_gpu_mode=ONE_NODE;
-    else if (strcmp("one_gpu",arg[4])==0)
-      multi_gpu_mode=ONE_GPU;
-    else if (strcmp("multi_gpu",arg[4])==0) {
-      multi_gpu_mode=MULTI_GPU;
-      if (multi_gpu_param<1)
-        error->all("Illegal pair_style command");
-    } else
-      error->all("Illegal pair_style command");
-    narg-=2;
-  }
-
-  PairGayBerne::settings(narg,arg);
-}
-
-/* ----------------------------------------------------------------------
-   init specific to this pair style
-------------------------------------------------------------------------- */
-
-void PairGayBerneGPU::init_style()
-{
-  // Set the GPU ID
-  int my_gpu=comm->me+multi_gpu_param;
-  int ngpus=universe->nprocs;
-  if (multi_gpu_mode==ONE_GPU) {
-    my_gpu=multi_gpu_param;
-    ngpus=1;
-  } else if (multi_gpu_mode==MULTI_GPU) {
-    ngpus=multi_gpu_param;
-    my_gpu=comm->me%ngpus;
-    if (ngpus>universe->nprocs)
-      ngpus=universe->nprocs;
-  }
-  
-  if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type)
-    error->all("Pair gayberne requires atom attributes quat, torque, shape");
-  if (atom->radius_flag)
-    error->all("Pair gayberne cannot be used with atom attribute diameter");
-
-  int irequest = neighbor->request(this);
-
-  // per-type shape precalculations
-
-  for (int i = 1; i <= atom->ntypes; i++) {
-    if (setwell[i]) {
-      double *one = atom->shape[i];
-      shape[i][0] = one[0]*one[0];
-      shape[i][1] = one[1]*one[1];
-      shape[i][2] = one[2]*one[2];
-      lshape[i] = (one[0]*one[1]+one[2]*one[2])*sqrt(one[0]*one[1]);
-    }
-  }
-
-  // Repeat cutsq calculation because done after call to init_style
-  double cut;
-  for (int i = 1; i <= atom->ntypes; i++)
-    for (int j = i; j <= atom->ntypes; j++) {
-      cut = init_one(i,j);
-      cutsq[i][j] = cutsq[j][i] = cut*cut;
-    }
-
-  // If compiled with OpenMP and only 1 proc, try to use multiple GPUs w/threads
-  #ifdef GB_GPU_OMP
-  if (multi_gpu_mode!=ONE_GPU)
-    nthreads=ngpus=gb_gpu_num_devices();
-  else
-    nthreads=ngpus=1;
-  if (nthreads>MAX_GPU_THREADS)
-    nthreads=MAX_GPU_THREADS;
-  omp_set_num_threads(nthreads);
-  #endif
-    
-  #pragma omp parallel firstprivate(my_gpu)
-  {
-    #ifdef GB_GPU_OMP
-    int my_thread = omp_get_thread_num();
-    if (multi_gpu_mode!=ONE_GPU)
-      my_gpu=my_thread;
-    if (multi_gpu_mode==ONE_NODE)
-      my_gpu+=multi_gpu_param;
-    #endif
-    
-    ij[my_thread]=gb_gpu_init(ij_size, atom->ntypes+1, gamma, upsilon, mu, 
-                              shape, well, cutsq, sigma, epsilon, lshape, form,
-                              lj1, lj2, lj3, lj4, offset, force->special_lj, 
-                              neighbor->oneatom, my_thread, my_gpu);
-    if (ij[my_thread]==0)
-      error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU.");
-    
-    if (ij_new[my_thread]!=NULL)
-      delete [] ij_new[my_thread];
-    ij_new[my_thread]=new int[ij_size];
-  }
-  
-  last_neighbor = -1;
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->full = 1;
-  if (force->newton_pair) 
-    error->all("Cannot use newton with GPU GayBerne pair style.");
-
-  if (comm->me == 0 && screen) {
-    printf("\n-------------------------------------");
-    printf("-------------------------------------\n");
-    printf("- Using GPGPU acceleration for Gay-Berne:\n");
-    printf("-------------------------------------");
-    printf("-------------------------------------\n");
-
-    for (int i=0; i<ngpus; i++) {
-      int gpui=my_gpu;
-      if (multi_gpu_mode==ONE_NODE)
-        gpui=i+multi_gpu_param;
-      else if (multi_gpu_mode==MULTI_GPU)
-        gpui=i;
-      std::string gpu_string=gb_gpu_name(gpui,neighbor->oneatom);
-      printf("GPU %d: %s\n",gpui,gpu_string.c_str());  
-    }
-    printf("-------------------------------------");
-    printf("-------------------------------------\n\n");
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-double PairGayBerneGPU::memory_usage()
-{
-  double bytes=Pair::memory_usage()+nthreads*ij_size*sizeof(int);
-  return bytes+gb_gpu_bytes();
-}
diff --git a/src/GPU/pair_gayberne_gpu.h b/src/GPU/pair_gayberne_gpu.h
deleted file mode 100644
index 7be34c3531..0000000000
--- a/src/GPU/pair_gayberne_gpu.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifndef PAIR_GPU_H
-#define PAIR_GPU_H
-
-#include "pair_gayberne.h"
-#define MAX_GPU_THREADS 4
-
-namespace LAMMPS_NS {
-
-class PairGayBerneGPU : public PairGayBerne {
- public:
-  PairGayBerneGPU(LAMMPS *lmp);
-  ~PairGayBerneGPU();
-  void compute(int, int);
-  void settings(int, char **);
-  void init_style();
-  double memory_usage();
- 
-  enum { ONE_NODE, ONE_GPU, MULTI_GPU };
-
- private:  
-  int ij_size;
-  int *ij[MAX_GPU_THREADS], *ij_new[MAX_GPU_THREADS], *olist[MAX_GPU_THREADS];
- 
-  int my_thread, nthreads, thread_inum[MAX_GPU_THREADS], omp_chunk;
- 
-  int last_neighbor, multi_gpu_mode, multi_gpu_param;
-};
-
-}
-#endif
diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp
deleted file mode 100644
index d82f99e87d..0000000000
--- a/src/GPU/pair_lj_cut_gpu.cpp
+++ /dev/null
@@ -1,270 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   cetain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Mike Brown (SNL)
-------------------------------------------------------------------------- */
-
-#include "math.h"
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
-#include "pair_lj_cut_gpu.h"
-#include "math_extra.h"
-#include "atom.h"
-#include "atom_vec.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "integrate.h"
-#include "memory.h"
-#include "error.h"
-#include "neigh_request.h"
-#include "universe.h"
-
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-
-// External functions from cuda library for force decomposition
-int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq, 
-                  double **sigma, double **epsilon, double **host_lj1, 
-                  double **host_lj2, double **host_lj3, double **host_lj4, 
-                  double **offset, double *special_lj, const int max_nbors, 
-                  const int gpu_id);
-void lj_gpu_clear();
-bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, 
-                        const int *numj);
-void lj_gpu_nbors(const int num_ij);
-void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild);
-void lj_gpu(const bool eflag, const bool vflag, const bool rebuild);
-double lj_gpu_forces(double **f, const int *ilist, const bool eflag, 
-                     const bool vflag, const bool eflag_atom, 
-                     const bool vflag_atom, double *eatom, double **vatom,
-                     double *virial);
-std::string lj_gpu_name(const int gpu_id, const int max_nbors);
-void lj_gpu_time();
-int lj_gpu_num_devices();
-double lj_gpu_bytes();
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0)
-{
-  ij_new=NULL;
-  respa_enable = 0;
-}
-
-/* ----------------------------------------------------------------------
-   free all arrays
-------------------------------------------------------------------------- */
-
-PairLJCutGPU::~PairLJCutGPU()
-{
-  if (comm->me == 0 && screen) {
-    printf("\n\n-------------------------------------");
-    printf("--------------------------------\n");
-    printf("      GPU Time Stamps: ");
-    printf("\n-------------------------------------");
-    printf("--------------------------------\n");
-    lj_gpu_time();
-    std::cout << "Procs: " << universe->nprocs << std::endl;
-    printf("-------------------------------------");
-    printf("--------------------------------\n\n");
-  }
-  lj_gpu_clear();
-  if (ij_new!=NULL) {
-    ij_new=NULL;
-    delete [] ij_new;
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutGPU::compute(int eflag, int vflag)
-{
-  if (eflag || vflag) ev_setup(eflag,vflag);
-  else evflag = vflag_fdotr = 0;
-  if (vflag_atom) 
-    error->all("Per-atom virial not available with GPU Gay-Berne.");
-
-  int nlocal = atom->nlocal;
-  int nall = nlocal + atom->nghost;
-  int inum = list->inum;
-  int *ilist = list->ilist;
-  
-  bool rebuild=false;
-  if (neighbor->ncalls > last_neighbor) {
-    last_neighbor=neighbor->ncalls;
-    rebuild=true;
-  }
-  
-  // copy nbors to GPU
-  if (rebuild)
-    if (!lj_gpu_reset_nbors(nall, inum, ilist, list->numneigh))
-      error->one("Total # of atoms exceed maximum allowed per GPGPU.\n");
-  
-  // copy atom data to GPU
-  lj_gpu_atom(atom->x,atom->type,rebuild);
-
-  int i,j,ii,jj,jnum;
-  double factor_lj;
-  int *jlist;
-
-  if (rebuild==true) {
-    int num_ij = 0;
-
-    // loop over neighbors of my atoms
-    int *ijp=ij_new;
-    for (ii = 0; ii<inum; ii++) {
-      i = ilist[ii];
-      jlist = list->firstneigh[i];
-      jnum = list->numneigh[i];
-      
-      for (jj = 0; jj < jnum; jj++) {
-        j = jlist[jj];
-
-        *ijp=j;
-        ijp++;
-        num_ij++;
-          
-        if (num_ij==ij_size) {
-          memcpy(ij,ij_new,num_ij*sizeof(int));
-          lj_gpu_nbors(num_ij);
-          ijp=ij_new;
-          num_ij=0;
-        }
-      }
-    }
-    if (num_ij>0) {
-      memcpy(ij,ij_new,num_ij*sizeof(int));
-      lj_gpu_nbors(num_ij);
-    }
-  }
-  
-  lj_gpu(eflag,vflag,rebuild);
-  eng_vdwl=lj_gpu_forces(atom->f,ilist,eflag,vflag, eflag_atom, vflag_atom, 
-                         eatom, vatom, virial);
-
-  if (vflag_fdotr) virial_compute();
-}
-
-/* ----------------------------------------------------------------------
-   global settings
-------------------------------------------------------------------------- */
-
-void PairLJCutGPU::settings(int narg, char **arg)
-{
-  if (narg != 1 && narg != 3) error->all("Illegal pair_style command");
-  
-  // Set multi_gpu_mode to one_node for multiple gpus on 1 node
-  // -- param is starting gpu id
-  // Set multi_gpu_mode to one_gpu to select the same gpu id on every node
-  // -- param is id of gpu
-  // Set multi_gpu_mode to multi_gpu to get ma
-  // -- param is number of gpus per node
-  multi_gpu_mode=ONE_NODE;
-  multi_gpu_param=0;
-  if (narg==3) {
-    multi_gpu_param=atoi(arg[2]);
-    if (strcmp("one_node",arg[1])==0)
-      multi_gpu_mode=ONE_NODE;
-    else if (strcmp("one_gpu",arg[1])==0)
-      multi_gpu_mode=ONE_GPU;
-    else if (strcmp("multi_gpu",arg[1])==0) {
-      multi_gpu_mode=MULTI_GPU;
-      if (multi_gpu_param<1)
-        error->all("Illegal pair_style command");
-    } else
-      error->all("Illegal pair_style command");
-    narg-=2;
-  }
-
-  PairLJCut::settings(narg,arg);
-}
-
-/* ----------------------------------------------------------------------
-   init specific to this pair style
-------------------------------------------------------------------------- */
-
-void PairLJCutGPU::init_style()
-{
-  // Set the GPU ID
-  int my_gpu=comm->me+multi_gpu_param;
-  int ngpus=universe->nprocs;
-  if (multi_gpu_mode==ONE_GPU) {
-    my_gpu=multi_gpu_param;
-    ngpus=1;
-  } else if (multi_gpu_mode==MULTI_GPU) {
-    ngpus=multi_gpu_param;
-    my_gpu=comm->me%ngpus;
-    if (ngpus>universe->nprocs)
-      ngpus=universe->nprocs;
-  }
-  
-  int irequest = neighbor->request(this);
-  cut_respa=NULL;
-
-  // Repeat cutsq calculation because done after call to init_style
-  double cut;
-  for (int i = 1; i <= atom->ntypes; i++)
-    for (int j = i; j <= atom->ntypes; j++) {
-      cut = init_one(i,j);
-      cutsq[i][j] = cutsq[j][i] = cut*cut;
-    }
-
-  ij=lj_gpu_init(ij_size, atom->ntypes+1, cutsq, sigma, epsilon, lj1, lj2, lj3, 
-                 lj4, offset, force->special_lj, neighbor->oneatom, my_gpu);
-  if (ij==0)
-    error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU.");
-    
-  if (ij_new!=NULL)
-    delete [] ij_new;
-  ij_new=new int[ij_size];
-  
-  last_neighbor = -1;
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->full = 1;
-  if (force->newton_pair) 
-    error->all("Cannot use newton with GPU LJCut pair style.");
-
-  if (comm->me == 0 && screen) {
-    printf("\n-------------------------------------");
-    printf("-------------------------------------\n");
-    printf("- Using GPGPU acceleration for LJ-Cut:\n");
-    printf("-------------------------------------");
-    printf("-------------------------------------\n");
-
-    for (int i=0; i<ngpus; i++) {
-      int gpui=my_gpu;
-      if (multi_gpu_mode==ONE_NODE)
-        gpui=i+multi_gpu_param;
-      else if (multi_gpu_mode==MULTI_GPU)
-        gpui=i;
-      std::string gpu_string=lj_gpu_name(gpui,neighbor->oneatom);
-      printf("GPU %d: %s\n",gpui,gpu_string.c_str());  
-    }
-    printf("-------------------------------------");
-    printf("-------------------------------------\n\n");
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-double PairLJCutGPU::memory_usage()
-{
-  double bytes=Pair::memory_usage()+ij_size*sizeof(int);
-  return bytes+lj_gpu_bytes();
-}
diff --git a/src/GPU/pair_lj_cut_gpu.h b/src/GPU/pair_lj_cut_gpu.h
deleted file mode 100644
index 2774e723ed..0000000000
--- a/src/GPU/pair_lj_cut_gpu.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifndef PAIR_LJ_CUT_GPU_H
-#define PAIR_LJ_CUT_GPU_H
-
-#include "pair_lj_cut.h"
-
-namespace LAMMPS_NS {
-
-class PairLJCutGPU : public PairLJCut {
- public:
-  PairLJCutGPU(LAMMPS *lmp);
-  ~PairLJCutGPU();
-  void compute(int, int);
-  void settings(int, char **);
-  void init_style();
-  double memory_usage();
-
-  enum { ONE_NODE, ONE_GPU, MULTI_GPU };
-
- private:  
-  int ij_size;
-  int *ij, *ij_new;
- 
-  int last_neighbor, multi_gpu_mode, multi_gpu_param;
-};
-
-}
-#endif
diff --git a/src/GPU/style_gpu.h b/src/GPU/style_gpu.h
deleted file mode 100644
index 1be999ccdb..0000000000
--- a/src/GPU/style_gpu.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef AtomInclude
-#endif
-
-#ifdef AtomClass
-# endif
-
-#ifdef ComputeInclude
-#endif
-
-#ifdef ComputeClass
-#endif
-
-#ifdef FixInclude
-#endif
-
-#ifdef FixClass
-#endif
-
-#ifdef PairInclude
-#include "pair_lj_cut_gpu.h"
-#include "pair_gayberne_gpu.h"
-#endif
-
-#ifdef PairClass
-PairStyle(lj/cut/gpu,PairLJCutGPU)
-PairStyle(gayberne/gpu,PairGayBerneGPU)
-#endif
-