diff --git a/src/USER-CUDA/Install.sh b/src/USER-CUDA/Install.sh
index 3dc143471f..56f0dc80b9 100755
--- a/src/USER-CUDA/Install.sh
+++ b/src/USER-CUDA/Install.sh
@@ -82,6 +82,12 @@ if (test $1 = 1) then
     cp pair_eam_alloy_cuda.h ..
     cp pair_eam_cuda.h ..
     cp pair_eam_fs_cuda.h ..
+    cp pair_sw_cuda.h ..
+    cp pair_sw_cuda.cpp ..
+    cp pair_tersoff_cuda.h ..
+    cp pair_tersoff_cuda.cpp ..
+    cp pair_tersoff_zbl_cuda.h ..
+    cp pair_tersoff_zbl_cuda.cpp ..
   fi
 
   if (test -e ../pair_gran_hooke.cpp) then
@@ -193,12 +199,9 @@ if (test $1 = 1) then
   cp verlet_cuda.h ..
 
   cp cuda.h ..
-  cp cuda_common.h ..
   cp cuda_data.h ..
   cp cuda_modify_flags.h ..
   cp cuda_neigh_list.h ..
-  cp cuda_precision.h ..
-  cp cuda_shared.h ..
 
 elif (test $1 = 0) then
 
@@ -341,12 +344,15 @@ elif (test $1 = 0) then
   rm -f ../pppm_cuda.h
   rm -f ../verlet_cuda.h
 
+  rm -f ../pair_sw_cuda.h 
+  rm -f ../pair_sw_cuda.cpp
+  rm -f ../pair_tersoff_cuda.h 
+  rm -f ../pair_tersoff_cuda.cpp 
+  rm -f ../pair_tersoff_zbl_cuda.h 
+  rm -f ../pair_tersoff_zbl_cuda.cpp   
+  
   rm -f ../cuda.h
-  rm -f ../cuda_common.h
   rm -f ../cuda_data.h
   rm -f ../cuda_modify_flags.h
   rm -f ../cuda_neigh_list.h
-  rm -f ../cuda_precision.h
-  rm -f ../cuda_shared.h
-
 fi
diff --git a/src/USER-CUDA/comm_cuda.cpp b/src/USER-CUDA/comm_cuda.cpp
index 8e75f93ba5..ea4a4ee6a6 100644
--- a/src/USER-CUDA/comm_cuda.cpp
+++ b/src/USER-CUDA/comm_cuda.cpp
@@ -41,6 +41,9 @@ using namespace LAMMPS_NS;
 #define BUFFACTOR 1.5
 #define BUFMIN 1000
 #define BUFEXTRA 1000
+
+
+
 #define BIG 1.0e20
 
 enum{SINGLE,MULTI};
@@ -137,6 +140,7 @@ void CommCuda::init()
 
 void CommCuda::setup()
 {
+  if(cuda->shared_data.pair.neighall) cutghostuser = MAX(2.0*neighbor->cutneighmax,cutghostuser);
   Comm::setup();
    
   //upload changed geometry to device
diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp
index 39261bd7c0..819357bc16 100644
--- a/src/USER-CUDA/cuda.cpp
+++ b/src/USER-CUDA/cuda.cpp
@@ -46,6 +46,8 @@
 
 using namespace LAMMPS_NS;
 
+
+
 Cuda::Cuda(LAMMPS *lmp) : Pointers(lmp)
 {
 	cuda_exists=true;
@@ -309,6 +311,7 @@ void Cuda::setSharedDataZero()
 	shared_data.pair.special_lj = 0;
 	shared_data.pair.special_coul = 0;
 	
+	shared_data.pair.neighall = false;
 	
 	shared_data.pppm.cudable_force = 0;
 	
diff --git a/src/USER-CUDA/cuda_common.h b/src/USER-CUDA/cuda_common.h
deleted file mode 100644
index d4687ebd06..0000000000
--- a/src/USER-CUDA/cuda_common.h
+++ /dev/null
@@ -1,344 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifndef _CUDA_COMMON_H_
-#define _CUDA_COMMON_H_
-
-//#include "cutil.h"
-#include "cuda_precision.h"
-#include "cuda_wrapper_cu.h"
-
-#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types
-//this can not be arbitrarly large, since constant space is limited. 
-//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types
-//Christian   
-#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE)
-#define CUDA_MAX_NSPECIAL 25
-
-// define some easy-to-use debug and emulation macros
-#ifdef _DEBUG
-#define MYDBG(a) a
-#else
-#define MYDBG(a) 
-#endif
-
-#if __DEVICE_EMULATION__
-#define MYEMU(a) a
-#else
-#define MYEMU(a) 
-#endif
-
-#define MYEMUDBG(a) MYEMU(MYDBG(a))
-
-// Add Prefix (needed as workaround, same constant's names in different files causes conflict)
-#define MY_ADD_PREFIX(prefix, var) prefix##_##var
-#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var)
-#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var)
-
-#define MY_VAR_TO_STR(var) #var
-#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var)
-#define MY_CONST(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
-
-#define CUDA_USE_TEXTURE
-#define CUDA_USE_FLOAT4
-
-//constants used by many classes
-
-//domain
-#define _boxhi       MY_AP(boxhi)
-#define _boxlo       MY_AP(boxlo)
-#define _subhi       MY_AP(subhi)
-#define _sublo       MY_AP(sublo)
-#define _box_size    MY_AP(box_size)
-#define _prd         MY_AP(prd)
-#define _periodicity MY_AP(periodicity)
-#define _triclinic	 MY_AP(triclinic)
-#define _boxhi_lamda MY_AP(boxhi_lamda)
-#define _boxlo_lamda MY_AP(boxlo_lamda)
-#define _prd_lamda   MY_AP(prd_lamda)
-#define _h		 	 MY_AP(h)
-#define _h_inv	 	 MY_AP(h_inv)
-#define _h_rate		 MY_AP(h_rate)
-__device__ __constant__ X_FLOAT _boxhi[3];
-__device__ __constant__ X_FLOAT _boxlo[3];
-__device__ __constant__ X_FLOAT _subhi[3];
-__device__ __constant__ X_FLOAT _sublo[3];
-__device__ __constant__ X_FLOAT _box_size[3];
-__device__ __constant__ X_FLOAT _prd[3];
-__device__ __constant__ int _periodicity[3];
-__device__ __constant__ int _triclinic;
-__device__ __constant__ X_FLOAT _boxhi_lamda[3];
-__device__ __constant__ X_FLOAT _boxlo_lamda[3];
-__device__ __constant__ X_FLOAT _prd_lamda[3];
-__device__ __constant__ X_FLOAT _h[6];
-__device__ __constant__ X_FLOAT _h_inv[6];
-__device__ __constant__ V_FLOAT _h_rate[6];
-
-
-//atom properties
-#define _x           MY_AP(x)
-#define _v           MY_AP(v)
-#define _f           MY_AP(f)
-#define _tag         MY_AP(tag)
-#define _type        MY_AP(type)
-#define _mask        MY_AP(mask)
-#define _image       MY_AP(image)
-#define _q           MY_AP(q)
-#define _mass        MY_AP(mass)
-#define _rmass       MY_AP(rmass)
-#define _rmass_flag  MY_AP(rmass_flag)
-#define _eatom       MY_AP(eatom)
-#define _vatom       MY_AP(vatom)
-#define _x_type      MY_AP(x_type)
-#define _radius      MY_AP(radius)
-#define _density     MY_AP(density)
-#define _omega       MY_AP(omega)
-#define _torque      MY_AP(torque)
-#define _special     MY_AP(special)
-#define _maxspecial  MY_AP(maxspecial)
-#define _nspecial    MY_AP(nspecial)
-#define _special_flag  MY_AP(special_flag)
-#define _molecule    MY_AP(molecule)
-#define _v_radius    MY_AP(v_radius)
-#define _omega_rmass MY_AP(omega_rmass)
-#define _freeze_group_bit MY_AP(freeze_group_bit)
-#define _map_array   MY_AP(map_array)
-__device__ __constant__ X_FLOAT* _x;  //holds pointer to positions
-__device__ __constant__ V_FLOAT* _v;
-__device__ __constant__ F_FLOAT* _f;
-__device__ __constant__ int* _tag;
-__device__ __constant__ int* _type;
-__device__ __constant__ int* _mask;
-__device__ __constant__ int* _image;
-__device__ __constant__ V_FLOAT* _mass;
-__device__ __constant__ F_FLOAT* _q;
-__device__ __constant__ V_FLOAT* _rmass;
-__device__ __constant__ int _rmass_flag;
-__device__ __constant__ ENERGY_FLOAT* _eatom;
-__device__ __constant__ ENERGY_FLOAT* _vatom;
-__device__ __constant__ X_FLOAT4* _x_type;  //holds pointer to positions
-__device__ __constant__ X_FLOAT* _radius;  
-__device__ __constant__ F_FLOAT* _density;  
-__device__ __constant__ V_FLOAT* _omega;  
-__device__ __constant__ F_FLOAT* _torque;  
-__device__ __constant__ int* _special;
-__device__ __constant__ int _maxspecial;
-__device__ __constant__ int* _nspecial;
-__device__ __constant__ int _special_flag[4];
-__device__ __constant__ int* _molecule;
-__device__ __constant__ V_FLOAT4* _v_radius;  //holds pointer to positions
-__device__ __constant__ V_FLOAT4* _omega_rmass;  //holds pointer to positions
-__device__ __constant__ int _freeze_group_bit;
-__device__ __constant__ int* _map_array;
-
-#ifdef CUDA_USE_TEXTURE
-	
-	#define _x_tex         MY_AP(x_tex)
-	#if X_PRECISION == 1
-	texture<float> _x_tex;
-	#else
-	texture<int2,1> _x_tex;
-	#endif
-	
-	#define _type_tex         MY_AP(type_tex)
-	texture<int> _type_tex;
-	
-	#define _x_type_tex         MY_AP(x_type_tex)
-	#if X_PRECISION == 1
-	texture<float4,1> _x_type_tex;
-	#else
-	texture<int4,1> _x_type_tex;
-	#endif
-	
-	#define _v_radius_tex         MY_AP(v_radius_tex)
-	#if V_PRECISION == 1
-	texture<float4,1> _v_radius_tex;
-	#else
-	texture<int4,1> _v_radius_tex;
-	#endif
-	
-	#define _omega_rmass_tex         MY_AP(omega_rmass_tex)
-	#if V_PRECISION == 1
-	texture<float4,1> _omega_rmass_tex;
-	#else
-	texture<int4,1> _omega_rmass_tex;
-	#endif
-
-	#define _q_tex         MY_AP(q_tex)
-	#if F_PRECISION == 1
-	texture<float> _q_tex;
-	#else
-	texture<int2,1> _q_tex;
-	#endif
-
-#endif
-
-//neighbor
-#ifdef IncludeCommonNeigh
-#define _inum        	MY_AP(inum)
-#define _inum_border    MY_AP(inum_border)
-#define _ilist       	MY_AP(ilist)
-#define _ilist_border 	MY_AP(ilist_border)
-#define _numneigh    	MY_AP(numneigh)
-#define _numneigh_border 	MY_AP(numneigh_border)
-#define _numneigh_inner		MY_AP(numneigh_inner)
-#define _firstneigh  	MY_AP(firstneigh)
-#define _neighbors 	MY_AP(neighbors)
-#define _neighbors_border 	MY_AP(neighbors_border)
-#define _neighbors_inner  	MY_AP(neighbors_inner)
-#define _reneigh_flag 	MY_AP(reneigh_flag)
-#define _triggerneighsq MY_AP(triggerneighsq)
-#define _xhold       	MY_AP(xhold)
-#define _maxhold     	MY_AP(maxhold)
-#define _dist_check     MY_AP(dist_check)
-#define _neighbor_maxlocal MY_AP(neighbor_maxlocal)
-#define _maxneighbors   MY_AP(maxneighbors)
-#define _overlap_comm   MY_AP(overlap_comm)
-__device__ __constant__ int _inum;
-__device__ __constant__ int* _inum_border;
-__device__ __constant__ int* _ilist;
-__device__ __constant__ int* _ilist_border;
-__device__ __constant__ int* _numneigh;
-__device__ __constant__ int* _numneigh_border;
-__device__ __constant__ int* _numneigh_inner;
-__device__ __constant__ int** _firstneigh;
-__device__ __constant__ int* _neighbors;
-__device__ __constant__ int* _neighbors_border;
-__device__ __constant__ int* _neighbors_inner;
-__device__ __constant__ int* _reneigh_flag;
-__device__ __constant__ X_FLOAT _triggerneighsq;
-__device__ __constant__ X_FLOAT* _xhold;  //holds pointer to positions
-__device__ __constant__ int _maxhold;
-__device__ __constant__ int _dist_check;
-__device__ __constant__ int _neighbor_maxlocal;
-__device__ __constant__ int _maxneighbors;
-__device__ __constant__ int _overlap_comm;
-#endif
-
-//system properties
-#define _nall        MY_AP(nall)
-#define _nghost      MY_AP(nghost)
-#define _nlocal      MY_AP(nlocal)
-#define _nmax        MY_AP(nmax)
-#define _cuda_ntypes MY_AP(cuda_ntypes)
-#define _dtf         MY_AP(dtf)
-#define _dtv         MY_AP(dtv)
-#define _factor      MY_AP(factor)
-#define _virial      MY_AP(virial)
-#define _eng_vdwl    MY_AP(eng_vdwl)
-#define _eng_coul    MY_AP(eng_coul)
-#define _molecular   MY_AP(molecular)
-__device__ __constant__ unsigned _nall;
-__device__ __constant__ unsigned _nghost;
-__device__ __constant__ unsigned _nlocal;
-__device__ __constant__ unsigned _nmax;
-__device__ __constant__ unsigned _cuda_ntypes;
-__device__ __constant__ V_FLOAT _dtf;
-__device__ __constant__ X_FLOAT _dtv;
-__device__ __constant__ V_FLOAT _factor;
-__device__ __constant__ ENERGY_FLOAT* _virial;
-__device__ __constant__ ENERGY_FLOAT* _eng_vdwl;
-__device__ __constant__ ENERGY_FLOAT* _eng_coul;
-__device__ __constant__ int _molecular;
-
-//other general constants
-#define _buffer      MY_AP(buffer)
-#define _flag		 MY_AP(flag)
-#define _debugdata   MY_AP(debugdata)
-__device__ __constant__ void* _buffer;
-__device__ __constant__ int* _flag;
-__device__ __constant__ int* _debugdata;
-
-// pointers to data fields on GPU are hold in constant space 
-// -> reduces register usage and number of parameters for kernelcalls 
-// will be variables of file scope in cuda files
-
-
-
-
-// maybe used to output cudaError_t
-#define MY_OUTPUT_RESULT(result) \
-	switch(result) \
-	{ \
-		case cudaSuccess: printf(" => cudaSuccess\n"); break; \
-		case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \
-		case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \
-		case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \
-		case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \
-		default: printf(" => unknown\n"); break; \
-	}
-
-#ifdef _DEBUG
-#  define CUT_CHECK_ERROR(errorMessage) {                                    \
-    cudaError_t err = cudaGetLastError();                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    err = cudaThreadSynchronize();                                           \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    }
-#else
-#  define CUT_CHECK_ERROR(errorMessage) {                                    \
-    cudaError_t err = cudaGetLastError();                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    }
-#endif
-
-#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
-    cudaError err = call;                                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
-                __FILE__, __LINE__, cudaGetErrorString( err) );              \
-        exit(EXIT_FAILURE);                                                  \
-    } }
-
-#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);
-
-#define X_MASK 1
-#define V_MASK 2
-#define F_MASK 4
-#define TAG_MASK 8
-#define TYPE_MASK 16
-#define MASK_MASK 32
-#define IMAGE_MASK 64
-#define Q_MASK 128
-#define MOLECULE_MASK 256
-#define RMASS_MASK 512
-#define RADIUS_MASK 1024
-#define DENSITY_MASK 2048
-#define OMEGA_MASK 4096
-#define TORQUE_MASK 8192
-
-
-
-#endif // #ifdef _CUDA_COMMON_H_
diff --git a/src/USER-CUDA/cuda_precision.h b/src/USER-CUDA/cuda_precision.h
deleted file mode 100644
index 5b7d6a6843..0000000000
--- a/src/USER-CUDA/cuda_precision.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifndef CUDA_PRECISION_H_
-#define CUDA_PRECISION_H_
-/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
- * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
- * ***_FLOAT: type definition of given property
- * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
- */
-
-#ifdef CUDA_USE_BINNING
-#define CUDA_IF_BINNING(a) a
-#else
-#define CUDA_IF_BINNING(a) 
-#endif
-
-//GLOBAL
-
-#ifdef CUDA_PRECISION
-  #if CUDA_PRECISION == 1
-    #define CUDA_FLOAT float
-    #define CUDA_F(x) x##f
-  #endif 
-  #if CUDA_PRECISION == 2
-    #define CUDA_FLOAT double
-    #define CUDA_F(x) x
-  #endif
-#endif
- 
-#ifndef CUDA_PRECISION
-  #define CUDA_FLOAT double
-  #define CUDA_F(x) x
-  #define CUDA_PRECISION 2
-#endif
-//--------------------------------
-//-----------FFT-----------------
-//--------------------------------
-
-#ifdef FFT_PRECISION_CU
-  #if FFT_PRECISION_CU == 1
-    #define FFT_FLOAT float
-    #define FFT_F(x) x##f 
-  #endif 
-  #if FFT_PRECISION_CU == 2
-    #define FFT_FLOAT double
-    #define FFT_F(x) x 
-  #endif
-#endif
-
-#ifndef FFT_PRECISION_CU
-  #define FFT_FLOAT CUDA_FLOAT
-  #define FFT_F(x) CUDA_F(x)
-  #define FFT_PRECISION_CU CUDA_PRECISION 
-#endif
-
-//--------------------------------
-//-----------PPPM-----------------
-//--------------------------------
-
-#ifdef PPPM_PRECISION
-  #if PPPM_PRECISION == 1
-    #define PPPM_FLOAT float
-    #define PPPM_F(x) x##f 
-  #endif 
-  #if PPPM_PRECISION == 2
-    #define PPPM_FLOAT double
-    #define PPPM_F(x) x 
-  #endif
-#endif
-
-#ifndef PPPM_PRECISION
-  #define PPPM_FLOAT CUDA_FLOAT
-  #define PPPM_F(x) CUDA_F(x) 
-  #define PPPM_PRECISION CUDA_PRECISION 
-#endif
-
-//--------------------------------
-//-----------FORCE-----------------
-//--------------------------------
-
-
-#ifdef F_PRECISION
-  #if F_PRECISION == 1
-    #define F_FLOAT float
-    #define F_F(x) x##f 
-  #endif 
-  #if F_PRECISION == 2
-    #define F_FLOAT double
-    #define F_F(x) x 
-  #endif
-#endif
-
-#ifndef F_PRECISION
-  #define F_FLOAT CUDA_FLOAT
-  #define F_F(x) CUDA_F(x) 
-  #define F_PRECISION CUDA_PRECISION 
-#endif
-
-#if F_PRECISION == 1
-#define _SQRT_ sqrtf
-#define _RSQRT_ rsqrtf
-#define _EXP_ expf
-#else
-#define _SQRT_ sqrt
-#define _RSQRT_ rsqrt
-#define _EXP_ exp
-#endif
-
-#if F_PRECISION == 2
-struct F_FLOAT2
-{
-  F_FLOAT x;
-  F_FLOAT y;
-};
-struct F_FLOAT3
-{
-  F_FLOAT x;
-  F_FLOAT y;
-  F_FLOAT z;
-};
-struct F_FLOAT4
-{
-  F_FLOAT x;
-  F_FLOAT y;
-  F_FLOAT z;
-  F_FLOAT w;
-};
-#else
-#define F_FLOAT2 float2
-#define F_FLOAT3 float3
-#define F_FLOAT4 float4
-#endif
-//--------------------------------
-//-----------ENERGY-----------------
-//--------------------------------
-
-#ifndef ENERGY_PRECISION
-  #define ENERGY_FLOAT CUDA_FLOAT
-  #define ENERGY_F(x) CUDA_F(x) 
-#endif
-
-#ifdef ENERGY_PRECISION
-  #if ENERGY_PRECISION == 1
-    #define ENERGY_FLOAT float
-    #define ENERGY_F(x) x##f 
-  #endif 
-  #if ENERGY_PRECISION == 2
-    #define ENERGY_FLOAT double
-    #define ENERGY_F(x) x 
-  #endif
-#endif
-
-#ifndef ENERGY_PRECISION
-  #define ENERGY_FLOAT CUDA_FLOAT
-  #define ENERGY_F(x) CUDA_F(x) 
-  #define ENERGY_PRECISION CUDA_PRECISION 
-#endif
-
-//--------------------------------
-//-----------POSITIONS------------
-//--------------------------------
-
-#ifdef X_PRECISION
-  #if X_PRECISION == 1
-    #define X_FLOAT float
-    #define X_F(x) x##f 
-  #endif 
-  #if X_PRECISION == 2
-    #define X_FLOAT double
-    #define X_F(x) x 
-  #endif
-#endif
-
-#ifndef X_PRECISION
-  #define X_FLOAT CUDA_FLOAT
-  #define X_F(x) CUDA_F(x) 
-  #define X_PRECISION CUDA_PRECISION 
-#endif
-
-#if X_PRECISION == 2
-struct X_FLOAT2
-{
-  X_FLOAT x;
-  X_FLOAT y;
-};
-struct X_FLOAT3
-{
-  X_FLOAT x;
-  X_FLOAT y;
-  X_FLOAT z;
-};
-struct X_FLOAT4
-{
-  X_FLOAT x;
-  X_FLOAT y;
-  X_FLOAT z;
-  X_FLOAT w;
-};
-#else
-#define X_FLOAT2 float2
-#define X_FLOAT3 float3
-#define X_FLOAT4 float4
-#endif
-
-//--------------------------------
-//-----------velocities-----------
-//--------------------------------
-
-#ifdef V_PRECISION
-  #if V_PRECISION == 1
-    #define V_FLOAT float
-    #define V_F(x) x##f  
-  #endif 
-  #if V_PRECISION == 2
-    #define V_FLOAT double
-    #define V_F(x) x  
-  #endif
-#endif
-
-#ifndef V_PRECISION
-  #define V_FLOAT CUDA_FLOAT
-  #define V_F(x) CUDA_F(x) 
-  #define V_PRECISION CUDA_PRECISION 
-#endif
-
-#if V_PRECISION == 2
-struct V_FLOAT4
-{
-  V_FLOAT x;
-  V_FLOAT y;
-  V_FLOAT z;
-  V_FLOAT w;
-};
-#else
-#define V_FLOAT4 float4
-#endif
-
-#ifdef NO_PREC_TIMING
-struct timespec_2
-{
-	unsigned int tv_sec;
-	unsigned int tv_nsec;
-};
-
-#define timespec timespec_2
-#define clock_gettime(a,b) 
-#endif
-#endif /*CUDA_PRECISION_H_*/
diff --git a/src/USER-CUDA/cuda_shared.h b/src/USER-CUDA/cuda_shared.h
deleted file mode 100644
index f7983fff05..0000000000
--- a/src/USER-CUDA/cuda_shared.h
+++ /dev/null
@@ -1,378 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifndef _CUDA_SHARED_H_
-#define _CUDA_SHARED_H_
-#include "cuda_precision.h"
-
-#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
-
-struct dev_array
-{
-	void* dev_data;			// pointer to memory address on cuda device
-	unsigned dim[3];		// array dimensions
-};
-
-struct cuda_shared_atom		// relevent data from atom class
-{
-	dev_array dx; 			// cumulated distance for binning settings
-	dev_array x;			// position
-	dev_array v;			// velocity
-	dev_array f;			// force
-	dev_array tag;
-	dev_array type; 		// global ID number, there are ghosttype = ntypes  (ntypescuda=ntypes+1)   
-	dev_array mask;
-	dev_array image; 		
-	dev_array q;			// charges
-	dev_array mass;			// per-type masses
-	dev_array rmass;		// per-atom masses
-	dev_array radius;		// per-atom radius
-	dev_array density;
-	dev_array omega;
-	dev_array torque;
-	dev_array molecule;
-	
-	dev_array special;
-	int maxspecial;
-	dev_array nspecial;
-	int* special_flag;
-	int molecular;
-	
-	dev_array eatom;		// per-atom energy
-	dev_array vatom;		// per-atom virial
-	int need_eatom;
-	int need_vatom;
-	
-	dev_array x_type;		// position + type in X_FLOAT4 struct
-	dev_array v_radius;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
-	dev_array omega_rmass;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
-
-	double* mass_host;		// remember per-type host pointer to masses
-	//int natoms;				// total # of atoms in system, could be 0
-	int nghost;				// and ghost atoms on this proc
-	int nlocal;				// # of owned
-	int nall;			    // total # of atoms in this proc
-	int nmax;				// max # of owned+ghost in arrays on this proc 	
-	int ntypes;
-	int q_flag;				// do we have charges?
-	int rmass_flag;			// do we have per-atom masses?
-	int firstgroup;
-	int nfirst;
-	
- 	int update_nlocal;
- 	int update_nmax;
- 	
-	dev_array xhold;	    // position at last neighboring
- 	X_FLOAT triggerneighsq;		// maximum square movement before reneighboring
- 	int reneigh_flag;		// is reneighboring necessary
- 	int maxhold;			// size of xhold
- 	int dist_check; 		//perform distance check for reneighboring
- 	dev_array binned_id;    //id of each binned atom (not tag!!)
- 	dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
-    float bin_extraspace;	
-	int bin_dim[3];
-	int bin_nmax;
-	dev_array map_array;
-};
-
-struct cuda_shared_pair		// relevent data from pair class
-{
-	char cudable_force;		// check for (cudable_force!=0)
-	X_FLOAT cut_global;
-	X_FLOAT cut_inner_global;
-	X_FLOAT cut_coul_global;
-	double** cut;			// type-type cutoff 
-	double** cutsq;			// type-type cutoff 
-	double** cut_inner;			// type-type cutoff for coul
-	double** cut_coul;			// type-type cutoff for coul
-	double** coeff1;		// tpye-type pair parameters
-	double** coeff2;
-	double** coeff3;
-	double** coeff4;
-	double** coeff5;		
-	double** coeff6;		
-	double** coeff7;		
-	double** coeff8;		
-	double** coeff9;		
-	double** coeff10;		
-	double** offset;
-	double* special_lj;
-	double* special_coul;
-	dev_array virial; // ENERGY_FLOAT
-	dev_array eng_vdwl; // ENERGY_FLOAT
-	dev_array eng_coul; // ENERGY_FLOAT
-	X_FLOAT cut_coulsq_global;
-	F_FLOAT g_ewald,kappa;
-	int freeze_group_bit;
-	
-	dev_array coeff1_gm;
-	dev_array coeff2_gm;
-	dev_array coeff3_gm;
-	dev_array coeff4_gm;
-	dev_array coeff5_gm;
-	dev_array coeff6_gm;
-	dev_array coeff7_gm;
-	dev_array coeff8_gm;
-	dev_array coeff9_gm;
-	dev_array coeff10_gm;
-	
-	int lastgridsize;
-	int n_energy_virial;
-	int collect_forces_later;
-	int use_block_per_atom;
-	int override_block_per_atom;
-	
-};
-
-struct cuda_shared_domain	// relevent data from domain class
-{
-	X_FLOAT sublo[3];			// orthogonal box -> sub-box bounds on this proc
-	X_FLOAT subhi[3];
-	X_FLOAT boxlo[3];
-	X_FLOAT boxhi[3];
-	X_FLOAT prd[3];
-	int periodicity[3];		// xyz periodicity as array
-
-	int triclinic;
-	X_FLOAT xy;
-	X_FLOAT xz;
-	X_FLOAT yz;
-	X_FLOAT boxlo_lamda[3];
-	X_FLOAT boxhi_lamda[3];
-	X_FLOAT prd_lamda[3];
-	X_FLOAT h[6];
-	X_FLOAT h_inv[6];
-	V_FLOAT h_rate[6];
-	int update;
-};
-
-struct cuda_shared_pppm
-{
-   char cudable_force;
-#ifdef FFT_CUFFT  
-   FFT_FLOAT* work1;
-   FFT_FLOAT* work2;
-   FFT_FLOAT* work3;
-   PPPM_FLOAT* greensfn;
-   PPPM_FLOAT* fkx;
-   PPPM_FLOAT* fky;
-   PPPM_FLOAT* fkz;
-   PPPM_FLOAT* vg;
-#endif
-   int* part2grid;
-   PPPM_FLOAT* density_brick;
-   int* density_brick_int;
-   PPPM_FLOAT density_intScale;
-   PPPM_FLOAT* vdx_brick;
-   PPPM_FLOAT* vdy_brick;
-   PPPM_FLOAT* vdz_brick;
-   PPPM_FLOAT* density_fft;
-   ENERGY_FLOAT* energy;
-   ENERGY_FLOAT* virial;
-   int nxlo_in;
-   int nxhi_in;
-   int nxlo_out;
-   int nxhi_out;
-   int nylo_in;
-   int nyhi_in;
-   int nylo_out;
-   int nyhi_out;
-   int nzlo_in;
-   int nzhi_in;
-   int nzlo_out;
-   int nzhi_out;
-   int nx_pppm;
-   int ny_pppm;
-   int nz_pppm;
-   PPPM_FLOAT qqrd2e;
-   int order;
-  // float3 sublo;
-   PPPM_FLOAT* rho_coeff;
-   int nmax;
-   int nlocal;
-   PPPM_FLOAT* debugdata;
-   PPPM_FLOAT delxinv;
-   PPPM_FLOAT delyinv;
-   PPPM_FLOAT delzinv;
-   int nlower;
-   int nupper;
-   PPPM_FLOAT shiftone;
-   
-};
-
-struct cuda_shared_comm
-{
-   int maxswap;
-   int maxlistlength;
-   dev_array pbc;
-   dev_array slablo;
-   dev_array slabhi;
-   dev_array multilo;
-   dev_array multihi;
-   dev_array sendlist;
-   int grow_flag;
-   int comm_phase;
-   
-   int nsend;
-   int* nsend_swap;
-   int* send_size;
-   int* recv_size; 
-   double** buf_send;
-   void** buf_send_dev;
-   double** buf_recv;
-   void** buf_recv_dev;
-   void* buffer;
-   int buffer_size;
-   double overlap_split_ratio;
-};
-
-struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data
-{
-	int maxlocal;
-	int inum;                // # of I atoms neighbors are stored for local indices of I atoms
-	int inum_border2;
-	dev_array inum_border;         // # of atoms which interact with border atoms
-	dev_array ilist;
-	dev_array ilist_border;
-	dev_array numneigh;
-	dev_array numneigh_inner;
-	dev_array numneigh_border;
-	dev_array firstneigh;
-	dev_array neighbors;
-	dev_array neighbors_border;
-	dev_array neighbors_inner;
-	int maxpage;
-	dev_array page_pointers;
-	dev_array* pages;
-	int maxneighbors;
-	int neigh_lists_per_page;
-	double** cutneighsq;
-	CUDA_FLOAT* cu_cutneighsq;
-	int* binned_id;
-	int* bin_dim;
-	int bin_nmax;
-	float bin_extraspace;
-	double maxcut;
-	dev_array ex_type;
-	int nex_type;
-	dev_array ex1_bit;
-	dev_array ex2_bit;
-	int nex_group;
-	dev_array ex_mol_bit;
-	int nex_mol;
-	
-};
-
-struct cuda_compile_settings		// this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
-{
-    int prec_glob;
-    int prec_x;
-    int prec_v;
-    int prec_f;
-    int prec_pppm;
-    int prec_fft;
-    int cufft;
-    int arch;
-};
-
-struct cuda_timings_struct
-{
-	//Debug:
-	double test1;
-	double test2;
-	//transfers
-	double transfer_upload_tmp_constr;
-	double transfer_download_tmp_deconstr;
-	
-	//communication
-	double comm_forward_total;
-	double comm_forward_mpi_upper;
-	double comm_forward_mpi_lower;
-	double comm_forward_kernel_pack;
-	double comm_forward_kernel_unpack;
-	double comm_forward_kernel_self;
-	double comm_forward_upload;
-	double comm_forward_download;
-
-	double comm_exchange_total;
-	double comm_exchange_mpi;
-	double comm_exchange_kernel_pack;
-	double comm_exchange_kernel_unpack;
-	double comm_exchange_kernel_fill;
-	double comm_exchange_cpu_pack;
-	double comm_exchange_upload;
-	double comm_exchange_download;
-
-	double comm_border_total;
-	double comm_border_mpi;
-	double comm_border_kernel_pack;
-	double comm_border_kernel_unpack;
-	double comm_border_kernel_self;
-	double comm_border_kernel_buildlist;
-	double comm_border_upload;
-	double comm_border_download;
-	
-	//pair forces
-	double pair_xtype_conversion;
-	double pair_kernel;
-	double pair_virial;
-	double pair_force_collection;
-	
-	//neighbor
-	double neigh_bin;
-	double neigh_build;
-	double neigh_special;
-	
-	//PPPM
- 	double pppm_particle_map; 
-    double pppm_make_rho; 
-    double pppm_brick2fft; 
-    double pppm_poisson; 
-    double pppm_fillbrick; 
-    double pppm_fieldforce; 
-    double pppm_compute; 
-	
-};
-
-struct cuda_shared_data		// holds space for all relevent data from the different classes
-{
-	void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
-	int buffersize; //maxsize of buffer
-	int buffer_new; //should be 1 if the pointer to buffer has changed
-	void* flag;
-	void* debugdata;  //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
-	cuda_shared_atom atom;
-	cuda_shared_pair pair;	
-	cuda_shared_domain domain;
-	cuda_shared_pppm pppm;
-	cuda_shared_comm comm;
-	cuda_compile_settings compile_settings;
-	cuda_timings_struct cuda_timings;
-	int exchange_dim;
-	int me; //mpi rank
-	unsigned int datamask;
-	int overlap_comm;
-};
-
-
-#endif // #ifndef _CUDA_SHARED_H_
diff --git a/src/USER-CUDA/neighbor_cuda.cpp b/src/USER-CUDA/neighbor_cuda.cpp
index 99bf2dce3c..dc5af9f2f8 100644
--- a/src/USER-CUDA/neighbor_cuda.cpp
+++ b/src/USER-CUDA/neighbor_cuda.cpp
@@ -26,6 +26,9 @@
 
 using namespace LAMMPS_NS;
 
+
+
+
 enum{NSQ,BIN,MULTI};     // also in neigh_list.cpp
 
 /* ---------------------------------------------------------------------- */
@@ -56,9 +59,9 @@ void NeighborCuda::choose_build(int index, NeighRequest *rq)
 {
   Neighbor::choose_build(index,rq);
   
-  if (rq->full && style == NSQ && rq->ghost == 0 && rq->cudable)
+  if (rq->full && style == NSQ && rq->cudable)
     pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_nsq_cuda;
-  else if (rq->full && style == BIN && rq->ghost == 0 && rq->cudable)
+  else if (rq->full && style == BIN && rq->cudable)
     pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_bin_cuda;
 }
 
diff --git a/src/USER-CUDA/verlet_cuda.cpp b/src/USER-CUDA/verlet_cuda.cpp
index 0c3f675fae..fbaa1800a5 100644
--- a/src/USER-CUDA/verlet_cuda.cpp
+++ b/src/USER-CUDA/verlet_cuda.cpp
@@ -21,6 +21,7 @@
    This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */
 
+
 #include <cstdlib>
 #include <cstdio>
 #include <cstring>
@@ -56,6 +57,7 @@ using namespace LAMMPS_NS;
 
 #define MAKETIMEING
 
+
 VerletCuda::VerletCuda(LAMMPS *lmp, int narg, char **arg) : Verlet(lmp, narg, arg) {	
   cuda = lmp->cuda;
    if(cuda == NULL)
@@ -132,20 +134,19 @@ void VerletCuda::setup()
   cuda->uploadAll();
   neighbor->build();
   neighbor->ncalls = 0;
-  cuda->uploadAllNeighborLists();
+
   if(atom->mass)
   cuda->cu_mass->upload();
  
   if(cuda->cu_map_array)
   cuda->cu_map_array->upload();
- 
+
   // compute all forces
 
   ev_set(update->ntimestep);
   if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
   if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
   if(elist_atom||vlist_atom) cuda->checkResize();
-  
 
   int test_BpA_vs_TpA = true;
   timespec starttime;